Fix rotation issues

VikParuchuri · May 6, 2024 · 6198478 · 6198478
1 parent 9086dd5
commit 6198478
Show file tree

Hide file tree

Showing 8 changed files with 18 additions and 19 deletions.
diff --git a/marker/models.py b/marker/models.py
@@ -50,7 +50,7 @@ def load_all_models(langs=None):
     layout = setup_layout_model()
     order = setup_order_model()
     edit = load_editing_model()
-    ocr = setup_recognition_model(langs) if settings.OCR_ENGINE_INTERNAL == "surya" else None
+    ocr = setup_recognition_model(langs) if settings.OCR_ENGINE == "surya" else None
     texify = setup_texify_model()
     model_lst = [texify, layout, order, edit, detection, ocr]
     return model_lst
diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py
@@ -63,7 +63,7 @@ def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3)
         total_intersection = 0
         for block in page.blocks:
             for line in block.lines:
-                intersection_pct = box_intersection_pct(line.bbox, detected_bbox)
+                intersection_pct = box_intersection_pct(detected_bbox, line.bbox)
                 total_intersection += intersection_pct
         if total_intersection > intersect_thresh:
             found_lines += 1

diff --git a/marker/ocr/lang.py b/marker/ocr/lang.py
@@ -5,7 +5,7 @@
 
 
 def replace_langs_with_codes(langs):
-    if settings.OCR_ENGINE_INTERNAL == "surya":
+    if settings.OCR_ENGINE == "surya":
         for i, lang in enumerate(langs):
             if lang in LANGUAGE_TO_CODE:
                 langs[i] = LANGUAGE_TO_CODE[lang]
@@ -17,7 +17,7 @@ def replace_langs_with_codes(langs):
 
 
 def validate_langs(langs):
-    if settings.OCR_ENGINE_INTERNAL == "surya":
+    if settings.OCR_ENGINE == "surya":
         for lang in langs:
             if lang not in CODE_TO_LANGUAGE:
                 raise ValueError(f"Invalid language code {lang} for Surya OCR")

diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
@@ -28,7 +28,7 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor
             ocr_idxs.append(pnum)
             ocr_pages += 1
 
-    ocr_method = settings.OCR_ENGINE_INTERNAL
+    ocr_method = settings.OCR_ENGINE
     if ocr_method == "surya":
         new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages)
     else:

diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py
@@ -57,12 +57,18 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
     page_bbox = page["bbox"]
     page_width = abs(page_bbox[2] - page_bbox[0])
     page_height = abs(page_bbox[3] - page_bbox[1])
+    rotation = page["rotation"]
+
+    # Flip width and height if rotated
+    if rotation == 90 or rotation == 270:
+        page_width, page_height = page_height, page_width
+
     page_bbox = [0, 0, page_width, page_height]
     out_page = Page(
         blocks=page_blocks,
         pnum=page["page"],
         bbox=page_bbox,
-        rotation=page["rotation"],
+        rotation=rotation,
         char_blocks=page["blocks"]
     )
     return out_page

diff --git a/marker/settings.py b/marker/settings.py
@@ -44,7 +44,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
 
     # OCR
     INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
-    OCR_ENGINE: Optional[str] = None # Which OCR engine to use, either "surya" or "ocrmypdf".  Defaults to "ocrmypdf" on CPU, "surya" on GPU.
+    OCR_ENGINE: Optional[str] = "surya" # Which OCR engine to use, either "surya" or "ocrmypdf".  Defaults to "ocrmypdf" on CPU, "surya" on GPU.
     OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
 
     ## Surya
@@ -56,13 +56,6 @@ def TORCH_DEVICE_MODEL(self) -> str:
     TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
     TESSDATA_PREFIX: str = ""
 
-    @computed_field
-    def OCR_ENGINE_INTERNAL(self) -> str:
-        if self.OCR_ENGINE is not None:
-            return self.OCR_ENGINE
-
-        return "surya"
-
     # Texify model
     TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
     TEXIFY_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for texify

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,7 +39,7 @@ texify = "^0.1.8"
 rapidfuzz = "^3.8.1"
 surya-ocr = "^0.4.0"
 filetype = "^1.2.0"
-pdftext = "^0.3.4"
+pdftext = "^0.3.6"
 regex = "^2024.4.28"
 
 [tool.poetry.group.dev.dependencies]