diff --git a/marker/models.py b/marker/models.py index 72cb2b62..a77fb7e1 100644 --- a/marker/models.py +++ b/marker/models.py @@ -50,7 +50,7 @@ def load_all_models(langs=None): layout = setup_layout_model() order = setup_order_model() edit = load_editing_model() - ocr = setup_recognition_model(langs) if settings.OCR_ENGINE_INTERNAL == "surya" else None + ocr = setup_recognition_model(langs) if settings.OCR_ENGINE == "surya" else None texify = setup_texify_model() model_lst = [texify, layout, order, edit, detection, ocr] return model_lst diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py index 2fdb9d8e..d7bca5bb 100644 --- a/marker/ocr/heuristics.py +++ b/marker/ocr/heuristics.py @@ -63,7 +63,7 @@ def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3) total_intersection = 0 for block in page.blocks: for line in block.lines: - intersection_pct = box_intersection_pct(line.bbox, detected_bbox) + intersection_pct = box_intersection_pct(detected_bbox, line.bbox) total_intersection += intersection_pct if total_intersection > intersect_thresh: found_lines += 1 diff --git a/marker/ocr/lang.py b/marker/ocr/lang.py index 8240b057..82d6cc0e 100644 --- a/marker/ocr/lang.py +++ b/marker/ocr/lang.py @@ -5,7 +5,7 @@ def replace_langs_with_codes(langs): - if settings.OCR_ENGINE_INTERNAL == "surya": + if settings.OCR_ENGINE == "surya": for i, lang in enumerate(langs): if lang in LANGUAGE_TO_CODE: langs[i] = LANGUAGE_TO_CODE[lang] @@ -17,7 +17,7 @@ def replace_langs_with_codes(langs): def validate_langs(langs): - if settings.OCR_ENGINE_INTERNAL == "surya": + if settings.OCR_ENGINE == "surya": for lang in langs: if lang not in CODE_TO_LANGUAGE: raise ValueError(f"Invalid language code {lang} for Surya OCR") diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py index 6da62d8d..24dfbbc7 100644 --- a/marker/ocr/recognition.py +++ b/marker/ocr/recognition.py @@ -28,7 +28,7 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor ocr_idxs.append(pnum) ocr_pages += 1 - ocr_method = settings.OCR_ENGINE_INTERNAL + ocr_method = settings.OCR_ENGINE if ocr_method == "surya": new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages) else: diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py index bf10e906..4d5aa317 100644 --- a/marker/pdf/extract_text.py +++ b/marker/pdf/extract_text.py @@ -57,12 +57,18 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page: page_bbox = page["bbox"] page_width = abs(page_bbox[2] - page_bbox[0]) page_height = abs(page_bbox[3] - page_bbox[1]) + rotation = page["rotation"] + + # Flip width and height if rotated + if rotation == 90 or rotation == 270: + page_width, page_height = page_height, page_width + page_bbox = [0, 0, page_width, page_height] out_page = Page( blocks=page_blocks, pnum=page["page"], bbox=page_bbox, - rotation=page["rotation"], + rotation=rotation, char_blocks=page["blocks"] ) return out_page diff --git a/marker/settings.py b/marker/settings.py index 9bdc1490..5566e5fb 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -44,7 +44,7 @@ def TORCH_DEVICE_MODEL(self) -> str: # OCR INVALID_CHARS: List[str] = [chr(0xfffd), "�"] - OCR_ENGINE: Optional[str] = None # Which OCR engine to use, either "surya" or "ocrmypdf". Defaults to "ocrmypdf" on CPU, "surya" on GPU. + OCR_ENGINE: Optional[str] = "surya" # Which OCR engine to use, either "surya" or "ocrmypdf". Defaults to "ocrmypdf" on CPU, "surya" on GPU. OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted ## Surya @@ -56,13 +56,6 @@ def TORCH_DEVICE_MODEL(self) -> str: TESSERACT_TIMEOUT: int = 20 # When to give up on OCR TESSDATA_PREFIX: str = "" - @computed_field - def OCR_ENGINE_INTERNAL(self) -> str: - if self.OCR_ENGINE is not None: - return self.OCR_ENGINE - - return "surya" - # Texify model TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify TEXIFY_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for texify diff --git a/poetry.lock b/poetry.lock index 925699e8..50eb44e0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2525,13 +2525,13 @@ image = ["Pillow"] [[package]] name = "pdftext" -version = "0.3.5" +version = "0.3.6" description = "Extract structured text from pdfs quickly" optional = false python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9" files = [ - {file = "pdftext-0.3.5-py3-none-any.whl", hash = "sha256:2a1649b1f2b8ea563fd4f2a3a7227afb0693622b5e3820bca390817d92f228c7"}, - {file = "pdftext-0.3.5.tar.gz", hash = "sha256:bd2c4c918889894488b18fa6395eff77138dcb8762fc3c44f08a402597618d41"}, + {file = "pdftext-0.3.6-py3-none-any.whl", hash = "sha256:82c6b0c1e3e1116446c9a5e31f1e15b078cf9195e1cff608e24f9fd5826a88df"}, + {file = "pdftext-0.3.6.tar.gz", hash = "sha256:91be26c76c2a496054d64875edf17349dbf5c17c40bb47f844dc0d9b95d4b7e2"}, ] [package.dependencies] @@ -4990,4 +4990,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13,!=3.9.7" -content-hash = "459483572dd8347587db50c0e627b839b6b061af2af022ab8d893c70905b04cb" +content-hash = "8759c2dc6b9d345ae966f2fe10bb8ee9a2bb93c2d6a07ec2a7d2ec4d57bd3b2c" diff --git a/pyproject.toml b/pyproject.toml index ff0c2ff4..219de389 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ texify = "^0.1.8" rapidfuzz = "^3.8.1" surya-ocr = "^0.4.0" filetype = "^1.2.0" -pdftext = "^0.3.4" +pdftext = "^0.3.6" regex = "^2024.4.28" [tool.poetry.group.dev.dependencies]