From 844833f43055cca01865fbd892c8d55bb1afd632 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 7 Dec 2023 11:13:49 -0800 Subject: [PATCH] Work with rotation --- README.md | 4 ++++ marker/bbox.py | 22 +++++++++++++++++++++- marker/debug/data.py | 7 +++++-- marker/extract_text.py | 33 +++++++++++++++++++++++++++++---- marker/schema.py | 4 ++-- marker/settings.py | 18 ++++++++++++++++-- 6 files changed, 77 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 72cf3d56..cbb91905 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,10 @@ The above results are with marker and nougat setup so they each take ~3GB of VRA See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instructions on how to run your own benchmarks. +# Community + +[Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development. + # Limitations PDF is a tricky format, so marker will not always work perfectly. Here are some known limitations that are on the roadmap to address: diff --git a/marker/bbox.py b/marker/bbox.py index a8437b3f..fc85dab2 100644 --- a/marker/bbox.py +++ b/marker/bbox.py @@ -1,3 +1,5 @@ +import fitz as pymupdf + def should_merge_blocks(box1, box2, tol=5): # Within tol y px, and to the right within tol px merge = [ @@ -58,4 +60,22 @@ def unnormalize_box(bbox, width, height): height * (bbox[1] / 1000), width * (bbox[2] / 1000), height * (bbox[3] / 1000), - ] \ No newline at end of file + ] + + +def correct_rotation(bbox, page): + #bbox base is (x0, y0, x1, y1) + rotation = page.rotation + if rotation == 0: + return bbox + + tl = pymupdf.Point(bbox[0], bbox[1]) * page.rotation_matrix + br = pymupdf.Point(bbox[2], bbox[3]) * page.rotation_matrix + if rotation == 90: + bbox = [br[0], tl[1], tl[0], br[1]] + elif rotation == 180: + bbox = [br[0], br[1], tl[0], tl[1]] + elif rotation == 270: + bbox = [tl[0], br[1], br[0], tl[1]] + + return bbox \ No newline at end of file diff --git a/marker/debug/data.py b/marker/debug/data.py index 9abbd20b..8d6ac537 100644 --- a/marker/debug/data.py +++ b/marker/debug/data.py @@ -14,6 +14,9 @@ def dump_nougat_debug_data(doc, images, converted_spans): if not settings.DEBUG_DATA_FOLDER: return + if len(images) == 0: + return + # We attempted one conversion per image assert len(converted_spans) == len(images) @@ -37,7 +40,7 @@ def dump_nougat_debug_data(doc, images, converted_spans): debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json") with open(debug_file, "w+") as f: - json.dump(data_lines, f, indent=4) + json.dump(data_lines, f) def dump_bbox_debug_data(doc, blocks: List[Page]): @@ -70,7 +73,7 @@ def dump_bbox_debug_data(doc, blocks: List[Page]): debug_data.append(page_data) with open(debug_file, "w+") as f: - json.dump(debug_data, f, indent=4) + json.dump(debug_data, f) diff --git a/marker/extract_text.py b/marker/extract_text.py index 81de2969..57d115b0 100644 --- a/marker/extract_text.py +++ b/marker/extract_text.py @@ -3,6 +3,7 @@ from spellchecker import SpellChecker +from marker.bbox import correct_rotation from marker.ocr.page import ocr_entire_page from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer from marker.settings import settings @@ -12,8 +13,27 @@ os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX +def sort_rotated_text(page_blocks, tolerance=1.25): + vertical_groups = {} + for block in page_blocks: + group_key = round(block.bbox[1] / tolerance) * tolerance + if group_key not in vertical_groups: + vertical_groups[group_key] = [] + vertical_groups[group_key].append(block) + + # Sort each group horizontally and flatten the groups into a single list + sorted_page_blocks = [] + for _, group in sorted(vertical_groups.items()): + sorted_group = sorted(group, key=lambda x: x.bbox[0]) + sorted_page_blocks.extend(sorted_group) + + return sorted_page_blocks + + def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]: page = doc[pnum] + rotation = page.rotation + if ocr: blocks = ocr_entire_page(page, tess_lang, spellchecker) else: @@ -30,7 +50,7 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optiona bbox = s["bbox"] span_obj = Span( text=block_text, - bbox=bbox, + bbox=correct_rotation(bbox, page), span_id=f"{pnum}_{span_id}", font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font color=s["color"], @@ -41,19 +61,23 @@ def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optiona span_id += 1 line_obj = Line( spans=spans, - bbox=l["bbox"] + bbox=correct_rotation(l["bbox"], page), ) # Only select valid lines, with positive bboxes if line_obj.area > 0: block_lines.append(line_obj) block_obj = Block( lines=block_lines, - bbox=block["bbox"], + bbox=correct_rotation(block["bbox"], page), pnum=pnum ) # Only select blocks with multiple lines if len(block_lines) > 0: page_blocks.append(block_obj) + + # If the page was rotated, sort the text again + if rotation > 0: + page_blocks = sort_rotated_text(page_blocks) return page_blocks @@ -80,8 +104,9 @@ def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no not disable_ocr ] if all(conditions) or settings.OCR_ALL_PAGES: + page = doc[pnum] blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker, ocr=True) - page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox) + page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation) ocr_pages = 1 if len(blocks) == 0: ocr_failed = 1 diff --git a/marker/schema.py b/marker/schema.py index 87da416f..f08153a9 100644 --- a/marker/schema.py +++ b/marker/schema.py @@ -1,5 +1,5 @@ from collections import Counter -from typing import List, Optional +from typing import List, Optional, Tuple from pydantic import BaseModel, field_validator import ftfy @@ -20,7 +20,6 @@ def find_span_type(span, page_blocks): class BboxElement(BaseModel): bbox: List[float] - @field_validator('bbox') @classmethod def check_4_elements(cls, v: List[float]) -> List[float]: @@ -134,6 +133,7 @@ class Page(BboxElement): blocks: List[Block] pnum: int column_count: Optional[int] = None + rotation: Optional[int] = None # Rotation degrees of the page def get_nonblank_lines(self): lines = self.get_all_lines() diff --git a/marker/settings.py b/marker/settings.py index 8701f5d9..1af597f8 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -54,8 +54,22 @@ class Settings(BaseSettings): # Nougat model NOUGAT_MODEL_MAX: int = 512 # Max inference length for nougat NOUGAT_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for nougat - NOUGAT_HALLUCINATION_WORDS: List[str] = ["[MISSING_PAGE_POST]", "## References\n", "**Figure Captions**\n", "Footnote", - "\par\par\par", "## Chapter", "Fig.", "particle", "[REPEATS]", "[TRUNCATED]", "### ", "effective field strength", "\Phi_{\rm eff}"] + NOUGAT_HALLUCINATION_WORDS: List[str] = [ + "[MISSING_PAGE_POST]", + "## References\n", + "**Figure Captions**\n", + "Footnote", + "\par\par\par", + "## Chapter", + "Fig.", + "particle", + "[REPEATS]", + "[TRUNCATED]", + "### ", + "effective field strength", + "\Phi_{\rm eff}", + "\mathbf{\mathbf" + ] NOUGAT_DPI: int = 96 # DPI to render images at, matches default settings for nougat NOUGAT_MODEL_NAME: str = "0.1.0-small" # Name of the model to use NOUGAT_BATCH_SIZE: int = 6 if TORCH_DEVICE == "cuda" else 1 # Batch size for nougat, don't batch on cpu