Skip to content

Commit

Permalink
Fix rotation issues
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 6, 2024
1 parent 9086dd5 commit 6198478
Show file tree
Hide file tree
Showing 8 changed files with 18 additions and 19 deletions.
2 changes: 1 addition & 1 deletion marker/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def load_all_models(langs=None):
layout = setup_layout_model()
order = setup_order_model()
edit = load_editing_model()
ocr = setup_recognition_model(langs) if settings.OCR_ENGINE_INTERNAL == "surya" else None
ocr = setup_recognition_model(langs) if settings.OCR_ENGINE == "surya" else None
texify = setup_texify_model()
model_lst = [texify, layout, order, edit, detection, ocr]
return model_lst
2 changes: 1 addition & 1 deletion marker/ocr/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3)
total_intersection = 0
for block in page.blocks:
for line in block.lines:
intersection_pct = box_intersection_pct(line.bbox, detected_bbox)
intersection_pct = box_intersection_pct(detected_bbox, line.bbox)
total_intersection += intersection_pct
if total_intersection > intersect_thresh:
found_lines += 1
Expand Down
4 changes: 2 additions & 2 deletions marker/ocr/lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


def replace_langs_with_codes(langs):
if settings.OCR_ENGINE_INTERNAL == "surya":
if settings.OCR_ENGINE == "surya":
for i, lang in enumerate(langs):
if lang in LANGUAGE_TO_CODE:
langs[i] = LANGUAGE_TO_CODE[lang]
Expand All @@ -17,7 +17,7 @@ def replace_langs_with_codes(langs):


def validate_langs(langs):
if settings.OCR_ENGINE_INTERNAL == "surya":
if settings.OCR_ENGINE == "surya":
for lang in langs:
if lang not in CODE_TO_LANGUAGE:
raise ValueError(f"Invalid language code {lang} for Surya OCR")
Expand Down
2 changes: 1 addition & 1 deletion marker/ocr/recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, parallel_factor
ocr_idxs.append(pnum)
ocr_pages += 1

ocr_method = settings.OCR_ENGINE_INTERNAL
ocr_method = settings.OCR_ENGINE
if ocr_method == "surya":
new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages)
else:
Expand Down
8 changes: 7 additions & 1 deletion marker/pdf/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,18 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
page_bbox = page["bbox"]
page_width = abs(page_bbox[2] - page_bbox[0])
page_height = abs(page_bbox[3] - page_bbox[1])
rotation = page["rotation"]

# Flip width and height if rotated
if rotation == 90 or rotation == 270:
page_width, page_height = page_height, page_width

page_bbox = [0, 0, page_width, page_height]
out_page = Page(
blocks=page_blocks,
pnum=page["page"],
bbox=page_bbox,
rotation=page["rotation"],
rotation=rotation,
char_blocks=page["blocks"]
)
return out_page
Expand Down
9 changes: 1 addition & 8 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def TORCH_DEVICE_MODEL(self) -> str:

# OCR
INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
OCR_ENGINE: Optional[str] = None # Which OCR engine to use, either "surya" or "ocrmypdf". Defaults to "ocrmypdf" on CPU, "surya" on GPU.
OCR_ENGINE: Optional[str] = "surya" # Which OCR engine to use, either "surya" or "ocrmypdf". Defaults to "ocrmypdf" on CPU, "surya" on GPU.
OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted

## Surya
Expand All @@ -56,13 +56,6 @@ def TORCH_DEVICE_MODEL(self) -> str:
TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
TESSDATA_PREFIX: str = ""

@computed_field
def OCR_ENGINE_INTERNAL(self) -> str:
if self.OCR_ENGINE is not None:
return self.OCR_ENGINE

return "surya"

# Texify model
TEXIFY_MODEL_MAX: int = 384 # Max inference length for texify
TEXIFY_TOKEN_BUFFER: int = 256 # Number of tokens to buffer above max for texify
Expand Down
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ texify = "^0.1.8"
rapidfuzz = "^3.8.1"
surya-ocr = "^0.4.0"
filetype = "^1.2.0"
pdftext = "^0.3.4"
pdftext = "^0.3.6"
regex = "^2024.4.28"

[tool.poetry.group.dev.dependencies]
Expand Down

0 comments on commit 6198478

Please sign in to comment.