Skip to content

Commit

Permalink
Remove pymupdf
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Apr 29, 2024
1 parent fd261fb commit 51990c8
Show file tree
Hide file tree
Showing 18 changed files with 1,832 additions and 1,688 deletions.
22 changes: 1 addition & 21 deletions marker/bbox.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import fitz as pymupdf

def should_merge_blocks(box1, box2, tol=5):
# Within tol y px, and to the right within tol px
merge = [
Expand Down Expand Up @@ -60,22 +58,4 @@ def unnormalize_box(bbox, width, height):
height * (bbox[1] / 1000),
width * (bbox[2] / 1000),
height * (bbox[3] / 1000),
]


def correct_rotation(bbox, page):
#bbox base is (x0, y0, x1, y1)
rotation = page.rotation
if rotation == 0:
return bbox

tl = pymupdf.Point(bbox[0], bbox[1]) * page.rotation_matrix
br = pymupdf.Point(bbox[2], bbox[3]) * page.rotation_matrix
if rotation == 90:
bbox = [br[0], tl[1], tl[0], br[1]]
elif rotation == 180:
bbox = [br[0], br[1], tl[0], tl[1]]
elif rotation == 270:
bbox = [tl[0], br[1], br[0], tl[1]]

return bbox
]
8 changes: 4 additions & 4 deletions marker/cleaners/code.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from marker.schema import Span, Line, Page
import re
from typing import List
import fitz as pymupdf


def is_code_linelen(lines, thresh=60):
Expand Down Expand Up @@ -102,13 +101,13 @@ def indent_blocks(blocks: List[Page]):
if col_width == 0 and len(span.text) > 0:
col_width = (span.bbox[2] - span.bbox[0]) / len(span.text)
text += span.text
lines.append((pymupdf.Rect(line.bbox), text))
lines.append((line.bbox, text))

block_text = ""
blank_line = False
for line in lines:
text = line[1]
prefix = " " * int((line[0].x0 - min_left) / col_width)
prefix = " " * int((line[0][0] - min_left) / col_width)
current_line_blank = len(text.strip()) == 0
if blank_line and current_line_blank:
# Don't put multiple blank lines in a row
Expand All @@ -120,9 +119,10 @@ def indent_blocks(blocks: List[Page]):
new_span = Span(
text=block_text,
bbox=block.bbox,
color=block.lines[0].spans[0].color,
span_id=f"{span_counter}_fix_code",
font=block.lines[0].spans[0].font,
font_weight=block.lines[0].spans[0].font_weight,
font_size=block.lines[0].spans[0].font_size,
block_type="Code"
)
span_counter += 1
Expand Down
8 changes: 4 additions & 4 deletions marker/cleaners/equations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from marker.bbox import should_merge_blocks, merge_boxes
from marker.debug.data import dump_equation_debug_data
from marker.pdf.images import render_image
from marker.settings import settings
from marker.schema import Page, Span, Line, Block, BlockType
import os
Expand Down Expand Up @@ -51,9 +52,7 @@ def mask_bbox(png_image, bbox, selected_bboxes):


def get_masked_image(page, bbox, selected_bboxes):
pix = page.get_pixmap(dpi=settings.TEXIFY_DPI, clip=bbox)
png = pix.pil_tobytes(format="PNG")
png_image = Image.open(io.BytesIO(png))
png_image = render_image(page, settings.TEXIFY_DPI)
png_image = mask_bbox(png_image, bbox, selected_bboxes)
png_image = png_image.convert("RGB")
return png_image
Expand Down Expand Up @@ -212,7 +211,8 @@ def replace_blocks_with_latex(page_blocks: Page, merged_boxes, reformat_regions,
bbox=merged_boxes[current_region],
span_id=f"{pnum}_{idx}_fixeq",
font="Latex",
color=0,
font_weight=0,
font_size=0,
block_type="Formula"
)
],
Expand Down
2 changes: 1 addition & 1 deletion marker/cleaners/headers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
from collections import Counter, defaultdict
from itertools import chain
from thefuzz import fuzz
from rapidfuzz import fuzz

from sklearn.cluster import DBSCAN
import numpy as np
Expand Down
4 changes: 2 additions & 2 deletions marker/cleaners/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from tabulate import tabulate
from typing import List
import re
import textwrap


def merge_table_blocks(blocks: List[Page]):
Expand Down Expand Up @@ -84,7 +83,8 @@ def create_new_tables(blocks: List[Page]):
bbox=block.bbox,
span_id=f"{table_idx}_fix_table",
font="Table",
color=0,
font_size=0,
font_weight=0,
block_type="Table",
text=new_text
)
Expand Down
24 changes: 11 additions & 13 deletions marker/convert.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import fitz as pymupdf
import pypdfium2 as pdfium

from marker.cleaners.table import merge_table_blocks, create_new_tables
from marker.debug.data import dump_bbox_debug_data
Expand All @@ -25,10 +25,10 @@ def find_filetype(fpath):
# The mimetype is not always consistent, so use in to check the most common formats
if "pdf" in mimetype:
return "pdf"
elif "epub" in mimetype:
return "epub"
elif "mobi" in mimetype:
return "mobi"
#elif "epub" in mimetype:
# return "epub"
#elif "mobi" in mimetype:
# return "mobi"
elif mimetype in settings.SUPPORTED_FILETYPES:
return settings.SUPPORTED_FILETYPES[mimetype]
else:
Expand All @@ -47,10 +47,12 @@ def get_length_of_text(fname: str) -> int:
if filetype == "other":
return 0

doc = pymupdf.open(fname, filetype=filetype)
doc = pdfium.PdfDocument(fname)
full_text = ""
for page in doc:
full_text += page.get_text("text", sort=True, flags=settings.TEXT_FLAGS)
for page_idx in range(len(doc)):
page = doc.get_page(page_idx)
text_page = page.get_textpage()
full_text += text_page.get_text_bounded()

return len(full_text)

Expand Down Expand Up @@ -81,11 +83,7 @@ def convert_single_pdf(

out_meta["filetype"] = filetype

doc = pymupdf.open(fname, filetype=filetype)
if filetype != "pdf":
conv = doc.convert_to_pdf()
doc = pymupdf.open("pdf", conv)

doc = pdfium.PdfDocument(fname)
blocks, toc, ocr_stats = get_text_blocks(
doc,
tess_lang,
Expand Down
5 changes: 2 additions & 3 deletions marker/debug/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import zlib
from typing import List

from marker.pdf.images import render_image
from marker.schema import Page
from marker.settings import settings
from PIL import Image
Expand Down Expand Up @@ -54,9 +55,7 @@ def dump_bbox_debug_data(doc, blocks: List[Page]):
for idx, page_blocks in enumerate(blocks):
page = doc[idx]

pix = page.get_pixmap(dpi=settings.TEXIFY_DPI, annots=False, clip=page_blocks.bbox)
png = pix.pil_tobytes(format="PNG")
png_image = Image.open(io.BytesIO(png))
png_image = render_image(page, dpi=settings.TEXIFY_DPI)
width, height = png_image.size
max_dimension = 6000
if width > max_dimension or height > max_dimension:
Expand Down
126 changes: 51 additions & 75 deletions marker/extract_text.py
Original file line number Diff line number Diff line change
@@ -1,111 +1,81 @@
import os
from typing import Tuple, List, Optional
from typing import List, Optional

from spellchecker import SpellChecker
import pypdfium2.internal as pdfium_i

from marker.bbox import correct_rotation
from marker.ocr.page import ocr_entire_page
from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
from marker.settings import settings
from marker.schema import Span, Line, Block, Page
from concurrent.futures import ThreadPoolExecutor
from pdftext.extraction import dictionary_output

os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX


def sort_rotated_text(page_blocks, tolerance=1.25):
vertical_groups = {}
for block in page_blocks:
group_key = round(block.bbox[1] / tolerance) * tolerance
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)

# Sort each group horizontally and flatten the groups into a single list
sorted_page_blocks = []
for _, group in sorted(vertical_groups.items()):
sorted_group = sorted(group, key=lambda x: x.bbox[0])
sorted_page_blocks.extend(sorted_group)

return sorted_page_blocks


def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]:
page = doc[pnum]
rotation = page.rotation

if ocr:
blocks = ocr_entire_page(page, tess_lang, spellchecker)
else:
blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS)["blocks"]

def pdftext_format_to_blocks(page, pnum: int) -> List[Block]:
page_blocks = []
span_id = 0
for block_idx, block in enumerate(blocks):
for block_idx, block in enumerate(page["blocks"]):
block_lines = []
for l in block["lines"]:
spans = []
for i, s in enumerate(l["spans"]):
block_text = s["text"]
bbox = s["bbox"]
span_obj = Span(
text=block_text,
bbox=correct_rotation(bbox, page),
bbox=s["bbox"],
span_id=f"{pnum}_{span_id}",
font=f"{s['font']}_{font_flags_decomposer(s['flags'])}", # Add font flags to end of font
color=s["color"],
ascender=s["ascender"],
descender=s["descender"],
font=f"{s['font']['name']}_{font_flags_decomposer(s['font']['flags'])}", # Add font flags to end of font
font_weight=s["font"]["weight"],
font_size=s["font"]["size"],
)
spans.append(span_obj) # Text, bounding box, span id
span_id += 1
line_obj = Line(
spans=spans,
bbox=correct_rotation(l["bbox"], page),
bbox=l["bbox"],
)
# Only select valid lines, with positive bboxes
if line_obj.area > 0:
if line_obj.area >= 0:
block_lines.append(line_obj)
block_obj = Block(
lines=block_lines,
bbox=correct_rotation(block["bbox"], page),
bbox=block["bbox"],
pnum=pnum
)
# Only select blocks with multiple lines
# Only select blocks with lines
if len(block_lines) > 0:
page_blocks.append(block_obj)
out_page = Page(
blocks=page_blocks,
pnum=page["page"],
bbox=page["bbox"],
rotation=page["rotation"],
)
return out_page

# If the page was rotated, sort the text again
if rotation > 0:
page_blocks = sort_rotated_text(page_blocks)
return page_blocks


def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no_text: bool, disable_ocr: bool = False, min_ocr_page: int = 2):
def ocr_page(doc, pnum, page: Page, tess_lang: str):
ocr_pages = 0
ocr_success = 0
ocr_failed = 0
spellchecker = None
page_bbox = doc[pnum].bound()
if spell_lang:
spellchecker = SpellChecker(language=spell_lang)

blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker)
blocks = get_single_page_blocks(doc, pnum, tess_lang)
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)

# OCR page if we got minimal text, or if we got too many spaces
conditions = [
(
no_text # Full doc has no text, and needs full OCR
or
(len(page_obj.prelim_text) > 0 and detect_bad_ocr(page_obj.prelim_text, spellchecker)) # Bad OCR
(len(page_obj.prelim_text) > 0 and detect_bad_ocr(page_obj.prelim_text)) # Bad OCR
),
min_ocr_page < pnum < len(doc) - 1,
not disable_ocr
]
if all(conditions) or settings.OCR_ALL_PAGES:
page = doc[pnum]
blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker, ocr=True)
blocks = get_single_page_blocks(doc, pnum, tess_lang, ocr=True)
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
ocr_pages = 1
if len(blocks) == 0:
Expand All @@ -116,37 +86,43 @@ def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no


def get_text_blocks(doc, tess_lang: str, spell_lang: Optional[str], max_pages: Optional[int] = None, parallel: int = settings.OCR_PARALLEL_WORKERS):
all_blocks = []
toc = doc.get_toc()
toc = get_toc(doc)
ocr_pages = 0
ocr_failed = 0
ocr_success = 0
# This is a thread because most of the work happens in a separate process (tesseract)
range_end = len(doc)
no_text = len(naive_get_text(doc).strip()) == 0

page_range = range(len(doc))
if max_pages:
range_end = min(max_pages, len(doc))
with ThreadPoolExecutor(max_workers=parallel) as pool:
args_list = [(doc, pnum, tess_lang, spell_lang, no_text) for pnum in range(range_end)]
if parallel == 1:
func = map
else:
func = pool.map
results = func(lambda a: convert_single_page(*a), args_list)
page_range = range(range_end)

for result in results:
page_obj, ocr_stats = result
all_blocks.append(page_obj)
ocr_pages += ocr_stats["ocr_pages"]
ocr_failed += ocr_stats["ocr_failed"]
ocr_success += ocr_stats["ocr_success"]
all_blocks = dictionary_output(doc, page_range=page_range)
all_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(all_blocks)]

return all_blocks, toc, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}


def naive_get_text(doc):
full_text = ""
for page in doc:
full_text += page.get_text("text", sort=True, flags=settings.TEXT_FLAGS)
full_text += "\n"
for page_idx in range(len(doc)):
page = doc.get_page(page_idx)
text_page = page.get_textpage()
full_text += text_page.get_text_bounded() + "\n"
return full_text


def get_toc(doc, max_depth=15):
toc = doc.get_toc(max_depth=max_depth)
toc_list = []
for item in toc:
list_item = {
"title": item.title,
"level": item.level,
"is_closed": item.is_closed,
"n_kids": item.n_kids,
"page_index": item.page_index,
"view_mode": pdfium_i.ViewmodeToStr.get(item.view_mode),
"view_pos": item.view_pos,
}
toc_list.append(list_item)
return toc_list
Loading

0 comments on commit 51990c8

Please sign in to comment.