Skip to content

Commit

Permalink
Work on tables
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 7, 2024
1 parent 6198478 commit 77a99f3
Show file tree
Hide file tree
Showing 15 changed files with 295 additions and 143 deletions.
5 changes: 2 additions & 3 deletions marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pypdfium2 as pdfium

from marker.cleaners.table import arrange_table_rows
from marker.tables.table import format_tables
from marker.debug.data import dump_bbox_debug_data
from marker.layout.layout import surya_layout, annotate_block_types
from marker.layout.order import surya_order, sort_blocks_in_reading_order
Expand All @@ -25,7 +25,6 @@
from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text

from typing import List, Dict, Tuple, Optional
import re
from marker.settings import settings


Expand Down Expand Up @@ -107,7 +106,7 @@ def convert_single_pdf(
indent_blocks(pages)

# Fix table blocks
table_count = arrange_table_rows(pages)
table_count = format_tables(pages)
out_meta["block_stats"]["table"] = table_count

for page in pages:
Expand Down
28 changes: 18 additions & 10 deletions marker/equations/equations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from typing import List

from marker.debug.data import dump_equation_debug_data
from marker.equations.images import get_equation_image
from marker.equations.inference import get_total_texify_tokens, get_latex_batched
from marker.pdf.images import render_bbox_image
from marker.schema.bbox import rescale_bbox
from marker.schema.page import Page
from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines
from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines, find_insert_block
from marker.settings import settings


Expand All @@ -30,21 +30,29 @@ def find_equation_blocks(page, processor):
if region_idx not in insert_points:
insert_points[region_idx] = (block_idx, line_idx)

# Account for regions where the lines were not detected
for region_idx, region in enumerate(equation_regions):
if region_idx in insert_points:
continue

insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)

block_lines_to_remove = defaultdict(set)
for region_idx, equation_region in enumerate(equation_regions):
if region_idx not in equation_lines or len(equation_lines[region_idx]) == 0:
continue
equation_block = equation_lines[region_idx]
equation_insert = insert_points[region_idx]
block_text = " ".join([line.prelim_text for line in equation_block])
equation_bbox = bbox_from_lines(equation_block)
block_text = ""
total_tokens = 0
else:
equation_block = equation_lines[region_idx]
block_text = " ".join([line.prelim_text for line in equation_block])
total_tokens = get_total_texify_tokens(block_text, processor)

total_tokens = get_total_texify_tokens(block_text, processor)
equation_insert = insert_points[region_idx]
equation_insert_line_idx = equation_insert[1]
equation_insert_line_idx -= len(
[x for x in lines_to_remove[region_idx] if x[0] == equation_insert[0] and x[1] < equation_insert[1]])

selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_bbox]
selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_region]
if total_tokens < settings.TEXIFY_MODEL_MAX:
# Account for the lines we're about to remove
for item in lines_to_remove[region_idx]:
Expand Down Expand Up @@ -144,7 +152,7 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings.
for page_idx, page_equation_blocks in enumerate(equation_blocks):
page_obj = doc[page_idx]
for equation_idx, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks):
png_image = get_equation_image(page_obj, pages[page_idx], equation_bbox)
png_image = render_bbox_image(page_obj, pages[page_idx], equation_bbox)

images.append(png_image)
token_counts.append(token_count)
Expand Down
19 changes: 0 additions & 19 deletions marker/equations/images.py

This file was deleted.

53 changes: 53 additions & 0 deletions marker/images/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from marker.pdf.images import render_bbox_image
from marker.schema.bbox import rescale_bbox
from marker.schema.block import find_insert_block, Span


def find_image_blocks(page):
image_blocks = []
image_regions = [l.bbox for l in page.layout.bboxes if l.label in ["Figure", "Picture"]]
image_regions = [rescale_bbox(page.layout.image_bbox, page.bbox, b) for b in image_regions]

insert_points = {}
for region_idx, region in enumerate(image_regions):
for block_idx, block in enumerate(page.blocks):
for line_idx, line in enumerate(block.lines):
if line.intersection_pct(region) > .8:
line.spans = [] # We will remove this line from the block

if region_idx not in insert_points:
insert_points[region_idx] = (block_idx, line_idx)

# Account for images with no detected lines
for region_idx, region in enumerate(image_regions):
if region_idx in insert_points:
continue

insert_points[region_idx] = (find_insert_block(page.blocks, region), 0)

for region_idx, image_region in enumerate(image_regions):
image_insert = insert_points[region_idx]
image_blocks.append([image_insert[0], image_insert[1], image_region])

return image_blocks


def extract_images(page):
image_blocks = find_image_blocks(page)

for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks):
block = page.blocks[block_idx]
image = render_bbox_image(page.page_obj, page, bbox)
image_filename = f"{page.pnum}_image_{image_idx}.png"
image_markdown = f"![{image_filename}]({image_filename})"
image_span = Span(
bbox=bbox,
text=image_markdown,
font="Image",
rotation=0,
font_weight=0,
font_size=0,
image=True
)
block.lines[line_idx].spans.append(image_span)
page.images.append(image)
20 changes: 2 additions & 18 deletions marker/layout/order.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from surya.ordering import batch_ordering

from marker.pdf.images import render_image
from marker.pdf.utils import sort_block_group
from marker.schema.bbox import rescale_bbox
from marker.schema.page import Page
from marker.settings import settings
Expand Down Expand Up @@ -55,21 +56,4 @@ def sort_blocks_in_reading_order(pages: List[Page]):
block_group = sort_block_group(block_groups[position])
new_blocks.extend(block_group)

page.blocks = new_blocks


def sort_block_group(blocks, tolerance=1.25):
vertical_groups = {}
for block in blocks:
group_key = round(block.bbox[1] / tolerance) * tolerance
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)

# Sort each group horizontally and flatten the groups into a single list
sorted_blocks = []
for _, group in sorted(vertical_groups.items()):
sorted_group = sorted(group, key=lambda x: x.bbox[0])
sorted_blocks.extend(sorted_group)

return sorted_blocks
page.blocks = new_blocks
2 changes: 1 addition & 1 deletion marker/ocr/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]):
return len(full_text.strip()) == 0


def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3):
def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6):
found_lines = 0
for detected_line in page.text_lines.bboxes:

Expand Down
5 changes: 3 additions & 2 deletions marker/pdf/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pypdfium2 as pdfium
import pypdfium2.internal as pdfium_i

from marker.pdf.utils import find_filetype, font_flags_decomposer
from marker.pdf.utils import find_filetype, font_flags_decomposer, sort_block_group
from marker.ocr.heuristics import detect_bad_ocr
from marker.settings import settings
from marker.schema.block import Span, Line, Block
Expand Down Expand Up @@ -63,13 +63,14 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
if rotation == 90 or rotation == 270:
page_width, page_height = page_height, page_width

char_blocks = page["blocks"]
page_bbox = [0, 0, page_width, page_height]
out_page = Page(
blocks=page_blocks,
pnum=page["page"],
bbox=page_bbox,
rotation=rotation,
char_blocks=page["blocks"]
char_blocks=char_blocks
)
return out_page

Expand Down
19 changes: 18 additions & 1 deletion marker/pdf/images.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import pypdfium2 as pdfium
from pypdfium2 import PdfPage

from marker.schema.page import Page
from marker.schema.bbox import rescale_bbox
from marker.settings import settings


def render_image(page: pdfium.PdfPage, dpi):
Expand All @@ -7,4 +12,16 @@ def render_image(page: pdfium.PdfPage, dpi):
draw_annots=False
).to_pil()
image = image.convert("RGB")
return image
return image


def render_bbox_image(page_obj: PdfPage, page: Page, bbox):
png_image = render_image(page_obj, settings.IMAGE_DPI)
# Rescale original pdf bbox bounds to match png image size
png_bbox = [0, 0, png_image.size[0], png_image.size[1]]
rescaled_merged = rescale_bbox(page.bbox, png_bbox, bbox)

# Crop out only the equation image
png_image = png_image.crop(rescaled_merged)
png_image = png_image.convert("RGB")
return png_image
22 changes: 22 additions & 0 deletions marker/pdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,25 @@ def font_flags_decomposer(flags: Optional[int]) -> str:
flag_descriptions.append("use_extern_attr")

return "_".join(flag_descriptions)


def sort_block_group(blocks, tolerance=1.25):
vertical_groups = {}
for block in blocks:
if hasattr(block, "bbox"):
bbox = block.bbox
else:
bbox = block["bbox"]

group_key = round(bbox[1] / tolerance) * tolerance
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)

# Sort each group horizontally and flatten the groups into a single list
sorted_blocks = []
for _, group in sorted(vertical_groups.items()):
sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0])
sorted_blocks.extend(sorted_group)

return sorted_blocks
21 changes: 21 additions & 0 deletions marker/schema/block.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import math
from typing import List, Optional

from pydantic import field_validator
Expand All @@ -19,6 +20,7 @@ class Span(BboxElement):
font_size: float
bold: Optional[bool] = None
italic: Optional[bool] = None
image: Optional[bool] = None


@field_validator('text')
Expand Down Expand Up @@ -98,3 +100,22 @@ def split_block_lines(block: Block, split_line_idx: int):
new_blocks.append(Block(lines=block.lines[:split_line_idx], bbox=bbox_from_lines(block.lines[:split_line_idx]), pnum=block.pnum))
new_blocks.append(Block(lines=block.lines[split_line_idx:], bbox=bbox_from_lines(block.lines[split_line_idx:]), pnum=block.pnum))
return new_blocks


def find_insert_block(blocks: List[Block], bbox):
nearest_match = None
match_dist = None
for idx, block in enumerate(blocks):
try:
dist = math.sqrt((block.bbox[1] - bbox[1]) ** 2 + (block.bbox[0] - bbox[0]) ** 2)
except Exception as e:
continue

if nearest_match is None or dist < match_dist:
nearest_match = idx
match_dist = dist
if nearest_match is None:
return 0
return nearest_match


3 changes: 2 additions & 1 deletion marker/schema/page.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from collections import Counter
from typing import List, Optional, Dict
from typing import List, Optional, Dict, Any

from marker.schema.bbox import BboxElement
from marker.schema.block import Block, Span
Expand All @@ -15,6 +15,7 @@ class Page(BboxElement):
order: Optional[OrderResult] = None
ocr_method: Optional[str] = None # One of "surya" or "tesseract"
char_blocks: Optional[List[Dict]] = None # Blocks with character-level data from pdftext
images: Optional[List[Any]] = None # Images to save along with the page, need Any to avoid pydantic error

def get_nonblank_lines(self):
lines = self.get_all_lines()
Expand Down
1 change: 1 addition & 0 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
class Settings(BaseSettings):
# General
TORCH_DEVICE: Optional[str] = None
IMAGE_DPI: int = 96 # DPI to render images pulled from pdf at

@computed_field
@property
Expand Down
Loading

0 comments on commit 77a99f3

Please sign in to comment.