diff --git a/marker/convert.py b/marker/convert.py index f3965be6..f475d3d4 100644 --- a/marker/convert.py +++ b/marker/convert.py @@ -6,7 +6,7 @@ import pypdfium2 as pdfium -from marker.cleaners.table import arrange_table_rows +from marker.tables.table import format_tables from marker.debug.data import dump_bbox_debug_data from marker.layout.layout import surya_layout, annotate_block_types from marker.layout.order import surya_order, sort_blocks_in_reading_order @@ -25,7 +25,6 @@ from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text from typing import List, Dict, Tuple, Optional -import re from marker.settings import settings @@ -107,7 +106,7 @@ def convert_single_pdf( indent_blocks(pages) # Fix table blocks - table_count = arrange_table_rows(pages) + table_count = format_tables(pages) out_meta["block_stats"]["table"] = table_count for page in pages: diff --git a/marker/equations/equations.py b/marker/equations/equations.py index da23b136..32df8d4d 100644 --- a/marker/equations/equations.py +++ b/marker/equations/equations.py @@ -3,11 +3,11 @@ from typing import List from marker.debug.data import dump_equation_debug_data -from marker.equations.images import get_equation_image from marker.equations.inference import get_total_texify_tokens, get_latex_batched +from marker.pdf.images import render_bbox_image from marker.schema.bbox import rescale_bbox from marker.schema.page import Page -from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines +from marker.schema.block import Line, Span, Block, bbox_from_lines, split_block_lines, find_insert_block from marker.settings import settings @@ -30,21 +30,29 @@ def find_equation_blocks(page, processor): if region_idx not in insert_points: insert_points[region_idx] = (block_idx, line_idx) + # Account for regions where the lines were not detected + for region_idx, region in enumerate(equation_regions): + if region_idx in insert_points: + continue + + insert_points[region_idx] = (find_insert_block(page.blocks, region), 0) + block_lines_to_remove = defaultdict(set) for region_idx, equation_region in enumerate(equation_regions): if region_idx not in equation_lines or len(equation_lines[region_idx]) == 0: - continue - equation_block = equation_lines[region_idx] - equation_insert = insert_points[region_idx] - block_text = " ".join([line.prelim_text for line in equation_block]) - equation_bbox = bbox_from_lines(equation_block) + block_text = "" + total_tokens = 0 + else: + equation_block = equation_lines[region_idx] + block_text = " ".join([line.prelim_text for line in equation_block]) + total_tokens = get_total_texify_tokens(block_text, processor) - total_tokens = get_total_texify_tokens(block_text, processor) + equation_insert = insert_points[region_idx] equation_insert_line_idx = equation_insert[1] equation_insert_line_idx -= len( [x for x in lines_to_remove[region_idx] if x[0] == equation_insert[0] and x[1] < equation_insert[1]]) - selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_bbox] + selected_blocks = [equation_insert[0], equation_insert_line_idx, total_tokens, block_text, equation_region] if total_tokens < settings.TEXIFY_MODEL_MAX: # Account for the lines we're about to remove for item in lines_to_remove[region_idx]: @@ -144,7 +152,7 @@ def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings. for page_idx, page_equation_blocks in enumerate(equation_blocks): page_obj = doc[page_idx] for equation_idx, (insert_block_idx, insert_line_idx, token_count, block_text, equation_bbox) in enumerate(page_equation_blocks): - png_image = get_equation_image(page_obj, pages[page_idx], equation_bbox) + png_image = render_bbox_image(page_obj, pages[page_idx], equation_bbox) images.append(png_image) token_counts.append(token_count) diff --git a/marker/equations/images.py b/marker/equations/images.py deleted file mode 100644 index f6d4644f..00000000 --- a/marker/equations/images.py +++ /dev/null @@ -1,19 +0,0 @@ -from pypdfium2 import PdfPage - -from marker.pdf.images import render_image -from marker.schema.bbox import rescale_bbox -from marker.schema.page import Page -from marker.settings import settings - - -def get_equation_image(page_obj: PdfPage, page: Page, bbox): - rescaled_bboxes = [] - png_image = render_image(page_obj, settings.TEXIFY_DPI) - # Rescale original pdf bbox bounds to match png image size - png_bbox = [0, 0, png_image.size[0], png_image.size[1]] - rescaled_merged = rescale_bbox(page.bbox, png_bbox, bbox) - - # Crop out only the equation image - png_image = png_image.crop(rescaled_merged) - png_image = png_image.convert("RGB") - return png_image diff --git a/marker/images/extract.py b/marker/images/extract.py new file mode 100644 index 00000000..ae7d3367 --- /dev/null +++ b/marker/images/extract.py @@ -0,0 +1,53 @@ +from marker.pdf.images import render_bbox_image +from marker.schema.bbox import rescale_bbox +from marker.schema.block import find_insert_block, Span + + +def find_image_blocks(page): + image_blocks = [] + image_regions = [l.bbox for l in page.layout.bboxes if l.label in ["Figure", "Picture"]] + image_regions = [rescale_bbox(page.layout.image_bbox, page.bbox, b) for b in image_regions] + + insert_points = {} + for region_idx, region in enumerate(image_regions): + for block_idx, block in enumerate(page.blocks): + for line_idx, line in enumerate(block.lines): + if line.intersection_pct(region) > .8: + line.spans = [] # We will remove this line from the block + + if region_idx not in insert_points: + insert_points[region_idx] = (block_idx, line_idx) + + # Account for images with no detected lines + for region_idx, region in enumerate(image_regions): + if region_idx in insert_points: + continue + + insert_points[region_idx] = (find_insert_block(page.blocks, region), 0) + + for region_idx, image_region in enumerate(image_regions): + image_insert = insert_points[region_idx] + image_blocks.append([image_insert[0], image_insert[1], image_region]) + + return image_blocks + + +def extract_images(page): + image_blocks = find_image_blocks(page) + + for image_idx, (block_idx, line_idx, bbox) in enumerate(image_blocks): + block = page.blocks[block_idx] + image = render_bbox_image(page.page_obj, page, bbox) + image_filename = f"{page.pnum}_image_{image_idx}.png" + image_markdown = f"![{image_filename}]({image_filename})" + image_span = Span( + bbox=bbox, + text=image_markdown, + font="Image", + rotation=0, + font_weight=0, + font_size=0, + image=True + ) + block.lines[line_idx].spans.append(image_span) + page.images.append(image) diff --git a/marker/layout/order.py b/marker/layout/order.py index 9833f5de..76f9fbc0 100644 --- a/marker/layout/order.py +++ b/marker/layout/order.py @@ -4,6 +4,7 @@ from surya.ordering import batch_ordering from marker.pdf.images import render_image +from marker.pdf.utils import sort_block_group from marker.schema.bbox import rescale_bbox from marker.schema.page import Page from marker.settings import settings @@ -55,21 +56,4 @@ def sort_blocks_in_reading_order(pages: List[Page]): block_group = sort_block_group(block_groups[position]) new_blocks.extend(block_group) - page.blocks = new_blocks - - -def sort_block_group(blocks, tolerance=1.25): - vertical_groups = {} - for block in blocks: - group_key = round(block.bbox[1] / tolerance) * tolerance - if group_key not in vertical_groups: - vertical_groups[group_key] = [] - vertical_groups[group_key].append(block) - - # Sort each group horizontally and flatten the groups into a single list - sorted_blocks = [] - for _, group in sorted(vertical_groups.items()): - sorted_group = sorted(group, key=lambda x: x.bbox[0]) - sorted_blocks.extend(sorted_group) - - return sorted_blocks \ No newline at end of file + page.blocks = new_blocks \ No newline at end of file diff --git a/marker/ocr/heuristics.py b/marker/ocr/heuristics.py index d7bca5bb..ffe6e422 100644 --- a/marker/ocr/heuristics.py +++ b/marker/ocr/heuristics.py @@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]): return len(full_text.strip()) == 0 -def detected_line_coverage(page: Page, intersect_thresh=.4, detection_thresh=.3): +def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.6): found_lines = 0 for detected_line in page.text_lines.bboxes: diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py index 4d5aa317..ea9182e1 100644 --- a/marker/pdf/extract_text.py +++ b/marker/pdf/extract_text.py @@ -4,7 +4,7 @@ import pypdfium2 as pdfium import pypdfium2.internal as pdfium_i -from marker.pdf.utils import find_filetype, font_flags_decomposer +from marker.pdf.utils import find_filetype, font_flags_decomposer, sort_block_group from marker.ocr.heuristics import detect_bad_ocr from marker.settings import settings from marker.schema.block import Span, Line, Block @@ -63,13 +63,14 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page: if rotation == 90 or rotation == 270: page_width, page_height = page_height, page_width + char_blocks = page["blocks"] page_bbox = [0, 0, page_width, page_height] out_page = Page( blocks=page_blocks, pnum=page["page"], bbox=page_bbox, rotation=rotation, - char_blocks=page["blocks"] + char_blocks=char_blocks ) return out_page diff --git a/marker/pdf/images.py b/marker/pdf/images.py index 2264c28c..1bf24b56 100644 --- a/marker/pdf/images.py +++ b/marker/pdf/images.py @@ -1,4 +1,9 @@ import pypdfium2 as pdfium +from pypdfium2 import PdfPage + +from marker.schema.page import Page +from marker.schema.bbox import rescale_bbox +from marker.settings import settings def render_image(page: pdfium.PdfPage, dpi): @@ -7,4 +12,16 @@ def render_image(page: pdfium.PdfPage, dpi): draw_annots=False ).to_pil() image = image.convert("RGB") - return image \ No newline at end of file + return image + + +def render_bbox_image(page_obj: PdfPage, page: Page, bbox): + png_image = render_image(page_obj, settings.IMAGE_DPI) + # Rescale original pdf bbox bounds to match png image size + png_bbox = [0, 0, png_image.size[0], png_image.size[1]] + rescaled_merged = rescale_bbox(page.bbox, png_bbox, bbox) + + # Crop out only the equation image + png_image = png_image.crop(rescaled_merged) + png_image = png_image.convert("RGB") + return png_image \ No newline at end of file diff --git a/marker/pdf/utils.py b/marker/pdf/utils.py index 1512c17b..e15e9f37 100644 --- a/marker/pdf/utils.py +++ b/marker/pdf/utils.py @@ -52,3 +52,25 @@ def font_flags_decomposer(flags: Optional[int]) -> str: flag_descriptions.append("use_extern_attr") return "_".join(flag_descriptions) + + +def sort_block_group(blocks, tolerance=1.25): + vertical_groups = {} + for block in blocks: + if hasattr(block, "bbox"): + bbox = block.bbox + else: + bbox = block["bbox"] + + group_key = round(bbox[1] / tolerance) * tolerance + if group_key not in vertical_groups: + vertical_groups[group_key] = [] + vertical_groups[group_key].append(block) + + # Sort each group horizontally and flatten the groups into a single list + sorted_blocks = [] + for _, group in sorted(vertical_groups.items()): + sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0]) + sorted_blocks.extend(sorted_group) + + return sorted_blocks diff --git a/marker/schema/block.py b/marker/schema/block.py index 1220b698..50ae95c6 100644 --- a/marker/schema/block.py +++ b/marker/schema/block.py @@ -1,3 +1,4 @@ +import math from typing import List, Optional from pydantic import field_validator @@ -19,6 +20,7 @@ class Span(BboxElement): font_size: float bold: Optional[bool] = None italic: Optional[bool] = None + image: Optional[bool] = None @field_validator('text') @@ -98,3 +100,22 @@ def split_block_lines(block: Block, split_line_idx: int): new_blocks.append(Block(lines=block.lines[:split_line_idx], bbox=bbox_from_lines(block.lines[:split_line_idx]), pnum=block.pnum)) new_blocks.append(Block(lines=block.lines[split_line_idx:], bbox=bbox_from_lines(block.lines[split_line_idx:]), pnum=block.pnum)) return new_blocks + + +def find_insert_block(blocks: List[Block], bbox): + nearest_match = None + match_dist = None + for idx, block in enumerate(blocks): + try: + dist = math.sqrt((block.bbox[1] - bbox[1]) ** 2 + (block.bbox[0] - bbox[0]) ** 2) + except Exception as e: + continue + + if nearest_match is None or dist < match_dist: + nearest_match = idx + match_dist = dist + if nearest_match is None: + return 0 + return nearest_match + + diff --git a/marker/schema/page.py b/marker/schema/page.py index 407939eb..c4fca410 100644 --- a/marker/schema/page.py +++ b/marker/schema/page.py @@ -1,5 +1,5 @@ from collections import Counter -from typing import List, Optional, Dict +from typing import List, Optional, Dict, Any from marker.schema.bbox import BboxElement from marker.schema.block import Block, Span @@ -15,6 +15,7 @@ class Page(BboxElement): order: Optional[OrderResult] = None ocr_method: Optional[str] = None # One of "surya" or "tesseract" char_blocks: Optional[List[Dict]] = None # Blocks with character-level data from pdftext + images: Optional[List[Any]] = None # Images to save along with the page, need Any to avoid pydantic error def get_nonblank_lines(self): lines = self.get_all_lines() diff --git a/marker/settings.py b/marker/settings.py index 5566e5fb..5eb85bed 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -10,6 +10,7 @@ class Settings(BaseSettings): # General TORCH_DEVICE: Optional[str] = None + IMAGE_DPI: int = 96 # DPI to render images pulled from pdf at @computed_field @property diff --git a/marker/tables/cells.py b/marker/tables/cells.py new file mode 100644 index 00000000..d4524314 --- /dev/null +++ b/marker/tables/cells.py @@ -0,0 +1,89 @@ +from marker.schema.bbox import rescale_bbox, box_intersection_pct +from marker.schema.page import Page + + +def find_row_separators(page: Page, table_box, round_factor=4): + top_edges = [] + bottom_edges = [] + + line_boxes = [p.bbox for p in page.text_lines.bboxes] + line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes] + line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > .8] + + min_count = len(line_boxes) / 3 + + for cell in line_boxes: + top_edges.append(cell[1] / round_factor * round_factor) + bottom_edges.append(cell[3] / round_factor * round_factor) + + top_edges = [t for t in top_edges if top_edges.count(t) > min_count] + bottom_edges = [b for b in bottom_edges if bottom_edges.count(b) > min_count] + + unique_top = sorted(list(set(top_edges))) + unique_bottom = sorted(list(set(bottom_edges))) + + separators = min([unique_top, unique_bottom], key=len) + + # Add the top and bottom of the page as separators, to grab all possible cells + separators.append(page.bbox[3]) + separators.insert(0, page.bbox[1]) + return separators + + +def find_column_separators(page: Page, table_box, round_factor=4): + left_edges = [] + right_edges = [] + centers = [] + + line_boxes = [p.bbox for p in page.text_lines.bboxes] + line_boxes = [rescale_bbox(page.text_lines.image_bbox, page.bbox, l) for l in line_boxes] + line_boxes = [l for l in line_boxes if box_intersection_pct(l, table_box) > .8] + + min_count = len(line_boxes) / 3 + for cell in line_boxes: + left_edges.append(cell[0] / round_factor * round_factor) + right_edges.append(cell[2] / round_factor * round_factor) + centers.append((cell[0] + cell[2]) / 2 * round_factor / round_factor) + + left_edges = [l for l in left_edges if left_edges.count(l) > min_count] + right_edges = [r for r in right_edges if right_edges.count(r) > min_count] + centers = [c for c in centers if centers.count(c) > min_count] + + unique_left = sorted(list(set(left_edges))) + unique_right = sorted(list(set(right_edges))) + unique_center = sorted(list(set(centers))) + + # Find list with minimum length + separators = min([unique_left, unique_right, unique_center], key=len) + separators.append(page.bbox[2]) + separators.insert(0, page.bbox[0]) + return separators + + +def assign_cells_to_columns(page, table_box, rows, round_factor=4, tolerance=4): + separators = find_column_separators(page, table_box, round_factor=round_factor) + new_rows = [] + additional_column_index = 0 + for row in rows: + new_row = {} + last_col_index = -1 + for cell in row: + left_edge = cell[0][0] + column_index = -1 + for i, separator in enumerate(separators): + if left_edge - tolerance < separator and last_col_index < i: + column_index = i + break + if column_index == -1: + column_index = len(separators) + additional_column_index + additional_column_index += 1 + new_row[column_index] = cell[1] + last_col_index = column_index + additional_column_index = 0 + + flat_row = [] + for cell_idx, cell in enumerate(sorted(new_row.items())): + flat_row.extend([""] * (cell[0] - cell_idx) + [cell[1]]) + new_rows.append(flat_row) + + return new_rows diff --git a/marker/cleaners/table.py b/marker/tables/table.py similarity index 64% rename from marker/cleaners/table.py rename to marker/tables/table.py index fe33e0e0..d99b758e 100644 --- a/marker/cleaners/table.py +++ b/marker/tables/table.py @@ -1,45 +1,13 @@ +from collections import defaultdict + from marker.schema.bbox import merge_boxes, box_intersection_pct, rescale_bbox from marker.schema.block import Line, Span, Block from marker.schema.page import Page from tabulate import tabulate -from typing import List, Dict -import re - - -def sort_table_blocks(blocks, tolerance=5): - vertical_groups = {} - for block in blocks: - if hasattr(block, "bbox"): - bbox = block.bbox - else: - bbox = block["bbox"] - group_key = round(bbox[1] / tolerance) * tolerance - if group_key not in vertical_groups: - vertical_groups[group_key] = [] - vertical_groups[group_key].append(block) - - # Sort each group horizontally and flatten the groups into a single list - sorted_blocks = [] - for _, group in sorted(vertical_groups.items()): - sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0]) - sorted_blocks.extend(sorted_group) - - return sorted_blocks +from typing import List - -def replace_dots(text): - dot_pattern = re.compile(r'(\s*\.\s*){4,}') - dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL) - - if dot_multiline_pattern.match(text): - text = dot_pattern.sub(' ', text) - return text - - -def replace_newlines(text): - # Replace all newlines - newline_pattern = re.compile(r'[\r\n]+') - return newline_pattern.sub(' ', text.strip()) +from marker.tables.cells import assign_cells_to_columns, find_row_separators, find_column_separators +from marker.tables.utils import sort_table_blocks, replace_dots, replace_newlines def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]: @@ -73,77 +41,45 @@ def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]: return table_rows -def assign_cells_to_columns(rows, round_factor=4, tolerance=4): - left_edges = [] - right_edges = [] - centers = [] - - for row in rows: - for cell in row: - left_edges.append(cell[0][0] / round_factor * round_factor) - right_edges.append(cell[0][2] / round_factor * round_factor) - centers.append((cell[0][0] + cell[0][2]) / 2 * round_factor / round_factor) - - unique_left = sorted(list(set(left_edges))) - unique_right = sorted(list(set(right_edges))) - unique_center = sorted(list(set(centers))) - - # Find list with minimum length - separators = min([unique_left, unique_right, unique_center], key=len) - - new_rows = [] - for row in rows: - new_row = {} - last_col_index = -1 - for cell in row: - left_edge = cell[0][0] - column_index = -1 - for i, separator in enumerate(separators): - if left_edge - tolerance < separator and last_col_index < i: - column_index = i - break - if column_index == -1: - column_index = cell[0][0] # Assign a new column - new_row[column_index] = cell[1] - last_col_index = column_index - - flat_row = [cell[1] for cell in sorted(new_row.items())] - min_column_index = min(new_row.keys()) - flat_row = [""] * min_column_index + flat_row - new_rows.append(flat_row) - - return new_rows - - def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: page_width = page.width table_rows = [] table_cell = "" cell_bbox = None - prev_end = None + prev_char = False table_row = [] sorted_char_blocks = sort_table_blocks(page.char_blocks) + for block_idx, block in enumerate(sorted_char_blocks): - sorted_block_lines = sort_table_blocks(block["lines"]) - for line_idx, line in enumerate(sorted_block_lines): + sorted_lines = sort_table_blocks(block["lines"]) + for line_idx, line in enumerate(sorted_lines): line_bbox = line["bbox"] intersect_pct = box_intersection_pct(line_bbox, table_box) - if intersect_pct < .5: + if intersect_pct < .7: continue for span in line["spans"]: for char in span["chars"]: x_start, y_start, x_end, y_end = char["bbox"] + if cell_bbox is None: cell_bbox = char["bbox"] else: + # Find boundaries of cell bbox before merging + cell_x_start, cell_y_start, cell_x_end, cell_y_end = cell_bbox + cell_x_start /= page_width + cell_x_end /= page_width + cell_bbox = merge_boxes(cell_bbox, char["bbox"]) x_start /= page_width x_end /= page_width + cell_content = replace_dots(replace_newlines(table_cell)) - if prev_end is None or abs(x_start - prev_end) < space_tol: # Check if we are in the same cell + if not prev_char: # First char + table_cell += char["char"] + elif cell_x_start - space_tol < x_start < cell_x_end + space_tol: # Check if we are in the same cell table_cell += char["char"] - elif x_start > prev_end - space_tol: # Check if we are on the same line + elif x_start > cell_x_end - space_tol: # Same line, new cell, check against cell bbox if len(table_cell) > 0: table_row.append((cell_bbox, cell_content)) table_cell = char["char"] @@ -156,16 +92,18 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]: if len(table_row) > 0: table_rows.append(table_row) table_row = [] - prev_end = x_end + prev_char = True + if len(table_cell) > 0: table_row.append((cell_bbox, replace_dots(replace_newlines(table_cell)))) if len(table_row) > 0: table_rows.append(table_row) - table_rows = assign_cells_to_columns(table_rows) + + table_rows = assign_cells_to_columns(page, table_box, table_rows) return table_rows -def arrange_table_rows(pages: List[Page]): +def format_tables(pages: List[Page]): # Formats tables nicely into github flavored markdown table_count = 0 for page in pages: diff --git a/marker/tables/utils.py b/marker/tables/utils.py new file mode 100644 index 00000000..b7efdabb --- /dev/null +++ b/marker/tables/utils.py @@ -0,0 +1,37 @@ +import re + + +def sort_table_blocks(blocks, tolerance=5): + vertical_groups = {} + for block in blocks: + if hasattr(block, "bbox"): + bbox = block.bbox + else: + bbox = block["bbox"] + group_key = round(bbox[1] / tolerance) * tolerance + if group_key not in vertical_groups: + vertical_groups[group_key] = [] + vertical_groups[group_key].append(block) + + # Sort each group horizontally and flatten the groups into a single list + sorted_blocks = [] + for _, group in sorted(vertical_groups.items()): + sorted_group = sorted(group, key=lambda x: x.bbox[0] if hasattr(x, "bbox") else x["bbox"][0]) + sorted_blocks.extend(sorted_group) + + return sorted_blocks + + +def replace_dots(text): + dot_pattern = re.compile(r'(\s*\.\s*){4,}') + dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL) + + if dot_multiline_pattern.match(text): + text = dot_pattern.sub(' ', text) + return text + + +def replace_newlines(text): + # Replace all newlines + newline_pattern = re.compile(r'[\r\n]+') + return newline_pattern.sub(' ', text.strip())