Skip to content

Commit

Permalink
Sort character blocks for pdf text
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 3, 2024
1 parent df6f8fc commit c22d32e
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 9 deletions.
30 changes: 23 additions & 7 deletions marker/cleaners/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,23 @@
import re


def sort_char_blocks(blocks, tolerance=1.25):
vertical_groups = {}
for block in blocks:
group_key = round(block["bbox"][1] / tolerance) * tolerance
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)

# Sort each group horizontally and flatten the groups into a single list
sorted_blocks = []
for _, group in sorted(vertical_groups.items()):
sorted_group = sorted(group, key=lambda x: x["bbox"][0])
sorted_blocks.extend(sorted_group)

return sorted_blocks


def replace_dots(text):
dot_pattern = re.compile(r'(\s*\.\s*){4,}')
dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
Expand All @@ -21,31 +38,28 @@ def replace_newlines(text):
return newline_pattern.sub(' ', text.strip())


def get_table_surya(page, table_box, y_tol=.005) -> List[List[str]]:
def get_table_surya(page, table_box, space_tol=.01) -> List[List[str]]:
table_rows = []
table_row = []
x_position = None
y_position = None
for block_idx, block in enumerate(page.blocks):
for line_idx, line in enumerate(block.lines):
line_bbox = line.bbox
intersect_pct = box_intersection_pct(line_bbox, table_box)
if intersect_pct < .5 or len(line.spans) == 0:
continue
normed_y_start = line_bbox[1] / page.height
normed_x_start = line_bbox[0] / page.width
normed_x_end = line_bbox[2] / page.width

cells = [[s.bbox, s.text] for s in line.spans]
if x_position is None or (normed_x_start > x_position and abs(normed_y_start - y_position) < y_tol):
if x_position is None or normed_x_start > x_position - space_tol:
# Same row
table_row.extend(cells)
else:
# New row
if len(table_row) > 0:
table_rows.append(table_row)
table_row = cells
y_position = normed_y_start
x_position = normed_x_end
if len(table_row) > 0:
table_rows.append(table_row)
Expand Down Expand Up @@ -102,8 +116,10 @@ def get_table_pdftext(page: Page, table_box, space_tol=.01) -> List[List[str]]:
cell_bbox = None
prev_end = None
table_row = []
for block_idx, block in enumerate(page.char_blocks):
for line_idx, line in enumerate(block["lines"]):
sorted_char_blocks = sort_char_blocks(page.char_blocks)
for block_idx, block in enumerate(sorted_char_blocks):
sorted_block_lines = sort_char_blocks(block["lines"])
for line_idx, line in enumerate(sorted_block_lines):
line_bbox = line["bbox"]
intersect_pct = box_intersection_pct(line_bbox, table_box)
if intersect_pct < .5:
Expand Down
8 changes: 8 additions & 0 deletions marker/cleaners/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import re


def cleanup_text(full_text):
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
full_text = full_text.replace('\xa0', ' ') # Replace non-breaking spaces
return full_text
6 changes: 4 additions & 2 deletions marker/convert.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import warnings

from marker.cleaners.text import cleanup_text

warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings

import pypdfium2 as pdfium
Expand Down Expand Up @@ -131,8 +134,7 @@ def convert_single_pdf(
full_text = get_full_text(text_blocks)

# Handle empty blocks being joined
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
full_text = cleanup_text(full_text)

# Replace bullet characters with a -
full_text = replace_bullets(full_text)
Expand Down
2 changes: 2 additions & 0 deletions marker/postprocessors/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ def line_separator(line1, line2, block_type, is_continuation=False):
return line1 + "\n\n" + line2
elif block_type == "Formula":
return line1 + " " + line2
elif block_type == "Table":
return line1 + "\n\n" + line2
else:
return line1 + "\n" + line2

Expand Down

0 comments on commit c22d32e

Please sign in to comment.