Skip to content

Commit

Permalink
Improve block merges, initial header levels
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Oct 16, 2024
1 parent 15f5f2d commit 78acbc0
Show file tree
Hide file tree
Showing 10 changed files with 1,887 additions and 1,660 deletions.
64 changes: 64 additions & 0 deletions marker/cleaners/headings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from collections import defaultdict
from typing import List
import numpy as np
from sklearn.cluster import KMeans

from marker.settings import settings
from marker.schema.bbox import rescale_bbox
Expand Down Expand Up @@ -57,3 +60,64 @@ def split_heading_blocks(pages: List[Page]):
new_blocks.append(copied_block)

page.blocks = new_blocks


def bucket_headings(line_heights, num_levels=settings.HEADING_LEVEL_COUNT):
data = np.asarray(line_heights).reshape(-1, 1)
labels = KMeans(n_clusters=num_levels, random_state=0, n_init="auto").fit_predict(data)
data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1)
data_labels = np.sort(data_labels, axis=0)

cluster_means = {label: np.mean(data[labels == label, 0]) for label in np.unique(labels)}
label_max = None
label_min = None
heading_ranges = []
prev_cluster = None
for row in data_labels:
value, label = row
if prev_cluster is not None and label != prev_cluster:
prev_cluster_mean = cluster_means[prev_cluster]
cluster_mean = cluster_means[label]
if cluster_mean * settings.HEADING_MERGE_THRESHOLD < prev_cluster_mean:
heading_ranges.append((label_min, label_max))
label_min = None
label_max = None

label_min = value if label_min is None else min(label_min, value)
label_max = value if label_max is None else max(label_max, value)
prev_cluster = label

if label_min is not None:
heading_ranges.append((label_min, label_max))

heading_ranges = sorted(heading_ranges, key=lambda x: x[0], reverse=True)

return heading_ranges


def infer_heading_levels(pages: List[Page]):
all_line_heights = []
for page in pages:
for block in page.blocks:
if block.block_type not in ["Title", "Section-header"]:
continue

block_heights = [min(l.height, l.width) for l in block.lines] # Account for rotation
all_line_heights.extend(block_heights)

heading_ranges = bucket_headings(all_line_heights)

for page in pages:
for block in page.blocks:
if block.block_type not in ["Title", "Section-header"]:
continue

block_heights = [min(l.height, l.width) for l in block.lines] # Account for rotation
avg_height = sum(block_heights) / len(block_heights)
for idx, (min_height, max_height) in enumerate(heading_ranges):
if avg_height >= min_height:
block.heading_level = len(heading_ranges) - idx
break

if block.heading_level is None:
block.heading_level = min(len(heading_ranges), settings.HEADING_DEFAULT_LEVEL)
3 changes: 2 additions & 1 deletion marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from marker.pdf.utils import find_filetype
from marker.cleaners.code import identify_code_blocks, indent_blocks
from marker.cleaners.bullets import replace_bullets
from marker.cleaners.headings import split_heading_blocks
from marker.cleaners.headings import split_heading_blocks, infer_heading_levels
from marker.cleaners.fontstyle import find_bold_italic
from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text
from marker.cleaners.text import cleanup_text
Expand Down Expand Up @@ -145,6 +145,7 @@ def convert_single_pdf(

# Split out headers
split_heading_blocks(pages)
infer_heading_levels(pages)
find_bold_italic(pages)

# Copy to avoid changing original data
Expand Down
7 changes: 5 additions & 2 deletions marker/debug/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,15 @@ def dump_bbox_debug_data(doc, fname, blocks: List[Page]):
png_image.save(img_bytes, format="WEBP", lossless=True, quality=100)
b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8")

page_data = page_blocks.model_dump()
page_data["image"] = b64_image
page_data = page_blocks.model_dump(exclude=["images", "layout", "text_lines"])
page_data["layout"] = page_blocks.layout.model_dump(exclude=["segmentation_map"])
page_data["text_lines"] = page_blocks.text_lines.model_dump(exclude=["heatmap", "affinity_map"])
#page_data["image"] = b64_image
debug_data.append(page_data)

with open(debug_file, "w+") as f:
json.dump(debug_data, f)
print(f"Dumped bbox debug data to {debug_file}")



32 changes: 31 additions & 1 deletion marker/layout/layout.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from collections import defaultdict
from typing import List

from surya.layout import batch_layout_detection

from marker.pdf.images import render_image
from marker.schema.bbox import rescale_bbox
from marker.schema.block import bbox_from_lines
from marker.schema.page import Page
from marker.settings import settings

Expand Down Expand Up @@ -45,4 +47,32 @@ def annotate_block_types(pages: List[Page]):
if i in max_intersections:
j = max_intersections[i][1]
block_type = page.layout.bboxes[j].label
block.block_type = block_type
block.block_type = block_type


# Merge blocks together, preserving pdf order
curr_layout_idx = None
curr_layout_block = None
new_blocks = []
for i in range(len(page.blocks)):
if i not in max_intersections:
if curr_layout_block is not None:
curr_layout_block.bbox = bbox_from_lines(curr_layout_block.lines)
new_blocks.append(curr_layout_block)
curr_layout_block = None
curr_layout_idx = None
new_blocks.append(page.blocks[i])
elif max_intersections[i][1] != curr_layout_idx:
if curr_layout_block is not None:
curr_layout_block.bbox = bbox_from_lines(curr_layout_block.lines)
new_blocks.append(curr_layout_block)
curr_layout_block = page.blocks[i].copy()
curr_layout_idx = max_intersections[i][1]
else:
curr_layout_block.lines.extend(page.blocks[i].lines)

if curr_layout_block is not None:
curr_layout_block.bbox = bbox_from_lines(curr_layout_block.lines)
new_blocks.append(curr_layout_block)

page.blocks = new_blocks
20 changes: 11 additions & 9 deletions marker/pdf/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
page_blocks = []
span_id = 0
for block_idx, block in enumerate(page["blocks"]):
block_lines = []
for l in block["lines"]:
block_lines = []
spans = []
for i, s in enumerate(l["spans"]):
block_text = s["text"]
Expand All @@ -44,14 +44,16 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
# Only select valid lines, with positive bboxes
if line_obj.area >= 0:
block_lines.append(line_obj)
block_obj = Block(
lines=block_lines,
bbox=block["bbox"],
pnum=pnum
)
# Only select blocks with lines
if len(block_lines) > 0:
page_blocks.append(block_obj)

# Each block is a single line
block_obj = Block(
lines=block_lines,
bbox=l["bbox"],
pnum=pnum
)
# Only select blocks with lines
if len(block_lines) > 0:
page_blocks.append(block_obj)

page_bbox = page["bbox"]
page_width = abs(page_bbox[2] - page_bbox[0])
Expand Down
14 changes: 9 additions & 5 deletions marker/postprocessors/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,19 @@ def merge_spans(pages: List[Page]) -> List[List[MergedBlock]]:
lines=block_lines,
pnum=block.pnum,
bbox=block.bbox,
block_type=block.block_type
block_type=block.block_type,
heading_level=block.heading_level
))
merged_blocks.append(page_blocks)

return merged_blocks


def block_surround(text, block_type):
def block_surround(text, block_type, heading_level):
if block_type == "Section-header":
if not text.startswith("#"):
text = "\n## " + text.strip().title() + "\n"
asterisks = "#" * heading_level if heading_level is not None else "##"
text = f"\n{asterisks} " + text.strip().title() + "\n"
elif block_type == "Title":
if not text.startswith("#"):
text = "# " + text.strip().title() + "\n"
Expand Down Expand Up @@ -144,20 +146,22 @@ def merge_lines(blocks: List[List[MergedBlock]]):
prev_line = None
block_text = ""
block_type = ""
block_heading_level = None

for idx, page in enumerate(blocks):
for block in page:
block_type = block.block_type
if block_type != prev_type and prev_type:
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type),
text=block_surround(block_text, prev_type, block_heading_level),
block_type=prev_type
)
)
block_text = ""

prev_type = block_type
block_heading_level = block.heading_level
# Join lines in the block together properly
for i, line in enumerate(block.lines):
line_height = line.bbox[3] - line.bbox[1]
Expand All @@ -176,7 +180,7 @@ def merge_lines(blocks: List[List[MergedBlock]]):
# Append the final block
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type),
text=block_surround(block_text, prev_type, block_heading_level),
block_type=block_type
)
)
Expand Down
1 change: 1 addition & 0 deletions marker/schema/block.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class Block(BboxElement):
lines: List[Line]
pnum: int
block_type: Optional[str] = None
heading_level: Optional[int] = None

@property
def prelim_text(self):
Expand Down
1 change: 1 addition & 0 deletions marker/schema/merged.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class MergedBlock(BboxElement):
lines: List[MergedLine]
pnum: int
block_type: Optional[str]
heading_level: Optional[int] = None


class FullyMergedBlock(BaseModel):
Expand Down
5 changes: 5 additions & 0 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ def TORCH_DEVICE_MODEL(self) -> str:
# Table models
SURYA_TABLE_DPI: int = 192

# Headings
HEADING_LEVEL_COUNT: int = 4
HEADING_MERGE_THRESHOLD: float = .25
HEADING_DEFAULT_LEVEL: int = 2

# Debug
DEBUG: bool = False # Enable debug logging
DEBUG_DATA_FOLDER: Optional[str] = None
Expand Down
Loading

0 comments on commit 78acbc0

Please sign in to comment.