Improve block merges, initial header levels

VikParuchuri · Oct 16, 2024 · 78acbc0 · 78acbc0
1 parent 15f5f2d
commit 78acbc0
Show file tree

Hide file tree

Showing 10 changed files with 1,887 additions and 1,660 deletions.
diff --git a/marker/cleaners/headings.py b/marker/cleaners/headings.py
@@ -1,4 +1,7 @@
+from collections import defaultdict
 from typing import List
+import numpy as np
+from sklearn.cluster import KMeans
 
 from marker.settings import settings
 from marker.schema.bbox import rescale_bbox
@@ -57,3 +60,64 @@ def split_heading_blocks(pages: List[Page]):
                 new_blocks.append(copied_block)
 
         page.blocks = new_blocks
+
+
+def bucket_headings(line_heights, num_levels=settings.HEADING_LEVEL_COUNT):
+    data = np.asarray(line_heights).reshape(-1, 1)
+    labels = KMeans(n_clusters=num_levels, random_state=0, n_init="auto").fit_predict(data)
+    data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1)
+    data_labels = np.sort(data_labels, axis=0)
+
+    cluster_means = {label: np.mean(data[labels == label, 0]) for label in np.unique(labels)}
+    label_max = None
+    label_min = None
+    heading_ranges = []
+    prev_cluster = None
+    for row in data_labels:
+        value, label = row
+        if prev_cluster is not None and label != prev_cluster:
+            prev_cluster_mean = cluster_means[prev_cluster]
+            cluster_mean = cluster_means[label]
+            if cluster_mean * settings.HEADING_MERGE_THRESHOLD < prev_cluster_mean:
+                heading_ranges.append((label_min, label_max))
+                label_min = None
+                label_max = None
+
+        label_min = value if label_min is None else min(label_min, value)
+        label_max = value if label_max is None else max(label_max, value)
+        prev_cluster = label
+
+    if label_min is not None:
+        heading_ranges.append((label_min, label_max))
+
+    heading_ranges = sorted(heading_ranges, key=lambda x: x[0], reverse=True)
+
+    return heading_ranges
+
+
+def infer_heading_levels(pages: List[Page]):
+    all_line_heights = []
+    for page in pages:
+        for block in page.blocks:
+            if block.block_type not in ["Title", "Section-header"]:
+                continue
+
+            block_heights = [min(l.height, l.width) for l in block.lines] # Account for rotation
+            all_line_heights.extend(block_heights)
+
+    heading_ranges = bucket_headings(all_line_heights)
+
+    for page in pages:
+        for block in page.blocks:
+            if block.block_type not in ["Title", "Section-header"]:
+                continue
+
+            block_heights = [min(l.height, l.width) for l in block.lines] # Account for rotation
+            avg_height = sum(block_heights) / len(block_heights)
+            for idx, (min_height, max_height) in enumerate(heading_ranges):
+                if avg_height >= min_height:
+                    block.heading_level = len(heading_ranges) - idx
+                    break
+
+            if block.heading_level is None:
+                block.heading_level = min(len(heading_ranges), settings.HEADING_DEFAULT_LEVEL)
diff --git a/marker/convert.py b/marker/convert.py
@@ -22,7 +22,7 @@
 from marker.pdf.utils import find_filetype
 from marker.cleaners.code import identify_code_blocks, indent_blocks
 from marker.cleaners.bullets import replace_bullets
-from marker.cleaners.headings import split_heading_blocks
+from marker.cleaners.headings import split_heading_blocks, infer_heading_levels
 from marker.cleaners.fontstyle import find_bold_italic
 from marker.postprocessors.markdown import merge_spans, merge_lines, get_full_text
 from marker.cleaners.text import cleanup_text
@@ -145,6 +145,7 @@ def convert_single_pdf(
 
     # Split out headers
     split_heading_blocks(pages)
+    infer_heading_levels(pages)
     find_bold_italic(pages)
 
     # Copy to avoid changing original data

diff --git a/marker/debug/data.py b/marker/debug/data.py
@@ -65,12 +65,15 @@ def dump_bbox_debug_data(doc, fname, blocks: List[Page]):
         png_image.save(img_bytes, format="WEBP", lossless=True, quality=100)
         b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
 
-        page_data = page_blocks.model_dump()
-        page_data["image"] = b64_image
+        page_data = page_blocks.model_dump(exclude=["images", "layout", "text_lines"])
+        page_data["layout"] = page_blocks.layout.model_dump(exclude=["segmentation_map"])
+        page_data["text_lines"] = page_blocks.text_lines.model_dump(exclude=["heatmap", "affinity_map"])
+        #page_data["image"] = b64_image
         debug_data.append(page_data)
 
     with open(debug_file, "w+") as f:
         json.dump(debug_data, f)
+    print(f"Dumped bbox debug data to {debug_file}")
 
 
 
diff --git a/marker/layout/layout.py b/marker/layout/layout.py
@@ -1,9 +1,11 @@
+from collections import defaultdict
 from typing import List
 
 from surya.layout import batch_layout_detection
 
 from marker.pdf.images import render_image
 from marker.schema.bbox import rescale_bbox
+from marker.schema.block import bbox_from_lines
 from marker.schema.page import Page
 from marker.settings import settings
 
@@ -45,4 +47,32 @@ def annotate_block_types(pages: List[Page]):
             if i in max_intersections:
                 j = max_intersections[i][1]
                 block_type = page.layout.bboxes[j].label
-            block.block_type = block_type
+            block.block_type = block_type
+
+
+        # Merge blocks together, preserving pdf order
+        curr_layout_idx = None
+        curr_layout_block = None
+        new_blocks = []
+        for i in range(len(page.blocks)):
+            if i not in max_intersections:
+                if curr_layout_block is not None:
+                    curr_layout_block.bbox = bbox_from_lines(curr_layout_block.lines)
+                    new_blocks.append(curr_layout_block)
+                curr_layout_block = None
+                curr_layout_idx = None
+                new_blocks.append(page.blocks[i])
+            elif max_intersections[i][1] != curr_layout_idx:
+                if curr_layout_block is not None:
+                    curr_layout_block.bbox = bbox_from_lines(curr_layout_block.lines)
+                    new_blocks.append(curr_layout_block)
+                curr_layout_block = page.blocks[i].copy()
+                curr_layout_idx = max_intersections[i][1]
+            else:
+                curr_layout_block.lines.extend(page.blocks[i].lines)
+
+        if curr_layout_block is not None:
+            curr_layout_block.bbox = bbox_from_lines(curr_layout_block.lines)
+            new_blocks.append(curr_layout_block)
+
+        page.blocks = new_blocks
diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py
@@ -17,8 +17,8 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
     page_blocks = []
     span_id = 0
     for block_idx, block in enumerate(page["blocks"]):
-        block_lines = []
         for l in block["lines"]:
+            block_lines = []
             spans = []
             for i, s in enumerate(l["spans"]):
                 block_text = s["text"]
@@ -44,14 +44,16 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
             # Only select valid lines, with positive bboxes
             if line_obj.area >= 0:
                 block_lines.append(line_obj)
-        block_obj = Block(
-            lines=block_lines,
-            bbox=block["bbox"],
-            pnum=pnum
-        )
-        # Only select blocks with lines
-        if len(block_lines) > 0:
-            page_blocks.append(block_obj)
+
+            # Each block is a single line
+            block_obj = Block(
+                lines=block_lines,
+                bbox=l["bbox"],
+                pnum=pnum
+            )
+            # Only select blocks with lines
+            if len(block_lines) > 0:
+                page_blocks.append(block_obj)
 
     page_bbox = page["bbox"]
     page_width = abs(page_bbox[2] - page_bbox[0])

diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py
@@ -66,17 +66,19 @@ def merge_spans(pages: List[Page]) -> List[List[MergedBlock]]:
                     lines=block_lines,
                     pnum=block.pnum,
                     bbox=block.bbox,
-                    block_type=block.block_type
+                    block_type=block.block_type,
+                    heading_level=block.heading_level
                 ))
         merged_blocks.append(page_blocks)
 
     return merged_blocks
 
 
-def block_surround(text, block_type):
+def block_surround(text, block_type, heading_level):
     if block_type == "Section-header":
         if not text.startswith("#"):
-            text = "\n## " + text.strip().title() + "\n"
+            asterisks = "#" * heading_level if heading_level is not None else "##"
+            text = f"\n{asterisks} " + text.strip().title() + "\n"
     elif block_type == "Title":
         if not text.startswith("#"):
             text = "# " + text.strip().title() + "\n"
@@ -144,20 +146,22 @@ def merge_lines(blocks: List[List[MergedBlock]]):
     prev_line = None
     block_text = ""
     block_type = ""
+    block_heading_level = None
 
     for idx, page in enumerate(blocks):
         for block in page:
             block_type = block.block_type
             if block_type != prev_type and prev_type:
                 text_blocks.append(
                     FullyMergedBlock(
-                        text=block_surround(block_text, prev_type),
+                        text=block_surround(block_text, prev_type, block_heading_level),
                         block_type=prev_type
                     )
                 )
                 block_text = ""
 
             prev_type = block_type
+            block_heading_level = block.heading_level
             # Join lines in the block together properly
             for i, line in enumerate(block.lines):
                 line_height = line.bbox[3] - line.bbox[1]
@@ -176,7 +180,7 @@ def merge_lines(blocks: List[List[MergedBlock]]):
     # Append the final block
     text_blocks.append(
         FullyMergedBlock(
-            text=block_surround(block_text, prev_type),
+            text=block_surround(block_text, prev_type, block_heading_level),
             block_type=block_type
         )
     )

diff --git a/marker/schema/block.py b/marker/schema/block.py
@@ -45,6 +45,7 @@ class Block(BboxElement):
     lines: List[Line]
     pnum: int
     block_type: Optional[str] = None
+    heading_level: Optional[int] = None
 
     @property
     def prelim_text(self):

diff --git a/marker/schema/merged.py b/marker/schema/merged.py
@@ -19,6 +19,7 @@ class MergedBlock(BboxElement):
     lines: List[MergedLine]
     pnum: int
     block_type: Optional[str]
+    heading_level: Optional[int] = None
 
 
 class FullyMergedBlock(BaseModel):

diff --git a/marker/settings.py b/marker/settings.py
@@ -78,6 +78,11 @@ def TORCH_DEVICE_MODEL(self) -> str:
     # Table models
     SURYA_TABLE_DPI: int = 192
 
+    # Headings
+    HEADING_LEVEL_COUNT: int = 4
+    HEADING_MERGE_THRESHOLD: float = .25
+    HEADING_DEFAULT_LEVEL: int = 2
+
     # Debug
     DEBUG: bool = False # Enable debug logging
     DEBUG_DATA_FOLDER: Optional[str] = None