Fix code block formatting

VikParuchuri · May 8, 2024 · aaef442 · aaef442
1 parent 287f546
commit aaef442
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 80 deletions.
diff --git a/marker/cleaners/code.py b/marker/cleaners/code.py
@@ -1,10 +1,13 @@
+from collections import Counter
+from statistics import mean, median
+
 from marker.schema.block import Span, Line
 from marker.schema.page import Page
 import re
 from typing import List
 
 
-def is_code_linelen(lines, thresh=60):
+def is_code_linelen(lines, thresh=80):
     # Decide based on chars per newline threshold
     total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
     total_newlines = max(len(lines) - 1, 1)
@@ -23,64 +26,61 @@ def comment_count(lines):
 
 def identify_code_blocks(pages: List[Page]):
     code_block_count = 0
-    font_info = None
-    for page in pages:
-        stats = page.get_font_stats()
-        if font_info is None:
-            font_info = stats
-        else:
-            font_info += stats
-    try:
-        most_common_font = font_info.most_common(1)[0][0]
-    except IndexError:
-        print(f"Could not find most common font")
-        most_common_font = None
-
-    last_block = None
+    font_sizes = []
+    line_heights = []
     for page in pages:
-        try:
-            min_start = page.get_min_line_start()
-        except IndexError:
-            continue
+        font_sizes += page.get_font_sizes()
+        line_heights += page.get_line_heights()
+
+    avg_font_size = None
+    avg_line_height = None
+    if len(font_sizes) > 0:
+        avg_line_height = median(line_heights)
+        avg_font_size = mean(font_sizes)
 
+    for page in pages:
         for block in page.blocks:
             if block.block_type != "Text":
                 last_block = block
                 continue
 
+            # Ensure we have lines and spans
+            if len(block.lines) == 0:
+                continue
+            if sum([len(line.spans) for line in block.lines]) == 0:
+                continue
+
+            min_start = block.get_min_line_start()
+
             is_indent = []
             line_fonts = []
+            line_font_sizes = []
+            block_line_heights = []
             for line in block.lines:
-                fonts = [span.font for span in line.spans]
-                line_fonts += fonts
-                line_start = line.bbox[0]
-                if line_start > min_start:
-                    is_indent.append(True)
-                else:
-                    is_indent.append(False)
+                line_fonts += [span.font for span in line.spans]
+                line_font_sizes += [span.font_size for span in line.spans]
+                block_line_heights.append(line.bbox[3] - line.bbox[1])
+
+                is_indent.append(line.bbox[0] > min_start)
+
             comment_lines = comment_count([line.prelim_text for line in block.lines])
             is_code = [
                 len(block.lines) > 3,
-                sum([f != most_common_font for f in line_fonts]) > len(line_fonts) * .8,  # At least 80% of the fonts are not the most common, since code usually uses a different font from the main body text
                 is_code_linelen(block.lines),
-                (
-                    sum(is_indent) > len(block.lines) * .2
-                    or
-                    comment_lines > len(block.lines) * .2
-                 ), # 20% lines indented or 20% of the lines are comments
+                sum(is_indent) + comment_lines > len(block.lines) * .3,
             ]
 
-            # Check if previous block is code, and this block is indented
-            is_code_prev = [
-                last_block and last_block.block_type == "Code",
-                sum(is_indent) >= len(block.lines) * .8 # At least 80% indented
-            ]
+            if avg_font_size is not None:
+                font_checks = [
+                    mean(line_font_sizes) <= avg_font_size, # Lower than average font size and line height
+                    mean(block_line_heights) < avg_line_height
+                ]
+                is_code += font_checks
 
-            if all(is_code) or all(is_code_prev):
+            if all(is_code):
                 code_block_count += 1
                 block.block_type = "Code"
 
-            last_block = block
     return code_block_count
 
 

diff --git a/marker/equations/equations.py b/marker/equations/equations.py
@@ -90,7 +90,7 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu
             lines=[Line(
                 spans=[
                     Span(
-                        text="\n\n" + block_text.replace("\n", " ") + "\n\n",
+                        text=block_text.replace("\n", " "),
                         bbox=equation_bbox,
                         span_id=f"{pnum}_{idx}_fixeq",
                         font="Latex",
@@ -109,7 +109,7 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu
             fail_count += 1
         else:
             success_count += 1
-            new_block.lines[0].spans[0].text = latex_text
+            new_block.lines[0].spans[0].text = latex_text.replace("\n", " ")
             converted_spans.append(deepcopy(new_block.lines[0].spans[0]))
 
         # Add in the new LaTeX block

diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py
@@ -12,6 +12,7 @@ def escape_markdown(text):
     escaped_text = re.sub(characters_to_escape, r'\\\g<0>', text)
     return escaped_text
 
+
 def surround_text(s, char_to_insert):
     leading_whitespace = re.match(r'^(\s*)', s).group(1)
     trailing_whitespace = re.search(r'(\s*)$', s).group(1)
@@ -82,43 +83,45 @@ def block_surround(text, block_type):
     elif block_type == "List-item":
         text = escape_markdown(text)
     elif block_type == "Code":
-        text = "\n" + escape_markdown(text) + "\n"
+        text = "\n```\n" + text + "\n```\n"
     elif block_type == "Text":
         text = escape_markdown(text)
+    elif block_type == "Formula":
+        if text.strip().startswith("$$") and text.strip().endswith("$$"):
+            text = text.strip()
+            text = "\n" + text + "\n"
     return text
 
 
 def line_separator(line1, line2, block_type, is_continuation=False):
     # Should cover latin-derived languages and russian
-    lowercase_letters = r'(\p{Lo}+|\p{Ll}+)'
+    lowercase_letters = r'\p{Lo}+|\d+'
     # Remove hyphen in current line if next line and current line appear to be joined
     hyphen_pattern = regex.compile(rf'.*[{lowercase_letters}][-]\s?$', regex.DOTALL)
     if line1 and hyphen_pattern.match(line1) and regex.match(rf"^\s?[{lowercase_letters}]", line2):
         # Split on — or - from the right
-        line1 = re.split(r"[-—]\s?$", line1)[0]
+        line1 = regex.split(r"[-—]\s?$", line1)[0]
         return line1.rstrip() + line2.lstrip()
 
-    all_letters = r'\p{L}+'
-    sentence_continuations = r',;(—'
-    sentence_ends = r'。ๆ.?!'
-    line_end_pattern = regex.compile(rf'.*[{lowercase_letters}{sentence_continuations}]\s?$', regex.DOTALL)
+    all_letters = r'\p{L}+|\d+'
+    sentence_continuations = r',;\(\—'
+    sentence_ends = r'。ๆ\.?!'
+    line_end_pattern = regex.compile(rf'.*[{lowercase_letters}][{sentence_continuations}]?\s?$', regex.DOTALL)
     line_start_pattern = regex.compile(rf'^\s?[{all_letters}]', regex.DOTALL)
     sentence_end_pattern = regex.compile(rf'.*[{sentence_ends}]\s?$', regex.DOTALL)
 
     if block_type in ["Title", "Section-header"]:
         return line1.rstrip() + " " + line2.lstrip()
+    elif block_type == "Formula":
+        return line1 + "\n" + line2
     elif line_end_pattern.match(line1) and line_start_pattern.match(line2) and block_type == "Text":
         return line1.rstrip() + " " + line2.lstrip()
     elif is_continuation:
         return line1.rstrip() + " " + line2.lstrip()
     elif block_type == "Text" and sentence_end_pattern.match(line1):
         return line1 + "\n\n" + line2
-    elif block_type == "Formula":
-        return line1 + " " + line2
     elif block_type == "Table":
         return line1 + "\n\n" + line2
-    elif block_type in ["Formula"]:
-        return line1.rstrip() + "\n\n" + line2.lstrip()
     else:
         return line1 + "\n" + line2
 

diff --git a/marker/schema/block.py b/marker/schema/block.py
@@ -74,12 +74,11 @@ def filter_bad_span_types(self):
                 new_lines.append(line)
         self.lines = new_lines
 
-    def font_info(self, prop="font"):
-        font_info = []
-        for line in self.lines:
-            for span in line.spans:
-                font_info.append(getattr(span, prop))
-        return font_info
+    def get_min_line_start(self):
+        line_starts = [line.start for line in self.lines]
+        if len(line_starts) == 0:
+            return None
+        return min(line_starts)
 
 
 def bbox_from_lines(lines: List[Line]):

diff --git a/marker/schema/page.py b/marker/schema/page.py
@@ -31,30 +31,13 @@ def get_nonblank_spans(self) -> List[Span]:
         spans = [s for l in lines for s in l.spans if s.text.strip()]
         return spans
 
-    def get_font_stats(self):
-        fonts = [s.font for s in self.get_nonblank_spans()]
-        font_counts = Counter(fonts)
-        return font_counts
+    def get_font_sizes(self):
+        font_sizes = [s.font_size for s in self.get_nonblank_spans()]
+        return font_sizes
 
-    def get_line_height_stats(self):
+    def get_line_heights(self):
         heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()]
-        height_counts = Counter(heights)
-        return height_counts
-
-    def get_line_start_stats(self):
-        starts = [l.bbox[0] for l in self.get_nonblank_lines()]
-        start_counts = Counter(starts)
-        return start_counts
-
-    def get_min_line_start(self):
-        starts = []
-        for block in self.blocks:
-            for line in block.lines:
-                if line.spans and block.block_type == "Text":
-                    starts.append(line.bbox[0])
-        if len(starts) == 0:
-            raise IndexError("No lines found")
-        return min(starts)
+        return heights
 
     @property
     def prelim_text(self):