Skip to content

Commit

Permalink
Fix code block formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 8, 2024
1 parent 287f546 commit aaef442
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 80 deletions.
78 changes: 39 additions & 39 deletions marker/cleaners/code.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from collections import Counter
from statistics import mean, median

from marker.schema.block import Span, Line
from marker.schema.page import Page
import re
from typing import List


def is_code_linelen(lines, thresh=60):
def is_code_linelen(lines, thresh=80):
# Decide based on chars per newline threshold
total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
total_newlines = max(len(lines) - 1, 1)
Expand All @@ -23,64 +26,61 @@ def comment_count(lines):

def identify_code_blocks(pages: List[Page]):
code_block_count = 0
font_info = None
for page in pages:
stats = page.get_font_stats()
if font_info is None:
font_info = stats
else:
font_info += stats
try:
most_common_font = font_info.most_common(1)[0][0]
except IndexError:
print(f"Could not find most common font")
most_common_font = None

last_block = None
font_sizes = []
line_heights = []
for page in pages:
try:
min_start = page.get_min_line_start()
except IndexError:
continue
font_sizes += page.get_font_sizes()
line_heights += page.get_line_heights()

avg_font_size = None
avg_line_height = None
if len(font_sizes) > 0:
avg_line_height = median(line_heights)
avg_font_size = mean(font_sizes)

for page in pages:
for block in page.blocks:
if block.block_type != "Text":
last_block = block
continue

# Ensure we have lines and spans
if len(block.lines) == 0:
continue
if sum([len(line.spans) for line in block.lines]) == 0:
continue

min_start = block.get_min_line_start()

is_indent = []
line_fonts = []
line_font_sizes = []
block_line_heights = []
for line in block.lines:
fonts = [span.font for span in line.spans]
line_fonts += fonts
line_start = line.bbox[0]
if line_start > min_start:
is_indent.append(True)
else:
is_indent.append(False)
line_fonts += [span.font for span in line.spans]
line_font_sizes += [span.font_size for span in line.spans]
block_line_heights.append(line.bbox[3] - line.bbox[1])

is_indent.append(line.bbox[0] > min_start)

comment_lines = comment_count([line.prelim_text for line in block.lines])
is_code = [
len(block.lines) > 3,
sum([f != most_common_font for f in line_fonts]) > len(line_fonts) * .8, # At least 80% of the fonts are not the most common, since code usually uses a different font from the main body text
is_code_linelen(block.lines),
(
sum(is_indent) > len(block.lines) * .2
or
comment_lines > len(block.lines) * .2
), # 20% lines indented or 20% of the lines are comments
sum(is_indent) + comment_lines > len(block.lines) * .3,
]

# Check if previous block is code, and this block is indented
is_code_prev = [
last_block and last_block.block_type == "Code",
sum(is_indent) >= len(block.lines) * .8 # At least 80% indented
]
if avg_font_size is not None:
font_checks = [
mean(line_font_sizes) <= avg_font_size, # Lower than average font size and line height
mean(block_line_heights) < avg_line_height
]
is_code += font_checks

if all(is_code) or all(is_code_prev):
if all(is_code):
code_block_count += 1
block.block_type = "Code"

last_block = block
return code_block_count


Expand Down
4 changes: 2 additions & 2 deletions marker/equations/equations.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu
lines=[Line(
spans=[
Span(
text="\n\n" + block_text.replace("\n", " ") + "\n\n",
text=block_text.replace("\n", " "),
bbox=equation_bbox,
span_id=f"{pnum}_{idx}_fixeq",
font="Latex",
Expand All @@ -109,7 +109,7 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu
fail_count += 1
else:
success_count += 1
new_block.lines[0].spans[0].text = latex_text
new_block.lines[0].spans[0].text = latex_text.replace("\n", " ")
converted_spans.append(deepcopy(new_block.lines[0].spans[0]))

# Add in the new LaTeX block
Expand Down
25 changes: 14 additions & 11 deletions marker/postprocessors/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def escape_markdown(text):
escaped_text = re.sub(characters_to_escape, r'\\\g<0>', text)
return escaped_text


def surround_text(s, char_to_insert):
leading_whitespace = re.match(r'^(\s*)', s).group(1)
trailing_whitespace = re.search(r'(\s*)$', s).group(1)
Expand Down Expand Up @@ -82,43 +83,45 @@ def block_surround(text, block_type):
elif block_type == "List-item":
text = escape_markdown(text)
elif block_type == "Code":
text = "\n" + escape_markdown(text) + "\n"
text = "\n```\n" + text + "\n```\n"
elif block_type == "Text":
text = escape_markdown(text)
elif block_type == "Formula":
if text.strip().startswith("$$") and text.strip().endswith("$$"):
text = text.strip()
text = "\n" + text + "\n"
return text


def line_separator(line1, line2, block_type, is_continuation=False):
# Should cover latin-derived languages and russian
lowercase_letters = r'(\p{Lo}+|\p{Ll}+)'
lowercase_letters = r'\p{Lo}+|\d+'
# Remove hyphen in current line if next line and current line appear to be joined
hyphen_pattern = regex.compile(rf'.*[{lowercase_letters}][-]\s?$', regex.DOTALL)
if line1 and hyphen_pattern.match(line1) and regex.match(rf"^\s?[{lowercase_letters}]", line2):
# Split on — or - from the right
line1 = re.split(r"[-—]\s?$", line1)[0]
line1 = regex.split(r"[-—]\s?$", line1)[0]
return line1.rstrip() + line2.lstrip()

all_letters = r'\p{L}+'
sentence_continuations = r',;(—'
sentence_ends = r'。ๆ.?!'
line_end_pattern = regex.compile(rf'.*[{lowercase_letters}{sentence_continuations}]\s?$', regex.DOTALL)
all_letters = r'\p{L}+|\d+'
sentence_continuations = r',;\(\—'
sentence_ends = r'。ๆ\.?!'
line_end_pattern = regex.compile(rf'.*[{lowercase_letters}][{sentence_continuations}]?\s?$', regex.DOTALL)
line_start_pattern = regex.compile(rf'^\s?[{all_letters}]', regex.DOTALL)
sentence_end_pattern = regex.compile(rf'.*[{sentence_ends}]\s?$', regex.DOTALL)

if block_type in ["Title", "Section-header"]:
return line1.rstrip() + " " + line2.lstrip()
elif block_type == "Formula":
return line1 + "\n" + line2
elif line_end_pattern.match(line1) and line_start_pattern.match(line2) and block_type == "Text":
return line1.rstrip() + " " + line2.lstrip()
elif is_continuation:
return line1.rstrip() + " " + line2.lstrip()
elif block_type == "Text" and sentence_end_pattern.match(line1):
return line1 + "\n\n" + line2
elif block_type == "Formula":
return line1 + " " + line2
elif block_type == "Table":
return line1 + "\n\n" + line2
elif block_type in ["Formula"]:
return line1.rstrip() + "\n\n" + line2.lstrip()
else:
return line1 + "\n" + line2

Expand Down
11 changes: 5 additions & 6 deletions marker/schema/block.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,11 @@ def filter_bad_span_types(self):
new_lines.append(line)
self.lines = new_lines

def font_info(self, prop="font"):
font_info = []
for line in self.lines:
for span in line.spans:
font_info.append(getattr(span, prop))
return font_info
def get_min_line_start(self):
line_starts = [line.start for line in self.lines]
if len(line_starts) == 0:
return None
return min(line_starts)


def bbox_from_lines(lines: List[Line]):
Expand Down
27 changes: 5 additions & 22 deletions marker/schema/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,30 +31,13 @@ def get_nonblank_spans(self) -> List[Span]:
spans = [s for l in lines for s in l.spans if s.text.strip()]
return spans

def get_font_stats(self):
fonts = [s.font for s in self.get_nonblank_spans()]
font_counts = Counter(fonts)
return font_counts
def get_font_sizes(self):
font_sizes = [s.font_size for s in self.get_nonblank_spans()]
return font_sizes

def get_line_height_stats(self):
def get_line_heights(self):
heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()]
height_counts = Counter(heights)
return height_counts

def get_line_start_stats(self):
starts = [l.bbox[0] for l in self.get_nonblank_lines()]
start_counts = Counter(starts)
return start_counts

def get_min_line_start(self):
starts = []
for block in self.blocks:
for line in block.lines:
if line.spans and block.block_type == "Text":
starts.append(line.bbox[0])
if len(starts) == 0:
raise IndexError("No lines found")
return min(starts)
return heights

@property
def prelim_text(self):
Expand Down

0 comments on commit aaef442

Please sign in to comment.