diff --git a/marker/cleaners/code.py b/marker/cleaners/code.py index a444103d..973c323e 100644 --- a/marker/cleaners/code.py +++ b/marker/cleaners/code.py @@ -1,10 +1,13 @@ +from collections import Counter +from statistics import mean, median + from marker.schema.block import Span, Line from marker.schema.page import Page import re from typing import List -def is_code_linelen(lines, thresh=60): +def is_code_linelen(lines, thresh=80): # Decide based on chars per newline threshold total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines) total_newlines = max(len(lines) - 1, 1) @@ -23,64 +26,61 @@ def comment_count(lines): def identify_code_blocks(pages: List[Page]): code_block_count = 0 - font_info = None - for page in pages: - stats = page.get_font_stats() - if font_info is None: - font_info = stats - else: - font_info += stats - try: - most_common_font = font_info.most_common(1)[0][0] - except IndexError: - print(f"Could not find most common font") - most_common_font = None - - last_block = None + font_sizes = [] + line_heights = [] for page in pages: - try: - min_start = page.get_min_line_start() - except IndexError: - continue + font_sizes += page.get_font_sizes() + line_heights += page.get_line_heights() + + avg_font_size = None + avg_line_height = None + if len(font_sizes) > 0: + avg_line_height = median(line_heights) + avg_font_size = mean(font_sizes) + for page in pages: for block in page.blocks: if block.block_type != "Text": last_block = block continue + # Ensure we have lines and spans + if len(block.lines) == 0: + continue + if sum([len(line.spans) for line in block.lines]) == 0: + continue + + min_start = block.get_min_line_start() + is_indent = [] line_fonts = [] + line_font_sizes = [] + block_line_heights = [] for line in block.lines: - fonts = [span.font for span in line.spans] - line_fonts += fonts - line_start = line.bbox[0] - if line_start > min_start: - is_indent.append(True) - else: - is_indent.append(False) + line_fonts += [span.font for span in line.spans] + line_font_sizes += [span.font_size for span in line.spans] + block_line_heights.append(line.bbox[3] - line.bbox[1]) + + is_indent.append(line.bbox[0] > min_start) + comment_lines = comment_count([line.prelim_text for line in block.lines]) is_code = [ len(block.lines) > 3, - sum([f != most_common_font for f in line_fonts]) > len(line_fonts) * .8, # At least 80% of the fonts are not the most common, since code usually uses a different font from the main body text is_code_linelen(block.lines), - ( - sum(is_indent) > len(block.lines) * .2 - or - comment_lines > len(block.lines) * .2 - ), # 20% lines indented or 20% of the lines are comments + sum(is_indent) + comment_lines > len(block.lines) * .3, ] - # Check if previous block is code, and this block is indented - is_code_prev = [ - last_block and last_block.block_type == "Code", - sum(is_indent) >= len(block.lines) * .8 # At least 80% indented - ] + if avg_font_size is not None: + font_checks = [ + mean(line_font_sizes) <= avg_font_size, # Lower than average font size and line height + mean(block_line_heights) < avg_line_height + ] + is_code += font_checks - if all(is_code) or all(is_code_prev): + if all(is_code): code_block_count += 1 block.block_type = "Code" - last_block = block return code_block_count diff --git a/marker/equations/equations.py b/marker/equations/equations.py index 462bd405..8fb7cde1 100644 --- a/marker/equations/equations.py +++ b/marker/equations/equations.py @@ -90,7 +90,7 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu lines=[Line( spans=[ Span( - text="\n\n" + block_text.replace("\n", " ") + "\n\n", + text=block_text.replace("\n", " "), bbox=equation_bbox, span_id=f"{pnum}_{idx}_fixeq", font="Latex", @@ -109,7 +109,7 @@ def insert_latex_block(page_blocks: Page, page_equation_blocks, predictions, pnu fail_count += 1 else: success_count += 1 - new_block.lines[0].spans[0].text = latex_text + new_block.lines[0].spans[0].text = latex_text.replace("\n", " ") converted_spans.append(deepcopy(new_block.lines[0].spans[0])) # Add in the new LaTeX block diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py index fdba861f..a27c67c7 100644 --- a/marker/postprocessors/markdown.py +++ b/marker/postprocessors/markdown.py @@ -12,6 +12,7 @@ def escape_markdown(text): escaped_text = re.sub(characters_to_escape, r'\\\g<0>', text) return escaped_text + def surround_text(s, char_to_insert): leading_whitespace = re.match(r'^(\s*)', s).group(1) trailing_whitespace = re.search(r'(\s*)$', s).group(1) @@ -82,43 +83,45 @@ def block_surround(text, block_type): elif block_type == "List-item": text = escape_markdown(text) elif block_type == "Code": - text = "\n" + escape_markdown(text) + "\n" + text = "\n```\n" + text + "\n```\n" elif block_type == "Text": text = escape_markdown(text) + elif block_type == "Formula": + if text.strip().startswith("$$") and text.strip().endswith("$$"): + text = text.strip() + text = "\n" + text + "\n" return text def line_separator(line1, line2, block_type, is_continuation=False): # Should cover latin-derived languages and russian - lowercase_letters = r'(\p{Lo}+|\p{Ll}+)' + lowercase_letters = r'\p{Lo}+|\d+' # Remove hyphen in current line if next line and current line appear to be joined hyphen_pattern = regex.compile(rf'.*[{lowercase_letters}][-]\s?$', regex.DOTALL) if line1 and hyphen_pattern.match(line1) and regex.match(rf"^\s?[{lowercase_letters}]", line2): # Split on — or - from the right - line1 = re.split(r"[-—]\s?$", line1)[0] + line1 = regex.split(r"[-—]\s?$", line1)[0] return line1.rstrip() + line2.lstrip() - all_letters = r'\p{L}+' - sentence_continuations = r',;(—' - sentence_ends = r'。ๆ.?!' - line_end_pattern = regex.compile(rf'.*[{lowercase_letters}{sentence_continuations}]\s?$', regex.DOTALL) + all_letters = r'\p{L}+|\d+' + sentence_continuations = r',;\(\—' + sentence_ends = r'。ๆ\.?!' + line_end_pattern = regex.compile(rf'.*[{lowercase_letters}][{sentence_continuations}]?\s?$', regex.DOTALL) line_start_pattern = regex.compile(rf'^\s?[{all_letters}]', regex.DOTALL) sentence_end_pattern = regex.compile(rf'.*[{sentence_ends}]\s?$', regex.DOTALL) if block_type in ["Title", "Section-header"]: return line1.rstrip() + " " + line2.lstrip() + elif block_type == "Formula": + return line1 + "\n" + line2 elif line_end_pattern.match(line1) and line_start_pattern.match(line2) and block_type == "Text": return line1.rstrip() + " " + line2.lstrip() elif is_continuation: return line1.rstrip() + " " + line2.lstrip() elif block_type == "Text" and sentence_end_pattern.match(line1): return line1 + "\n\n" + line2 - elif block_type == "Formula": - return line1 + " " + line2 elif block_type == "Table": return line1 + "\n\n" + line2 - elif block_type in ["Formula"]: - return line1.rstrip() + "\n\n" + line2.lstrip() else: return line1 + "\n" + line2 diff --git a/marker/schema/block.py b/marker/schema/block.py index 50ae95c6..1ecd1a6a 100644 --- a/marker/schema/block.py +++ b/marker/schema/block.py @@ -74,12 +74,11 @@ def filter_bad_span_types(self): new_lines.append(line) self.lines = new_lines - def font_info(self, prop="font"): - font_info = [] - for line in self.lines: - for span in line.spans: - font_info.append(getattr(span, prop)) - return font_info + def get_min_line_start(self): + line_starts = [line.start for line in self.lines] + if len(line_starts) == 0: + return None + return min(line_starts) def bbox_from_lines(lines: List[Line]): diff --git a/marker/schema/page.py b/marker/schema/page.py index c4fca410..bf2bc6f9 100644 --- a/marker/schema/page.py +++ b/marker/schema/page.py @@ -31,30 +31,13 @@ def get_nonblank_spans(self) -> List[Span]: spans = [s for l in lines for s in l.spans if s.text.strip()] return spans - def get_font_stats(self): - fonts = [s.font for s in self.get_nonblank_spans()] - font_counts = Counter(fonts) - return font_counts + def get_font_sizes(self): + font_sizes = [s.font_size for s in self.get_nonblank_spans()] + return font_sizes - def get_line_height_stats(self): + def get_line_heights(self): heights = [l.bbox[3] - l.bbox[1] for l in self.get_nonblank_lines()] - height_counts = Counter(heights) - return height_counts - - def get_line_start_stats(self): - starts = [l.bbox[0] for l in self.get_nonblank_lines()] - start_counts = Counter(starts) - return start_counts - - def get_min_line_start(self): - starts = [] - for block in self.blocks: - for line in block.lines: - if line.spans and block.block_type == "Text": - starts.append(line.bbox[0]) - if len(starts) == 0: - raise IndexError("No lines found") - return min(starts) + return heights @property def prelim_text(self):