diff --git a/marker/cleaners/headings.py b/marker/cleaners/headings.py index 8cdcb9f6..8be2bfa8 100644 --- a/marker/cleaners/headings.py +++ b/marker/cleaners/headings.py @@ -63,6 +63,9 @@ def split_heading_blocks(pages: List[Page]): def bucket_headings(line_heights, num_levels=settings.HEADING_LEVEL_COUNT): + if len(line_heights) <= num_levels: + return [] + data = np.asarray(line_heights).reshape(-1, 1) labels = KMeans(n_clusters=num_levels, random_state=0, n_init="auto").fit_predict(data) data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1) @@ -119,5 +122,5 @@ def infer_heading_levels(pages: List[Page], height_tol=.99): break if block.heading_level is None: - block.heading_level = min(len(heading_ranges), settings.HEADING_DEFAULT_LEVEL) + block.heading_level = settings.HEADING_DEFAULT_LEVEL diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py index 5029df3a..fdf40a22 100644 --- a/marker/postprocessors/markdown.py +++ b/marker/postprocessors/markdown.py @@ -143,7 +143,7 @@ def block_separator(prev_block: FullyMergedBlock, block: FullyMergedBlock): return sep + block.text -def merge_lines(blocks: List[List[MergedBlock]]): +def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=10): text_blocks = [] prev_type = None prev_line = None @@ -171,8 +171,9 @@ def merge_lines(blocks: List[List[MergedBlock]]): line_height = line.bbox[3] - line.bbox[1] prev_line_height = prev_line.bbox[3] - prev_line.bbox[1] if prev_line else 0 prev_line_x = prev_line.bbox[0] if prev_line else 0 + vertical_dist = min(abs(line.bbox[1] - prev_line.bbox[3]), abs(line.bbox[3] - prev_line.bbox[1])) if prev_line else 0 prev_line = line - is_continuation = line_height == prev_line_height and line.bbox[0] == prev_line_x + is_continuation = line_height == prev_line_height and line.bbox[0] == prev_line_x and vertical_dist < max_block_gap if block_text: block_text = line_separator(block_text, line.text, block_type, is_continuation) else: