Skip to content

Commit

Permalink
Fix bug with headers
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Oct 17, 2024
1 parent eb2a205 commit db27e98
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 3 deletions.
5 changes: 4 additions & 1 deletion marker/cleaners/headings.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ def split_heading_blocks(pages: List[Page]):


def bucket_headings(line_heights, num_levels=settings.HEADING_LEVEL_COUNT):
if len(line_heights) <= num_levels:
return []

data = np.asarray(line_heights).reshape(-1, 1)
labels = KMeans(n_clusters=num_levels, random_state=0, n_init="auto").fit_predict(data)
data_labels = np.concatenate([data, labels.reshape(-1, 1)], axis=1)
Expand Down Expand Up @@ -119,5 +122,5 @@ def infer_heading_levels(pages: List[Page], height_tol=.99):
break

if block.heading_level is None:
block.heading_level = min(len(heading_ranges), settings.HEADING_DEFAULT_LEVEL)
block.heading_level = settings.HEADING_DEFAULT_LEVEL

5 changes: 3 additions & 2 deletions marker/postprocessors/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def block_separator(prev_block: FullyMergedBlock, block: FullyMergedBlock):
return sep + block.text


def merge_lines(blocks: List[List[MergedBlock]]):
def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=10):
text_blocks = []
prev_type = None
prev_line = None
Expand Down Expand Up @@ -171,8 +171,9 @@ def merge_lines(blocks: List[List[MergedBlock]]):
line_height = line.bbox[3] - line.bbox[1]
prev_line_height = prev_line.bbox[3] - prev_line.bbox[1] if prev_line else 0
prev_line_x = prev_line.bbox[0] if prev_line else 0
vertical_dist = min(abs(line.bbox[1] - prev_line.bbox[3]), abs(line.bbox[3] - prev_line.bbox[1])) if prev_line else 0
prev_line = line
is_continuation = line_height == prev_line_height and line.bbox[0] == prev_line_x
is_continuation = line_height == prev_line_height and line.bbox[0] == prev_line_x and vertical_dist < max_block_gap
if block_text:
block_text = line_separator(block_text, line.text, block_type, is_continuation)
else:
Expand Down

0 comments on commit db27e98

Please sign in to comment.