Skip to content

Commit

Permalink
Initial working version
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 1, 2024
1 parent efa115c commit 4660a3d
Show file tree
Hide file tree
Showing 19 changed files with 217 additions and 91 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ python convert.py /path/to/input/folder /path/to/output/folder --workers 10 --ma
}
```

You can use language names or codes. See [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py) for a full list.
You can use language names or codes. The exact codes depend on the OCR engine. See [here](https://github.com/VikParuchuri/surya/blob/master/surya/languages.py) for a full list for surya codes, and [here](https://tesseract-ocr.github.io/tessdoc/Data-Files#data-files-for-version-400-november-29-2016) for tesseract.

## Convert multiple files on multiple GPUs

Expand Down
22 changes: 10 additions & 12 deletions marker/cleaners/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ def comment_count(lines):
return sum([1 for line in lines if pattern.match(line)])


def identify_code_blocks(blocks: List[Page]):
def identify_code_blocks(pages: List[Page]):
code_block_count = 0
font_info = None
for p in blocks:
stats = p.get_font_stats()
for page in pages:
stats = page.get_font_stats()
if font_info is None:
font_info = stats
else:
Expand All @@ -37,14 +37,14 @@ def identify_code_blocks(blocks: List[Page]):
most_common_font = None

last_block = None
for page in blocks:
for page in pages:
try:
min_start = page.get_min_line_start()
except IndexError:
continue

for block in page.blocks:
if block.most_common_block_type() != "Text":
if block.block_type != "Text":
last_block = block
continue

Expand Down Expand Up @@ -72,24 +72,23 @@ def identify_code_blocks(blocks: List[Page]):

# Check if previous block is code, and this block is indented
is_code_prev = [
last_block and last_block.most_common_block_type() == "Code",
last_block and last_block.block_type == "Code",
sum(is_indent) >= len(block.lines) * .8 # At least 80% indented
]

if all(is_code) or all(is_code_prev):
code_block_count += 1
block.set_block_type("Code")
block.block_type = "Code"

last_block = block
return code_block_count


def indent_blocks(blocks: List[Page]):
def indent_blocks(pages: List[Page]):
span_counter = 0
for page in blocks:
for page in pages:
for block in page.blocks:
block_types = [span.block_type for line in block.lines for span in line.spans]
if "Code" not in block_types:
if block.block_type != "Code":
continue

lines = []
Expand Down Expand Up @@ -124,7 +123,6 @@ def indent_blocks(blocks: List[Page]):
font=block.lines[0].spans[0].font,
font_weight=block.lines[0].spans[0].font_weight,
font_size=block.lines[0].spans[0].font_size,
block_type="Code"
)
span_counter += 1
block.lines = [Line(spans=[new_span], bbox=block.bbox)]
34 changes: 18 additions & 16 deletions marker/cleaners/equations.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from marker.debug.data import dump_equation_debug_data
from marker.pdf.images import render_image
from marker.settings import settings
from marker.schema.schema import Span, Line, Block, BlockType
from marker.schema.schema import Span, Line, Block
from marker.schema.page import Page
import os

Expand Down Expand Up @@ -79,7 +79,7 @@ def get_total_texify_tokens(text, processor):
return len(tokens["input_ids"])


def find_page_equation_regions(page):
def find_page_equation_regions(page, processor):
i = 0
reformatted_blocks = set()
reformat_regions = []
Expand All @@ -106,7 +106,7 @@ def find_page_equation_regions(page):
prev_block = page.blocks[j]
prev_bbox = prev_block.bbox
prelim_block_text = prev_block.prelim_text + " " + block_text
if get_total_texify_tokens(prelim_block_text) >= settings.TEXIFY_MODEL_MAX:
if get_total_texify_tokens(prelim_block_text, processor) >= settings.TEXIFY_MODEL_MAX:
break

block_text = prelim_block_text
Expand All @@ -120,7 +120,7 @@ def find_page_equation_regions(page):
while (should_merge_blocks(bbox, next_bbox) or next_block.block_type in ["Formula"]) and i + 1 < len(page.blocks):
bbox = merge_boxes(bbox, next_bbox)
prelim_block_text = block_text + " " + next_block.prelim_text
if get_total_texify_tokens(prelim_block_text) >= settings.TEXIFY_MODEL_MAX:
if get_total_texify_tokens(prelim_block_text, processor) >= settings.TEXIFY_MODEL_MAX:
break

block_text = prelim_block_text
Expand All @@ -130,7 +130,7 @@ def find_page_equation_regions(page):
next_block = page.blocks[i + 1]
next_bbox = next_block.bbox

total_tokens = get_total_texify_tokens(block_text)
total_tokens = get_total_texify_tokens(block_text, processor)
ordered_blocks = sorted(([sb[0] for sb in selected_blocks]))
if total_tokens < settings.TEXIFY_MODEL_MAX:
# Get indices of all blocks to merge
Expand Down Expand Up @@ -160,7 +160,7 @@ def get_bboxes_for_region(page, region):
return bboxes, merged_box


def replace_blocks_with_latex(page_blocks: Page, merged_boxes, reformat_regions, predictions, pnum):
def replace_blocks_with_latex(page_blocks: Page, merged_boxes, reformat_regions, predictions, pnum, processor):
new_blocks = []
converted_spans = []
current_region = 0
Expand All @@ -178,7 +178,7 @@ def replace_blocks_with_latex(page_blocks: Page, merged_boxes, reformat_regions,
latex_text = predictions[current_region]
conditions = [
len(latex_text) > 0,
get_total_texify_tokens(latex_text) < settings.TEXIFY_MODEL_MAX, # Make sure we didn't run to the overall token max
get_total_texify_tokens(latex_text, processor) < settings.TEXIFY_MODEL_MAX, # Make sure we didn't run to the overall token max
len(latex_text) > len(orig_block_text) * .8,
len(latex_text.strip()) > 0
]
Expand Down Expand Up @@ -209,21 +209,22 @@ def replace_blocks_with_latex(page_blocks: Page, merged_boxes, reformat_regions,
new_blocks.append(Block(
lines=[block_line],
bbox=merged_boxes[current_region],
pnum=pnum
pnum=pnum,
block_type="Formula"
))
current_region += 1
return new_blocks, success_count, fail_count, converted_spans


def replace_equations(doc, blocks: List[Page], texify_model, batch_size=settings.TEXIFY_BATCH_SIZE):
def replace_equations(doc, pages: List[Page], texify_model, batch_size=settings.TEXIFY_BATCH_SIZE):
unsuccessful_ocr = 0
successful_ocr = 0

# Find potential equation regions, and length of text in each region
reformat_regions = []
reformat_region_lens = []
for pnum, page in enumerate(blocks):
regions, region_lens = find_page_equation_regions(page)
for pnum, page in enumerate(pages):
regions, region_lens = find_page_equation_regions(page, texify_model.processor)
reformat_regions.append(regions)
reformat_region_lens.append(region_lens)

Expand All @@ -236,7 +237,7 @@ def replace_equations(doc, blocks: List[Page], texify_model, batch_size=settings
for page_idx, reformat_regions_page in enumerate(reformat_regions):
page_obj = doc[page_idx]
for reformat_region in reformat_regions_page:
bboxes, merged_box = get_bboxes_for_region(blocks[page_idx], reformat_region)
bboxes, merged_box = get_bboxes_for_region(pages[page_idx], reformat_region)
png_image = get_masked_image(page_obj, merged_box, bboxes)
images.append(png_image)
merged_boxes.append(merged_box)
Expand All @@ -251,19 +252,20 @@ def replace_equations(doc, blocks: List[Page], texify_model, batch_size=settings
page_predictions = predictions[page_start:page_start + len(reformat_regions_page)]
page_boxes = merged_boxes[page_start:page_start + len(reformat_regions_page)]
new_page_blocks, success_count, fail_count, converted_span = replace_blocks_with_latex(
blocks[page_idx],
pages[page_idx],
page_boxes,
reformat_regions_page,
page_predictions,
page_idx
page_idx,
texify_model.processor
)
converted_spans.extend(converted_span)
blocks[page_idx].blocks = new_page_blocks
pages[page_idx].blocks = new_page_blocks
page_start += len(reformat_regions_page)
successful_ocr += success_count
unsuccessful_ocr += fail_count

# If debug mode is on, dump out conversions for comparison
dump_equation_debug_data(doc, images, converted_spans)

return blocks, {"successful_ocr": successful_ocr, "unsuccessful_ocr": unsuccessful_ocr, "equations": eq_count}
return pages, {"successful_ocr": successful_ocr, "unsuccessful_ocr": unsuccessful_ocr, "equations": eq_count}
10 changes: 6 additions & 4 deletions marker/cleaners/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@ def merge_table_blocks(blocks: List[Page]):
new_page_blocks = []
pnum = page.pnum
for block in page.blocks:
if block.most_common_block_type() != "Table":
if block.block_type != "Table":
if len(current_lines) > 0:
new_block = Block(
lines=deepcopy(current_lines),
pnum=pnum,
bbox=current_bbox
bbox=current_bbox,
block_type="Table"
)
new_page_blocks.append(new_block)
current_lines = []
Expand All @@ -38,7 +39,8 @@ def merge_table_blocks(blocks: List[Page]):
new_block = Block(
lines=deepcopy(current_lines),
pnum=pnum,
bbox=current_bbox
bbox=current_bbox,
block_type="Table"
)
new_page_blocks.append(new_block)
current_lines = []
Expand All @@ -54,7 +56,7 @@ def create_new_tables(blocks: List[Page]):

for page in blocks:
for block in page.blocks:
if block.most_common_block_type() != "Table" or len(block.lines) < 3:
if block.block_type != "Table" or len(block.lines) < 3:
continue

table_rows = []
Expand Down
5 changes: 4 additions & 1 deletion marker/convert.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import warnings
warnings.filterwarnings("ignore", category=UserWarning) # Filter torch pytree user warnings

import pypdfium2 as pdfium

from marker.cleaners.table import merge_table_blocks, create_new_tables
Expand Down Expand Up @@ -113,7 +116,7 @@ def convert_single_pdf(

# Copy to avoid changing original data
merged_lines = merge_spans(filtered)
text_blocks = merge_lines(merged_lines, filtered)
text_blocks = merge_lines(merged_lines)
text_blocks = filter_common_titles(text_blocks)
full_text = get_full_text(text_blocks)

Expand Down
39 changes: 20 additions & 19 deletions marker/layout/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from surya.layout import batch_layout_detection

from marker.pdf.images import render_image
from marker.pdf.utils import rescale_bbox
from marker.schema.page import Page
from marker.settings import settings

Expand All @@ -17,23 +18,23 @@ def surya_layout(doc, pages: List[Page], layout_model):
page.layout = layout_result


def annotate_block_types(page):
max_intersections = {}
for i, block in enumerate(page.blocks):
bbox = block.bbox
for j, layout_block in enumerate(page.layout.bboxes):
layout_bbox = layout_block.bbox
intersection_pct = bbox.intersection_pct(layout_bbox)
if i not in max_intersections:
max_intersections[i] = (intersection_pct, j)
elif intersection_pct > max_intersections[i][0]:
max_intersections[i] = (intersection_pct, j)

for i, block in enumerate(page.blocks):
block = page.blocks[i]
if i in max_intersections:
j = max_intersections[i][1]
block_type = page.layout.bboxes[j].label
else:
def annotate_block_types(pages: List[Page]):
for page in pages:
max_intersections = {}
for i, block in enumerate(page.blocks):
for j, layout_block in enumerate(page.layout.bboxes):
layout_bbox = layout_block.bbox
layout_bbox = rescale_bbox(page.layout.image_bbox, page.bbox, layout_bbox)
intersection_pct = block.intersection_pct(layout_bbox)
if i not in max_intersections:
max_intersections[i] = (intersection_pct, j)
elif intersection_pct > max_intersections[i][0]:
max_intersections[i] = (intersection_pct, j)

for i, block in enumerate(page.blocks):
block = page.blocks[i]
block_type = "Text"
block.block_type = block_type
if i in max_intersections:
j = max_intersections[i][1]
block_type = page.layout.bboxes[j].label
block.block_type = block_type
2 changes: 2 additions & 0 deletions marker/layout/order.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from surya.ordering import batch_ordering

from marker.pdf.images import render_image
from marker.pdf.utils import rescale_bbox
from marker.schema.page import Page
from marker.settings import settings

Expand Down Expand Up @@ -32,6 +33,7 @@ def sort_blocks_in_reading_order(pages: List[Page]):
for order_box in order.bboxes:
order_bbox = order_box.bbox
position = order_box.position
order_bbox = rescale_bbox(order.image_bbox, page.bbox, order_bbox)
block_intersection = block.intersection_pct(order_bbox)
if i not in block_positions:
block_positions[i] = (block_intersection, position)
Expand Down
2 changes: 1 addition & 1 deletion marker/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ def configure_logging():
logging.getLogger('PIL').setLevel(logging.ERROR)
logging.getLogger('fitz').setLevel(logging.ERROR)
logging.getLogger('ocrmypdf').setLevel(logging.ERROR)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
14 changes: 6 additions & 8 deletions marker/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@ def surround_text(s, char_to_insert):
return final_string


def merge_spans(blocks):
def merge_spans(pages: List[Page]) -> List[List[MergedBlock]]:
merged_blocks = []
for page in blocks:
for page in pages:
page_blocks = []
for blocknum, block in enumerate(page.blocks):
block_lines = []
block_types = []
for linenum, line in enumerate(block.lines):
line_text = ""
if len(line.spans) == 0:
Expand All @@ -37,7 +36,6 @@ def merge_spans(blocks):
break

fonts.append(font)
block_types.append(span.block_type)
span_text = span.text

# Don't bold or italicize very short sequences
Expand All @@ -58,7 +56,7 @@ def merge_spans(blocks):
lines=block_lines,
pnum=block.pnum,
bbox=block.bbox,
block_types=block_types
block_type=block.block_type
))
merged_blocks.append(page_blocks)

Expand Down Expand Up @@ -118,16 +116,16 @@ def block_separator(line1, line2, block_type1, block_type2):
return sep + line2


def merge_lines(blocks, page_blocks: List[Page]):
def merge_lines(blocks: List[List[MergedBlock]]):
text_blocks = []
prev_type = None
prev_line = None
block_text = ""
block_type = ""
common_line_heights = [p.get_line_height_stats() for p in page_blocks]

for page in blocks:
for block in page:
block_type = block.most_common_block_type()
block_type = block.block_type
if block_type != prev_type and prev_type:
text_blocks.append(
FullyMergedBlock(
Expand Down
Loading

0 comments on commit 4660a3d

Please sign in to comment.