Skip to content

Commit

Permalink
Add github action, debug settings
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Dec 5, 2023
1 parent c3d8b1d commit 5f6079f
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 7 deletions.
37 changes: 37 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Integration test with benchmark

on: [push]

env:
TESSDATA_PREFIX: "/usr/share/tesseract-ocr/5/tessdata"
TORCH_DEVICE: "cpu"
OCR_ENGINE: "tesseract" # So we don't have to install ghostscript, which takes a long time

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.12
uses: actions/setup-python@v4
with:
python-version: 3.12
- name: Install system dependencies
run: cat scripts/install/apt-requirements.txt | xargs sudo apt-get install -y
- name: Install tesseract 5
run: bash scripts/install/tesseract_5_install.sh
- name: Install python dependencies
run: |
pip install poetry
poetry install
- name: Download benchmark data
run: |
wget https://drive.google.com/uc?export=download&id=1ktVDYPEeyHlKLaF56FnHjI5VjVnYa1xL -O benchmark_data.zip
unzip benchmark_data.zip
- name: Run benchmark test
run: |
poetry run python benchmark.py benchmark_data/pdfs benchmark_data/references report.json
poetry run python scripts/verify_benchmark_scores.py report.json
19 changes: 13 additions & 6 deletions marker/cleaners/equations.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
import io
from concurrent.futures import ThreadPoolExecutor
from copy import deepcopy
from functools import partial
from typing import List

import torch
from nougat import NougatModel
from nougat.postprocessing import close_envs, markdown_compatible
from nougat.postprocessing import markdown_compatible
from nougat.utils.checkpoint import get_checkpoint
import re
from PIL import Image, ImageDraw
import fitz as pymupdf
from nougat.utils.dataset import ImageDataset

from marker.bbox import should_merge_blocks, merge_boxes, multiple_boxes_intersect
from marker.bbox import should_merge_blocks, merge_boxes
from marker.debug.data import dump_nougat_debug_data
from marker.settings import settings
from marker.schema import Page, Span, Line, Block, BlockType
from nougat.utils.device import move_to_device
Expand Down Expand Up @@ -209,6 +208,7 @@ def get_bboxes_for_region(page, region):

def replace_blocks_with_nougat_predictions(page_blocks: Page, merged_boxes, reformat_regions, predictions, pnum, nougat_model):
new_blocks = []
converted_spans = []
current_region = 0
idx = 0
success_count = 0
Expand All @@ -233,6 +233,7 @@ def replace_blocks_with_nougat_predictions(page_blocks: Page, merged_boxes, refo
idx = reformat_regions[current_region][-1] + 1
if not all(conditions):
fail_count += 1
converted_spans.append(None)
for i in reformat_regions[current_region]:
new_blocks.append(page_blocks.blocks[i])
else:
Expand All @@ -250,13 +251,14 @@ def replace_blocks_with_nougat_predictions(page_blocks: Page, merged_boxes, refo
],
bbox=merged_boxes[current_region]
)
converted_spans.append(deepcopy(block_line.spans[0]))
new_blocks.append(Block(
lines=[block_line],
bbox=merged_boxes[current_region],
pnum=pnum
))
current_region += 1
return new_blocks, success_count, fail_count
return new_blocks, success_count, fail_count, converted_spans


def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]], nougat_model, batch_size=settings.NOUGAT_BATCH_SIZE):
Expand Down Expand Up @@ -290,20 +292,25 @@ def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]

# Replace blocks with predictions
page_start = 0
converted_spans = []
for page_idx, reformat_regions_page in enumerate(reformat_regions):
page_predictions = predictions[page_start:page_start + len(reformat_regions_page)]
page_boxes = merged_boxes[page_start:page_start + len(reformat_regions_page)]
new_page_blocks, success_count, fail_count = replace_blocks_with_nougat_predictions(
new_page_blocks, success_count, fail_count, converted_span = replace_blocks_with_nougat_predictions(
blocks[page_idx],
page_boxes,
reformat_regions_page,
page_predictions,
page_idx,
nougat_model
)
converted_spans.extend(converted_span)
blocks[page_idx].blocks = new_page_blocks
page_start += len(reformat_regions_page)
successful_ocr += success_count
unsuccessful_ocr += fail_count

# If debug mode is on, dump out conversions for comparison
dump_nougat_debug_data(doc, images, converted_spans)

return blocks, {"successful_ocr": successful_ocr, "unsuccessful_ocr": unsuccessful_ocr, "equations": eq_count}
4 changes: 4 additions & 0 deletions marker/convert.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import fitz as pymupdf

from marker.cleaners.table import merge_table_blocks, create_new_tables
from marker.debug.data import dump_bbox_debug_data
from marker.extract_text import get_text_blocks
from marker.cleaners.headers import filter_header_footer, filter_common_titles
from marker.cleaners.equations import replace_equations
Expand Down Expand Up @@ -117,6 +118,9 @@ def convert_single_pdf(

annotate_spans(blocks, block_types)

# Dump debug data if flags are set
dump_bbox_debug_data(doc, blocks)

blocks = order_blocks(
doc,
blocks,
Expand Down
76 changes: 76 additions & 0 deletions marker/debug/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import base64
import json
import os
import zlib
from typing import List

from marker.schema import Page
from marker.settings import settings
from PIL import Image
import io


def dump_nougat_debug_data(doc, images, converted_spans):
if not settings.DEBUG or not settings.DEBUG_DATA_FOLDER:
return

# We attempted one conversion per image
assert len(converted_spans) == len(images)

data_lines = []
for idx, (image, converted_span) in enumerate(zip(images, converted_spans)):
if converted_span is None:
continue
# Image is a BytesIO object
pil_image = Image.open(image)
img_bytes = io.BytesIO()
pil_image.save(img_bytes, format="WEBP", lossless=True)
b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
data_lines.append({
"image": b64_image,
"text": converted_span.text,
"bbox": converted_span.bbox
})

# Remove extension from doc name
doc_base = os.path.basename(doc.name).rsplit(".", 1)[0]

debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json")
with open(debug_file, "w+") as f:
json.dump(data_lines, f, indent=4)


def dump_bbox_debug_data(doc, blocks: List[Page]):
if not settings.DEBUG or not settings.DEBUG_DATA_FOLDER:
return

# Remove extension from doc name
doc_base = os.path.basename(doc.name).rsplit(".", 1)[0]

debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json")
debug_data = []
for idx, page_blocks in enumerate(blocks):
page = doc[idx]

pix = page.get_pixmap(dpi=settings.NOUGAT_DPI, annots=False, clip=page_blocks.bbox)
png = pix.pil_tobytes(format="PNG")
png_image = Image.open(io.BytesIO(png))
width, height = png_image.size
max_dimension = 6000
if width > max_dimension or height > max_dimension:
scaling_factor = min(max_dimension / width, max_dimension / height)
png_image = png_image.resize((int(width * scaling_factor), int(height * scaling_factor)), Image.ANTIALIAS)

img_bytes = io.BytesIO()
png_image.save(img_bytes, format="WEBP", lossless=True, quality=100)
b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8")

page_data = page_blocks.model_dump()
page_data["image"] = b64_image
debug_data.append(page_data)

with open(debug_file, "w+") as f:
json.dump(debug_data, f, indent=4)



5 changes: 4 additions & 1 deletion marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ class Settings(BaseSettings):
TORCH_DEVICE: str = "cpu"
INFERENCE_RAM: int = 40 # How much VRAM each GPU has (in GB).
VRAM_PER_TASK: float = 2.5 # How much VRAM to allocate per task (in GB). Peak marker VRAM usage is around 3GB, but avg across workers is lower.
DEBUG: bool = False # Enable debug logging
DEFAULT_LANG: str = "English" # Default language we assume files to be in, should be one of the keys in TESSERACT_LANGUAGES

SUPPORTED_FILETYPES: Dict = {
Expand Down Expand Up @@ -85,6 +84,10 @@ class Settings(BaseSettings):
RAY_DASHBOARD_HOST: str = "127.0.0.1"
RAY_CORES_PER_WORKER: int = 1 # How many cpu cores to allocate per worker

# Debug
DEBUG: bool = False # Enable debug logging
DEBUG_DATA_FOLDER: Optional[str] = None

@computed_field
@property
def CUDA(self) -> bool:
Expand Down
20 changes: 20 additions & 0 deletions scripts/verify_benchmark_scores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import json
import argparse


def verify_scores(file_path):
with open(file_path, 'r') as file:
data = json.load(file)

multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"]

if multicolcnn_score <= 0.4 or switch_trans_score <= 0.4:
raise ValueError("One or more scores are below the required threshold of 0.4")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Verify benchmark scores")
parser.add_argument("file_path", type=str, help="Path to the json file")
args = parser.parse_args()
verify_scores(args.file_path)

0 comments on commit 5f6079f

Please sign in to comment.