Initial table integration

VikParuchuri · Oct 15, 2024 · 04d308e · 04d308e
1 parent 96379ed
commit 04d308e
Show file tree

Hide file tree

Showing 16 changed files with 452 additions and 996 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -29,10 +29,6 @@ jobs:
         run: |
           poetry run python benchmarks/overall.py benchmark_data/pdfs benchmark_data/references report.json
           poetry run python scripts/verify_benchmark_scores.py report.json --type marker
-      - name: Run table benchmark
-        run: |
-          poetry run python benchmarks/table.py tables.json
-          poetry run python scripts/verify_benchmark_scores.py tables.json --type table
         
           
 
diff --git a/CLA.md b/CLA.md
@@ -1,6 +1,6 @@
 Marker Contributor Agreement
 
-This Marker Contributor Agreement ("MCA") applies to any contribution that you make to any product or project managed by us (the "project"), and sets out the intellectual property rights you grant to us in the contributed materials. The term "us" shall mean Vikas Paruchuri. The term "you" shall mean the person or entity identified below. 
+This Marker Contributor Agreement ("MCA") applies to any contribution that you make to any product or project managed by us (the "project"), and sets out the intellectual property rights you grant to us in the contributed materials. The term "us" shall mean Endless Labs, Inc. The term "you" shall mean the person or entity identified below. 
 
 If you agree to be bound by these terms, sign by writing "I have read the CLA document and I hereby sign the CLA" in response to the CLA bot Github comment. Read this agreement carefully before signing. These terms and conditions constitute a binding legal agreement.
 
@@ -20,5 +20,5 @@ If you or your affiliates institute patent litigation against any entity (includ
    - each contribution that you submit is and shall be an original work of authorship and you can legally grant the rights set out in this MCA; 
    - to the best of your knowledge, each contribution will not violate any third party's copyrights, trademarks, patents, or other intellectual property rights; and 
    - each contribution shall be in compliance with U.S. export control laws and other applicable export and import laws.
-You agree to notify us if you become aware of any circumstance which would make any of the foregoing representations inaccurate in any respect. Vikas Paruchuri may publicly disclose your participation in the project, including the fact that you have signed the MCA. 
+You agree to notify us if you become aware of any circumstance which would make any of the foregoing representations inaccurate in any respect. Endless Labs, Inc. may publicly disclose your participation in the project, including the fact that you have signed the MCA. 
 6. This MCA is governed by the laws of the State of California and applicable U.S. Federal law. Any choice of law rules will not apply.
diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instruc
 
 I want marker to be as widely accessible as possible, while still funding my development/training costs.  Research and personal usage is always okay, but there are some restrictions on commercial usage.
 
-The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period AND under $5M in lifetime VC/angel funding raised. If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options [here](https://www.datalab.to).
+The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period AND under $5M in lifetime VC/angel funding raised. You also must not be competitive with the [Datalab API](https://www.datalab.to/).  If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options [here](https://www.datalab.to).
 
 # Hosted API
 
@@ -217,14 +217,6 @@ This will benchmark marker against other text extraction methods.  It sets up ba
 
 Omit `--nougat` to exclude nougat from the benchmark.  I don't recommend running nougat on CPU, since it is very slow.
 
-### Table benchmark
-
-There is a benchmark for table parsing, which you can run with:
-
-```shell
-python benchmarks/table.py test_data/tables.json
-```
-
 # Thanks
 
 This work would not have been possible without amazing open source models and datasets, including (but not limited to):
@@ -233,6 +225,5 @@ This work would not have been possible without amazing open source models and da
 - Texify
 - Pypdfium2/pdfium
 - DocLayNet from IBM
-- ByT5 from Google
 
 Thank you to the authors of these models and datasets for making them available to the community!
diff --git a/benchmarks/table.py b/benchmarks/table.py
diff --git a/marker/convert.py b/marker/convert.py
@@ -20,7 +20,6 @@
 from marker.cleaners.headers import filter_header_footer, filter_common_titles
 from marker.equations.equations import replace_equations
 from marker.pdf.utils import find_filetype
-from marker.postprocessors.editor import edit_full_text
 from marker.cleaners.code import identify_code_blocks, indent_blocks
 from marker.cleaners.bullets import replace_bullets
 from marker.cleaners.headings import split_heading_blocks
@@ -83,7 +82,7 @@ def convert_single_pdf(
             doc.del_page(0)
 
     # Unpack models from list
-    texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst
+    texify_model, layout_model, order_model, detection_model, ocr_model, table_rec_model = model_lst
 
     # Identify text lines on pages
     surya_detection(doc, pages, detection_model, batch_multiplier=batch_multiplier)
@@ -123,7 +122,7 @@ def convert_single_pdf(
     indent_blocks(pages)
 
     # Fix table blocks
-    table_count = format_tables(pages)
+    table_count = format_tables(pages, doc, fname, detection_model, table_rec_model, ocr_model)
     out_meta["block_stats"]["table"] = table_count
 
     for page in pages:
@@ -160,14 +159,6 @@ def convert_single_pdf(
     # Replace bullet characters with a -
     full_text = replace_bullets(full_text)
 
-    # Postprocess text with editor model
-    full_text, edit_stats = edit_full_text(
-        full_text,
-        edit_model,
-        batch_multiplier=batch_multiplier
-    )
-    flush_cuda_memory()
-    out_meta["postprocess_stats"] = {"edit": edit_stats}
     doc_images = images_to_dict(pages)
 
     return full_text, doc_images, out_meta
diff --git a/marker/models.py b/marker/models.py
@@ -2,7 +2,6 @@
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS
 
 
-from marker.postprocessors.editor import load_editing_model
 from surya.model.detection.model import load_model as load_detection_model, load_processor as load_detection_processor
 from texify.model.model import load_model as load_texify_model
 from texify.model.processor import load_processor as load_texify_processor
@@ -11,15 +10,25 @@
 from surya.model.recognition.processor import load_processor as load_recognition_processor
 from surya.model.ordering.model import load_model as load_order_model
 from surya.model.ordering.processor import load_processor as load_order_processor
+from surya.model.table_rec.model import load_model as load_table_model
+from surya.model.table_rec.processor import load_processor as load_table_processor
+
+
+def setup_table_rec_model(device=None, dtype=None):
+    if device:
+        table_model = load_table_model(device=device, dtype=dtype)
+    else:
+        table_model = load_table_model()
+    table_model.processor = load_table_processor()
+    return table_model
 
 
 def setup_recognition_model(device=None, dtype=None):
     if device:
         rec_model = load_recognition_model(device=device, dtype=dtype)
     else:
         rec_model = load_recognition_model()
-    rec_processor = load_recognition_processor()
-    rec_model.processor = rec_processor
+    rec_model.processor = load_recognition_processor()
     return rec_model
 
 
@@ -28,9 +37,7 @@ def setup_detection_model(device=None, dtype=None):
         model = load_detection_model(device=device, dtype=dtype)
     else:
         model = load_detection_model()
-
-    processor = load_detection_processor()
-    model.processor = processor
+    model.processor = load_detection_processor()
     return model
 
 
@@ -39,8 +46,7 @@ def setup_texify_model(device=None, dtype=None):
         texify_model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=device, dtype=dtype)
     else:
         texify_model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=settings.TORCH_DEVICE_MODEL, dtype=settings.TEXIFY_DTYPE)
-    texify_processor = load_texify_processor()
-    texify_model.processor = texify_processor
+    texify_model.processor = load_texify_processor()
     return texify_model
 
 
@@ -49,8 +55,7 @@ def setup_layout_model(device=None, dtype=None):
         model = load_detection_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT, device=device, dtype=dtype)
     else:
         model = load_detection_model(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
-    processor = load_detection_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
-    model.processor = processor
+    model.processor = load_detection_processor(checkpoint=settings.LAYOUT_MODEL_CHECKPOINT)
     return model
 
 
@@ -59,23 +64,22 @@ def setup_order_model(device=None, dtype=None):
         model = load_order_model(device=device, dtype=dtype)
     else:
         model = load_order_model()
-    processor = load_order_processor()
-    model.processor = processor
+    model.processor = load_order_processor()
     return model
 
 
-def load_all_models(device=None, dtype=None, force_load_ocr=False):
+def load_all_models(device=None, dtype=None):
     if device is not None:
         assert dtype is not None, "Must provide dtype if device is provided"
 
     # langs is optional list of languages to prune from recognition MoE model
     detection = setup_detection_model(device, dtype)
     layout = setup_layout_model(device, dtype)
     order = setup_order_model(device, dtype)
-    edit = load_editing_model(device, dtype)
 
     # Only load recognition model if we'll need it for all pdfs
     ocr = setup_recognition_model(device, dtype)
     texify = setup_texify_model(device, dtype)
-    model_lst = [texify, layout, order, edit, detection, ocr]
+    table_model = setup_table_rec_model(device, dtype)
+    model_lst = [texify, layout, order, detection, ocr, table_model]
     return model_lst
diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
@@ -65,14 +65,23 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplie
 
 
 def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[Page], batch_multiplier=1) -> List[Optional[Page]]:
+    # Slice images in higher resolution than detection happened in
     images = [render_image(doc[pnum], dpi=settings.SURYA_OCR_DPI) for pnum in page_idxs]
+    box_scale = settings.SURYA_OCR_DPI / settings.SURYA_DETECTOR_DPI
+
     processor = rec_model.processor
     selected_pages = [p for i, p in enumerate(pages) if i in page_idxs]
 
     surya_langs = [langs] * len(page_idxs)
     detection_results = [p.text_lines.bboxes for p in selected_pages]
     polygons = [[b.polygon for b in bboxes] for bboxes in detection_results]
 
+    # Scale polygons to get correct image slices
+    for poly in polygons:
+        for p in poly:
+            for i in range(len(p)):
+                p[i] = [int(p[i][0] * box_scale), int(p[i][1] * box_scale)]
+
     results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier))
 
     new_pages = []
@@ -81,14 +90,15 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P
         ocr_results = result.text_lines
         blocks = []
         for i, line in enumerate(ocr_results):
+            scaled_bbox = [b / box_scale for b in line.bbox]
             block = Block(
-                bbox=line.bbox,
+                bbox=scaled_bbox,
                 pnum=page_idx,
                 lines=[Line(
-                    bbox=line.bbox,
+                    bbox=scaled_bbox,
                     spans=[Span(
                         text=line.text,
-                        bbox=line.bbox,
+                        bbox=scaled_bbox,
                         span_id=f"{page_idx}_{i}",
                         font="",
                         font_weight=0,
@@ -98,10 +108,11 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P
                 )]
             )
             blocks.append(block)
+        scaled_image_bbox = [b / box_scale for b in result.image_bbox]
         page = Page(
             blocks=blocks,
             pnum=page_idx,
-            bbox=result.image_bbox,
+            bbox=scaled_image_bbox,
             rotation=0,
             text_lines=text_lines,
             ocr_method="surya"

diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py
@@ -90,7 +90,7 @@ def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Opt
 
     page_range = range(start_page, start_page + max_pages)
 
-    char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=True, workers=settings.PDFTEXT_CPU_WORKERS)
+    char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=False, workers=settings.PDFTEXT_CPU_WORKERS)
     marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
 
     return marker_blocks, toc