From 534b6b287b274557b1834512a61e77393f5f72af Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 24 Oct 2024 13:36:04 -0400 Subject: [PATCH] Fix table recognition --- marker/tables/table.py | 16 +++++++++++----- pyproject.toml | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/marker/tables/table.py b/marker/tables/table.py index 6a8e04f8..c5ab194c 100644 --- a/marker/tables/table.py +++ b/marker/tables/table.py @@ -31,12 +31,12 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname): table_counts = [] table_bboxes = [] img_sizes = [] + pnums = [] - for page in pages: - pnum = page.pnum + for page_idx, page in enumerate(pages): # The bbox for the entire table bbox = [b.bbox for b in page.layout.bboxes if b.label == "Table"] - highres_img = render_image(doc[pnum], dpi=settings.SURYA_TABLE_DPI) + highres_img = render_image(doc[page_idx], dpi=settings.SURYA_TABLE_DPI) page_table_imgs = [] page_bboxes = [] @@ -48,11 +48,13 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname): if len(bbox) == 0: table_counts.append(0) img_sizes.append(None) + pnums.append(page.pnum) continue # Number of tables per page table_counts.append(len(bbox)) img_sizes.append(highres_img.size) + pnums.append(page.pnum) for bb in bbox: highres_bb = rescale_bbox(page.layout.image_bbox, [0, 0, highres_img.size[0], highres_img.size[1]], bb) @@ -62,10 +64,14 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname): table_imgs.extend(page_table_imgs) table_bboxes.extend(page_bboxes) - table_idxs = [i for i, c in enumerate(table_counts) if c > 0] + # The page number in doc and in the original document are not the same + # Doc has had pages removed from the start to align to start_page + # This corrects for that + doc_idxs = [pnum for pnum, tc in zip(pnums, table_counts) if tc > 0] + table_idxs = [i for i, tc in enumerate(table_counts) if tc > 0] sel_text_lines = get_page_text_lines( fname, - table_idxs, + doc_idxs, [hr for i, hr in enumerate(img_sizes) if i in table_idxs], ) text_lines = [] diff --git a/pyproject.toml b/pyproject.toml index e61f01ed..ba6ba65e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "0.3.7" +version = "0.3.8" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md"