From b9c6f73bf2b45e587191ccfcf367be0e3427c025 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Fri, 25 Oct 2024 12:28:26 -0400 Subject: [PATCH] Flatten pdfs, fix page separators --- README.md | 2 +- marker/pdf/extract_text.py | 2 +- marker/postprocessors/markdown.py | 60 ++++++++++++++++++++----------- marker/schema/merged.py | 3 +- marker/settings.py | 3 +- marker/tables/table.py | 1 + poetry.lock | 8 ++--- pyproject.toml | 4 +-- 8 files changed, 53 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 3bf38247..9ece8ed8 100644 --- a/README.md +++ b/README.md @@ -212,7 +212,7 @@ Set `DEBUG=true` to save data to the `debug` subfolder in the marker root direct These settings can improve/change output quality: - `OCR_ALL_PAGES` will force OCR across the document. Many PDFs have bad text embedded due to older OCR engines being used. -- `PAGINATE_OUTPUT` will put a horizontal rule between pages. Default: False. +- `PAGINATE_OUTPUT` will put a horizontal rule between pages. Default: False. The horizontal rule will be `\n\n`, then `{PAGE_NUMBER}`, then 48 single dashes `-`, then `\n\n`. The separator can be configured via the `PAGE_SEPARATOR` setting. - `EXTRACT_IMAGES` will extract images and save separately. Default: True. - `BAD_SPAN_TYPES` specifies layout blocks to remove from the markdown output. diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py index 07719e95..e8c0955b 100644 --- a/marker/pdf/extract_text.py +++ b/marker/pdf/extract_text.py @@ -92,7 +92,7 @@ def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Opt page_range = range(start_page, start_page + max_pages) - char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=False, workers=settings.PDFTEXT_CPU_WORKERS) + char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=False, workers=settings.PDFTEXT_CPU_WORKERS, flatten_pdf=settings.FLATTEN_PDF) marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)] return marker_blocks, toc diff --git a/marker/postprocessors/markdown.py b/marker/postprocessors/markdown.py index 417b83fa..86f45b7d 100644 --- a/marker/postprocessors/markdown.py +++ b/marker/postprocessors/markdown.py @@ -64,11 +64,19 @@ def merge_spans(pages: List[Page]) -> List[List[MergedBlock]]: if len(block_lines) > 0: page_blocks.append(MergedBlock( lines=block_lines, - pnum=block.pnum, + pnum=page.pnum, bbox=block.bbox, block_type=block.block_type, heading_level=block.heading_level )) + if len(page_blocks) == 0: + page_blocks.append(MergedBlock( + lines=[], + pnum=page.pnum, + bbox=page.bbox, + block_type="Text", + heading_level=None + )) merged_blocks.append(page_blocks) return merged_blocks @@ -139,9 +147,6 @@ def block_separator(prev_block: FullyMergedBlock, block: FullyMergedBlock): if prev_block.block_type == "Text": sep = "\n\n" - if prev_block.page_end: - sep = settings.PAGE_SEPARATOR - return sep + block.text @@ -152,8 +157,30 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15): block_text = "" block_type = "" prev_heading_level = None + pnum = None for idx, page in enumerate(blocks): + # Insert pagination at every page boundary + if settings.PAGINATE_OUTPUT: + if block_text: + text_blocks.append( + FullyMergedBlock( + text=block_surround(block_text, prev_type, prev_heading_level), + block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE, + page_start=False, + pnum=pnum + ) + ) + block_text = "" + text_blocks.append( + FullyMergedBlock( + text="", + block_type="Text", + page_start=True, + pnum=page[0].pnum + ) + ) + for block in page: block_type = block.block_type if (block_type != prev_type and prev_type) or (block.heading_level != prev_heading_level and prev_heading_level): @@ -161,13 +188,15 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15): FullyMergedBlock( text=block_surround(block_text, prev_type, prev_heading_level), block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE, - page_end=False + page_start=False, + pnum=block.pnum ) ) block_text = "" prev_type = block_type prev_heading_level = block.heading_level + pnum = block.pnum # Join lines in the block together properly for i, line in enumerate(block.lines): line_height = line.bbox[3] - line.bbox[1] @@ -181,28 +210,17 @@ def merge_lines(blocks: List[List[MergedBlock]], max_block_gap=15): else: block_text = line.text - # Force blocks to end at page boundaries - if settings.PAGINATE_OUTPUT: - text_blocks.append( - FullyMergedBlock( - text=block_surround(block_text, prev_type, prev_heading_level), - block_type=prev_type if prev_type else settings.DEFAULT_BLOCK_TYPE, - page_end=True - ) - ) - block_text = "" - - # Append the final block text_blocks.append( FullyMergedBlock( text=block_surround(block_text, prev_type, prev_heading_level), block_type=block_type if block_type else settings.DEFAULT_BLOCK_TYPE, - page_end=False + page_start=False, + pnum=pnum ) ) - text_blocks = [block for block in text_blocks if block.text.strip()] + text_blocks = [block for block in text_blocks if (block.text.strip() or block.page_start)] return text_blocks @@ -210,7 +228,9 @@ def get_full_text(text_blocks): full_text = "" prev_block = None for block in text_blocks: - if prev_block: + if block.page_start: + full_text += "\n\n{" + str(block.pnum) + "}" + settings.PAGE_SEPARATOR + elif prev_block: full_text += block_separator(prev_block, block) else: full_text += block.text diff --git a/marker/schema/merged.py b/marker/schema/merged.py index 6462da64..8cbfd4f3 100644 --- a/marker/schema/merged.py +++ b/marker/schema/merged.py @@ -25,4 +25,5 @@ class MergedBlock(BboxElement): class FullyMergedBlock(BaseModel): text: str block_type: str - page_end: bool + page_start: bool + pnum: int | None diff --git a/marker/settings.py b/marker/settings.py index 7093853f..3777c058 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -14,6 +14,7 @@ class Settings(BaseSettings): EXTRACT_IMAGES: bool = True # Extract images from pdfs and save them PAGINATE_OUTPUT: bool = False # Paginate output markdown BASE_DIR: str = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + FLATTEN_PDF: bool = True # Pull form field values into the PDF before converting to markdown @computed_field @property @@ -88,7 +89,7 @@ def TORCH_DEVICE_MODEL(self) -> str: HEADING_DEFAULT_LEVEL: int = 2 # Output - PAGE_SEPARATOR: str = "\n\n" + "-" * 48 + "\n\n" + PAGE_SEPARATOR: str = "-" * 48 + "\n\n" # Debug DEBUG_DATA_FOLDER: str = os.path.join(BASE_DIR, "debug_data") diff --git a/marker/tables/table.py b/marker/tables/table.py index c5ab194c..c1c260fb 100644 --- a/marker/tables/table.py +++ b/marker/tables/table.py @@ -73,6 +73,7 @@ def get_table_boxes(pages: List[Page], doc: PdfDocument, fname): fname, doc_idxs, [hr for i, hr in enumerate(img_sizes) if i in table_idxs], + # Add flatten pdf here ) text_lines = [] out_img_sizes = [] diff --git a/poetry.lock b/poetry.lock index d4b2d2a6..e8677058 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4175,13 +4175,13 @@ snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[ [[package]] name = "surya-ocr" -version = "0.6.10" +version = "0.6.11" description = "OCR, layout, reading order, and table recognition in 90+ languages" optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "surya_ocr-0.6.10-py3-none-any.whl", hash = "sha256:e22038d226d73bead781abda761a3813bacb1261f47996a00f5679686e86434e"}, - {file = "surya_ocr-0.6.10.tar.gz", hash = "sha256:7867dd02242a67e8d632d3f1343c62eb16e2068a98e36612dce1ec40065ff5b5"}, + {file = "surya_ocr-0.6.11-py3-none-any.whl", hash = "sha256:9b0c8638feda3f0f9db73a2ebceccdcbf5fdbb4cef0102b8d19837c455799347"}, + {file = "surya_ocr-0.6.11.tar.gz", hash = "sha256:d1415fcceae30cd44b08e8012d810efef51538dcf8d902fad035189b6219ee48"}, ] [package.dependencies] @@ -5076,4 +5076,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "380f95b398ed6864345aa1ed6d5357bfef0045cfee6bf32450d71e6e05ec079f" +content-hash = "e60697c44fdc30b1d5b48e5f4077ac4c65ff5844a49a4057f236dc6933a56dbb" diff --git a/pyproject.toml b/pyproject.toml index ba6ba65e..22705857 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "0.3.8" +version = "0.3.9" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md" @@ -32,7 +32,7 @@ tabulate = "^0.9.0" ftfy = "^6.1.1" texify = "^0.2.0" rapidfuzz = "^3.8.1" -surya-ocr = "^0.6.10" +surya-ocr = "^0.6.11" filetype = "^1.2.0" regex = "^2024.4.28" pdftext = "^0.3.17"