Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable setting start page #149

Merged
merged 1 commit into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,11 @@ def main():

mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
model_lst = load_all_models()

for model in model_lst:
if model.device.type == "mps":
raise ValueError("Cannot use MPS with torch multiprocessing share_memory. You have to use CUDA or CPU. Set the TORCH_DEVICE environment variable to change the device.")

if model:
model.share_memory()

Expand Down
3 changes: 2 additions & 1 deletion convert_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def main():
parser.add_argument("filename", help="PDF file to parse")
parser.add_argument("output", help="Output base folder path")
parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
args = parser.parse_args()
Expand All @@ -24,7 +25,7 @@ def main():

fname = args.filename
model_lst = load_all_models()
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier)
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)

fname = os.path.basename(fname)
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
Expand Down
9 changes: 8 additions & 1 deletion marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def convert_single_pdf(
fname: str,
model_lst: List,
max_pages: int = None,
start_page: int = None,
metadata: Optional[Dict] = None,
langs: Optional[List[str]] = None,
batch_multiplier: int = 1
Expand Down Expand Up @@ -66,12 +67,18 @@ def convert_single_pdf(
doc,
fname,
max_pages=max_pages,
start_page=start_page
)
out_meta.update({
"toc": toc,
"pages": len(pages),
})

# Trim pages from doc to align with start page
if start_page:
for page_idx in range(start_page):
doc.del_page(0)

# Unpack models from list
texify_model, layout_model, order_model, edit_model, detection_model, ocr_model = model_lst

Expand Down Expand Up @@ -99,7 +106,7 @@ def convert_single_pdf(
annotate_block_types(pages)

# Dump debug data if flags are set
dump_bbox_debug_data(doc, pages)
dump_bbox_debug_data(doc, fname, pages)

# Find reading order for blocks
# Sort blocks by reading order
Expand Down
4 changes: 2 additions & 2 deletions marker/debug/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ def dump_equation_debug_data(doc, images, converted_spans):
json.dump(data_lines, f)


def dump_bbox_debug_data(doc, blocks: List[Page]):
def dump_bbox_debug_data(doc, fname, blocks: List[Page]):
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2:
return

# Remove extension from doc name
doc_base = os.path.basename(doc.name).rsplit(".", 1)[0]
doc_base = fname.rsplit(".", 1)[0]

debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json")
debug_data = []
Expand Down
16 changes: 12 additions & 4 deletions marker/pdf/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,21 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
return out_page


def get_text_blocks(doc, fname, max_pages: Optional[int] = None) -> (List[Page], Dict):
def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
toc = get_toc(doc)

page_range = range(len(doc))
if start_page:
assert start_page < len(doc)
else:
start_page = 0

if max_pages:
range_end = min(max_pages, len(doc))
page_range = range(range_end)
if max_pages + start_page > len(doc):
max_pages = len(doc) - start_page
else:
max_pages = len(doc) - start_page

page_range = range(start_page, start_page + max_pages)

char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=True, workers=settings.PDFTEXT_CPU_WORKERS)
marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "0.2.11"
version = "0.2.12"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down
Loading