Skip to content

Commit

Permalink
Set start page
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 29, 2024
1 parent 5f2ca1a commit 131fbff
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 6 deletions.
3 changes: 3 additions & 0 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ def main():

mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work
model_lst = load_all_models()
if model_lst[0].device == "mps":
raise ValueError("Cannot use MPS with torch multiprocessing share_memory. You have to use CUDA or CPU.")

for model in model_lst:
if model:
model.share_memory()
Expand Down
3 changes: 2 additions & 1 deletion convert_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def main():
parser.add_argument("filename", help="PDF file to parse")
parser.add_argument("output", help="Output base folder path")
parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at")
parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None)
parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes")
args = parser.parse_args()
Expand All @@ -24,7 +25,7 @@ def main():

fname = args.filename
model_lst = load_all_models()
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier)
full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page)

fname = os.path.basename(fname)
subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta)
Expand Down
2 changes: 2 additions & 0 deletions marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def convert_single_pdf(
fname: str,
model_lst: List,
max_pages: int = None,
start_page: int = None,
metadata: Optional[Dict] = None,
langs: Optional[List[str]] = None,
batch_multiplier: int = 1
Expand Down Expand Up @@ -66,6 +67,7 @@ def convert_single_pdf(
doc,
fname,
max_pages=max_pages,
start_page=start_page
)
out_meta.update({
"toc": toc,
Expand Down
16 changes: 12 additions & 4 deletions marker/pdf/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,21 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page:
return out_page


def get_text_blocks(doc, fname, max_pages: Optional[int] = None) -> (List[Page], Dict):
def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict):
toc = get_toc(doc)

page_range = range(len(doc))
if start_page:
assert start_page < len(doc)
else:
start_page = 0

if max_pages:
range_end = min(max_pages, len(doc))
page_range = range(range_end)
if max_pages + start_page > len(doc):
max_pages = len(doc) - start_page
else:
max_pages = len(doc) - start_page

page_range = range(start_page, start_page + max_pages)

char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=True, workers=settings.PDFTEXT_CPU_WORKERS)
marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "0.2.11"
version = "0.2.12"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit 131fbff

Please sign in to comment.