diff --git a/convert.py b/convert.py index 4d927456..9b6c499e 100755 --- a/convert.py +++ b/convert.py @@ -107,6 +107,9 @@ def main(): mp.set_start_method('spawn') # Required for CUDA, forkserver doesn't work model_lst = load_all_models() + if model_lst[0].device == "mps": + raise ValueError("Cannot use MPS with torch multiprocessing share_memory. You have to use CUDA or CPU.") + for model in model_lst: if model: model.share_memory() diff --git a/convert_single.py b/convert_single.py index 78ee9c5f..463feeb8 100755 --- a/convert_single.py +++ b/convert_single.py @@ -16,6 +16,7 @@ def main(): parser.add_argument("filename", help="PDF file to parse") parser.add_argument("output", help="Output base folder path") parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse") + parser.add_argument("--start_page", type=int, default=None, help="Page to start processing at") parser.add_argument("--langs", type=str, help="Languages to use for OCR, comma separated", default=None) parser.add_argument("--batch_multiplier", type=int, default=2, help="How much to increase batch sizes") args = parser.parse_args() @@ -24,7 +25,7 @@ def main(): fname = args.filename model_lst = load_all_models() - full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier) + full_text, images, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, langs=langs, batch_multiplier=args.batch_multiplier, start_page=args.start_page) fname = os.path.basename(fname) subfolder_path = save_markdown(args.output, fname, full_text, images, out_meta) diff --git a/marker/convert.py b/marker/convert.py index 5ebbf964..958a6bd2 100644 --- a/marker/convert.py +++ b/marker/convert.py @@ -34,6 +34,7 @@ def convert_single_pdf( fname: str, model_lst: List, max_pages: int = None, + start_page: int = None, metadata: Optional[Dict] = None, langs: Optional[List[str]] = None, batch_multiplier: int = 1 @@ -66,6 +67,7 @@ def convert_single_pdf( doc, fname, max_pages=max_pages, + start_page=start_page ) out_meta.update({ "toc": toc, diff --git a/marker/pdf/extract_text.py b/marker/pdf/extract_text.py index a28016db..937ded31 100644 --- a/marker/pdf/extract_text.py +++ b/marker/pdf/extract_text.py @@ -74,13 +74,21 @@ def pdftext_format_to_blocks(page, pnum: int) -> Page: return out_page -def get_text_blocks(doc, fname, max_pages: Optional[int] = None) -> (List[Page], Dict): +def get_text_blocks(doc, fname, max_pages: Optional[int] = None, start_page: Optional[int] = None) -> (List[Page], Dict): toc = get_toc(doc) - page_range = range(len(doc)) + if start_page: + assert start_page < len(doc) + else: + start_page = 0 + if max_pages: - range_end = min(max_pages, len(doc)) - page_range = range(range_end) + if max_pages + start_page > len(doc): + max_pages = len(doc) - start_page + else: + max_pages = len(doc) - start_page + + page_range = range(start_page, start_page + max_pages) char_blocks = dictionary_output(fname, page_range=page_range, keep_chars=True, workers=settings.PDFTEXT_CPU_WORKERS) marker_blocks = [pdftext_format_to_blocks(page, pnum) for pnum, page in enumerate(char_blocks)] diff --git a/pyproject.toml b/pyproject.toml index 1a4d964a..89a75a71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "0.2.11" +version = "0.2.12" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md"