From bc609246e50aab6163ed917cb2e7ef8fa0111a21 Mon Sep 17 00:00:00 2001 From: Ali Hamdi Ali Fadel Date: Fri, 19 Jul 2024 15:17:46 +0000 Subject: [PATCH] Implement TahweelType.DIR CLI part and utilize writers --- tahweel/cli.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/tahweel/cli.py b/tahweel/cli.py index af01347..967c933 100644 --- a/tahweel/cli.py +++ b/tahweel/cli.py @@ -4,24 +4,47 @@ from tahweel import TahweelArgumentParser from tahweel.enums import TahweelType -from tahweel.managers import PdfManager +from tahweel.managers import PdfFileManager from tahweel.processors import GoogleDriveOcrProcessor +from tahweel.writers import DocxWriter, TxtWriter def main() -> None: args = TahweelArgumentParser(underscores_to_dashes=True).parse_args() + processor = GoogleDriveOcrProcessor(args.service_account_credentials) prepare_package_dirs() - processor = GoogleDriveOcrProcessor(args.service_account_credentials) + match args.tahweel_type: + case TahweelType.FILE: + pdf_file_manager = PdfFileManager(args.file_or_dir_path, args.pdf2image_thread_count) - if args.tahweel_type == TahweelType.FILE: - pdf_manager = PdfManager(args.file_or_dir_path, args.pdf2image_thread_count) - pdf_manager.to_images() + process_file(args, processor, pdf_file_manager) + case TahweelType.DIR: + for pdf_file_path in args.file_or_dir_path.rglob('*.pdf'): + pdf_file_manager = PdfFileManager(pdf_file_path, args.pdf2image_thread_count) - for image_path in pdf_manager.images_paths: - print(processor.process(image_path)) + process_file(args, processor, pdf_file_manager) def prepare_package_dirs() -> None: Path(platformdirs.user_cache_dir('Tahweel')).mkdir(parents=True, exist_ok=True) + + +def process_file( + args: TahweelArgumentParser, + processor: GoogleDriveOcrProcessor, + pdf_file_manager: PdfFileManager, +) -> None: + if pdf_file_manager.already_processed(args.tahweel_type, args.file_or_dir_path): + return + + pdf_file_manager.to_images() + + ocred_pages = list(map(processor.process, pdf_file_manager.images_paths)) + ocred_pages = list(map(lambda text: text.replace('________________', ''), ocred_pages)) + ocred_pages = list(map(lambda text: text.replace('', ''), ocred_pages)) + ocred_pages = list(map(str.strip, ocred_pages)) + + TxtWriter(pdf_file_manager.txt_file_path(args.tahweel_type, args.file_or_dir_path)).write(ocred_pages) + DocxWriter(pdf_file_manager.docx_file_path(args.tahweel_type, args.file_or_dir_path)).write(ocred_pages)