Skip to content

Commit

Permalink
Implement TahweelType.DIR CLI part and utilize writers
Browse files Browse the repository at this point in the history
  • Loading branch information
AliOsm committed Jul 19, 2024
1 parent 55f02b0 commit bc60924
Showing 1 changed file with 30 additions and 7 deletions.
37 changes: 30 additions & 7 deletions tahweel/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,47 @@

from tahweel import TahweelArgumentParser
from tahweel.enums import TahweelType
from tahweel.managers import PdfManager
from tahweel.managers import PdfFileManager
from tahweel.processors import GoogleDriveOcrProcessor
from tahweel.writers import DocxWriter, TxtWriter


def main() -> None:
args = TahweelArgumentParser(underscores_to_dashes=True).parse_args()
processor = GoogleDriveOcrProcessor(args.service_account_credentials)

prepare_package_dirs()

processor = GoogleDriveOcrProcessor(args.service_account_credentials)
match args.tahweel_type:
case TahweelType.FILE:
pdf_file_manager = PdfFileManager(args.file_or_dir_path, args.pdf2image_thread_count)

if args.tahweel_type == TahweelType.FILE:
pdf_manager = PdfManager(args.file_or_dir_path, args.pdf2image_thread_count)
pdf_manager.to_images()
process_file(args, processor, pdf_file_manager)
case TahweelType.DIR:
for pdf_file_path in args.file_or_dir_path.rglob('*.pdf'):
pdf_file_manager = PdfFileManager(pdf_file_path, args.pdf2image_thread_count)

for image_path in pdf_manager.images_paths:
print(processor.process(image_path))
process_file(args, processor, pdf_file_manager)


def prepare_package_dirs() -> None:
Path(platformdirs.user_cache_dir('Tahweel')).mkdir(parents=True, exist_ok=True)


def process_file(
args: TahweelArgumentParser,
processor: GoogleDriveOcrProcessor,
pdf_file_manager: PdfFileManager,
) -> None:
if pdf_file_manager.already_processed(args.tahweel_type, args.file_or_dir_path):
return

pdf_file_manager.to_images()

ocred_pages = list(map(processor.process, pdf_file_manager.images_paths))
ocred_pages = list(map(lambda text: text.replace('________________', ''), ocred_pages))
ocred_pages = list(map(lambda text: text.replace('', ''), ocred_pages))
ocred_pages = list(map(str.strip, ocred_pages))

TxtWriter(pdf_file_manager.txt_file_path(args.tahweel_type, args.file_or_dir_path)).write(ocred_pages)
DocxWriter(pdf_file_manager.docx_file_path(args.tahweel_type, args.file_or_dir_path)).write(ocred_pages)

0 comments on commit bc60924

Please sign in to comment.