From 0a2b205b4ba1a6be456368a15b38f4cf48cf3022 Mon Sep 17 00:00:00 2001 From: Ali Hamdi Ali Fadel Date: Sat, 20 Jul 2024 16:27:06 +0000 Subject: [PATCH] Add --txt-page-separator argument (closes #10) --- tahweel/cli.py | 6 +++++- tahweel/tahweel_argument_parser.py | 4 +++- tahweel/writers/txt_writer.py | 7 ++----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tahweel/cli.py b/tahweel/cli.py index 6761b58..4a474cc 100644 --- a/tahweel/cli.py +++ b/tahweel/cli.py @@ -56,5 +56,9 @@ def process_file(args: TahweelArgumentParser, processor: GoogleDriveOcrProcessor content = list(map(lambda text: text.replace('\ufeff', ''), content)) content = list(map(str.strip, content)) - TxtWriter(file_manager.txt_file_path(args.tahweel_type, args.dir_output_type, args.file_or_dir_path)).write(content) + TxtWriter(file_manager.txt_file_path(args.tahweel_type, args.dir_output_type, args.file_or_dir_path)).write( + content, + args.txt_page_separator, + ) + DocxWriter(file_manager.docx_file_path(args.tahweel_type, args.dir_output_type, args.file_or_dir_path)).write(content) diff --git a/tahweel/tahweel_argument_parser.py b/tahweel/tahweel_argument_parser.py index b4c0150..5575957 100644 --- a/tahweel/tahweel_argument_parser.py +++ b/tahweel/tahweel_argument_parser.py @@ -9,7 +9,6 @@ class TahweelArgumentParser(Tap): file_or_dir_path: Path - """Path to the file or directory to be processed.""" service_account_credentials: Path """Path to the service account credentials JSON file.""" @@ -22,6 +21,9 @@ class TahweelArgumentParser(Tap): dir_output_type: DirOutputType = DirOutputType.TREE_TO_TREE + txt_page_separator: str = 'PAGE_SEPARATOR' + """Separator to use between pages in the output TXT file.""" + skip_output_check: bool = False """Use this flag in development only to skip the output check.""" diff --git a/tahweel/writers/txt_writer.py b/tahweel/writers/txt_writer.py index c83defa..d84e889 100644 --- a/tahweel/writers/txt_writer.py +++ b/tahweel/writers/txt_writer.py @@ -1,14 +1,11 @@ from pathlib import Path -PAGE_SEPARATOR = 'PAGE_SEPARATOR' - - class TxtWriter: def __init__(self, file_path: Path): self.file_path = file_path self.file_path.parent.mkdir(parents=True, exist_ok=True) - def write(self, texts: list[str]): - self.file_path.write_text(f'\n{PAGE_SEPARATOR}\n'.join(texts)) + def write(self, texts: list[str], page_separator: str): + self.file_path.write_text(f'\n{page_separator}\n'.join(texts))