Skip to content

Commit

Permalink
Add --txt-page-separator argument (closes #10)
Browse files Browse the repository at this point in the history
  • Loading branch information
AliOsm committed Jul 20, 2024
1 parent 93db9f1 commit 0a2b205
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 7 deletions.
6 changes: 5 additions & 1 deletion tahweel/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,9 @@ def process_file(args: TahweelArgumentParser, processor: GoogleDriveOcrProcessor
content = list(map(lambda text: text.replace('\ufeff', ''), content))
content = list(map(str.strip, content))

TxtWriter(file_manager.txt_file_path(args.tahweel_type, args.dir_output_type, args.file_or_dir_path)).write(content)
TxtWriter(file_manager.txt_file_path(args.tahweel_type, args.dir_output_type, args.file_or_dir_path)).write(
content,
args.txt_page_separator,
)

DocxWriter(file_manager.docx_file_path(args.tahweel_type, args.dir_output_type, args.file_or_dir_path)).write(content)
4 changes: 3 additions & 1 deletion tahweel/tahweel_argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

class TahweelArgumentParser(Tap):
file_or_dir_path: Path
"""Path to the file or directory to be processed."""

service_account_credentials: Path
"""Path to the service account credentials JSON file."""
Expand All @@ -22,6 +21,9 @@ class TahweelArgumentParser(Tap):

dir_output_type: DirOutputType = DirOutputType.TREE_TO_TREE

txt_page_separator: str = 'PAGE_SEPARATOR'
"""Separator to use between pages in the output TXT file."""

skip_output_check: bool = False
"""Use this flag in development only to skip the output check."""

Expand Down
7 changes: 2 additions & 5 deletions tahweel/writers/txt_writer.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
from pathlib import Path


PAGE_SEPARATOR = 'PAGE_SEPARATOR'


class TxtWriter:
def __init__(self, file_path: Path):
self.file_path = file_path

self.file_path.parent.mkdir(parents=True, exist_ok=True)

def write(self, texts: list[str]):
self.file_path.write_text(f'\n{PAGE_SEPARATOR}\n'.join(texts))
def write(self, texts: list[str], page_separator: str):
self.file_path.write_text(f'\n{page_separator}\n'.join(texts))

0 comments on commit 0a2b205

Please sign in to comment.