Skip to content

Commit

Permalink
Remove retries from GoogleDriveOcrProcessor while downloading
Browse files Browse the repository at this point in the history
  • Loading branch information
AliOsm committed Jul 19, 2024
1 parent e71506f commit cf18ae2
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 21 deletions.
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,5 @@ repos:
rev: v1.10.0
hooks:
- id: mypy
additional_dependencies:
- google-api-python-client-stubs
29 changes: 28 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ google-auth-oauthlib = "^1.2.1"
pdf2image = "^1.17.0"
platformdirs = "^4.2.2"
tqdm = "^4.66.4"
google-api-python-client-stubs = "^1.26.0"

[tool.poetry.scripts]
tahweel = "tahweel.cli:main"
Expand Down
30 changes: 10 additions & 20 deletions tahweel/processors/google_drive_ocr_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,41 +15,31 @@ def __init__(self, service_account_credentials: Path):
cache_discovery=False,
)

def process(self, file_path: Path, retries: int = 5) -> str:
download_buffer = None

while download_buffer is None and retries > 0:
file_id = self._upload_file(file_path)
download_buffer = self._download_file(file_id)
self._delete_file(file_id)
retries -= 1

if download_buffer is None:
return ''
def process(self, file_path: Path) -> str:
file_id = self._upload_file(file_path)
download_buffer = self._download_file(file_id)
self._delete_file(file_id)

return download_buffer.getvalue().decode('utf-8')

def _upload_file(self, file_path: Path) -> str:
return self._drive_service.files().create(
return str(self._drive_service.files().create(
body={'name': file_path.name, 'mimeType': 'application/vnd.google-apps.document'},
media_body=MediaFileUpload(file_path, mimetype='image/jpeg'),
).execute().get('id')
).execute().get('id'))

def _download_file(self, file_id: str) -> BytesIO | None:
def _download_file(self, file_id: str) -> BytesIO:
download_buffer = BytesIO()
download = MediaIoBaseDownload(
download_buffer,
self._drive_service.files().export_media(fileId=file_id, mimeType='text/plain'),
)

downloaded, status = False, False
downloaded = False
while downloaded is False:
status, downloaded = download.next_chunk()

if status:
return download_buffer
_, downloaded = download.next_chunk()

return None
return download_buffer

def _delete_file(self, file_id: str) -> None:
self._drive_service.files().delete(fileId=file_id).execute()

0 comments on commit cf18ae2

Please sign in to comment.