diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dc49632..cfe606a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,3 +38,5 @@ repos: rev: v1.10.0 hooks: - id: mypy + additional_dependencies: + - google-api-python-client-stubs diff --git a/poetry.lock b/poetry.lock index 9580fa0..f0dc7d6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -222,6 +222,22 @@ google-auth-httplib2 = ">=0.2.0,<1.0.0" httplib2 = ">=0.19.0,<1.dev0" uritemplate = ">=3.0.1,<5" +[[package]] +name = "google-api-python-client-stubs" +version = "1.26.0" +description = "Type stubs for google-api-python-client" +optional = false +python-versions = "<4.0,>=3.7" +files = [ + {file = "google_api_python_client_stubs-1.26.0-py3-none-any.whl", hash = "sha256:0614b0cef5beac43e6ab02418f07e64ee66dc99ae4e377d54a155ac261533987"}, + {file = "google_api_python_client_stubs-1.26.0.tar.gz", hash = "sha256:f3b38b46f7b5cf4f6e7cc63ca554a2d23096d49c841f38b9ea553a5237074b56"}, +] + +[package.dependencies] +google-api-python-client = ">=2.130.0" +types-httplib2 = ">=0.22.0.2" +typing-extensions = ">=3.10.0" + [[package]] name = "google-auth" version = "2.32.0" @@ -757,6 +773,17 @@ typing-inspect = ">=0.7.1" dev = ["pydantic (>=2.5.0)", "typed-argument-parser[dev-no-pydantic]"] dev-no-pydantic = ["flake8", "pytest", "pytest-cov"] +[[package]] +name = "types-httplib2" +version = "0.22.0.20240310" +description = "Typing stubs for httplib2" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-httplib2-0.22.0.20240310.tar.gz", hash = "sha256:1eda99fea18ec8a1dc1a725ead35b889d0836fec1b11ae6f1fe05440724c1d15"}, + {file = "types_httplib2-0.22.0.20240310-py3-none-any.whl", hash = "sha256:8cd706fc81f0da32789a4373a28df6f39e9d5657d1281db4d2fd22ee29e83661"}, +] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -834,4 +861,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "868737ba3c89e70e67ad99fe745db9190526e50a2b754617ad327c2698e972e7" +content-hash = "adc0905c5ae1f46568caf581c192d765a14040609f73379e3b734bf96d106613" diff --git a/pyproject.toml b/pyproject.toml index df6c188..b052530 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ google-auth-oauthlib = "^1.2.1" pdf2image = "^1.17.0" platformdirs = "^4.2.2" tqdm = "^4.66.4" +google-api-python-client-stubs = "^1.26.0" [tool.poetry.scripts] tahweel = "tahweel.cli:main" diff --git a/tahweel/processors/google_drive_ocr_processor.py b/tahweel/processors/google_drive_ocr_processor.py index 11d7bd7..cb10f05 100644 --- a/tahweel/processors/google_drive_ocr_processor.py +++ b/tahweel/processors/google_drive_ocr_processor.py @@ -15,41 +15,31 @@ def __init__(self, service_account_credentials: Path): cache_discovery=False, ) - def process(self, file_path: Path, retries: int = 5) -> str: - download_buffer = None - - while download_buffer is None and retries > 0: - file_id = self._upload_file(file_path) - download_buffer = self._download_file(file_id) - self._delete_file(file_id) - retries -= 1 - - if download_buffer is None: - return '' + def process(self, file_path: Path) -> str: + file_id = self._upload_file(file_path) + download_buffer = self._download_file(file_id) + self._delete_file(file_id) return download_buffer.getvalue().decode('utf-8') def _upload_file(self, file_path: Path) -> str: - return self._drive_service.files().create( + return str(self._drive_service.files().create( body={'name': file_path.name, 'mimeType': 'application/vnd.google-apps.document'}, media_body=MediaFileUpload(file_path, mimetype='image/jpeg'), - ).execute().get('id') + ).execute().get('id')) - def _download_file(self, file_id: str) -> BytesIO | None: + def _download_file(self, file_id: str) -> BytesIO: download_buffer = BytesIO() download = MediaIoBaseDownload( download_buffer, self._drive_service.files().export_media(fileId=file_id, mimeType='text/plain'), ) - downloaded, status = False, False + downloaded = False while downloaded is False: - status, downloaded = download.next_chunk() - - if status: - return download_buffer + _, downloaded = download.next_chunk() - return None + return download_buffer def _delete_file(self, file_id: str) -> None: self._drive_service.files().delete(fileId=file_id).execute()