From d85df8213e9af67910bf65376004508f57200485 Mon Sep 17 00:00:00 2001 From: Nancy Hung Date: Mon, 28 Oct 2024 21:51:21 -0700 Subject: [PATCH 1/5] throw permission error if swallowed in mlflow exception stacktrace --- composer/utils/object_store/mlflow_object_store.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/composer/utils/object_store/mlflow_object_store.py b/composer/utils/object_store/mlflow_object_store.py index 26b4074b8d..311b225714 100644 --- a/composer/utils/object_store/mlflow_object_store.py +++ b/composer/utils/object_store/mlflow_object_store.py @@ -53,7 +53,6 @@ def _wrap_mlflow_exceptions(uri: str, e: Exception): retryable_server_codes = [ ErrorCode.Name(code) for code in [ DATA_LOSS, - INTERNAL_ERROR, INVALID_STATE, TEMPORARILY_UNAVAILABLE, DEADLINE_EXCEEDED, @@ -62,9 +61,19 @@ def _wrap_mlflow_exceptions(uri: str, e: Exception): retryable_client_codes = [ErrorCode.Name(code) for code in [ABORTED, REQUEST_LIMIT_EXCEEDED, RESOURCE_EXHAUSTED]] not_found_codes = [ErrorCode.Name(code) for code in [RESOURCE_DOES_NOT_EXIST, NOT_FOUND, ENDPOINT_NOT_FOUND]] + # MLflow wraps Azure data exceptions as INTERNAL_ERROR. Need to unwrap and check msg for the specific error. + non_retryable_internal_error_codes = [ + '401', + '403', + ] + if isinstance(e, MlflowException): error_code = e.error_code # pyright: ignore - if error_code in retryable_server_codes or error_code in retryable_client_codes: + if error_code == ErrorCode.Name(INTERNAL_ERROR): + error_message = e.message # pyright: ignore + if any(code in error_message for code in non_retryable_internal_error_codes): + raise PermissionError(error_message) + elif error_code in retryable_server_codes or error_code in retryable_client_codes: raise ObjectStoreTransientError(error_code) from e elif error_code in not_found_codes: raise FileNotFoundError(f'Object {uri} not found') from e From d3cf11731fefb23955b3bccb9b3d089a8844cf62 Mon Sep 17 00:00:00 2001 From: Nancy Hung Date: Tue, 29 Oct 2024 12:59:56 -0700 Subject: [PATCH 2/5] error message --- composer/utils/object_store/mlflow_object_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/utils/object_store/mlflow_object_store.py b/composer/utils/object_store/mlflow_object_store.py index 311b225714..2a1fef7771 100644 --- a/composer/utils/object_store/mlflow_object_store.py +++ b/composer/utils/object_store/mlflow_object_store.py @@ -72,7 +72,7 @@ def _wrap_mlflow_exceptions(uri: str, e: Exception): if error_code == ErrorCode.Name(INTERNAL_ERROR): error_message = e.message # pyright: ignore if any(code in error_message for code in non_retryable_internal_error_codes): - raise PermissionError(error_message) + raise PermissionError(f'Permission denied for object {uri} from the data provider. Details: {error_message}') from e elif error_code in retryable_server_codes or error_code in retryable_client_codes: raise ObjectStoreTransientError(error_code) from e elif error_code in not_found_codes: From ba1e2448ab869b325b44051a31c3eeafce1a50d2 Mon Sep 17 00:00:00 2001 From: Nancy Hung Date: Tue, 29 Oct 2024 16:10:03 -0700 Subject: [PATCH 3/5] Update composer/utils/object_store/mlflow_object_store.py Co-authored-by: Matthew Ding --- composer/utils/object_store/mlflow_object_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/utils/object_store/mlflow_object_store.py b/composer/utils/object_store/mlflow_object_store.py index 2a1fef7771..0a0092117c 100644 --- a/composer/utils/object_store/mlflow_object_store.py +++ b/composer/utils/object_store/mlflow_object_store.py @@ -71,7 +71,7 @@ def _wrap_mlflow_exceptions(uri: str, e: Exception): error_code = e.error_code # pyright: ignore if error_code == ErrorCode.Name(INTERNAL_ERROR): error_message = e.message # pyright: ignore - if any(code in error_message for code in non_retryable_internal_error_codes): + if any(f'{code} Client Error' in error_message for code in non_retryable_internal_error_codes): raise PermissionError(f'Permission denied for object {uri} from the data provider. Details: {error_message}') from e elif error_code in retryable_server_codes or error_code in retryable_client_codes: raise ObjectStoreTransientError(error_code) from e From b74a76111ca68955231f5bf9113380157dc9ba7e Mon Sep 17 00:00:00 2001 From: Nancy Hung Date: Tue, 29 Oct 2024 16:11:35 -0700 Subject: [PATCH 4/5] ok well we retry on 500 i guess --- composer/utils/object_store/mlflow_object_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/composer/utils/object_store/mlflow_object_store.py b/composer/utils/object_store/mlflow_object_store.py index 0a0092117c..0cb18ab339 100644 --- a/composer/utils/object_store/mlflow_object_store.py +++ b/composer/utils/object_store/mlflow_object_store.py @@ -53,6 +53,7 @@ def _wrap_mlflow_exceptions(uri: str, e: Exception): retryable_server_codes = [ ErrorCode.Name(code) for code in [ DATA_LOSS, + INTERNAL_ERROR, INVALID_STATE, TEMPORARILY_UNAVAILABLE, DEADLINE_EXCEEDED, From 3d308f8b1a0680486bb22792d50a4e8e1782d642 Mon Sep 17 00:00:00 2001 From: Nancy Hung Date: Wed, 30 Oct 2024 15:21:42 -0700 Subject: [PATCH 5/5] linter and pr feedback --- composer/utils/object_store/mlflow_object_store.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/composer/utils/object_store/mlflow_object_store.py b/composer/utils/object_store/mlflow_object_store.py index 0cb18ab339..6a6655fba8 100644 --- a/composer/utils/object_store/mlflow_object_store.py +++ b/composer/utils/object_store/mlflow_object_store.py @@ -63,7 +63,7 @@ def _wrap_mlflow_exceptions(uri: str, e: Exception): not_found_codes = [ErrorCode.Name(code) for code in [RESOURCE_DOES_NOT_EXIST, NOT_FOUND, ENDPOINT_NOT_FOUND]] # MLflow wraps Azure data exceptions as INTERNAL_ERROR. Need to unwrap and check msg for the specific error. - non_retryable_internal_error_codes = [ + permission_error_codes = [ '401', '403', ] @@ -71,9 +71,11 @@ def _wrap_mlflow_exceptions(uri: str, e: Exception): if isinstance(e, MlflowException): error_code = e.error_code # pyright: ignore if error_code == ErrorCode.Name(INTERNAL_ERROR): - error_message = e.message # pyright: ignore - if any(f'{code} Client Error' in error_message for code in non_retryable_internal_error_codes): - raise PermissionError(f'Permission denied for object {uri} from the data provider. Details: {error_message}') from e + error_message = e.message # pyright: ignore + if any(f'{code} Client Error' in error_message for code in permission_error_codes): + raise PermissionError( + f'Permission denied for object {uri} from the data provider. Details: {error_message}', + ) from e elif error_code in retryable_server_codes or error_code in retryable_client_codes: raise ObjectStoreTransientError(error_code) from e elif error_code in not_found_codes: