From 68680ad32756d41465c47214c37e087fb6f17876 Mon Sep 17 00:00:00 2001 From: Song Jiang Date: Mon, 9 Sep 2024 18:39:30 -0700 Subject: [PATCH 1/2] Fix help message for auto-resume and max-retry options --- README.md | 4 ++-- src/hyperpod_cli/commands/job.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5cf6c55..bd6d9c1 100644 --- a/README.md +++ b/README.md @@ -137,8 +137,8 @@ hyperpod start-job --job-name [--namespace ] [--job-kind < * `scheduler-type` (enum) - Optional. The scheduler type to use. Currently, only `Kueue` is supported. * `queue-name` (string) - Optional. The name of the queue to submit the job to, which is created by the cluster admin users in your AWS account. * `priority` (string) - Optional. The priority for the job, which needs to be created by the cluster admin users and match the name in the cluster. -* `auto-resume` (bool) - Optional. If set to `true`, the job will automatically resume after a failure. Note that `auto-resume` currently only works in the `kubeflow` namespace or the namespace prefixed with `aws-hyperpod`. To enable `auto-resume`, you also should set `restart-policy` to `OnFailure`. -* `max-retry` (int) - Optional. The maximum number of retries if `auto-resume` is `true`. If `auto-resume` is set to true and `max-retry` is not specified, the default value is 1. +* `auto-resume` (bool) - Optional. The flag to enable HyperPod resilience job auto resume. If set to `true`, the job will automatically resume after pod or node failure. To enable `auto-resume`, you also should set `restart-policy` to `OnFailure`. +* `max-retry` (int) - Optional. The maximum number of retries for HyperPod resilience job auto resume. If `auto-resume` is set to true and `max-retry` is not specified, the default value is 1. * `restart-policy` (enum) - Optional. The PyTorchJob restart policy, which can be `Always`, `OnFailure`, `Never`, or `ExitCode`. The default is `OnFailure`. To enable `auto-resume`, `restart-policy` should be set to `OnFailure`. * `volumes` (list[string]) - Optional. Add a temp directory for containers to store data in the hosts. * `persistent-volume-claims` (list[string]) - Optional. The pre-created persistent volume claims (PVCs) that the data scientist can choose to mount to the containers. The cluster admin users should create PVCs and provide it to the data scientist users. diff --git a/src/hyperpod_cli/commands/job.py b/src/hyperpod_cli/commands/job.py index b40da41..c47447c 100644 --- a/src/hyperpod_cli/commands/job.py +++ b/src/hyperpod_cli/commands/job.py @@ -355,12 +355,12 @@ def cancel_job( "--auto-resume", type=click.BOOL, default=False, - help="Optional. If set to `true`, the job will automatically resume after a failure. Note that `auto-resume` currently only works in the `kubeflow` namespace or the namespace prefixed with `aws-hyperpod`. To enable `auto-resume`, you also should set `restart-policy` to `OnFailure`.", + help="Optional. The flag to enable HyperPod resilience job auto resume. If set to `true`, the job will automatically resume after pod or node failure. To enable `auto-resume`, you also should set `restart-policy` to `OnFailure`.", ) @click.option( "--max-retry", type=click.INT, - help="Optional. The maximum number of retries if `auto-resume` is `true`. If `auto-resume` is set to true and `max-retry` is not specified, the default value is 1.", + help="Optional. The maximum number of retries for HyperPod resilience job auto resume. If `auto-resume` is set to true and `max-retry` is not specified, the default value is 1.", ) @click.option( "--restart-policy", From 8b4e81cbc1fec953fed2d7a2f7961e0171ea32aa Mon Sep 17 00:00:00 2001 From: Song Jiang Date: Wed, 11 Sep 2024 12:11:18 -0700 Subject: [PATCH 2/2] Add a debug log to show each node with labels --- src/hyperpod_cli/commands/cluster.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hyperpod_cli/commands/cluster.py b/src/hyperpod_cli/commands/cluster.py index 8af3143..ed62031 100644 --- a/src/hyperpod_cli/commands/cluster.py +++ b/src/hyperpod_cli/commands/cluster.py @@ -269,6 +269,7 @@ def _aggregate_nodes_info( for node in nodes: labels = node.metadata.labels node_name = node.metadata.name + logger.debug(f"node_name is {node_name} and labels are {labels}") instance_type = labels[INSTANCE_TYPE_LABEL] nodes_summary[instance_type]["total_nodes"] += 1 if DEEP_HEALTH_CHECK_STATUS_LABEL in labels: