Skip to content

Commit

Permalink
Set grpc env for optimization. (#1308)
Browse files Browse the repository at this point in the history
* add grpc envs

* add grpc envs

* lint

* lint

* lint
  • Loading branch information
BalaBalaYi authored Oct 25, 2024
1 parent 0a77136 commit 53e10bd
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 8 deletions.
1 change: 1 addition & 0 deletions dlrover/python/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ class NodeEnv(object):
RELAUNCHED_POD = "RELAUNCHED_POD"
DLROVER_MASTER_ADDR = "DLROVER_MASTER_ADDR"
GRPC_ENABLE_FORK = "GRPC_ENABLE_FORK_SUPPORT"
GRPC_POLL_STRATEGY = "GRPC_POLL_STRATEGY"
POD_NAME = "POD_NAME"
MONITOR_ENABLED = "MONITOR_ENABLED"
JOB_NAME = "ELASTIC_JOB_NAME"
Expand Down
10 changes: 5 additions & 5 deletions dlrover/python/elastic_agent/master_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
def retry_grpc_request(func):
def wrapper(self, *args, **kwargs):
retry = kwargs.get("retry", 10)
execption = None
exception = None
for i in range(retry):
try:
return func(self, *args, **kwargs)
Expand All @@ -39,11 +39,11 @@ def wrapper(self, *args, **kwargs):
logger.warning(
f"Retry {i} to {class_name}.{func_name} with failure",
)
execption = e
exception = e
time.sleep(5)
if execption:
logger.error(execption)
raise execption
if exception:
logger.error(exception)
raise exception

return wrapper

Expand Down
9 changes: 6 additions & 3 deletions dlrover/python/master/scaler/pod_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,9 +472,12 @@ def _create_pod(self, node: Node):
env.append(V1EnvVar(name=NodeEnv.JOB_NAME, value=self._job_name))
env.append(V1EnvVar(name=NodeEnv.JOB_UID, value=self._job_uid))

# A deadlock can happen when pthread_atfork handler is running.
# For detail https://chromium.googlesource.com/external/github.com/grpc/grpc/+/refs/tags/v1.19.0-pre1/doc/fork_support.md # noqa: E501
env.append(V1EnvVar(name=NodeEnv.GRPC_ENABLE_FORK, value="False"))
# At the cost of increased performance overhead, these provide greater
# stability in concurrent scenarios. (need grpcio version>=1.58)
# A history background: https://chromium.googlesource.com/external/
# github.com/grpc/grpc/+/refs/tags/v1.19.0-pre1/doc/fork_support.md
env.append(V1EnvVar(name=NodeEnv.GRPC_ENABLE_FORK, value="true"))
env.append(V1EnvVar(name=NodeEnv.GRPC_POLL_STRATEGY, value="poll"))

worker_num = self._config_worker_num
if worker_num == 0:
Expand Down
1 change: 1 addition & 0 deletions dlrover/python/tests/test_master_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def tearDown(self):
self._master.stop()

def test_open_channel(self):
self.assertEqual(self._master_client._timeout, 0.5)
self.assertEqual(self._master_client._timeout, 0.5)
self._master_client.close_channel()
self._master_client.open_channel()
Expand Down

0 comments on commit 53e10bd

Please sign in to comment.