Skip to content

Commit

Permalink
add more logging and error handler
Browse files Browse the repository at this point in the history
  • Loading branch information
majieyue committed Oct 1, 2024
1 parent e8086cd commit df662a4
Showing 1 changed file with 13 additions and 5 deletions.
18 changes: 13 additions & 5 deletions dlrover/python/elastic_agent/torch/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,11 +784,19 @@ def _stop_workers(
"""
if self._pcontext is not None:
for pid in self._pcontext.pids():
pp = psutil.Process(pid)
cp = pp.children()
for proc in cp:
logger.info(f"kill sub {proc.pid} of parent {pid}")
os.kill(proc.pid, signal.SIGKILL)
logger.info(f"kill process {pid} and its sub processes")
if pid == 0:
logger.info("skip invalid process 0")
continue
try:
pp = psutil.Process(pid)
cp = pp.children()
for proc in cp:
logger.info(f"kill sub {proc.pid} of parent {pid}")
os.kill(proc.pid, signal.SIGKILL)
except Exception as e:
logger.info(f"Error when kill {pid}: {str(e)}")

Check warning on line 798 in dlrover/python/elastic_agent/torch/training.py

View check run for this annotation

Codecov / codecov/patch

dlrover/python/elastic_agent/torch/training.py#L786-L798

Added lines #L786 - L798 were not covered by tests

self._shutdown(death_sig=signal.SIGKILL)
else:
if version_less_than_240():
Expand Down

0 comments on commit df662a4

Please sign in to comment.