From 056428128ad4f107b6638f5ede7ad083b0f20789 Mon Sep 17 00:00:00 2001 From: bsang Date: Thu, 12 Sep 2024 16:48:41 +0800 Subject: [PATCH 01/15] fix diagnosis configure bugs --- .../inferencechain/inference_chain.py | 1 + .../check_failure_node_operator.py | 6 ++++++ .../inferenceoperator/operator.py | 20 +++++++++++++++++++ .../diagnosis/diagnosis_agent.py | 13 +++++++----- .../python/elastic_agent/torch/training.py | 7 ++++--- 5 files changed, 39 insertions(+), 8 deletions(-) create mode 100644 dlrover/python/diagnosis/inferencechain/inferenceoperator/operator.py diff --git a/dlrover/python/diagnosis/inferencechain/inference_chain.py b/dlrover/python/diagnosis/inferencechain/inference_chain.py index 7a8450445..b5e9ab286 100644 --- a/dlrover/python/diagnosis/inferencechain/inference_chain.py +++ b/dlrover/python/diagnosis/inferencechain/inference_chain.py @@ -36,6 +36,7 @@ def __init__( self.operators = operators def infer(self) -> List[Inference]: + logger.info(f"Infer {self.inferences}") inferences = self.inferences while True: has_new_inference = False diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_failure_node_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_failure_node_operator.py index 9e07deb76..e272bbd1f 100644 --- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_failure_node_operator.py +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_failure_node_operator.py @@ -53,6 +53,12 @@ def infer(self, inferences: List[Inference]) -> List[Inference]: or InferenceConfigKey.LOG_FILE not in inferences[0].configs or InferenceConfigKey.ERRORS not in inferences[0].configs ): + if len(inferences) > 0 and inferences[0].configs: + logger.error( + f"invalid configurations: {inferences[0].configs}" + ) + else: + logger.error("no configurations found") return [ Inference( name=InferenceName.NODE, diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/operator.py new file mode 100644 index 000000000..ad0933aac --- /dev/null +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/operator.py @@ -0,0 +1,20 @@ +# Copyright 2024 The DLRover Authors. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dlrover.python.diagnosis.inferencechain.inferenceoperator.check_failure_node_operator import ( # noqa: E501 + CheckFailureNodeOperator, +) + + +def get_training_failure_operators(): + return [CheckFailureNodeOperator()] diff --git a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py index bf2915102..5bdaf84dc 100644 --- a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py +++ b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py @@ -35,8 +35,8 @@ from dlrover.python.diagnosis.inferencechain.inference_chain import ( InferenceChain, ) -from dlrover.python.diagnosis.inferencechain.inferenceoperator.check_failure_node_operator import ( # noqa: E501 - CheckFailureNodeOperator, +from dlrover.python.diagnosis.inferencechain.inferenceoperator.operator import ( # noqa: E501 + get_training_failure_operators, ) from dlrover.python.elastic_agent.master_client import MasterClient @@ -47,8 +47,11 @@ def __init__(self, training_log_file: str, errors: str): self._training_log_file = training_log_file self._errors = errors - def get_training_failure_operators(self): - return [CheckFailureNodeOperator()] + logger.info( + "Initializing diagnosis agent with\n" + f"training_log_file: {self._training_log_file}\n" + f"errors: {self._errors}" + ) def diagnose_training_failure(self, worker_context: WorkerContext) -> str: self._report_failure_to_master( @@ -64,7 +67,7 @@ def diagnose_training_failure(self, worker_context: WorkerContext) -> str: InferenceConfigKey.ERRORS: self._errors, }, ) - ic = InferenceChain([inference], self.get_training_failure_operators()) + ic = InferenceChain([inference], get_training_failure_operators()) infer_results = ic.infer() failure_inf = Inference( name=InferenceName.NODE, diff --git a/dlrover/python/elastic_agent/torch/training.py b/dlrover/python/elastic_agent/torch/training.py index 965372532..86b09eefc 100644 --- a/dlrover/python/elastic_agent/torch/training.py +++ b/dlrover/python/elastic_agent/torch/training.py @@ -167,6 +167,8 @@ def set_node_unit(self, node_unit): self.rdzv_configs["node_unit"] = node_unit def auto_configure_params(self): + self.training_log_file = os.getenv(NodeEnv.TRAINING_LOG_FILE, "") + self.failure_node_errors = os.getenv(NodeEnv.FAILURE_NODE_ERRORS, "") device = "" if torch.cuda.is_available(): device = torch.cuda.get_device_name() @@ -183,9 +185,6 @@ def auto_configure_params(self): if self.min_nodes >= 4: self.network_check = True - self.training_log_file = os.getenv(NodeEnv.TRAINING_LOG_FILE, "") - self.failure_node_errors = os.getenv(NodeEnv.FAILURE_NODE_ERRORS, "") - class MasterRendezvousHandler(RendezvousHandler): """The rendezvous handler completes rendezvous by connecting @@ -872,6 +871,8 @@ def launch_agent( f" monitor_interval : {config.monitor_interval}\n" f" log_dir : {config.log_dir}\n" f" metrics_cfg : {config.metrics_cfg}\n" + f" training_log : {config.training_log_file}\n" + f" failure_errors : {config.failure_node_errors}\n" ) _set_paral_config() From f4647e80b84e6bafb10ea4695d23b56a60164760 Mon Sep 17 00:00:00 2001 From: bsang Date: Thu, 12 Sep 2024 17:15:25 +0800 Subject: [PATCH 02/15] make diagnosis as singleton --- dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py | 3 ++- dlrover/python/tests/test_diagnosis_agent.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py index 5bdaf84dc..64e294b18 100644 --- a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py +++ b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py @@ -20,6 +20,7 @@ from dlrover.python.common.constants import TrainingExceptionLevel from dlrover.python.common.error import ProcessError from dlrover.python.common.log import default_logger as logger +from dlrover.python.common.singleton import Singleton from dlrover.python.common.worker import WorkerContext from dlrover.python.diagnosis.common.constants import ( DiagnoseAction, @@ -41,7 +42,7 @@ from dlrover.python.elastic_agent.master_client import MasterClient -class DiagnosisAgent: +class DiagnosisAgent(Singleton): def __init__(self, training_log_file: str, errors: str): self._client = MasterClient.singleton_instance() self._training_log_file = training_log_file diff --git a/dlrover/python/tests/test_diagnosis_agent.py b/dlrover/python/tests/test_diagnosis_agent.py index 10e8198f0..bdce91d15 100644 --- a/dlrover/python/tests/test_diagnosis_agent.py +++ b/dlrover/python/tests/test_diagnosis_agent.py @@ -56,7 +56,7 @@ def test_diagnose_training(self): file_path = os.path.join(path, file) errors = "error code is 11111" - agent = DiagnosisAgent(file_path, errors) + agent = DiagnosisAgent.singleton_instance(file_path, errors) spec = _create_worker_spec( node_rank=0, From d58f07d2b0a7454e71aeaeedc90b4e888d60d8c0 Mon Sep 17 00:00:00 2001 From: Ma Jie Yue Date: Thu, 19 Sep 2024 14:45:59 +0800 Subject: [PATCH 03/15] add RELEASES.md document --- RELEASES.md | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 RELEASES.md diff --git a/RELEASES.md b/RELEASES.md new file mode 100644 index 000000000..681edc354 --- /dev/null +++ b/RELEASES.md @@ -0,0 +1,95 @@ +# Release Notes + +The DLRover project follows the semantic versioning scheme and maintains a separate branch for each minor version. The main branch always represents the next upcoming minor or major version. + +For laset news about DLRover you can check as following link: https://github.com/intelligent-machine-learning/dlrover?tab=readme-ov-file#latest-news= + +## Release 0.3.7 on May 13 + +Features: +* Flash Checkpoint suppors deleting old checkpoints. + +BugFix: +* Save/load the non-params-related variables of dist optimizer in Megatron-LM models. +* The agent waits for async saving checkpoint finishes before exiting. + +## Release 0.3.6 on Apr 24 + +Features: +* Flash checkpoint provides FlashCkptTrainer to support HuggingFace transforemers.Trainer. +* Flash checkpoint supports loading the checkpint of Megatron-LM from the memory. +Flash Checkpoint supports saving and loading FSDP checkpoint with full state dict. +* Job master can sort the node ranks by the access switches of the node. + +BugFix: +* Fix the segment fault when restarting the training process. + +## Release 0.3.5 on Mar 29 + +Features: +* Flash checkpoint supports saving and loading Megatron-LM MOE models. #1042 +* APIs to extend the module to check the node with different chips. #1023 +* Automatically mark the node as unschedulable if the node fails. #1025 + +BugFix: +* Fix the DDP example of mnist to save and load checkpoint. #1051 +* Fix the checkpoint name of DDP. #1034 + +## Release 0.3.4 on Feb 21 + +Features: +* Flash checkpoint enables saving and loading Megatron-LM models from multiple ranks in parallel. +* dlrover-run --auto-config Automatically configure the number of nodes and the number of processes per node. +* Users can customize the APIs of storage to save the checkpoint into different file systems. +* Deletion strategy to clean the old checkpoint files. + +BugFix: +* The shared memory does not exist if the size of the checkpoint changes. + +## Release 0.3.3 on Jan 25 + +Features: +* Support Python > 3.10. +* Support restarting the training process on Ascend NPU. +* Support asynchronously saving the checkpoint of the distributed optimizer of Megatron-LM to the storage. + +BugFix: +* Fix the checkpoint shard inconsistency of all ranks. +* Fix the bug to asynchronously save the Megatron-LM checkpoint of the job with multi-GPUs on multi-nodes. +* Fix the bug to load the Megatron-LM checkpoint. + +## Release 0.3.1 on Jan 10 + +Feature: +* Users can use flash checkpoint using torchrun or python -m torch.distributed.launch. + +Bugfix: +* The dlrover master cannot print the error message of the fault node in a kubeflow/PytorchJob. + +## Release 0.3.0 on Jan 3 + +Features: +* Flash Checkpoint to asynchronously persist checkpoint to storage. +* Flash Checkpoint recovers failure in memory. +* Flash Checkpoint supports DDP/FSDP/DeepSpeed/Megatron +* Node detection supports NPU. + +Examples +* The example of training nanoGPT using DeepSpeed. +* The example to save/load sharding FSDP checkpoint. + + +## Release 0.2.2 on Nov 21, 2023 + +Features: +* dlrover-run can run on any distributed jobs with the NODE_RANK and DLROVER_MASTER_ADDR in the environment. +* DLRover can asynchronously save the checkpoint into the storage which only block the training with a few time. + +BugFix: +* Fix the bug to load the FSDP checkpoint. + +## Release 0.2.1 on Oct 11, 2023 + +* Autotuning batch size without restarting the job. +* Automatically detect the straggler (slow worker). +* TFPlus: TFPlus 0.1.0 has been released, see detail in https://github.com/intelligent-machine-learning/dlrover/tree/master/tfplus From 28c873a36c9d5a5520f980cd7a86fc6b29ef7aa5 Mon Sep 17 00:00:00 2001 From: Ma Jie Yue Date: Thu, 19 Sep 2024 15:38:58 +0800 Subject: [PATCH 04/15] add CODE_OF_CONDUCT.md file --- CODE_OF_CONDUCT.md | 76 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..9e191ade7 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or +advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic +address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a +professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq From 7f85f62d8566dd2262b3b079875cb60d7996b7c8 Mon Sep 17 00:00:00 2001 From: Ma JieYue Date: Thu, 19 Sep 2024 15:49:14 +0800 Subject: [PATCH 05/15] Delete CODE_OF_CONDUCT.md --- CODE_OF_CONDUCT.md | 76 ---------------------------------------------- 1 file changed, 76 deletions(-) delete mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index 9e191ade7..000000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,76 +0,0 @@ -# Code of Conduct - -## Our Pledge - -In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to make participation in our project and -our community a harassment-free experience for everyone, regardless of age, body -size, disability, ethnicity, sex characteristics, gender identity and expression, -level of experience, education, socio-economic status, nationality, personal -appearance, race, religion, or sexual identity and orientation. - -## Our Standards - -Examples of behavior that contributes to creating a positive environment -include: - -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members - -Examples of unacceptable behavior by participants include: - -* The use of sexualized language or imagery and unwelcome sexual attention or -advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic -address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a -professional setting - -## Our Responsibilities - -Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior. - -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful. - -## Scope - -This Code of Conduct applies within all project spaces, and it also applies when -an individual is representing the project or its community in public spaces. -Examples of representing a project or community include using an official -project e-mail address, posting via an official social media account, or acting -as an appointed representative at an online or offline event. Representation of -a project may be further defined and clarified by project maintainers. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at . All -complaints will be reviewed and investigated and will result in a response that -is deemed necessary and appropriate to the circumstances. The project team is -obligated to maintain confidentiality with regard to the reporter of an incident. -Further details of specific enforcement policies may be posted separately. - -Project maintainers who do not follow or enforce the Code of Conduct in good -faith may face temporary or permanent repercussions as determined by other -members of the project's leadership. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, -available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html - -[homepage]: https://www.contributor-covenant.org - -For answers to common questions about this code of conduct, see -https://www.contributor-covenant.org/faq From 1acf15f548564cbe53174de8b9040d19dc40a96f Mon Sep 17 00:00:00 2001 From: Ma Jie Yue Date: Thu, 19 Sep 2024 15:54:29 +0800 Subject: [PATCH 06/15] add CODE_OF_CONDUCT.md file --- CODE_OF_CONDUCT.md | 76 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..9e191ade7 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or +advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic +address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a +professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq From 2f64834e3e587f096cb1e6e703d5db5caf6ca1ed Mon Sep 17 00:00:00 2001 From: Ma Jie Yue Date: Thu, 19 Sep 2024 16:23:36 +0800 Subject: [PATCH 07/15] add CONTRIBUTING.md file --- CONTRIBUTING.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..0eea4cb67 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,5 @@ +# Contributing + +## This repo is part of [DLRover](https://github.com/intelligent-machine-learning/dlrover) + +For details on how to contribute to DLRover, please see the main [contributing document](https://github.com/intelligent-machine-learning/dlrover/blob/master/docs/developer_guide.md). From bd4b530b303e35cc7ccbf7a38121d220acf526c4 Mon Sep 17 00:00:00 2001 From: Ma Jie Yue Date: Thu, 19 Sep 2024 16:51:37 +0800 Subject: [PATCH 08/15] add MAINTERNERS.md --- MAINTAINERS.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 MAINTAINERS.md diff --git a/MAINTAINERS.md b/MAINTAINERS.md new file mode 100644 index 000000000..4d018d0a8 --- /dev/null +++ b/MAINTAINERS.md @@ -0,0 +1,18 @@ + +- [Overview](#overview) +- [Current Maintainers](#current-maintainers) +- [Emeritus](#emeritus) + +## Overview + +This document contains a list of maintainers in this repo. If you're interested in contributing, and becoming a maintainer, see [CONTRIBUTING](CONTRIBUTING.md). + +## Current Maintainers + +| Maintainer | GitHub ID | Affiliation | +| ------------------ | --------------------------------------------------------- | ----------- | +| Tianyi Chen | [BalaBalaYi](https://github.com/BalaBalaYi) | AntGroup | +| Bo Sang | [samplise](https://github.com/samplise) | AntGroup | +| Qinlong Wang | [workingloong](https://github.com/workingloong) | ByteDance | + +## Emeritus From 7426fdd39d4b0f57b3563dbd8132372a205ca1da Mon Sep 17 00:00:00 2001 From: Ma JieYue Date: Thu, 19 Sep 2024 16:58:47 +0800 Subject: [PATCH 09/15] Delete MAINTAINERS.md --- MAINTAINERS.md | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 MAINTAINERS.md diff --git a/MAINTAINERS.md b/MAINTAINERS.md deleted file mode 100644 index 4d018d0a8..000000000 --- a/MAINTAINERS.md +++ /dev/null @@ -1,18 +0,0 @@ - -- [Overview](#overview) -- [Current Maintainers](#current-maintainers) -- [Emeritus](#emeritus) - -## Overview - -This document contains a list of maintainers in this repo. If you're interested in contributing, and becoming a maintainer, see [CONTRIBUTING](CONTRIBUTING.md). - -## Current Maintainers - -| Maintainer | GitHub ID | Affiliation | -| ------------------ | --------------------------------------------------------- | ----------- | -| Tianyi Chen | [BalaBalaYi](https://github.com/BalaBalaYi) | AntGroup | -| Bo Sang | [samplise](https://github.com/samplise) | AntGroup | -| Qinlong Wang | [workingloong](https://github.com/workingloong) | ByteDance | - -## Emeritus From 23ea711c45b8d1994df5e29e82187925fc62f10d Mon Sep 17 00:00:00 2001 From: Ma Jie Yue Date: Thu, 19 Sep 2024 17:03:59 +0800 Subject: [PATCH 10/15] add CODEOWNERS file --- CODEOWNERS | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 CODEOWNERS diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 000000000..32c4998d7 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,8 @@ +# root directory +* @nash635 @workingloong @samplise @BalaBalaYi + +# dlrover directory +/dlrover/ @workingloong @samplise @BalaBalaYi @majieyue + +# atorch directory +/atorch/ @skydoorkai @adamantboy @hxdtest From a975bd5b356bcd20b2fb252263c919964e58df63 Mon Sep 17 00:00:00 2001 From: Ma JieYue Date: Thu, 19 Sep 2024 17:52:54 +0800 Subject: [PATCH 11/15] Delete CODEOWNERS --- CODEOWNERS | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 CODEOWNERS diff --git a/CODEOWNERS b/CODEOWNERS deleted file mode 100644 index 32c4998d7..000000000 --- a/CODEOWNERS +++ /dev/null @@ -1,8 +0,0 @@ -# root directory -* @nash635 @workingloong @samplise @BalaBalaYi - -# dlrover directory -/dlrover/ @workingloong @samplise @BalaBalaYi @majieyue - -# atorch directory -/atorch/ @skydoorkai @adamantboy @hxdtest From 7301088d9cb02fe46b790440845c5a81fc45b310 Mon Sep 17 00:00:00 2001 From: Ma Jie Yue Date: Thu, 19 Sep 2024 17:56:53 +0800 Subject: [PATCH 12/15] add MAINTAINERS.md, refer to CODEOWNERS --- MAINTAINERS.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 MAINTAINERS.md diff --git a/MAINTAINERS.md b/MAINTAINERS.md new file mode 100644 index 000000000..14a294efa --- /dev/null +++ b/MAINTAINERS.md @@ -0,0 +1,3 @@ +# Maintainers + +For details please see [CODEOWNERS](https://github.com/majieyue/dlrover/blob/lfai-20240919/.github/CODEOWNERS). From 97f39dcfac68898fe00059c56c24f2713287b6e9 Mon Sep 17 00:00:00 2001 From: Tianyi Chen Date: Mon, 23 Sep 2024 10:55:43 +0800 Subject: [PATCH 13/15] Optimize logging (#1276) * optimize logging * fix ut * lint --- .../python/master/elastic_training/rdzv_manager.py | 10 +++++++--- dlrover/python/master/node/dist_job_manager.py | 10 ++++++++-- dlrover/python/master/node/worker.py | 9 ++++++--- dlrover/python/tests/test_rdzv_manager.py | 11 ++++++----- 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/dlrover/python/master/elastic_training/rdzv_manager.py b/dlrover/python/master/elastic_training/rdzv_manager.py index 378aaf0f4..db3ef9afd 100644 --- a/dlrover/python/master/elastic_training/rdzv_manager.py +++ b/dlrover/python/master/elastic_training/rdzv_manager.py @@ -200,11 +200,15 @@ def _get_lacking_ranks(self) -> List[int]: """ lacking_ranks: List[int] = [] - if self._rdzv_params is None or self._rdzv_params.min_nodes <= 0: + if ( + self._rdzv_params is None + or self._rdzv_params.min_nodes <= 0 + or self._rdzv_params.max_nodes <= 0 + ): return lacking_ranks - min_required = self._rdzv_params.min_nodes - min_ranks = set([i for i in range(min_required)]) + max_required = self._rdzv_params.max_nodes + min_ranks = set([i for i in range(max_required)]) if self._waiting_nodes: waiting_ranks = set(self._waiting_nodes.keys()) else: diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py index 09ee564d2..f5c7c416b 100644 --- a/dlrover/python/master/node/dist_job_manager.py +++ b/dlrover/python/master/node/dist_job_manager.py @@ -134,9 +134,9 @@ def __init__( raise ValueError( f"Distribution strategy {job_args.distribution_strategy} " "is not supported. You can specify it with " - "ParameterServerStrategy/AllreduceStrategy." + "ParameterServerStrategy/AllReduceStrategy." ) - logger.info("New job optimizer : %s", self._job_optimizer.__class__) + logger.info(f"New job optimizer: {self._job_optimizer.__class__}") worker_restart_count = node_restart_count.get(NodeType.WORKER, 0) ps_restart_count = node_restart_count.get(NodeType.PS, 0) @@ -150,6 +150,12 @@ def __init__( self._ps_relaunch_max_num = min( ps_restart_count, _MAX_POD_RELAUNCH_COUNT ) + logger.info( + f"Worker relaunch number: {self._relaunch_on_worker_failure}; " + f"PS relaunch number: {self._ps_relaunch_max_num}; " + f"Critical worker index: {self._critical_worker_index}." + ) + self._node_event_callbacks: List[NodeEventCallback] = [] # Protects followed variables, which are accessed from event_cb. diff --git a/dlrover/python/master/node/worker.py b/dlrover/python/master/node/worker.py index 1b4d2c5ab..2654aa2c6 100644 --- a/dlrover/python/master/node/worker.py +++ b/dlrover/python/master/node/worker.py @@ -368,12 +368,12 @@ def is_training_hang_by_pending(self, total_node_num) -> bool: pending_nodes ): logger.debug( - "Skip for no required nodes info " "and not all nodes pending." + "Skip for no required nodes info and not all nodes pending." ) return False elif 0 < len(pending_nodes) == total_node_num: # all nodes pending - logger.debug(f"All nodes pending: {pending_nodes}.") + logger.info(f"All nodes pending: {pending_nodes}.") else: # partial nodes pending # with condition 1 + 2 @@ -404,7 +404,10 @@ def is_training_hang_by_pending(self, total_node_num) -> bool: if now - first_pending_node.create_time.timestamp() > timeout: logger.warning( f"Node {first_pending_node.name} " - f"exceeded pending timeout: {timeout}s." + f"exceeded pending timeout: {timeout}s, " + f"running nodes(size:{len(running_nodes)}): {running_nodes}, " + f"pending nodes(size:{len(pending_nodes)}): {pending_nodes}, " + f"min required nodes size: {self.get_min_nodes_required()}." ) return True diff --git a/dlrover/python/tests/test_rdzv_manager.py b/dlrover/python/tests/test_rdzv_manager.py index f865acd83..b6c554fb7 100644 --- a/dlrover/python/tests/test_rdzv_manager.py +++ b/dlrover/python/tests/test_rdzv_manager.py @@ -172,23 +172,24 @@ def test_get_lacking_ranks(self): rdzv_manager = ElasticTrainingRendezvousManager(error_monitor) rdzv_manager._rdzv_params.min_nodes = 4 + rdzv_manager._rdzv_params.max_nodes = 4 rdzv_manager._waiting_nodes = {0: 0, 1: 1, 2: 2, 3: 3} self.assertEqual(rdzv_manager._get_lacking_ranks(), []) - rdzv_manager._rdzv_params.min_nodes = 5 + rdzv_manager._rdzv_params.max_nodes = 5 self.assertEqual(rdzv_manager._get_lacking_ranks(), [4]) - rdzv_manager._rdzv_params.min_nodes = 3 + rdzv_manager._rdzv_params.max_nodes = 3 self.assertEqual(rdzv_manager._get_lacking_ranks(), []) - rdzv_manager._rdzv_params.min_nodes = 6 + rdzv_manager._rdzv_params.max_nodes = 6 self.assertEqual(rdzv_manager._get_lacking_ranks(), [4, 5]) - rdzv_manager._rdzv_params.min_nodes = 4 + rdzv_manager._rdzv_params.max_nodes = 4 rdzv_manager._waiting_nodes = {} self.assertEqual(rdzv_manager._get_lacking_ranks(), [0, 1, 2, 3]) - rdzv_manager._rdzv_params.min_nodes = 0 + rdzv_manager._rdzv_params.max_nodes = 0 self.assertEqual(rdzv_manager._get_lacking_ranks(), []) def test_multi_updating_waiting_nodes(self): From 78ffde986329e75d4a93e2e2e453e550b6bf3df7 Mon Sep 17 00:00:00 2001 From: Tianyi Chen Date: Tue, 24 Sep 2024 17:09:17 +0800 Subject: [PATCH 14/15] enlarge timeout (#1278) --- dlrover/python/elastic_agent/monitor/resource.py | 2 +- dlrover/python/master/node/dist_job_manager.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dlrover/python/elastic_agent/monitor/resource.py b/dlrover/python/elastic_agent/monitor/resource.py index ca78c6f50..d3b4a71c3 100644 --- a/dlrover/python/elastic_agent/monitor/resource.py +++ b/dlrover/python/elastic_agent/monitor/resource.py @@ -117,7 +117,7 @@ def start(self): # The first time called cpu_percent will return a meaningless 0.0 # value which we are supposed to ignore. So, here we call it at - # the begining of monitor and the next value is valid. + # the beginning of monitor and the next value is valid. get_process_cpu_percent() def stop(self): diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py index f5c7c416b..c3e37a881 100644 --- a/dlrover/python/master/node/dist_job_manager.py +++ b/dlrover/python/master/node/dist_job_manager.py @@ -455,7 +455,7 @@ def _monitor_node_heart_beat(self): logger.warning(detail_trace_back) time.sleep(15) - def _get_dead_node_event(self, window_interval=600) -> List[NodeEvent]: + def _get_dead_node_event(self, window_interval=900) -> List[NodeEvent]: now = time.time() dead_events: List[NodeEvent] = [] logger.debug(f"Current job nodes are: {self._job_nodes}.") From e37646f74c6ff47966474aa2f165a6874b03b84b Mon Sep 17 00:00:00 2001 From: "chentianyi.cty" Date: Fri, 27 Sep 2024 15:18:14 +0800 Subject: [PATCH 15/15] upgrade version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 50109730a..f25af286a 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ setup( name="dlrover", - version="0.3.7rc0", + version="0.3.8", description="An Automatic Distributed Deep Learning Framework", long_description="DLRover helps model developers focus on model algorithm" " itself, without taking care of any engineering stuff,"