diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..9e191ade7 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or +advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic +address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a +professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..0eea4cb67 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,5 @@ +# Contributing + +## This repo is part of [DLRover](https://github.com/intelligent-machine-learning/dlrover) + +For details on how to contribute to DLRover, please see the main [contributing document](https://github.com/intelligent-machine-learning/dlrover/blob/master/docs/developer_guide.md). diff --git a/MAINTAINERS.md b/MAINTAINERS.md new file mode 100644 index 000000000..14a294efa --- /dev/null +++ b/MAINTAINERS.md @@ -0,0 +1,3 @@ +# Maintainers + +For details please see [CODEOWNERS](https://github.com/majieyue/dlrover/blob/lfai-20240919/.github/CODEOWNERS). diff --git a/RELEASES.md b/RELEASES.md new file mode 100644 index 000000000..681edc354 --- /dev/null +++ b/RELEASES.md @@ -0,0 +1,95 @@ +# Release Notes + +The DLRover project follows the semantic versioning scheme and maintains a separate branch for each minor version. The main branch always represents the next upcoming minor or major version. + +For laset news about DLRover you can check as following link: https://github.com/intelligent-machine-learning/dlrover?tab=readme-ov-file#latest-news= + +## Release 0.3.7 on May 13 + +Features: +* Flash Checkpoint suppors deleting old checkpoints. + +BugFix: +* Save/load the non-params-related variables of dist optimizer in Megatron-LM models. +* The agent waits for async saving checkpoint finishes before exiting. + +## Release 0.3.6 on Apr 24 + +Features: +* Flash checkpoint provides FlashCkptTrainer to support HuggingFace transforemers.Trainer. +* Flash checkpoint supports loading the checkpint of Megatron-LM from the memory. +Flash Checkpoint supports saving and loading FSDP checkpoint with full state dict. +* Job master can sort the node ranks by the access switches of the node. + +BugFix: +* Fix the segment fault when restarting the training process. + +## Release 0.3.5 on Mar 29 + +Features: +* Flash checkpoint supports saving and loading Megatron-LM MOE models. #1042 +* APIs to extend the module to check the node with different chips. #1023 +* Automatically mark the node as unschedulable if the node fails. #1025 + +BugFix: +* Fix the DDP example of mnist to save and load checkpoint. #1051 +* Fix the checkpoint name of DDP. #1034 + +## Release 0.3.4 on Feb 21 + +Features: +* Flash checkpoint enables saving and loading Megatron-LM models from multiple ranks in parallel. +* dlrover-run --auto-config Automatically configure the number of nodes and the number of processes per node. +* Users can customize the APIs of storage to save the checkpoint into different file systems. +* Deletion strategy to clean the old checkpoint files. + +BugFix: +* The shared memory does not exist if the size of the checkpoint changes. + +## Release 0.3.3 on Jan 25 + +Features: +* Support Python > 3.10. +* Support restarting the training process on Ascend NPU. +* Support asynchronously saving the checkpoint of the distributed optimizer of Megatron-LM to the storage. + +BugFix: +* Fix the checkpoint shard inconsistency of all ranks. +* Fix the bug to asynchronously save the Megatron-LM checkpoint of the job with multi-GPUs on multi-nodes. +* Fix the bug to load the Megatron-LM checkpoint. + +## Release 0.3.1 on Jan 10 + +Feature: +* Users can use flash checkpoint using torchrun or python -m torch.distributed.launch. + +Bugfix: +* The dlrover master cannot print the error message of the fault node in a kubeflow/PytorchJob. + +## Release 0.3.0 on Jan 3 + +Features: +* Flash Checkpoint to asynchronously persist checkpoint to storage. +* Flash Checkpoint recovers failure in memory. +* Flash Checkpoint supports DDP/FSDP/DeepSpeed/Megatron +* Node detection supports NPU. + +Examples +* The example of training nanoGPT using DeepSpeed. +* The example to save/load sharding FSDP checkpoint. + + +## Release 0.2.2 on Nov 21, 2023 + +Features: +* dlrover-run can run on any distributed jobs with the NODE_RANK and DLROVER_MASTER_ADDR in the environment. +* DLRover can asynchronously save the checkpoint into the storage which only block the training with a few time. + +BugFix: +* Fix the bug to load the FSDP checkpoint. + +## Release 0.2.1 on Oct 11, 2023 + +* Autotuning batch size without restarting the job. +* Automatically detect the straggler (slow worker). +* TFPlus: TFPlus 0.1.0 has been released, see detail in https://github.com/intelligent-machine-learning/dlrover/tree/master/tfplus diff --git a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py index 9350c0645..4342fc809 100644 --- a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py +++ b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py @@ -20,6 +20,7 @@ from dlrover.python.common.constants import TrainingExceptionLevel from dlrover.python.common.error import ProcessError from dlrover.python.common.log import default_logger as logger +from dlrover.python.common.singleton import Singleton from dlrover.python.common.worker import WorkerContext from dlrover.python.diagnosis.common.constants import ( DiagnoseAction, @@ -41,7 +42,7 @@ from dlrover.python.elastic_agent.master_client import MasterClient -class DiagnosisAgent: +class DiagnosisAgent(Singleton): def __init__(self, training_log_file: str, errors: str): self._client = MasterClient.singleton_instance() self._training_log_file = training_log_file diff --git a/dlrover/python/elastic_agent/monitor/resource.py b/dlrover/python/elastic_agent/monitor/resource.py index ca78c6f50..d3b4a71c3 100644 --- a/dlrover/python/elastic_agent/monitor/resource.py +++ b/dlrover/python/elastic_agent/monitor/resource.py @@ -117,7 +117,7 @@ def start(self): # The first time called cpu_percent will return a meaningless 0.0 # value which we are supposed to ignore. So, here we call it at - # the begining of monitor and the next value is valid. + # the beginning of monitor and the next value is valid. get_process_cpu_percent() def stop(self): diff --git a/dlrover/python/master/elastic_training/rdzv_manager.py b/dlrover/python/master/elastic_training/rdzv_manager.py index 378aaf0f4..db3ef9afd 100644 --- a/dlrover/python/master/elastic_training/rdzv_manager.py +++ b/dlrover/python/master/elastic_training/rdzv_manager.py @@ -200,11 +200,15 @@ def _get_lacking_ranks(self) -> List[int]: """ lacking_ranks: List[int] = [] - if self._rdzv_params is None or self._rdzv_params.min_nodes <= 0: + if ( + self._rdzv_params is None + or self._rdzv_params.min_nodes <= 0 + or self._rdzv_params.max_nodes <= 0 + ): return lacking_ranks - min_required = self._rdzv_params.min_nodes - min_ranks = set([i for i in range(min_required)]) + max_required = self._rdzv_params.max_nodes + min_ranks = set([i for i in range(max_required)]) if self._waiting_nodes: waiting_ranks = set(self._waiting_nodes.keys()) else: diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py index 09ee564d2..c3e37a881 100644 --- a/dlrover/python/master/node/dist_job_manager.py +++ b/dlrover/python/master/node/dist_job_manager.py @@ -134,9 +134,9 @@ def __init__( raise ValueError( f"Distribution strategy {job_args.distribution_strategy} " "is not supported. You can specify it with " - "ParameterServerStrategy/AllreduceStrategy." + "ParameterServerStrategy/AllReduceStrategy." ) - logger.info("New job optimizer : %s", self._job_optimizer.__class__) + logger.info(f"New job optimizer: {self._job_optimizer.__class__}") worker_restart_count = node_restart_count.get(NodeType.WORKER, 0) ps_restart_count = node_restart_count.get(NodeType.PS, 0) @@ -150,6 +150,12 @@ def __init__( self._ps_relaunch_max_num = min( ps_restart_count, _MAX_POD_RELAUNCH_COUNT ) + logger.info( + f"Worker relaunch number: {self._relaunch_on_worker_failure}; " + f"PS relaunch number: {self._ps_relaunch_max_num}; " + f"Critical worker index: {self._critical_worker_index}." + ) + self._node_event_callbacks: List[NodeEventCallback] = [] # Protects followed variables, which are accessed from event_cb. @@ -449,7 +455,7 @@ def _monitor_node_heart_beat(self): logger.warning(detail_trace_back) time.sleep(15) - def _get_dead_node_event(self, window_interval=600) -> List[NodeEvent]: + def _get_dead_node_event(self, window_interval=900) -> List[NodeEvent]: now = time.time() dead_events: List[NodeEvent] = [] logger.debug(f"Current job nodes are: {self._job_nodes}.") diff --git a/dlrover/python/master/node/worker.py b/dlrover/python/master/node/worker.py index 1b4d2c5ab..2654aa2c6 100644 --- a/dlrover/python/master/node/worker.py +++ b/dlrover/python/master/node/worker.py @@ -368,12 +368,12 @@ def is_training_hang_by_pending(self, total_node_num) -> bool: pending_nodes ): logger.debug( - "Skip for no required nodes info " "and not all nodes pending." + "Skip for no required nodes info and not all nodes pending." ) return False elif 0 < len(pending_nodes) == total_node_num: # all nodes pending - logger.debug(f"All nodes pending: {pending_nodes}.") + logger.info(f"All nodes pending: {pending_nodes}.") else: # partial nodes pending # with condition 1 + 2 @@ -404,7 +404,10 @@ def is_training_hang_by_pending(self, total_node_num) -> bool: if now - first_pending_node.create_time.timestamp() > timeout: logger.warning( f"Node {first_pending_node.name} " - f"exceeded pending timeout: {timeout}s." + f"exceeded pending timeout: {timeout}s, " + f"running nodes(size:{len(running_nodes)}): {running_nodes}, " + f"pending nodes(size:{len(pending_nodes)}): {pending_nodes}, " + f"min required nodes size: {self.get_min_nodes_required()}." ) return True diff --git a/dlrover/python/tests/test_diagnosis_agent.py b/dlrover/python/tests/test_diagnosis_agent.py index e9225d772..e3c3b1cba 100644 --- a/dlrover/python/tests/test_diagnosis_agent.py +++ b/dlrover/python/tests/test_diagnosis_agent.py @@ -60,7 +60,7 @@ def test_diagnose_training(self): file_path = os.path.join(path, file) errors = "error code is 11111" - agent = DiagnosisAgent(file_path, errors) + agent = DiagnosisAgent.singleton_instance(file_path, errors) spec = _create_worker_spec( node_rank=0, diff --git a/dlrover/python/tests/test_rdzv_manager.py b/dlrover/python/tests/test_rdzv_manager.py index f865acd83..b6c554fb7 100644 --- a/dlrover/python/tests/test_rdzv_manager.py +++ b/dlrover/python/tests/test_rdzv_manager.py @@ -172,23 +172,24 @@ def test_get_lacking_ranks(self): rdzv_manager = ElasticTrainingRendezvousManager(error_monitor) rdzv_manager._rdzv_params.min_nodes = 4 + rdzv_manager._rdzv_params.max_nodes = 4 rdzv_manager._waiting_nodes = {0: 0, 1: 1, 2: 2, 3: 3} self.assertEqual(rdzv_manager._get_lacking_ranks(), []) - rdzv_manager._rdzv_params.min_nodes = 5 + rdzv_manager._rdzv_params.max_nodes = 5 self.assertEqual(rdzv_manager._get_lacking_ranks(), [4]) - rdzv_manager._rdzv_params.min_nodes = 3 + rdzv_manager._rdzv_params.max_nodes = 3 self.assertEqual(rdzv_manager._get_lacking_ranks(), []) - rdzv_manager._rdzv_params.min_nodes = 6 + rdzv_manager._rdzv_params.max_nodes = 6 self.assertEqual(rdzv_manager._get_lacking_ranks(), [4, 5]) - rdzv_manager._rdzv_params.min_nodes = 4 + rdzv_manager._rdzv_params.max_nodes = 4 rdzv_manager._waiting_nodes = {} self.assertEqual(rdzv_manager._get_lacking_ranks(), [0, 1, 2, 3]) - rdzv_manager._rdzv_params.min_nodes = 0 + rdzv_manager._rdzv_params.max_nodes = 0 self.assertEqual(rdzv_manager._get_lacking_ranks(), []) def test_multi_updating_waiting_nodes(self): diff --git a/setup.py b/setup.py index 50109730a..f25af286a 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ setup( name="dlrover", - version="0.3.7rc0", + version="0.3.8", description="An Automatic Distributed Deep Learning Framework", long_description="DLRover helps model developers focus on model algorithm" " itself, without taking care of any engineering stuff,"