From 056428128ad4f107b6638f5ede7ad083b0f20789 Mon Sep 17 00:00:00 2001
From: bsang <b.sang@antgroup.com>
Date: Thu, 12 Sep 2024 16:48:41 +0800
Subject: [PATCH 01/15] fix diagnosis configure bugs

---
 .../inferencechain/inference_chain.py         |  1 +
 .../check_failure_node_operator.py            |  6 ++++++
 .../inferenceoperator/operator.py             | 20 +++++++++++++++++++
 .../diagnosis/diagnosis_agent.py              | 13 +++++++-----
 .../python/elastic_agent/torch/training.py    |  7 ++++---
 5 files changed, 39 insertions(+), 8 deletions(-)
 create mode 100644 dlrover/python/diagnosis/inferencechain/inferenceoperator/operator.py

diff --git a/dlrover/python/diagnosis/inferencechain/inference_chain.py b/dlrover/python/diagnosis/inferencechain/inference_chain.py
index 7a8450445..b5e9ab286 100644
--- a/dlrover/python/diagnosis/inferencechain/inference_chain.py
+++ b/dlrover/python/diagnosis/inferencechain/inference_chain.py
@@ -36,6 +36,7 @@ def __init__(
         self.operators = operators
 
     def infer(self) -> List[Inference]:
+        logger.info(f"Infer {self.inferences}")
         inferences = self.inferences
         while True:
             has_new_inference = False
diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_failure_node_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_failure_node_operator.py
index 9e07deb76..e272bbd1f 100644
--- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_failure_node_operator.py
+++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_failure_node_operator.py
@@ -53,6 +53,12 @@ def infer(self, inferences: List[Inference]) -> List[Inference]:
             or InferenceConfigKey.LOG_FILE not in inferences[0].configs
             or InferenceConfigKey.ERRORS not in inferences[0].configs
         ):
+            if len(inferences) > 0 and inferences[0].configs:
+                logger.error(
+                    f"invalid configurations: {inferences[0].configs}"
+                )
+            else:
+                logger.error("no configurations found")
             return [
                 Inference(
                     name=InferenceName.NODE,
diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/operator.py
new file mode 100644
index 000000000..ad0933aac
--- /dev/null
+++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/operator.py
@@ -0,0 +1,20 @@
+# Copyright 2024 The DLRover Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dlrover.python.diagnosis.inferencechain.inferenceoperator.check_failure_node_operator import (  # noqa: E501
+    CheckFailureNodeOperator,
+)
+
+
+def get_training_failure_operators():
+    return [CheckFailureNodeOperator()]
diff --git a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py
index bf2915102..5bdaf84dc 100644
--- a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py
+++ b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py
@@ -35,8 +35,8 @@
 from dlrover.python.diagnosis.inferencechain.inference_chain import (
     InferenceChain,
 )
-from dlrover.python.diagnosis.inferencechain.inferenceoperator.check_failure_node_operator import (  # noqa: E501
-    CheckFailureNodeOperator,
+from dlrover.python.diagnosis.inferencechain.inferenceoperator.operator import (  # noqa: E501
+    get_training_failure_operators,
 )
 from dlrover.python.elastic_agent.master_client import MasterClient
 
@@ -47,8 +47,11 @@ def __init__(self, training_log_file: str, errors: str):
         self._training_log_file = training_log_file
         self._errors = errors
 
-    def get_training_failure_operators(self):
-        return [CheckFailureNodeOperator()]
+        logger.info(
+            "Initializing diagnosis agent with\n"
+            f"training_log_file:    {self._training_log_file}\n"
+            f"errors:               {self._errors}"
+        )
 
     def diagnose_training_failure(self, worker_context: WorkerContext) -> str:
         self._report_failure_to_master(
@@ -64,7 +67,7 @@ def diagnose_training_failure(self, worker_context: WorkerContext) -> str:
                 InferenceConfigKey.ERRORS: self._errors,
             },
         )
-        ic = InferenceChain([inference], self.get_training_failure_operators())
+        ic = InferenceChain([inference], get_training_failure_operators())
         infer_results = ic.infer()
         failure_inf = Inference(
             name=InferenceName.NODE,
diff --git a/dlrover/python/elastic_agent/torch/training.py b/dlrover/python/elastic_agent/torch/training.py
index 965372532..86b09eefc 100644
--- a/dlrover/python/elastic_agent/torch/training.py
+++ b/dlrover/python/elastic_agent/torch/training.py
@@ -167,6 +167,8 @@ def set_node_unit(self, node_unit):
         self.rdzv_configs["node_unit"] = node_unit
 
     def auto_configure_params(self):
+        self.training_log_file = os.getenv(NodeEnv.TRAINING_LOG_FILE, "")
+        self.failure_node_errors = os.getenv(NodeEnv.FAILURE_NODE_ERRORS, "")
         device = ""
         if torch.cuda.is_available():
             device = torch.cuda.get_device_name()
@@ -183,9 +185,6 @@ def auto_configure_params(self):
         if self.min_nodes >= 4:
             self.network_check = True
 
-        self.training_log_file = os.getenv(NodeEnv.TRAINING_LOG_FILE, "")
-        self.failure_node_errors = os.getenv(NodeEnv.FAILURE_NODE_ERRORS, "")
-
 
 class MasterRendezvousHandler(RendezvousHandler):
     """The rendezvous handler completes rendezvous by connecting
@@ -872,6 +871,8 @@ def launch_agent(
         f"  monitor_interval : {config.monitor_interval}\n"
         f"  log_dir          : {config.log_dir}\n"
         f"  metrics_cfg      : {config.metrics_cfg}\n"
+        f"  training_log     : {config.training_log_file}\n"
+        f"  failure_errors   : {config.failure_node_errors}\n"
     )
 
     _set_paral_config()

From f4647e80b84e6bafb10ea4695d23b56a60164760 Mon Sep 17 00:00:00 2001
From: bsang <b.sang@antgroup.com>
Date: Thu, 12 Sep 2024 17:15:25 +0800
Subject: [PATCH 02/15] make diagnosis as singleton

---
 dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py | 3 ++-
 dlrover/python/tests/test_diagnosis_agent.py              | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py
index 5bdaf84dc..64e294b18 100644
--- a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py
+++ b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py
@@ -20,6 +20,7 @@
 from dlrover.python.common.constants import TrainingExceptionLevel
 from dlrover.python.common.error import ProcessError
 from dlrover.python.common.log import default_logger as logger
+from dlrover.python.common.singleton import Singleton
 from dlrover.python.common.worker import WorkerContext
 from dlrover.python.diagnosis.common.constants import (
     DiagnoseAction,
@@ -41,7 +42,7 @@
 from dlrover.python.elastic_agent.master_client import MasterClient
 
 
-class DiagnosisAgent:
+class DiagnosisAgent(Singleton):
     def __init__(self, training_log_file: str, errors: str):
         self._client = MasterClient.singleton_instance()
         self._training_log_file = training_log_file
diff --git a/dlrover/python/tests/test_diagnosis_agent.py b/dlrover/python/tests/test_diagnosis_agent.py
index 10e8198f0..bdce91d15 100644
--- a/dlrover/python/tests/test_diagnosis_agent.py
+++ b/dlrover/python/tests/test_diagnosis_agent.py
@@ -56,7 +56,7 @@ def test_diagnose_training(self):
         file_path = os.path.join(path, file)
 
         errors = "error code is 11111"
-        agent = DiagnosisAgent(file_path, errors)
+        agent = DiagnosisAgent.singleton_instance(file_path, errors)
 
         spec = _create_worker_spec(
             node_rank=0,

From d58f07d2b0a7454e71aeaeedc90b4e888d60d8c0 Mon Sep 17 00:00:00 2001
From: Ma Jie Yue <majieyue@gmail.com>
Date: Thu, 19 Sep 2024 14:45:59 +0800
Subject: [PATCH 03/15] add RELEASES.md document

---
 RELEASES.md | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 RELEASES.md

diff --git a/RELEASES.md b/RELEASES.md
new file mode 100644
index 000000000..681edc354
--- /dev/null
+++ b/RELEASES.md
@@ -0,0 +1,95 @@
+# Release Notes
+
+The DLRover project follows the semantic versioning scheme and maintains a separate branch for each minor version. The main branch always represents the next upcoming minor or major version.
+
+For laset news about DLRover you can check as following link: https://github.com/intelligent-machine-learning/dlrover?tab=readme-ov-file#latest-news=
+
+##  Release 0.3.7 on May 13
+
+Features:
+* Flash Checkpoint suppors deleting old checkpoints.
+
+BugFix:
+* Save/load the non-params-related variables of dist optimizer in Megatron-LM models.
+* The agent waits for async saving checkpoint finishes before exiting.
+
+## Release 0.3.6 on Apr 24
+
+Features:
+* Flash checkpoint provides FlashCkptTrainer to support HuggingFace transforemers.Trainer.
+* Flash checkpoint supports loading the checkpint of Megatron-LM from the memory.
+Flash Checkpoint supports saving and loading FSDP checkpoint with full state dict.
+* Job master can sort the node ranks by the access switches of the node.
+
+BugFix:
+* Fix the segment fault when restarting the training process.
+
+## Release 0.3.5 on Mar 29
+
+Features:
+* Flash checkpoint supports saving and loading Megatron-LM MOE models. #1042
+* APIs to extend the module to check the node with different chips. #1023
+* Automatically mark the node as unschedulable if the node fails. #1025
+
+BugFix:
+* Fix the DDP example of mnist to save and load checkpoint. #1051
+* Fix the checkpoint name of DDP. #1034
+
+## Release 0.3.4 on Feb 21
+
+Features:
+* Flash checkpoint enables saving and loading Megatron-LM models from multiple ranks in parallel.
+* dlrover-run --auto-config Automatically configure the number of nodes and the number of processes per node.
+* Users can customize the APIs of storage to save the checkpoint into different file systems.
+* Deletion strategy to clean the old checkpoint files.
+
+BugFix:
+* The shared memory does not exist if the size of the checkpoint changes.
+
+## Release 0.3.3 on Jan 25
+
+Features:
+* Support Python > 3.10.
+* Support restarting the training process on Ascend NPU.
+* Support asynchronously saving the checkpoint of the distributed optimizer of Megatron-LM to the storage.
+
+BugFix:
+* Fix the checkpoint shard inconsistency of all ranks.
+* Fix the bug to asynchronously save the Megatron-LM checkpoint of the job with multi-GPUs on multi-nodes.
+* Fix the bug to load the Megatron-LM checkpoint.
+
+## Release 0.3.1 on Jan 10
+
+Feature:
+* Users can use flash checkpoint using torchrun or python -m torch.distributed.launch.
+
+Bugfix:
+* The dlrover master cannot print the error message of the fault node in a kubeflow/PytorchJob.
+
+## Release 0.3.0 on Jan 3
+
+Features:
+* Flash Checkpoint to asynchronously persist checkpoint to storage.
+* Flash Checkpoint recovers failure in memory.
+* Flash Checkpoint supports DDP/FSDP/DeepSpeed/Megatron
+* Node detection supports NPU.
+
+Examples
+* The example of training nanoGPT using DeepSpeed.
+* The example to save/load sharding FSDP checkpoint.
+
+
+## Release 0.2.2 on Nov 21, 2023
+
+Features:
+* dlrover-run can run on any distributed jobs with the NODE_RANK and DLROVER_MASTER_ADDR in the environment.
+* DLRover can asynchronously save the checkpoint into the storage which only block the training with a few time.
+
+BugFix:
+* Fix the bug to load the FSDP checkpoint.
+
+## Release 0.2.1 on Oct 11, 2023
+
+* Autotuning batch size without restarting the job.
+* Automatically detect the straggler (slow worker).
+* TFPlus: TFPlus 0.1.0 has been released, see detail in https://github.com/intelligent-machine-learning/dlrover/tree/master/tfplus

From 28c873a36c9d5a5520f980cd7a86fc6b29ef7aa5 Mon Sep 17 00:00:00 2001
From: Ma Jie Yue <majieyue@gmail.com>
Date: Thu, 19 Sep 2024 15:38:58 +0800
Subject: [PATCH 04/15] add CODE_OF_CONDUCT.md file

---
 CODE_OF_CONDUCT.md | 76 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 CODE_OF_CONDUCT.md

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..9e191ade7
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,76 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <jieyue.majy@antgroup.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

From 7f85f62d8566dd2262b3b079875cb60d7996b7c8 Mon Sep 17 00:00:00 2001
From: Ma JieYue <majieyue@gmail.com>
Date: Thu, 19 Sep 2024 15:49:14 +0800
Subject: [PATCH 05/15] Delete CODE_OF_CONDUCT.md

---
 CODE_OF_CONDUCT.md | 76 ----------------------------------------------
 1 file changed, 76 deletions(-)
 delete mode 100644 CODE_OF_CONDUCT.md

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
deleted file mode 100644
index 9e191ade7..000000000
--- a/CODE_OF_CONDUCT.md
+++ /dev/null
@@ -1,76 +0,0 @@
-# Code of Conduct
-
-## Our Pledge
-
-In the interest of fostering an open and welcoming environment, we as
-contributors and maintainers pledge to make participation in our project and
-our community a harassment-free experience for everyone, regardless of age, body
-size, disability, ethnicity, sex characteristics, gender identity and expression,
-level of experience, education, socio-economic status, nationality, personal
-appearance, race, religion, or sexual identity and orientation.
-
-## Our Standards
-
-Examples of behavior that contributes to creating a positive environment
-include:
-
-* Using welcoming and inclusive language
-* Being respectful of differing viewpoints and experiences
-* Gracefully accepting constructive criticism
-* Focusing on what is best for the community
-* Showing empathy towards other community members
-
-Examples of unacceptable behavior by participants include:
-
-* The use of sexualized language or imagery and unwelcome sexual attention or
-advances
-* Trolling, insulting/derogatory comments, and personal or political attacks
-* Public or private harassment
-* Publishing others' private information, such as a physical or electronic
-address, without explicit permission
-* Other conduct which could reasonably be considered inappropriate in a
-professional setting
-
-## Our Responsibilities
-
-Project maintainers are responsible for clarifying the standards of acceptable
-behavior and are expected to take appropriate and fair corrective action in
-response to any instances of unacceptable behavior.
-
-Project maintainers have the right and responsibility to remove, edit, or
-reject comments, commits, code, wiki edits, issues, and other contributions
-that are not aligned to this Code of Conduct, or to ban temporarily or
-permanently any contributor for other behaviors that they deem inappropriate,
-threatening, offensive, or harmful.
-
-## Scope
-
-This Code of Conduct applies within all project spaces, and it also applies when
-an individual is representing the project or its community in public spaces.
-Examples of representing a project or community include using an official
-project e-mail address, posting via an official social media account, or acting
-as an appointed representative at an online or offline event. Representation of
-a project may be further defined and clarified by project maintainers.
-
-## Enforcement
-
-Instances of abusive, harassing, or otherwise unacceptable behavior may be
-reported by contacting the project team at <jieyue.majy@antgroup.com>. All
-complaints will be reviewed and investigated and will result in a response that
-is deemed necessary and appropriate to the circumstances. The project team is
-obligated to maintain confidentiality with regard to the reporter of an incident.
-Further details of specific enforcement policies may be posted separately.
-
-Project maintainers who do not follow or enforce the Code of Conduct in good
-faith may face temporary or permanent repercussions as determined by other
-members of the project's leadership.
-
-## Attribution
-
-This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
-available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
-
-[homepage]: https://www.contributor-covenant.org
-
-For answers to common questions about this code of conduct, see
-https://www.contributor-covenant.org/faq

From 1acf15f548564cbe53174de8b9040d19dc40a96f Mon Sep 17 00:00:00 2001
From: Ma Jie Yue <majieyue@gmail.com>
Date: Thu, 19 Sep 2024 15:54:29 +0800
Subject: [PATCH 06/15] add CODE_OF_CONDUCT.md file

---
 CODE_OF_CONDUCT.md | 76 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 CODE_OF_CONDUCT.md

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..9e191ade7
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,76 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <jieyue.majy@antgroup.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

From 2f64834e3e587f096cb1e6e703d5db5caf6ca1ed Mon Sep 17 00:00:00 2001
From: Ma Jie Yue <majieyue@gmail.com>
Date: Thu, 19 Sep 2024 16:23:36 +0800
Subject: [PATCH 07/15] add CONTRIBUTING.md file

---
 CONTRIBUTING.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 CONTRIBUTING.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..0eea4cb67
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,5 @@
+# Contributing
+
+## This repo is part of [DLRover](https://github.com/intelligent-machine-learning/dlrover)
+
+For details on how to contribute to DLRover, please see the main [contributing document](https://github.com/intelligent-machine-learning/dlrover/blob/master/docs/developer_guide.md).

From bd4b530b303e35cc7ccbf7a38121d220acf526c4 Mon Sep 17 00:00:00 2001
From: Ma Jie Yue <majieyue@gmail.com>
Date: Thu, 19 Sep 2024 16:51:37 +0800
Subject: [PATCH 08/15] add MAINTERNERS.md

---
 MAINTAINERS.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 MAINTAINERS.md

diff --git a/MAINTAINERS.md b/MAINTAINERS.md
new file mode 100644
index 000000000..4d018d0a8
--- /dev/null
+++ b/MAINTAINERS.md
@@ -0,0 +1,18 @@
+
+- [Overview](#overview)
+- [Current Maintainers](#current-maintainers)
+- [Emeritus](#emeritus)
+
+## Overview
+
+This document contains a list of maintainers in this repo. If you're interested in contributing, and becoming a maintainer, see [CONTRIBUTING](CONTRIBUTING.md).
+
+## Current Maintainers
+
+| Maintainer         | GitHub ID                                                 | Affiliation |
+| ------------------ | --------------------------------------------------------- | ----------- |
+| Tianyi Chen        | [BalaBalaYi](https://github.com/BalaBalaYi)               | AntGroup    |
+| Bo Sang            | [samplise](https://github.com/samplise)                   | AntGroup    |
+| Qinlong Wang       | [workingloong](https://github.com/workingloong)           | ByteDance   |
+
+## Emeritus

From 7426fdd39d4b0f57b3563dbd8132372a205ca1da Mon Sep 17 00:00:00 2001
From: Ma JieYue <majieyue@gmail.com>
Date: Thu, 19 Sep 2024 16:58:47 +0800
Subject: [PATCH 09/15] Delete MAINTAINERS.md

---
 MAINTAINERS.md | 18 ------------------
 1 file changed, 18 deletions(-)
 delete mode 100644 MAINTAINERS.md

diff --git a/MAINTAINERS.md b/MAINTAINERS.md
deleted file mode 100644
index 4d018d0a8..000000000
--- a/MAINTAINERS.md
+++ /dev/null
@@ -1,18 +0,0 @@
-
-- [Overview](#overview)
-- [Current Maintainers](#current-maintainers)
-- [Emeritus](#emeritus)
-
-## Overview
-
-This document contains a list of maintainers in this repo. If you're interested in contributing, and becoming a maintainer, see [CONTRIBUTING](CONTRIBUTING.md).
-
-## Current Maintainers
-
-| Maintainer         | GitHub ID                                                 | Affiliation |
-| ------------------ | --------------------------------------------------------- | ----------- |
-| Tianyi Chen        | [BalaBalaYi](https://github.com/BalaBalaYi)               | AntGroup    |
-| Bo Sang            | [samplise](https://github.com/samplise)                   | AntGroup    |
-| Qinlong Wang       | [workingloong](https://github.com/workingloong)           | ByteDance   |
-
-## Emeritus

From 23ea711c45b8d1994df5e29e82187925fc62f10d Mon Sep 17 00:00:00 2001
From: Ma Jie Yue <majieyue@gmail.com>
Date: Thu, 19 Sep 2024 17:03:59 +0800
Subject: [PATCH 10/15] add CODEOWNERS file

---
 CODEOWNERS | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 CODEOWNERS

diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 000000000..32c4998d7
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,8 @@
+# root directory
+*       @nash635 @workingloong @samplise @BalaBalaYi
+
+# dlrover directory
+/dlrover/   @workingloong @samplise @BalaBalaYi @majieyue
+
+# atorch directory
+/atorch/   @skydoorkai  @adamantboy @hxdtest

From a975bd5b356bcd20b2fb252263c919964e58df63 Mon Sep 17 00:00:00 2001
From: Ma JieYue <majieyue@gmail.com>
Date: Thu, 19 Sep 2024 17:52:54 +0800
Subject: [PATCH 11/15] Delete CODEOWNERS

---
 CODEOWNERS | 8 --------
 1 file changed, 8 deletions(-)
 delete mode 100644 CODEOWNERS

diff --git a/CODEOWNERS b/CODEOWNERS
deleted file mode 100644
index 32c4998d7..000000000
--- a/CODEOWNERS
+++ /dev/null
@@ -1,8 +0,0 @@
-# root directory
-*       @nash635 @workingloong @samplise @BalaBalaYi
-
-# dlrover directory
-/dlrover/   @workingloong @samplise @BalaBalaYi @majieyue
-
-# atorch directory
-/atorch/   @skydoorkai  @adamantboy @hxdtest

From 7301088d9cb02fe46b790440845c5a81fc45b310 Mon Sep 17 00:00:00 2001
From: Ma Jie Yue <majieyue@gmail.com>
Date: Thu, 19 Sep 2024 17:56:53 +0800
Subject: [PATCH 12/15] add MAINTAINERS.md, refer to CODEOWNERS

---
 MAINTAINERS.md | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 MAINTAINERS.md

diff --git a/MAINTAINERS.md b/MAINTAINERS.md
new file mode 100644
index 000000000..14a294efa
--- /dev/null
+++ b/MAINTAINERS.md
@@ -0,0 +1,3 @@
+# Maintainers
+
+For details please see [CODEOWNERS](https://github.com/majieyue/dlrover/blob/lfai-20240919/.github/CODEOWNERS).

From 97f39dcfac68898fe00059c56c24f2713287b6e9 Mon Sep 17 00:00:00 2001
From: Tianyi Chen <chentianyi.cty@antfin.com>
Date: Mon, 23 Sep 2024 10:55:43 +0800
Subject: [PATCH 13/15] Optimize logging (#1276)

* optimize logging

* fix ut

* lint
---
 .../python/master/elastic_training/rdzv_manager.py    | 10 +++++++---
 dlrover/python/master/node/dist_job_manager.py        | 10 ++++++++--
 dlrover/python/master/node/worker.py                  |  9 ++++++---
 dlrover/python/tests/test_rdzv_manager.py             | 11 ++++++-----
 4 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/dlrover/python/master/elastic_training/rdzv_manager.py b/dlrover/python/master/elastic_training/rdzv_manager.py
index 378aaf0f4..db3ef9afd 100644
--- a/dlrover/python/master/elastic_training/rdzv_manager.py
+++ b/dlrover/python/master/elastic_training/rdzv_manager.py
@@ -200,11 +200,15 @@ def _get_lacking_ranks(self) -> List[int]:
         """
 
         lacking_ranks: List[int] = []
-        if self._rdzv_params is None or self._rdzv_params.min_nodes <= 0:
+        if (
+            self._rdzv_params is None
+            or self._rdzv_params.min_nodes <= 0
+            or self._rdzv_params.max_nodes <= 0
+        ):
             return lacking_ranks
 
-        min_required = self._rdzv_params.min_nodes
-        min_ranks = set([i for i in range(min_required)])
+        max_required = self._rdzv_params.max_nodes
+        min_ranks = set([i for i in range(max_required)])
         if self._waiting_nodes:
             waiting_ranks = set(self._waiting_nodes.keys())
         else:
diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py
index 09ee564d2..f5c7c416b 100644
--- a/dlrover/python/master/node/dist_job_manager.py
+++ b/dlrover/python/master/node/dist_job_manager.py
@@ -134,9 +134,9 @@ def __init__(
             raise ValueError(
                 f"Distribution strategy {job_args.distribution_strategy} "
                 "is not supported. You can specify it with "
-                "ParameterServerStrategy/AllreduceStrategy."
+                "ParameterServerStrategy/AllReduceStrategy."
             )
-        logger.info("New job optimizer : %s", self._job_optimizer.__class__)
+        logger.info(f"New job optimizer: {self._job_optimizer.__class__}")
 
         worker_restart_count = node_restart_count.get(NodeType.WORKER, 0)
         ps_restart_count = node_restart_count.get(NodeType.PS, 0)
@@ -150,6 +150,12 @@ def __init__(
         self._ps_relaunch_max_num = min(
             ps_restart_count, _MAX_POD_RELAUNCH_COUNT
         )
+        logger.info(
+            f"Worker relaunch number: {self._relaunch_on_worker_failure}; "
+            f"PS relaunch number: {self._ps_relaunch_max_num}; "
+            f"Critical worker index: {self._critical_worker_index}."
+        )
+
         self._node_event_callbacks: List[NodeEventCallback] = []
 
         # Protects followed variables, which are accessed from event_cb.
diff --git a/dlrover/python/master/node/worker.py b/dlrover/python/master/node/worker.py
index 1b4d2c5ab..2654aa2c6 100644
--- a/dlrover/python/master/node/worker.py
+++ b/dlrover/python/master/node/worker.py
@@ -368,12 +368,12 @@ def is_training_hang_by_pending(self, total_node_num) -> bool:
             pending_nodes
         ):
             logger.debug(
-                "Skip for no required nodes info " "and not all nodes pending."
+                "Skip for no required nodes info and not all nodes pending."
             )
             return False
         elif 0 < len(pending_nodes) == total_node_num:
             # all nodes pending
-            logger.debug(f"All nodes pending: {pending_nodes}.")
+            logger.info(f"All nodes pending: {pending_nodes}.")
         else:
             # partial nodes pending
             # with condition 1 + 2
@@ -404,7 +404,10 @@ def is_training_hang_by_pending(self, total_node_num) -> bool:
         if now - first_pending_node.create_time.timestamp() > timeout:
             logger.warning(
                 f"Node {first_pending_node.name} "
-                f"exceeded pending timeout: {timeout}s."
+                f"exceeded pending timeout: {timeout}s, "
+                f"running nodes(size:{len(running_nodes)}): {running_nodes}, "
+                f"pending nodes(size:{len(pending_nodes)}): {pending_nodes}, "
+                f"min required nodes size: {self.get_min_nodes_required()}."
             )
             return True
 
diff --git a/dlrover/python/tests/test_rdzv_manager.py b/dlrover/python/tests/test_rdzv_manager.py
index f865acd83..b6c554fb7 100644
--- a/dlrover/python/tests/test_rdzv_manager.py
+++ b/dlrover/python/tests/test_rdzv_manager.py
@@ -172,23 +172,24 @@ def test_get_lacking_ranks(self):
         rdzv_manager = ElasticTrainingRendezvousManager(error_monitor)
 
         rdzv_manager._rdzv_params.min_nodes = 4
+        rdzv_manager._rdzv_params.max_nodes = 4
         rdzv_manager._waiting_nodes = {0: 0, 1: 1, 2: 2, 3: 3}
         self.assertEqual(rdzv_manager._get_lacking_ranks(), [])
 
-        rdzv_manager._rdzv_params.min_nodes = 5
+        rdzv_manager._rdzv_params.max_nodes = 5
         self.assertEqual(rdzv_manager._get_lacking_ranks(), [4])
 
-        rdzv_manager._rdzv_params.min_nodes = 3
+        rdzv_manager._rdzv_params.max_nodes = 3
         self.assertEqual(rdzv_manager._get_lacking_ranks(), [])
 
-        rdzv_manager._rdzv_params.min_nodes = 6
+        rdzv_manager._rdzv_params.max_nodes = 6
         self.assertEqual(rdzv_manager._get_lacking_ranks(), [4, 5])
 
-        rdzv_manager._rdzv_params.min_nodes = 4
+        rdzv_manager._rdzv_params.max_nodes = 4
         rdzv_manager._waiting_nodes = {}
         self.assertEqual(rdzv_manager._get_lacking_ranks(), [0, 1, 2, 3])
 
-        rdzv_manager._rdzv_params.min_nodes = 0
+        rdzv_manager._rdzv_params.max_nodes = 0
         self.assertEqual(rdzv_manager._get_lacking_ranks(), [])
 
     def test_multi_updating_waiting_nodes(self):

From 78ffde986329e75d4a93e2e2e453e550b6bf3df7 Mon Sep 17 00:00:00 2001
From: Tianyi Chen <chentianyi.cty@antfin.com>
Date: Tue, 24 Sep 2024 17:09:17 +0800
Subject: [PATCH 14/15] enlarge timeout (#1278)

---
 dlrover/python/elastic_agent/monitor/resource.py | 2 +-
 dlrover/python/master/node/dist_job_manager.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dlrover/python/elastic_agent/monitor/resource.py b/dlrover/python/elastic_agent/monitor/resource.py
index ca78c6f50..d3b4a71c3 100644
--- a/dlrover/python/elastic_agent/monitor/resource.py
+++ b/dlrover/python/elastic_agent/monitor/resource.py
@@ -117,7 +117,7 @@ def start(self):
 
         # The first time called cpu_percent will return a meaningless 0.0
         # value which we are supposed to ignore. So, here we call it at
-        # the begining of monitor and the next value is valid.
+        # the beginning of monitor and the next value is valid.
         get_process_cpu_percent()
 
     def stop(self):
diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py
index f5c7c416b..c3e37a881 100644
--- a/dlrover/python/master/node/dist_job_manager.py
+++ b/dlrover/python/master/node/dist_job_manager.py
@@ -455,7 +455,7 @@ def _monitor_node_heart_beat(self):
                     logger.warning(detail_trace_back)
             time.sleep(15)
 
-    def _get_dead_node_event(self, window_interval=600) -> List[NodeEvent]:
+    def _get_dead_node_event(self, window_interval=900) -> List[NodeEvent]:
         now = time.time()
         dead_events: List[NodeEvent] = []
         logger.debug(f"Current job nodes are: {self._job_nodes}.")

From e37646f74c6ff47966474aa2f165a6874b03b84b Mon Sep 17 00:00:00 2001
From: "chentianyi.cty" <robert37@sina.com>
Date: Fri, 27 Sep 2024 15:18:14 +0800
Subject: [PATCH 15/15] upgrade version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 50109730a..f25af286a 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@
 
 setup(
     name="dlrover",
-    version="0.3.7rc0",
+    version="0.3.8",
     description="An Automatic Distributed Deep Learning Framework",
     long_description="DLRover helps model developers focus on model algorithm"
     " itself, without taking care of any engineering stuff,"