intelligent-machine-learning · samplise · Oct 10, 2024 · Sep 20, 2024 · Sep 30, 2024 · Sep 30, 2024
@@ -489,18 +489,10 @@ class NodeCheckpointState(Message):
 
 
 @dataclass
-class DiagnosisTrainingLog(Message):
-    timestamp: int = 0
-
-
-@dataclass
-class DiagnosisCudaLog(Message):
-    timestamp: int = 0
-
-
-@dataclass
-class DiagnosisChipMetrics(Message):
-    timestamp: int = 0
+class DiagnosisReportData(Message):
+    data_cls: str = ""
+    data_content: str = ""
+    node_rank: int = -1
 
 
 @dataclass

@@ -12,12 +12,27 @@
 # limitations under the License.
 
 
+class EnvConfigKey(object):
+    XPU_TIMER_PORT = "XPU_TIMER_PORT"
+
+
 class InferenceConfigKey(object):
     LOG_FILE = "log_file"
     ERRORS = "errors"
 
 
-class DiagnoseAction(object):
+class DiagnosisConstant(object):
+    MASTER_DIAGNOSIS_OBSERVING_INTERVAL_SECS = 180
+    AGENT_PERIODICALLY_DIAGNOSIS_INTERVAL_SECS = 60
+
+
+class DiagnosisDataType(object):
+    GENERIC = "GENERIC"
+    TRAINING_LOG = "TRAINING_LOG"
+    XPU_TIMER_METRIC = "XPU_TIMER_METRIC"
+
+
+class DiagnosisAction(object):
     NO_ACTION = "no_action"
     RESTART_WORKER = "restart_worker"
     RELAUNCH_WORKER = "relaunch_worker"
@@ -11,69 +11,168 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from abc import ABCMeta, abstractmethod
+import json
+from abc import ABCMeta
 from datetime import datetime
-from typing import List, Optional
+from typing import List
 
-
-class DiagnosisDataType:
-    CUDALOG = "cuda_log"
-    TRAININGLOG = "training_log"
-    CHIPMETRICES = "chip_metrics"
+from dlrover.python.common import env_utils
+from dlrover.python.diagnosis.common.constants import DiagnosisDataType
 
 
 class DiagnosisData(metaclass=ABCMeta):
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def get_timestamp(self) -> float:
-        pass
-
-    @abstractmethod
-    def get_type(self) -> str:
-        pass
-
-
-class CudaLog(DiagnosisData):
-    def __init__(self, timestamp: int):
+    """
+    Basic definition of diagnosis data.
+
+    Args:
+        timestamp (datetime): Timestamp of diagnosis data.
+        data_type (str): Type of metric. Defaults to "GENERIC".
+        data_content (str): Content of the metric. Defaults to "".
+        node_id (int): Node ID. Defaults to -1.
+        node_type (str): Node type. Defaults to "".
+        node_rank (int): Node rank. Defaults to -1.
+    """
+
+    def __init__(
+        self,
+        timestamp: int = 0,
+        data_type: str = DiagnosisDataType.GENERIC,
+        data_content: str = "",
+        node_id: int = -1,
+        node_type: str = "",
+        node_rank: int = -1,
+    ):
         if timestamp == 0:
-            self.timestamp = int(round(datetime.now().timestamp()))
+            self._timestamp = int(round(datetime.now().timestamp()))
         else:
-            self.timestamp = timestamp
-
-    def get_timestamp(self) -> int:
-        return self.timestamp
-
-    def get_type(self) -> str:
-        return DiagnosisDataType.CUDALOG
+            self._timestamp = timestamp
+        self._data_type = data_type
+        self._data_content = data_content
+        self._node_id = node_id
+        self._node_type = node_type
+        self._node_rank = node_rank
+
+    @property
+    def data_type(self) -> str:
+        return self._data_type
+
+    @property
+    def timestamp(self) -> int:
+        return self._timestamp
+
+    @property
+    def data_content(self) -> str:
+        return self._data_content
+
+    @property
+    def node_id(self):
+        return self._node_id
+
+    @property
+    def node_type(self):
+        return self._node_type
+
+    @property
+    def node_rank(self):
+        return self._node_rank
+
+    def to_json(self):
+        data = {k.lstrip("_"): v for k, v in self.__dict__.items()}
+        return json.dumps(data)
+
+    @classmethod
+    def from_json(cls, json_data):
+        return cls(**json.loads(json_data))
+
+    def is_from_worker(self):
+        return self._node_id != -1
+
+
+class WorkerTrainingMetric(DiagnosisData):
+    """
+    Diagnosis data for worker training metric.
+
+    Args:
+        timestamp (datetime): Timestamp of diagnosis data.
+        data_type (str): Type of metric. Defaults to "GENERIC".
+        data_content (str): Content of the metric. Defaults to "".
+        node_id (int): Node ID. Defaults to -1.
+        node_type (str): Node type. Defaults to "".
+        node_rank (int): Node rank. Defaults to -1.
+        is_final_result (bool, optional): Whether the metric is final result.
+            Defaults to False.
+        need_report (bool, optional): Whether the metric needs report.
+            Defaults to False.
+    """
+
+    def __init__(
+        self,
+        timestamp: int = 0,
+        data_type: str = DiagnosisDataType.GENERIC,
+        data_content: str = "",
+        node_id=env_utils.get_node_id(),
+        node_type=env_utils.get_node_type(),
+        node_rank=env_utils.get_node_rank(),
+        is_final_result=False,
+        need_report=False,
+    ):
+        super(WorkerTrainingMetric, self).__init__(
+            timestamp, data_type, data_content, node_id, node_type, node_rank
+        )
+        self._is_final_result = is_final_result
+        self._need_report = need_report
+
+    @property
+    def is_final_result(self):
+        return self._is_final_result
+
+    @property
+    def need_report(self):
+        return self._need_report
+
+    def is_resolvable(self):
+        if self.data_type == DiagnosisDataType.XPU_TIMER_METRIC:
+            return True
+        # TODO: add more resolvable metric type later
+        return False
 
 
 class TrainingLog(DiagnosisData):
-    def __init__(self, timestamp: int = 0, logs: List[str] = None):
-        super().__init__()
-        if timestamp == 0:
-            self.timestamp = int(round(datetime.now().timestamp()))
+    """
+    Worker's training log.
+
+    Args:
+        timestamp (datetime): Timestamp of diagnosis data.
+        logs (list): Log content in list format.
+        node_id (int): Node ID. Defaults to -1.
+        node_type (str): Node type. Defaults to "".
+        node_rank (int): Node rank. Defaults to -1.
+    """
+
+    def __init__(
+        self,
+        timestamp: int = 0,
+        logs: List[str] = None,
+        node_id=env_utils.get_node_id(),
+        node_type=env_utils.get_node_type(),
+        node_rank=env_utils.get_node_rank(),
+    ):
+        if logs is None:
+            data_content = ""
         else:
-            self.timestamp = timestamp
-        self.logs: Optional[List[str]] = logs
-
-    def get_timestamp(self) -> int:
-        return self.timestamp
-
-    def get_type(self) -> str:
-        return DiagnosisDataType.TRAININGLOG
-
-
-class ChipMetrics(DiagnosisData):
-    def __init__(self, timestamp: int):
-        if timestamp == 0:
-            self.timestamp = int(round(datetime.now().timestamp()))
-        else:
-            self.timestamp = timestamp
-
-    def get_timestamp(self) -> int:
-        return self.timestamp
-
-    def get_type(self) -> str:
-        return DiagnosisDataType.CHIPMETRICES
+            data_content = "\n".join(logs)
+
+        super().__init__(
+            timestamp,
+            DiagnosisDataType.TRAINING_LOG,
+            data_content,
+            node_id,
+            node_type,
+            node_rank,
+        )
+
+    @property
+    def logs(self) -> List[str]:
+        if not self.data_content:
+            return []
+        return [line for line in self.data_content.splitlines()]
@@ -62,6 +62,10 @@ def infer(self, inferences: List[Inference]) -> List[Inference]:
     def is_compatible(self, inference: Inference) -> bool:
         pass
 
+    @property
+    def data_manager(self):
+        return self._data_manager
+
 
 def is_same_inference(inference1: Inference, inference2: Inference) -> bool:
     if (

@@ -31,8 +31,11 @@
 
     @abstractmethod
     def collect_data(self) -> object:
+        """The implementation of data collector."""
         pass
 
     @abstractmethod
-    def to_collect_data(self) -> bool:
-        pass
+    def is_enabled(self) -> bool:
+        """Whether the collector is enabled."""
+
+        return True
@@ -44,5 +44,5 @@ def collect_data(self) -> TrainingLog:
         training_log = TrainingLog(logs=logs)
         return training_log
 
-    def to_collect_data(self) -> bool:
+    def is_enabled(self) -> bool:
         return True