diff --git a/dlrover/python/common/grpc.py b/dlrover/python/common/grpc.py index f5562283d..7220230d1 100644 --- a/dlrover/python/common/grpc.py +++ b/dlrover/python/common/grpc.py @@ -489,18 +489,10 @@ class NodeCheckpointState(Message): @dataclass -class DiagnosisTrainingLog(Message): - timestamp: int = 0 - - -@dataclass -class DiagnosisCudaLog(Message): - timestamp: int = 0 - - -@dataclass -class DiagnosisChipMetrics(Message): - timestamp: int = 0 +class DiagnosisReportData(Message): + data_cls: str = "" + data_content: str = "" + node_rank: int = -1 @dataclass diff --git a/dlrover/python/diagnosis/common/constants.py b/dlrover/python/diagnosis/common/constants.py index 8e0444e77..601bb730d 100644 --- a/dlrover/python/diagnosis/common/constants.py +++ b/dlrover/python/diagnosis/common/constants.py @@ -12,12 +12,27 @@ # limitations under the License. +class EnvConfigKey(object): + XPU_TIMER_PORT = "XPU_TIMER_PORT" + + class InferenceConfigKey(object): LOG_FILE = "log_file" ERRORS = "errors" -class DiagnoseAction(object): +class DiagnosisConstant(object): + MASTER_DIAGNOSIS_OBSERVING_INTERVAL_SECS = 180 + AGENT_PERIODICALLY_DIAGNOSIS_INTERVAL_SECS = 60 + + +class DiagnosisDataType(object): + GENERIC = "GENERIC" + TRAINING_LOG = "TRAINING_LOG" + XPU_TIMER_METRIC = "XPU_TIMER_METRIC" + + +class DiagnosisAction(object): NO_ACTION = "no_action" RESTART_WORKER = "restart_worker" RELAUNCH_WORKER = "relaunch_worker" diff --git a/dlrover/python/diagnosis/common/diagnosis_data.py b/dlrover/python/diagnosis/common/diagnosis_data.py index 18f69df86..61c502e3b 100644 --- a/dlrover/python/diagnosis/common/diagnosis_data.py +++ b/dlrover/python/diagnosis/common/diagnosis_data.py @@ -11,69 +11,168 @@ # See the License for the specific language governing permissions and # limitations under the License. -from abc import ABCMeta, abstractmethod +import json +from abc import ABCMeta from datetime import datetime -from typing import List, Optional +from typing import List - -class DiagnosisDataType: - CUDALOG = "cuda_log" - TRAININGLOG = "training_log" - CHIPMETRICES = "chip_metrics" +from dlrover.python.common import env_utils +from dlrover.python.diagnosis.common.constants import DiagnosisDataType class DiagnosisData(metaclass=ABCMeta): - def __init__(self): - pass - - @abstractmethod - def get_timestamp(self) -> float: - pass - - @abstractmethod - def get_type(self) -> str: - pass - - -class CudaLog(DiagnosisData): - def __init__(self, timestamp: int): + """ + Basic definition of diagnosis data. + + Args: + timestamp (datetime): Timestamp of diagnosis data. + data_type (str): Type of metric. Defaults to "GENERIC". + data_content (str): Content of the metric. Defaults to "". + node_id (int): Node ID. Defaults to -1. + node_type (str): Node type. Defaults to "". + node_rank (int): Node rank. Defaults to -1. + """ + + def __init__( + self, + timestamp: int = 0, + data_type: str = DiagnosisDataType.GENERIC, + data_content: str = "", + node_id: int = -1, + node_type: str = "", + node_rank: int = -1, + ): if timestamp == 0: - self.timestamp = int(round(datetime.now().timestamp())) + self._timestamp = int(round(datetime.now().timestamp())) else: - self.timestamp = timestamp - - def get_timestamp(self) -> int: - return self.timestamp - - def get_type(self) -> str: - return DiagnosisDataType.CUDALOG + self._timestamp = timestamp + self._data_type = data_type + self._data_content = data_content + self._node_id = node_id + self._node_type = node_type + self._node_rank = node_rank + + @property + def data_type(self) -> str: + return self._data_type + + @property + def timestamp(self) -> int: + return self._timestamp + + @property + def data_content(self) -> str: + return self._data_content + + @property + def node_id(self): + return self._node_id + + @property + def node_type(self): + return self._node_type + + @property + def node_rank(self): + return self._node_rank + + def to_json(self): + data = {k.lstrip("_"): v for k, v in self.__dict__.items()} + return json.dumps(data) + + @classmethod + def from_json(cls, json_data): + return cls(**json.loads(json_data)) + + def is_from_worker(self): + return self._node_id != -1 + + +class WorkerTrainingMetric(DiagnosisData): + """ + Diagnosis data for worker training metric. + + Args: + timestamp (datetime): Timestamp of diagnosis data. + data_type (str): Type of metric. Defaults to "GENERIC". + data_content (str): Content of the metric. Defaults to "". + node_id (int): Node ID. Defaults to -1. + node_type (str): Node type. Defaults to "". + node_rank (int): Node rank. Defaults to -1. + is_final_result (bool, optional): Whether the metric is final result. + Defaults to False. + need_report (bool, optional): Whether the metric needs report. + Defaults to False. + """ + + def __init__( + self, + timestamp: int = 0, + data_type: str = DiagnosisDataType.GENERIC, + data_content: str = "", + node_id=env_utils.get_node_id(), + node_type=env_utils.get_node_type(), + node_rank=env_utils.get_node_rank(), + is_final_result=False, + need_report=False, + ): + super(WorkerTrainingMetric, self).__init__( + timestamp, data_type, data_content, node_id, node_type, node_rank + ) + self._is_final_result = is_final_result + self._need_report = need_report + + @property + def is_final_result(self): + return self._is_final_result + + @property + def need_report(self): + return self._need_report + + def is_resolvable(self): + if self.data_type == DiagnosisDataType.XPU_TIMER_METRIC: + return True + # TODO: add more resolvable metric type later + return False class TrainingLog(DiagnosisData): - def __init__(self, timestamp: int = 0, logs: List[str] = None): - super().__init__() - if timestamp == 0: - self.timestamp = int(round(datetime.now().timestamp())) + """ + Worker's training log. + + Args: + timestamp (datetime): Timestamp of diagnosis data. + logs (list): Log content in list format. + node_id (int): Node ID. Defaults to -1. + node_type (str): Node type. Defaults to "". + node_rank (int): Node rank. Defaults to -1. + """ + + def __init__( + self, + timestamp: int = 0, + logs: List[str] = None, + node_id=env_utils.get_node_id(), + node_type=env_utils.get_node_type(), + node_rank=env_utils.get_node_rank(), + ): + if logs is None: + data_content = "" else: - self.timestamp = timestamp - self.logs: Optional[List[str]] = logs - - def get_timestamp(self) -> int: - return self.timestamp - - def get_type(self) -> str: - return DiagnosisDataType.TRAININGLOG - - -class ChipMetrics(DiagnosisData): - def __init__(self, timestamp: int): - if timestamp == 0: - self.timestamp = int(round(datetime.now().timestamp())) - else: - self.timestamp = timestamp - - def get_timestamp(self) -> int: - return self.timestamp - - def get_type(self) -> str: - return DiagnosisDataType.CHIPMETRICES + data_content = "\n".join(logs) + + super().__init__( + timestamp, + DiagnosisDataType.TRAINING_LOG, + data_content, + node_id, + node_type, + node_rank, + ) + + @property + def logs(self) -> List[str]: + if not self.data_content: + return [] + return [line for line in self.data_content.splitlines()] diff --git a/dlrover/python/diagnosis/common/inference_chain.py b/dlrover/python/diagnosis/common/inference_chain.py index 46d74e807..8587b9507 100644 --- a/dlrover/python/diagnosis/common/inference_chain.py +++ b/dlrover/python/diagnosis/common/inference_chain.py @@ -62,6 +62,10 @@ def infer(self, inferences: List[Inference]) -> List[Inference]: def is_compatible(self, inference: Inference) -> bool: pass + @property + def data_manager(self): + return self._data_manager + def is_same_inference(inference1: Inference, inference2: Inference) -> bool: if ( diff --git a/dlrover/python/diagnosis/datacollector/cuda_log_collector.py b/dlrover/python/diagnosis/datacollector/cuda_log_collector.py deleted file mode 100644 index 3b51f018c..000000000 --- a/dlrover/python/diagnosis/datacollector/cuda_log_collector.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2024 The DLRover Authors. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dlrover.python.diagnosis.common.diagnosis_data import CudaLog -from dlrover.python.diagnosis.datacollector.data_collector import DataCollector - - -class CudaLogCollector(DataCollector): - """ - CudaLogCollector collects cuda runtime logs - """ - - def __init__(self, *args, **kwargs): - super().__init__() - pass - - def collect_data(self) -> object: - log = CudaLog(0) - return log - - def to_collect_data(self) -> bool: - return True diff --git a/dlrover/python/diagnosis/datacollector/data_collector.py b/dlrover/python/diagnosis/datacollector/data_collector.py index 9ff2dbaa6..faff4cd8f 100644 --- a/dlrover/python/diagnosis/datacollector/data_collector.py +++ b/dlrover/python/diagnosis/datacollector/data_collector.py @@ -31,8 +31,11 @@ def __init__(self): @abstractmethod def collect_data(self) -> object: + """The implementation of data collector.""" pass @abstractmethod - def to_collect_data(self) -> bool: - pass + def is_enabled(self) -> bool: + """Whether the collector is enabled.""" + + return True diff --git a/dlrover/python/diagnosis/datacollector/metrics_collector.py b/dlrover/python/diagnosis/datacollector/metrics_collector.py deleted file mode 100644 index f5d5c194b..000000000 --- a/dlrover/python/diagnosis/datacollector/metrics_collector.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright 2024 The DLRover Authors. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dlrover.python.diagnosis.common.diagnosis_data import ChipMetrics -from dlrover.python.diagnosis.datacollector.data_collector import DataCollector - - -class MetricsCollector(DataCollector): - def __init__(self, *args, **kwargs): - """ - MetricsCollector collects GPU metrics - """ - pass - - def collect_data(self) -> object: - chip_metrics = ChipMetrics(0) - return chip_metrics - - def to_collect_data(self) -> bool: - return True diff --git a/dlrover/python/diagnosis/datacollector/training_log_collector.py b/dlrover/python/diagnosis/datacollector/training_log_collector.py index df24ba9e3..a4f5556a7 100644 --- a/dlrover/python/diagnosis/datacollector/training_log_collector.py +++ b/dlrover/python/diagnosis/datacollector/training_log_collector.py @@ -44,5 +44,5 @@ def collect_data(self) -> TrainingLog: training_log = TrainingLog(logs=logs) return training_log - def to_collect_data(self) -> bool: + def is_enabled(self) -> bool: return True diff --git a/dlrover/python/diagnosis/datacollector/xpu_timer_metric_collector.py b/dlrover/python/diagnosis/datacollector/xpu_timer_metric_collector.py new file mode 100644 index 000000000..441f342b8 --- /dev/null +++ b/dlrover/python/diagnosis/datacollector/xpu_timer_metric_collector.py @@ -0,0 +1,66 @@ +# Copyright 2024 The DLRover Authors. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import requests + +from dlrover.python.common import env_utils +from dlrover.python.common.log import default_logger as logger +from dlrover.python.diagnosis.common.constants import EnvConfigKey +from dlrover.python.diagnosis.datacollector.data_collector import DataCollector + + +class XpuTimerMetricsCollector(DataCollector): + def __init__(self): + """ + MetricsCollector collects GPU metrics from xpu-timer. + """ + super().__init__() + self._metric_port = env_utils.get_env(EnvConfigKey.XPU_TIMER_PORT) + if self._metric_port: + self._metric_endpoint = ( + "http://127.0.0.1:" + self._metric_port + "/metrics" + ) + else: + self._metric_endpoint = None + + def collect_data(self) -> str: + if not self.is_enabled(): + return "" + + try: + response = requests.get(self._metric_endpoint) + response.raise_for_status() + + # data preprocessing + return self._preprocess_metrics(response.text) + except requests.exceptions.RequestException as e: + logger.warning( + "Error fetching metrics from " + f"xpu-timer: {self._metric_endpoint}, error: {e}" + ) + return "" + + def _preprocess_metrics(self, metric_str): + try: + metric_list = [ + line + for line in metric_str.splitlines() + if not line.startswith("#") and not line.startswith("exposer") + ] + return "\n".join(metric_list) + except Exception as e: + logger.warning(f"Error preprocessing metrics from xpu-timer: {e}") + return "" + + def is_enabled(self) -> bool: + return self._metric_endpoint is not None diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py index 65ae16c69..1bcfd0c55 100644 --- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py @@ -13,6 +13,8 @@ from typing import List +from dlrover.python.diagnosis.common.constants import DiagnosisDataType +from dlrover.python.diagnosis.common.diagnosis_data import DiagnosisData from dlrover.python.diagnosis.common.inference_chain import ( Inference, InferenceAttribute, @@ -21,6 +23,8 @@ InferenceOperator, ) +HANG_METRIC_PREFIX = "XPU_TIMER_COMMON_HANG" + class CheckTrainingHangOperator(InferenceOperator): """ @@ -42,10 +46,48 @@ def is_compatible(self, inference: Inference) -> bool: return False def infer(self, inferences: List[Inference]) -> List[Inference]: + if not self.data_manager: + return [ + Inference( + name=InferenceName.TRAINING, + attribution=InferenceAttribute.NOT, + description=InferenceDescription.HANG, + ) + ] + + diagnosis_data = self._data_manager.get_data( + DiagnosisDataType.XPU_TIMER_METRIC + ) + + if diagnosis_data and self.is_hang(diagnosis_data): + return [ + Inference( + name=InferenceName.TRAINING, + attribution=InferenceAttribute.IS, + description=InferenceDescription.HANG, + ) + ] + return [ Inference( - name=InferenceName.END, - attribution="", - description="", + name=InferenceName.TRAINING, + attribution=InferenceAttribute.NOT, + description=InferenceDescription.HANG, ) ] + + def is_hang(self, diagnosis_data: List[DiagnosisData]): + hang_metric = [] + if not diagnosis_data: + return False + + for data in diagnosis_data: + each_metric = [ + line + for line in data.data_content.splitlines() + if line.startswith(HANG_METRIC_PREFIX) + ] + hang_metric.append(each_metric) + + # TODO: implement the judgement + return False diff --git a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py index 4342fc809..b2ca6dbc8 100644 --- a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py +++ b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py @@ -12,20 +12,26 @@ # limitations under the License. import json +import threading +import time from datetime import datetime from typing import Dict from torch.distributed.elastic.multiprocessing.errors import ProcessFailure +from dlrover.python.common import env_utils from dlrover.python.common.constants import TrainingExceptionLevel from dlrover.python.common.error import ProcessError from dlrover.python.common.log import default_logger as logger from dlrover.python.common.singleton import Singleton from dlrover.python.common.worker import WorkerContext from dlrover.python.diagnosis.common.constants import ( - DiagnoseAction, + DiagnosisAction, + DiagnosisConstant, + DiagnosisDataType, InferenceConfigKey, ) +from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric from dlrover.python.diagnosis.common.inference_chain import ( Inference, InferenceAttribute, @@ -33,6 +39,9 @@ InferenceName, is_inference_included, ) +from dlrover.python.diagnosis.datacollector.xpu_timer_metric_collector import ( + XpuTimerMetricsCollector, +) from dlrover.python.diagnosis.inferencechain.inference_chain import ( InferenceChain, ) @@ -47,6 +56,10 @@ def __init__(self, training_log_file: str, errors: str): self._client = MasterClient.singleton_instance() self._training_log_file = training_log_file self._errors = errors + self._xpu_timer_metric_collector = XpuTimerMetricsCollector() + self._stopped = False + + self.start() logger.info( "Initializing diagnosis agent with\n" @@ -54,6 +67,42 @@ def __init__(self, training_log_file: str, errors: str): f"errors: {self._errors}" ) + def start(self): + self._stopped = False + + # start a async thread to diagnose periodically + thread = threading.Thread( + target=self._periodically_diagnosis, + name="periodically_diagnosis", + daemon=True, + ) + thread.start() + + def stop(self): + self._stopped = True + + def _periodically_diagnosis(self): + logger.info("Start periodically diagnosis...") + while True: + if self._stopped: + logger.info("Stop periodically diagnosis.") + break + + xpu_timer_metric = self._xpu_timer_metric_collector.collect_data() + if xpu_timer_metric: + agent_xpu_metric = WorkerTrainingMetric( + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=xpu_timer_metric, + node_id=env_utils.get_node_id(), + node_type=env_utils.get_node_type(), + node_rank=env_utils.get_node_rank(), + ) + self._report_metric_to_master(agent_xpu_metric) + + time.sleep( + DiagnosisConstant.AGENT_PERIODICALLY_DIAGNOSIS_INTERVAL_SECS + ) + def diagnose_training_failure(self, worker_context: WorkerContext) -> str: self._report_failure_to_master( worker_context.run_result.failures, worker_context.restart_count @@ -86,7 +135,7 @@ def diagnose_training_failure(self, worker_context: WorkerContext) -> str: f"{worker_context.worker_spec.max_restarts} " f"attempts left; will restart worker group." ) - return DiagnoseAction.RESTART_WORKER + return DiagnosisAction.RESTART_WORKER else: logger.info( f"[{worker_context.worker_spec.role}] Worker group " @@ -95,7 +144,7 @@ def diagnose_training_failure(self, worker_context: WorkerContext) -> str: f"no attempts({worker_context.worker_spec.max_restarts}) " "left; will relaunch." ) - return DiagnoseAction.RELAUNCH_WORKER + return DiagnosisAction.RELAUNCH_WORKER def _report_failure_to_master( self, failures: Dict[int, ProcessFailure], restart_count: int @@ -115,3 +164,6 @@ def _report_failure_to_master( restart_count, TrainingExceptionLevel.PROCESS_ERROR, ) + + def _report_metric_to_master(self, agent_metric: WorkerTrainingMetric): + self._client.report_diagnosis_agent_metrics(agent_metric) diff --git a/dlrover/python/elastic_agent/master_client.py b/dlrover/python/elastic_agent/master_client.py index 9388c4783..e02554018 100644 --- a/dlrover/python/elastic_agent/master_client.py +++ b/dlrover/python/elastic_agent/master_client.py @@ -23,11 +23,7 @@ from dlrover.python.common.constants import NetworkFailureReason, NodeEnv from dlrover.python.common.log import default_logger as logger from dlrover.python.common.singleton import Singleton -from dlrover.python.diagnosis.common.diagnosis_data import ( - ChipMetrics, - CudaLog, - TrainingLog, -) +from dlrover.python.diagnosis.common.diagnosis_data import DiagnosisData def retry_grpc_request(func): @@ -386,16 +382,12 @@ def report_failures(self, error_data, restart_count=-1, level=""): def report_paral_config(self, config: grpc.ParallelConfig): self._report(config) - def report_diagnosis_training_log(self, training_log: TrainingLog): - message = grpc.DiagnosisTrainingLog(training_log.timestamp) - self._report(message) - - def report_diagnosis_chip_metrics(self, chip_metrics: ChipMetrics): - message = grpc.DiagnosisChipMetrics(chip_metrics.timestamp) - self._report(message) - - def report_diagnosis_cuda_log(self, cuda_log: CudaLog): - message = grpc.DiagnosisCudaLog(cuda_log.timestamp) + def report_diagnosis_agent_metrics(self, data: DiagnosisData): + message = grpc.DiagnosisReportData( + data.__class__.__name__, + data.to_json(), + data.node_rank, + ) self._report(message) def get_paral_config(self) -> grpc.ParallelConfig: diff --git a/dlrover/python/elastic_agent/torch/training.py b/dlrover/python/elastic_agent/torch/training.py index afbbacde8..508ed96d4 100644 --- a/dlrover/python/elastic_agent/torch/training.py +++ b/dlrover/python/elastic_agent/torch/training.py @@ -88,7 +88,7 @@ ) from dlrover.python.common.log import default_logger as logger from dlrover.python.common.worker import WorkerContext -from dlrover.python.diagnosis.common.constants import DiagnoseAction +from dlrover.python.diagnosis.common.constants import DiagnosisAction from dlrover.python.elastic_agent.config.paral_config_tuner import ( ParalConfigTuner, ) @@ -869,9 +869,9 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult: except Exception as e: logger.warning(f"Failed to diagnose errors: {e}") if self._remaining_failovers > 0: - action = DiagnoseAction.RESTART_WORKER + action = DiagnosisAction.RESTART_WORKER else: - action = DiagnoseAction.RELAUNCH_WORKER + action = DiagnosisAction.RELAUNCH_WORKER self._process_diagnose_action(action) if self._worker_group.state == WorkerState.FAILED: return run_result @@ -884,10 +884,10 @@ def _invoke_run(self, role: str = DEFAULT_ROLE) -> RunResult: raise Exception(f"[{role}] worker group in {state.name} state") def _process_diagnose_action(self, action: str): - if action == DiagnoseAction.RESTART_WORKER: + if action == DiagnosisAction.RESTART_WORKER: self._remaining_failovers -= 1 self._restart_workers(self._worker_group) - elif action == DiagnoseAction.RELAUNCH_WORKER: + elif action == DiagnosisAction.RELAUNCH_WORKER: self._stop_workers(self._worker_group) self._worker_group.state = WorkerState.FAILED diff --git a/dlrover/python/master/diagnosis/diagnosis.py b/dlrover/python/master/diagnosis/diagnosis.py index 6835e4d63..17dd073ea 100644 --- a/dlrover/python/master/diagnosis/diagnosis.py +++ b/dlrover/python/master/diagnosis/diagnosis.py @@ -17,6 +17,7 @@ from typing import Dict, List from dlrover.python.common.log import default_logger as logger +from dlrover.python.diagnosis.common.constants import DiagnosisConstant from dlrover.python.diagnosis.common.diagnosis_data import DiagnosisData from dlrover.python.diagnosis.common.inference_chain import ( InferenceAttribute, @@ -45,8 +46,8 @@ def __init__(self): self._data_manager: DiagnosisDataManager = DiagnosisDataManager(600) self._diagnostician: Diagnostician = Diagnostician(self._data_manager) - def collect_diagnosis_data(self, data_type: str, data: DiagnosisData): - self._data_manager.store_data(data_type, data) + def collect_diagnosis_data(self, data: DiagnosisData): + self._data_manager.store_data(data) def pre_check(self): logger.info("Start Diagnosis Manager to pre-check training...") @@ -100,7 +101,9 @@ def _diagnose_failures(self): root_causes = self._diagnostician.diagnose_failure(problem) for root_cause in root_causes: logger.info(f"identify root cause: {root_cause}") - time.sleep(180) + time.sleep( + DiagnosisConstant.MASTER_DIAGNOSIS_OBSERVING_INTERVAL_SECS + ) class DiagnosisDataManager: @@ -108,9 +111,10 @@ def __init__(self, expire_time_period): self.diagnosis_data: Dict[str, List[DiagnosisData]] = {} self.expire_time_period = expire_time_period - def store_data(self, data_type: str, data: DiagnosisData): + def store_data(self, data: DiagnosisData): + data_type = data.data_type if data_type not in self.diagnosis_data: - logger.warning(f"{data_type} is not found in the store") + logger.debug(f"{data_type} is not found in the store") self.diagnosis_data[data_type] = [] self.diagnosis_data[data_type].append(data) self._clean_diagnosis_data(data_type) @@ -127,7 +131,7 @@ def _clean_diagnosis_data(self, data_type: str): data = self.diagnosis_data[data_type] n = 0 for d in data: - if has_expired(d.get_timestamp(), self.expire_time_period): + if has_expired(d.timestamp, self.expire_time_period): n = n + 1 else: break diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py index f5b0350c6..6cc00cea2 100644 --- a/dlrover/python/master/node/dist_job_manager.py +++ b/dlrover/python/master/node/dist_job_manager.py @@ -424,9 +424,6 @@ def _monitor_nodes(self): try: nodes = self._node_watcher.list() self._process_list_nodes(nodes) - if self._stopped: - logger.info("Stop processing node events") - break for event in self._node_watcher.watch(): try: self._process_event(event) diff --git a/dlrover/python/master/servicer.py b/dlrover/python/master/servicer.py index ed9bb323b..d52898a97 100644 --- a/dlrover/python/master/servicer.py +++ b/dlrover/python/master/servicer.py @@ -14,7 +14,7 @@ import threading import time from concurrent import futures -from typing import Dict, List +from typing import Dict, List, Optional import grpc as grpc_lib @@ -32,12 +32,7 @@ ) from dlrover.python.common.global_context import Context from dlrover.python.common.log import default_logger as logger -from dlrover.python.diagnosis.common.diagnosis_data import ( - ChipMetrics, - CudaLog, - DiagnosisDataType, - TrainingLog, -) +from dlrover.python.diagnosis.common.diagnosis_data import DiagnosisData from dlrover.python.master.diagnosis.diagnosis import DiagnosisManager from dlrover.python.master.elastic_training.kv_store_service import ( KVStoreService, @@ -359,12 +354,8 @@ def report(self, request, _): success = self._report_heartbeat(node_type, node_id, message) elif isinstance(message, grpc.NodeCheckpointState): success = self._sync_checkpoint(node_type, node_id, message) - elif isinstance(message, grpc.DiagnosisChipMetrics): - success = self._report_chip_metrics(node_type, node_id, message) - elif isinstance(message, grpc.DiagnosisCudaLog): - success = self._report_cuda_log(node_type, node_id, message) - elif isinstance(message, grpc.DiagnosisTrainingLog): - success = self._report_training_log(node_type, node_id, message) + elif isinstance(message, grpc.DiagnosisReportData): + success = self._report_worker_diagnosis_data(message) response.success = success return response @@ -620,34 +611,17 @@ def _sync_checkpoint( rdzv_manager = self._rdzv_managers[RendezvousName.ELASTIC_TRAINING] return rdzv_manager.sync_ckpt_nodes(node_id, message.step) - def _report_chip_metrics( - self, node_type, node_id, message: grpc.DiagnosisChipMetrics - ): - if self._diagnosis_manager: - data = ChipMetrics(message.timestamp) - self._diagnosis_manager.collect_diagnosis_data( - DiagnosisDataType.CHIPMETRICES, data - ) - return True - - def _report_training_log( - self, node_type, node_id, message: grpc.DiagnosisTrainingLog - ): - if self._diagnosis_manager: - data = TrainingLog(message.timestamp) - self._diagnosis_manager.collect_diagnosis_data( - DiagnosisDataType.TRAININGLOG, data - ) - return True - - def _report_cuda_log( - self, node_type, node_id, message: grpc.DiagnosisCudaLog - ): + def _report_worker_diagnosis_data(self, message: grpc.DiagnosisReportData): if self._diagnosis_manager: - data = CudaLog(message.timestamp) - self._diagnosis_manager.collect_diagnosis_data( - DiagnosisDataType.CUDALOG, data - ) + data_cls: Optional[DiagnosisData] = globals().get(message.data_cls) + if data_cls is None: + logger.warning( + "Invalid diagnosis report " + f"data type: {message.data_cls}" + ) + return False + data_obj = data_cls.from_json(message.data_content) + self._diagnosis_manager.collect_diagnosis_data(data_obj) return True def _sync_training_ports( diff --git a/dlrover/python/master/watcher/k8s_watcher.py b/dlrover/python/master/watcher/k8s_watcher.py index 7ffc1dede..fb321ce19 100644 --- a/dlrover/python/master/watcher/k8s_watcher.py +++ b/dlrover/python/master/watcher/k8s_watcher.py @@ -165,10 +165,16 @@ class PodWatcher(NodeWatcher): """PodWatcher monitors all Pods of a k8s Job.""" def __init__(self, job_name, namespace): + super().__init__(job_name) self._job_name = job_name self._namespace = namespace self._k8s_client = k8sClient.singleton_instance(namespace) self._job_selector = ElasticJobLabel.JOB_KEY + "=" + self._job_name + logger.info( + f"Initialize PodWatcher with " + f"namespace: {self._namespace}, " + f"job-selector: {self._job_selector}" + ) def watch(self): resource_version = None diff --git a/dlrover/python/tests/data/xpu_timer_metrics b/dlrover/python/tests/data/xpu_timer_metrics new file mode 100644 index 000000000..0e646c2ab --- /dev/null +++ b/dlrover/python/tests/data/xpu_timer_metrics @@ -0,0 +1,193 @@ +# HELP exposer_transferred_bytes_total Transferred bytes to metrics services +# TYPE exposer_transferred_bytes_total counter +exposer_transferred_bytes_total 12375203 +# HELP exposer_scrapes_total Number of times metrics were scraped +# TYPE exposer_scrapes_total counter +exposer_scrapes_total 6174 +# HELP exposer_request_latencies Latencies of serving scrape requests, in microseconds +# TYPE exposer_request_latencies summary +exposer_request_latencies_count 6174 +exposer_request_latencies_sum 6043374 +exposer_request_latencies{quantile="0.5"} 888 +exposer_request_latencies{quantile="0.9"} 888 +exposer_request_latencies{quantile="0.99"} 888 +# TYPE XPU_TIMER_MM_KERNEL_AVG_LATENCY gauge +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 19063 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3719 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78409 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 3673 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 3682 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 7500 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23150 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 3675 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3662 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 7267 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 3678 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 7082 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 7076 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 8104 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 7587 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 3690 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 7732 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3511 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 18365 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 11184 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 17503 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23387 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 6777 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23613 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 79162 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 3694 +# TYPE XPU_TIMER_MM_KERNEL_MAX_LATENCY gauge +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3990 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72937 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4085 +# TYPE XPU_TIMER_MM_KERNEL_P99_LATENCY gauge +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4025 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3989 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72832 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4084 +# TYPE XPU_TIMER_MM_KERNEL_MIN_LATENCY gauge +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1183 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1257 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 77810 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 2995 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1040 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 2991 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1037 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1058 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1049 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1057 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 2997 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1181 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1200 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1205 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1180 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1042 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 77646 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 2991 +# TYPE XPU_TIMER_MM_KERNEL_FLOPS gauge +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 341.3345944233395 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 333.0872448878719 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 297.7779568156592 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 412.980679183614 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 412.7927898846224 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 371.6654952875318 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 333.9413147465587 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 412.6426014946334 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 365.3253468575526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 366.8107237284316 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 366.655433637526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 372.6319021584654 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 376.5763187831131 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 412.4764489189885 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 380.7275537933207 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 414.4304717174849 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 340.2784849108911 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 342.0186789595024 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 339.7178943261944 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 333.5362100380105 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 368.8886616386404 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 330.9267993677032 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 293.8160541599364 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 410.5513810707692 +# TYPE XPU_TIMER_COMMON_HANG gauge +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_START_DUMP gauge +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_END_DUMP gauge +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_POOL_QUEUE_SIZE gauge +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 23 +# TYPE XPU_TIMER_COMMON_WORK_QUEUE_SIZE gauge +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 14 diff --git a/dlrover/python/tests/test_diagnosis.py b/dlrover/python/tests/test_diagnosis.py index 5f86a3b7f..501dd1d08 100644 --- a/dlrover/python/tests/test_diagnosis.py +++ b/dlrover/python/tests/test_diagnosis.py @@ -14,7 +14,8 @@ import time import unittest -from dlrover.python.diagnosis.common.diagnosis_data import CudaLog +from dlrover.python.diagnosis.common.constants import DiagnosisDataType +from dlrover.python.diagnosis.common.diagnosis_data import TrainingLog from dlrover.python.master.diagnosis.diagnosis import DiagnosisDataManager @@ -27,20 +28,19 @@ def tearDown(self): def test_data_manager(self): mgr = DiagnosisDataManager(5) - data_type = "type" - log1 = CudaLog(0) - mgr.store_data(data_type, log1) + log1 = TrainingLog(0) + mgr.store_data(log1) time.sleep(1) - log2 = CudaLog(0) - mgr.store_data(data_type, log2) + log2 = TrainingLog(0) + mgr.store_data(log2) - logs = mgr.get_data(data_type) + logs = mgr.get_data(DiagnosisDataType.TRAINING_LOG) self.assertEqual(len(logs), 2) time.sleep(6) - log3 = CudaLog(0) - mgr.store_data(data_type, log3) - logs = mgr.get_data(data_type) + log3 = TrainingLog(0) + mgr.store_data(log3) + logs = mgr.get_data(DiagnosisDataType.TRAINING_LOG) self.assertEqual(len(logs), 1) diff --git a/dlrover/python/tests/test_diagnosis_agent.py b/dlrover/python/tests/test_diagnosis_agent.py index e3c3b1cba..83cd5be83 100644 --- a/dlrover/python/tests/test_diagnosis_agent.py +++ b/dlrover/python/tests/test_diagnosis_agent.py @@ -18,12 +18,21 @@ from torch.distributed.elastic.agent.server.api import RunResult, WorkerState from torch.distributed.launcher.api import LaunchConfig -from dlrover.python.common.constants import RendezvousName +from dlrover.python.common import env_utils +from dlrover.python.common.constants import NodeEnv, NodeType, RendezvousName from dlrover.python.common.worker import WorkerContext -from dlrover.python.diagnosis.common.constants import DiagnoseAction +from dlrover.python.diagnosis.common.constants import ( + DiagnosisAction, + DiagnosisDataType, + EnvConfigKey, +) +from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric from dlrover.python.diagnosis.datacollector.training_log_collector import ( TrainingLogCollector, ) +from dlrover.python.diagnosis.datacollector.xpu_timer_metric_collector import ( + XpuTimerMetricsCollector, +) from dlrover.python.elastic_agent.diagnosis.diagnosis_agent import ( DiagnosisAgent, ) @@ -52,7 +61,7 @@ def setUp(self): self.config = ElasticLaunchConfig(**launch_config.__dict__) def tearDown(self): - pass + os.environ.clear() def test_diagnose_training(self): file = "data/training.log" @@ -84,21 +93,21 @@ def test_diagnose_training(self): ) action = agent.diagnose_training_failure(wc) - self.assertEqual(action, DiagnoseAction.RESTART_WORKER) + self.assertEqual(action, DiagnosisAction.RESTART_WORKER) agent._errors = "error code is 507035" action = agent.diagnose_training_failure(wc) - self.assertEqual(action, DiagnoseAction.RELAUNCH_WORKER) + self.assertEqual(action, DiagnosisAction.RELAUNCH_WORKER) agent._errors = "error code is 11111" wc.remaining_failovers = 0 action = agent.diagnose_training_failure(wc) - self.assertEqual(action, DiagnoseAction.RELAUNCH_WORKER) + self.assertEqual(action, DiagnosisAction.RELAUNCH_WORKER) agent._errors = " #" wc.remaining_failovers = 2 action = agent.diagnose_training_failure(wc) - self.assertEqual(action, DiagnoseAction.RESTART_WORKER) + self.assertEqual(action, DiagnosisAction.RESTART_WORKER) @patch( "dlrover.python.diagnosis.datacollector.training_log_collector" @@ -113,10 +122,80 @@ def test_log_collect(self, mock_file_util): training_log_collector = TrainingLogCollector( log_file="test", n_line=3 ) + self.assertTrue(training_log_collector.is_enabled()) result = training_log_collector.collect_data() self.assertTrue("test0" not in result.logs) self.assertTrue("test1" in result.logs) + def test_xpu_timer_metric_collect(self): + collector = XpuTimerMetricsCollector() + self.assertFalse(collector.is_enabled()) + + env_utils.set_env(EnvConfigKey.XPU_TIMER_PORT, 18889) + collector = XpuTimerMetricsCollector() + self.assertTrue(collector.is_enabled()) + + self.assertEqual(collector.collect_data(), "") + + file = "data/xpu_timer_metrics" + file_path = os.path.join(os.path.dirname(__file__), file) + with open(file_path, "r", encoding="utf-8") as file: + test_metrics = file.read() + result = collector._preprocess_metrics(test_metrics) + self.assertTrue(result) + if "#" in result or "exposer" in result: + self.fail() + + env_utils.set_env(NodeEnv.NODE_ID, 1) + env_utils.set_env(NodeEnv.NODE_TYPE, NodeType.WORKER) + env_utils.set_env(NodeEnv.NODE_RANK, 1) + agent_xpu_metric = WorkerTrainingMetric( + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=result, + node_id=env_utils.get_node_id(), + node_type=env_utils.get_node_type(), + node_rank=env_utils.get_node_rank(), + ) + self.assertEqual( + agent_xpu_metric.data_type, + DiagnosisDataType.XPU_TIMER_METRIC, + ) + self.assertEqual(agent_xpu_metric.data_content, result) + self.assertEqual(agent_xpu_metric.node_id, 1) + self.assertEqual(agent_xpu_metric.node_type, NodeType.WORKER) + self.assertEqual(agent_xpu_metric.node_rank, 1) + self.assertTrue(agent_xpu_metric.timestamp > 0) + + def test_worker_training_metric(self): + test = WorkerTrainingMetric( + data_content="test123", + node_id=env_utils.get_node_id(), + node_type=env_utils.get_node_type(), + node_rank=env_utils.get_node_rank(), + is_final_result=True, + ) + + test_str = test.to_json() + self.assertTrue('"data_content": "test123"' in test_str) + + test_new = WorkerTrainingMetric.from_json(test_str) + self.assertEqual(test_new.timestamp, test.timestamp) + self.assertEqual(test_new.data_content, test.data_content) + self.assertEqual(test_new.data_type, test.data_type) + self.assertEqual(test_new.is_final_result, test.is_final_result) + + test_new = globals().get("WorkerTrainingMetric").from_json(test_str) + self.assertEqual(test_new.timestamp, test.timestamp) + self.assertEqual(test_new.data_content, test.data_content) + self.assertEqual(test_new.data_type, test.data_type) + self.assertEqual(test_new.is_final_result, test.is_final_result) + + test_new = globals().get(test.__class__.__name__).from_json(test_str) + self.assertEqual(test_new.timestamp, test.timestamp) + self.assertEqual(test_new.data_content, test.data_content) + self.assertEqual(test_new.data_type, test.data_type) + self.assertEqual(test_new.is_final_result, test.is_final_result) + if __name__ == "__main__": unittest.main() diff --git a/dlrover/python/tests/test_elastic_training_agent.py b/dlrover/python/tests/test_elastic_training_agent.py index 267acf627..afcd03ffb 100644 --- a/dlrover/python/tests/test_elastic_training_agent.py +++ b/dlrover/python/tests/test_elastic_training_agent.py @@ -32,6 +32,7 @@ Accelerators, AscendConstants, ConfigPath, + NodeEnv, RendezvousName, ) from dlrover.python.common.storage import PosixDiskStorage @@ -266,8 +267,8 @@ def setUp(self) -> None: rdzv_handler=self.rdzv_handler, max_restarts=self.config.max_restarts, monitor_interval=self.config.monitor_interval, - redirects=self.config.redirects, - tee=self.config.tee, + # redirects=self.config.redirects, + # tee=self.config.tee, master_addr=master_addr, local_addr=self.config.local_addr, ) @@ -305,6 +306,7 @@ def test_failure_ending_after_training(self): self.assertEqual(run_result.state, WorkerState.SUCCEEDED) def test_report_resource_with_step(self): + os.environ[NodeEnv.MONITOR_ENABLED] = "true" with tempfile.TemporaryDirectory() as tmpdirname: config_file = os.path.join(tmpdirname, "runtime_metrics.json") monitor = TorchTrainingMonitor(config_file) diff --git a/dlrover/python/tests/test_env_utils.py b/dlrover/python/tests/test_env_utils.py index 7d10d5c43..a4d783b15 100644 --- a/dlrover/python/tests/test_env_utils.py +++ b/dlrover/python/tests/test_env_utils.py @@ -27,6 +27,7 @@ def tearDown(self): os.environ.pop("LOCAL_WORLD_SIZE", None) def test_get_env(self): + os.environ[NodeEnv.WORKER_RANK] = "0" node_rank = env_utils.get_node_rank() self.assertEqual(node_rank, 0) diff --git a/dlrover/python/tests/test_inference_chain.py b/dlrover/python/tests/test_inference_chain.py index 015ff91b8..5a5124997 100644 --- a/dlrover/python/tests/test_inference_chain.py +++ b/dlrover/python/tests/test_inference_chain.py @@ -40,7 +40,7 @@ def setUp(self): def tearDown(self): pass - def test_CheckTrainingHangOperator(self): + def test_check_training_hang_operator(self): operator = CheckTrainingHangOperator(None) inf = Inference( name=InferenceName.TRAINING, @@ -50,9 +50,16 @@ def test_CheckTrainingHangOperator(self): self.assertTrue(operator.is_compatible(inf)) results = operator.infer([inf]) - self.assertEqual(results[0].name, InferenceName.END) + self.assertEqual( + results[0], + Inference( + name=InferenceName.TRAINING, + attribution=InferenceAttribute.NOT, + description=InferenceDescription.HANG, + ), + ) - def test_CheckFailureNodeOperator(self): + def test_check_failure_node_operator(self): file = "data/training.log" path = os.path.dirname(__file__) file_path = os.path.join(path, file) @@ -96,7 +103,7 @@ def test_CheckFailureNodeOperator(self): ) self.assertTrue(is_same_inference(results[0], not_failure_inf)) - def test_InferenceChain(self): + def test_inference_chain(self): file = "data/training.log" path = os.path.dirname(__file__) file_path = os.path.join(path, file) diff --git a/docker/ci.dockerfile b/docker/ci.dockerfile index 90c08cd56..34b89a126 100644 --- a/docker/ci.dockerfile +++ b/docker/ci.dockerfile @@ -16,6 +16,8 @@ RUN apt-get update && apt-get install -y \ g++ \ wget \ cmake \ + vim \ + net-tools \ ca-certificates \ shellcheck \ clang-format > /dev/null && \ diff --git a/docker/master.dockerfile b/docker/master.dockerfile index 151827c38..2b9ca83f3 100644 --- a/docker/master.dockerfile +++ b/docker/master.dockerfile @@ -9,7 +9,7 @@ RUN pip install pyparsing -i https://pypi.org/simple RUN apt-get -qq update && apt-get install -y iputils-ping vim gdb -ENV VERSION="0.3.6rc0" +ENV VERSION="0.3.8" COPY --from=builder /dlrover/dist/dlrover-${VERSION}-py3-none-any.whl / RUN pip install /dlrover-${VERSION}-py3-none-any.whl[k8s] --extra-index-url=https://pypi.org/simple && rm -f /*.whl RUN unset VERSION \ No newline at end of file diff --git a/setup.py b/setup.py index 53ef6b5c2..59a71842f 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ "pynvml", "urllib3<1.27,>=1.21.1", "deprecated", + "requests", ]