ecmwf · anaprietonem · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,11 @@ Keep it human-readable, your future self will thank you!
 
 ## [Unreleased](https://github.com/ecmwf/anemoi-training/compare/0.2.1...HEAD)
 
+
+### Added
+- Feat: Anemoi Profiler compatible with mlflow and using Pytorch (Kineto) Profiler for memory report [38](https://github.com/ecmwf/anemoi-training/pull/38/)
+
+
 ## [0.2.1 - Bugfix: resuming mlflow runs](https://github.com/ecmwf/anemoi-training/compare/0.2.0...0.2.1) - 2024-10-24
 
 ### Added
@@ -54,6 +59,7 @@ Keep it human-readable, your future self will thank you!
 - Feature: `AnemoiMlflowClient`, an mlflow client with authentication support [#86](https://github.com/ecmwf/anemoi-training/pull/86)
 - Long Rollout Plots
 
+
 ### Fixed
 
 - Fix `TypeError` raised when trying to JSON serialise `datetime.timedelta` object - [#43](https://github.com/ecmwf/anemoi-training/pull/43)

diff --git a/docs/images/profiler/anemoi_profiler_architecture.png b/docs/images/profiler/anemoi_profiler_architecture.png
diff --git a/docs/images/profiler/anemoi_profiler_benchmark_profiler.png b/docs/images/profiler/anemoi_profiler_benchmark_profiler.png
diff --git a/docs/images/profiler/anemoi_profiler_config.png b/docs/images/profiler/anemoi_profiler_config.png
diff --git a/docs/images/profiler/anemoi_profiler_high_level.png b/docs/images/profiler/anemoi_profiler_high_level.png
diff --git a/docs/images/profiler/anemoi_profiler_mlflow_integration.png b/docs/images/profiler/anemoi_profiler_mlflow_integration.png
diff --git a/docs/images/profiler/anemoi_profiler_mlflow_integration_2.png b/docs/images/profiler/anemoi_profiler_mlflow_integration_2.png
diff --git a/docs/images/profiler/anemoi_profiler_mlflow_integration_3.png b/docs/images/profiler/anemoi_profiler_mlflow_integration_3.png
diff --git a/docs/images/profiler/anemoi_profiler_speed_report.png b/docs/images/profiler/anemoi_profiler_speed_report.png
diff --git a/docs/images/profiler/anemoi_profiler_speedreport_diagram.png b/docs/images/profiler/anemoi_profiler_speedreport_diagram.png
diff --git a/docs/images/profiler/anemoi_profiler_training_rates.png b/docs/images/profiler/anemoi_profiler_training_rates.png
diff --git a/docs/images/profiler/anemoi_profiler_validation_rates.png b/docs/images/profiler/anemoi_profiler_validation_rates.png
diff --git a/docs/images/profiler/example_memory_report.png b/docs/images/profiler/example_memory_report.png
diff --git a/docs/images/profiler/example_memory_timeline.png b/docs/images/profiler/example_memory_timeline.png
diff --git a/docs/images/profiler/example_model_summary.png b/docs/images/profiler/example_model_summary.png
diff --git a/docs/images/profiler/example_model_summary_2.png b/docs/images/profiler/example_model_summary_2.png
diff --git a/docs/images/profiler/example_system_report.png b/docs/images/profiler/example_system_report.png
diff --git a/docs/images/profiler/example_time_report.png b/docs/images/profiler/example_time_report.png
diff --git a/docs/images/profiler/idle_time_breakdown.png b/docs/images/profiler/idle_time_breakdown.png
diff --git a/docs/images/profiler/kernel_breakdown_dfs.png b/docs/images/profiler/kernel_breakdown_dfs.png
diff --git a/docs/images/profiler/kernel_breakdown_plots.png b/docs/images/profiler/kernel_breakdown_plots.png
diff --git a/docs/images/profiler/memory_snapshot_diagram.png b/docs/images/profiler/memory_snapshot_diagram.png
diff --git a/docs/images/profiler/memory_snapshot_output.png b/docs/images/profiler/memory_snapshot_output.png
diff --git a/docs/images/profiler/temporal_breakdown.png b/docs/images/profiler/temporal_breakdown.png
diff --git a/docs/index.rst b/docs/index.rst
@@ -43,6 +43,7 @@ This package provides the *Anemoi* training functionality.
    user-guide/training
    user-guide/models
    user-guide/tracking
+   user-guide/benchmarking
    user-guide/distributed
    user-guide/debugging
 

diff --git a/docs/overview.rst b/docs/overview.rst
@@ -91,6 +91,18 @@ and resolve issues during the training process, including:
 -  Debug configurations for quick error identification
 -  Guidance on isolating and addressing common problems
 
+8. Benchmarking and HPC Profiling
+=================================
+
+Anemoi Training offers tools and configurations to support benchmarking
+and High-Performance Computing (HPC) profiling, allowing users to
+optimize training performance. This includes:
+
+-  Benchmarking configurations for evaluating training efficiency across
+   different hardware setups.
+-  Profiling tools for monitoring resource utilization (CPU, GPU,
+   memory) and identifying performance bottlenecks.
+
 **************************
  Components and Structure
 **************************

diff --git a/docs/user-guide/benchmarking.rst b/docs/user-guide/benchmarking.rst
@@ -76,6 +76,13 @@ optional-dependencies.docs = [
   "sphinx-argparse",
   "sphinx-rtd-theme",
 ]
+optional-dependencies.profile = [
+  "holistictraceanalysis>=0.2",
+  "pandas>=1.3.2",
+  "rich>=13.6",
+  "tabulate>=0.9",
+]
+
 optional-dependencies.tests = [ "hypothesis", "pytest", "pytest-mock" ]
 
 urls.Changelog = "https://github.com/ecmwf/anemoi-training/CHANGELOG.md"
@@ -86,6 +93,7 @@ urls.Repository = "https://github.com/ecmwf/anemoi-training/"
 # command for interactive DDP (not supposed to be used directly)
 # the dot is intentional, so it doesn't trigger autocomplete
 scripts.".anemoi-training-train" = "anemoi.training.commands.train:main"
+scripts.".anemoi-training-profiler" = "anemoi.training.commands.profiler:main"
 
 # Add subcommand in the `commands` directory
 scripts.anemoi-training = "anemoi.training.__main__:main"

diff --git a/src/anemoi/training/commands/profiler.py b/src/anemoi/training/commands/profiler.py
@@ -0,0 +1,85 @@
+# (C) Copyright 2024 ECMWF.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+from __future__ import annotations
+
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from anemoi.training.commands import Command
+
+if TYPE_CHECKING:
+    import argparse
+
+LOGGER = logging.getLogger(__name__)
+
+
+class Profiler(Command):
+    """Commands to profile Anemoi models."""
+
+    accept_unknown_args = True
+
+    @staticmethod
+    def add_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        return parser
+
+    def run(self, args: list[str], unknown_args: list[str] | None = None) -> None:
+        # This will be picked up by the logger
+        os.environ["ANEMOI_PROFILER_CMD"] = f"{sys.argv[0]} {args.command}"
+        # Merge the known subcommands with a non-whitespace character for hydra
+        new_sysargv = self._merge_sysargv(args)
+
+        # Add the unknown arguments (belonging to hydra) to sys.argv
+        if unknown_args is not None:
+            sys.argv = [new_sysargv, *unknown_args]
+        else:
+            sys.argv = [new_sysargv]
+
+        # Import and run the profiler command
+        LOGGER.info("Running anemoi profiling command with overrides: %s", sys.argv[1:])
+        main()
+
+    def _merge_sysargv(self, args: argparse.Namespace) -> str:
+        """Merge the sys.argv with the known subcommands to pass to hydra.
+
+        Parameters
+        ----------
+        args : argparse.Namespace
+            args from the command line
+
+        Returns
+        -------
+        str
+            Modified sys.argv as string
+        """
+        argv = Path(sys.argv[0])
+
+        # this will turn "/env/bin/anemoi-training train" into "/env/bin/.anemoi-training-train"
+        # the dot at the beginning is intentional to not interfere with autocomplete
+        modified_sysargv = argv.with_name(f".{argv.name}-{args.command}")
+
+        if hasattr(args, "subcommand"):
+            modified_sysargv += f"-{args.subcommand}"
+        return str(modified_sysargv)
+
+
+def main() -> None:
+    # Use the environment variable to check if main is being called from the subcommand, not from the ddp entrypoint
+    if not os.environ.get("ANEMOI_PROFILER_CMD"):
+        error = "This entrypoint should not be called directly. Use `anemoi-training profiler` instead."
+        raise RuntimeError(error)
+
+    from anemoi.training.train.profiler import main as anemoi_profiler
+
+    anemoi_profiler()
+
+
+command = Profiler
diff --git a/src/anemoi/training/config/diagnostics/eval_rollout.yaml b/src/anemoi/training/config/diagnostics/eval_rollout.yaml
@@ -57,6 +57,28 @@ debug:
 # remember to also activate the tensorboard logger (below)
 profiler: False
 
+# Use anemoi-profile to profile the training process
+benchmark_profiler:
+  memory:
+    enabled: True
+    steps: 5 # wait warmup steps and then do steps (too many steps would lead to a big file)
+    warmup: 2
+    extra_plots: False
+    trace_rank0_only: False #set to true and it will profile rank 0 only. Reads SLURM_PROC_ID so won't work when not running via Slurm
+  time:
+    enabled: True
+    verbose: False #If true, output every action the profiler caputres, otherwise output a subset defined in PROFILER_ACTIONS at the top of aifs/diagnostics/profiler.py
+  speed:
+    enabled: True
+  system:
+    enabled: True
+  model_summary:
+    enabled: True
+  snapshot:
+    enabled: True
+    steps: 4 # wait warmup steps and then do steps
+    warmup: 0
+
 checkpoint:
   every_n_minutes:
     save_frequency: 30 # Approximate, as this is checked at the end of training steps

diff --git a/src/anemoi/training/config/training/default.yaml b/src/anemoi/training/config/training/default.yaml
@@ -19,6 +19,8 @@ multistep_input: 2
 # the effective batch size becomes num-devices * batch_size * k
 accum_grad_batches: 1
 
+num_sanity_val_steps: 6
+
 # clipp gradients, 0 : don't clip, default algorithm: norm, alternative: value
 gradient_clip:
   val: 32.

diff --git a/src/anemoi/training/diagnostics/callbacks/__init__.py b/src/anemoi/training/diagnostics/callbacks/__init__.py
@@ -37,6 +37,7 @@
 from pytorch_lightning.callbacks import Callback
 from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 from pytorch_lightning.utilities import rank_zero_only
+from pytorch_lightning.utilities.types import STEP_OUTPUT
 
 from anemoi.training.diagnostics.plots import init_plot_settings
 from anemoi.training.diagnostics.plots import plot_graph_features
@@ -885,6 +886,71 @@ def on_load_checkpoint(
         pl_module.hparams["metadata"]["parent_uuid"] = checkpoint["hyper_parameters"]["metadata"]["uuid"]
 
 
+class MemorySnapshotRecorder(Callback):
+    """Record memory snapshot using torch.cuda._record_memory_history()."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.dirpath = Path(self.config.hardware.paths.profiler)
+
+        self.warmup = self.config.diagnostics.benchmark_profiler.snapshot.warmup
+        if not self.warmup:
+            self.warmup = 0
+        self.num_steps = (
+            self.config.diagnostics.benchmark_profiler.snapshot.steps + self.warmup
+        )  # be consistent with profiler scheduler
+        self.status = False
+
+        assert (
+            self.num_steps % self.config.dataloader.batch_size.training == 0
+        ), "Snapshot steps is not a multiple of batch size"
+        assert (
+            self.warmup % self.config.dataloader.batch_size.training == 0
+        ), "Snapshot Warmup steps is not a multiple of batch size"
+
+    @rank_zero_only
+    def _start_snapshot_recording(self):
+        LOGGER.info("Starting snapshot record_memory_history")
+        torch.cuda.memory._record_memory_history()
+        self.status = True
+
+    @rank_zero_only
+    def _save_snapshot(self):
+        self.memory_snapshot_fname = self.dirpath / "memory_snapshot.pickle"
+        try:
+            LOGGER.info("Saving memory snapshot to %s", self.memory_snapshot_fname)
+            torch.cuda.memory._dump_snapshot(f"{self.memory_snapshot_fname}")
+        except Exception as e:
+            LOGGER.error(f"Failed to capture memory snapshot {e}")
+
+    @rank_zero_only
+    def stop_record_memory_history(self) -> None:
+        LOGGER.info("Stopping snapshot record_memory_history")
+        torch.cuda.memory._record_memory_history(enabled=None)
+
+    def on_train_batch_start(
+        self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", batch: Any, batch_idx: int
+    ) -> None:
+        if trainer.global_step == self.warmup:
+            self._start_snapshot_recording()
+
+    def on_train_batch_end(
+        self,
+        trainer: "pl.Trainer",
+        pl_module: "pl.LightningModule",
+        outputs: STEP_OUTPUT,
+        batch: Any,
+        batch_idx: int,
+    ) -> None:
+        if trainer.global_step == self.num_steps:
+            if self.status is True:
+                self._save_snapshot()
+                self.stop_record_memory_history()
+            else:
+                LOGGER.info("Snapshot recording was not started so no snapshot was saved")
+
+
 class AnemoiCheckpoint(ModelCheckpoint):
     """A checkpoint callback that saves the model after every validation epoch."""
 

diff --git a/src/anemoi/training/diagnostics/mlflow/logger.py b/src/anemoi/training/diagnostics/mlflow/logger.py
@@ -377,7 +377,9 @@ def _get_mlflow_run_params(
         tags = {"projectName": project_name}
 
         # create a tag with the command used to run the script
-        command = os.environ.get("ANEMOI_TRAINING_CMD", sys.argv[0])
+        command = os.environ.get("ANEMOI_TRAINING_CMD", sys.argv[0]) or os.environ.get(
+            "ANEMOI_PROFILER_CMD", sys.argv[0],
+        )
         tags["command"] = command.split("/")[-1]  # get the python script name
         tags["mlflow.source.name"] = command
         if len(sys.argv) > 1: