From e4974e1c69dc218953fbf8d311d9d2f8c3205e62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jarl=20Sondre=20S=C3=A6ther?= Date: Tue, 29 Oct 2024 10:25:35 +0100 Subject: [PATCH] add backup to gpu monitoring --- src/itwinai/cli.py | 40 +++++++++++++++++++----- src/itwinai/torch/monitoring/plotting.py | 11 +++++++ 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/src/itwinai/cli.py b/src/itwinai/cli.py index 1769fc42..2a04bc04 100644 --- a/src/itwinai/cli.py +++ b/src/itwinai/cli.py @@ -24,10 +24,13 @@ def generate_gpu_energy_plot( log_dir: str = "utilization_logs", pattern_str: str = r"dataframe_(?:\w+)_(?:\d+)\.csv$", output_file: str = "plots/gpu_energy_plot.png", - backup_dir: Optional[str] = "backup/" + do_backup: bool = True, + backup_dir: Optional[str] = "scalability_backups/", + experiment_name: Optional[str] = None, + run_name: Optional[str] = None, ) -> None: """Generate a GPU energy plot showing the expenditure for each combination of - strategy and number of GPUs in Watt hours. Backs up the data used to create the + strategy and number of GPUs in Watt hours. Backs up the data used to create the plot if ``backup_dir`` is not None Args: @@ -37,10 +40,11 @@ def generate_gpu_energy_plot( Defaults to ``dataframe_(?:\\w+)_(?:\\d+)\\.csv$``. output_file: The path to where the resulting plot should be saved. Defaults to ``plots/gpu_energy_plot.png``. - backup_dir: The path to where the data used to produce the plot should be - saved. + backup_dir: The path to where the data used to produce the plot should be + saved. """ + import uuid import matplotlib.pyplot as plt from itwinai.torch.monitoring.plotting import gpu_energy_plot, read_energy_df @@ -59,6 +63,20 @@ def generate_gpu_energy_plot( plt.savefig(output_path) print(f"\nSaved GPU energy plot at '{output_path.resolve()}'.") + if not do_backup: + return + + if experiment_name is None: + random_id = uuid4() + experiment_name = "exp_" + random_id[:6] + if run_name is None: + random_id = uuid4() + run_name = "run_" + random_id[:6] + + backup_path = Path(backup_dir) / experiment_name / run_name / "gpu_energy.csv" + gpu_utilization_df.to_csv(backup_path, index=False) + print(f"Storing backup file at '{backup_path.resolve()}'.") + @app.command() def generate_communication_plot( @@ -148,8 +166,8 @@ def scalability_report( pattern: Annotated[ str, typer.Option(help="Python pattern matching names of CSVs in sub-folders.") ], - log_dir: Annotated[ - str, typer.Option(help="Directory location for the data files to read") + log_dir: Annotated[ + str, typer.Option(help="Directory location for the data files to read") ], plot_title: Annotated[Optional[str], typer.Option(help=("Plot name."))] = None, # skip_id: Annotated[Optional[int], typer.Option(help=("Skip epoch ID."))] = None, @@ -170,12 +188,18 @@ def scalability_report( """ # TODO: add max depth and path different from CWD - from itwinai.scalability import read_scalability_files, archive_data, create_relative_plot, create_absolute_plot + from itwinai.scalability import ( + read_scalability_files, + archive_data, + create_relative_plot, + create_absolute_plot, + ) + log_dir_path = Path(log_dir) combined_df, csv_files = read_scalability_files( pattern=pattern, log_dir=log_dir_path - ) + ) print("Merged CSV:") print(combined_df) diff --git a/src/itwinai/torch/monitoring/plotting.py b/src/itwinai/torch/monitoring/plotting.py index 3d86a838..fc8ef512 100644 --- a/src/itwinai/torch/monitoring/plotting.py +++ b/src/itwinai/torch/monitoring/plotting.py @@ -127,3 +127,14 @@ def gpu_energy_plot(gpu_utilization_df: pd.DataFrame) -> Tuple[Figure, Axes]: def backup_data(file_paths: List): pass + + + + + + + + + + +