Skip to content

Commit

Permalink
add backup to gpu monitoring
Browse files Browse the repository at this point in the history
  • Loading branch information
jarlsondre committed Oct 29, 2024
1 parent dedafe7 commit e4974e1
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 8 deletions.
40 changes: 32 additions & 8 deletions src/itwinai/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,13 @@ def generate_gpu_energy_plot(
log_dir: str = "utilization_logs",
pattern_str: str = r"dataframe_(?:\w+)_(?:\d+)\.csv$",
output_file: str = "plots/gpu_energy_plot.png",
backup_dir: Optional[str] = "backup/"
do_backup: bool = True,
backup_dir: Optional[str] = "scalability_backups/",
experiment_name: Optional[str] = None,
run_name: Optional[str] = None,
) -> None:
"""Generate a GPU energy plot showing the expenditure for each combination of
strategy and number of GPUs in Watt hours. Backs up the data used to create the
strategy and number of GPUs in Watt hours. Backs up the data used to create the
plot if ``backup_dir`` is not None
Args:
Expand All @@ -37,10 +40,11 @@ def generate_gpu_energy_plot(
Defaults to ``dataframe_(?:\\w+)_(?:\\d+)\\.csv$``.
output_file: The path to where the resulting plot should be saved. Defaults to
``plots/gpu_energy_plot.png``.
backup_dir: The path to where the data used to produce the plot should be
saved.
backup_dir: The path to where the data used to produce the plot should be
saved.
"""
import uuid
import matplotlib.pyplot as plt
from itwinai.torch.monitoring.plotting import gpu_energy_plot, read_energy_df

Expand All @@ -59,6 +63,20 @@ def generate_gpu_energy_plot(
plt.savefig(output_path)
print(f"\nSaved GPU energy plot at '{output_path.resolve()}'.")

if not do_backup:
return

if experiment_name is None:
random_id = uuid4()
experiment_name = "exp_" + random_id[:6]
if run_name is None:
random_id = uuid4()
run_name = "run_" + random_id[:6]

backup_path = Path(backup_dir) / experiment_name / run_name / "gpu_energy.csv"
gpu_utilization_df.to_csv(backup_path, index=False)
print(f"Storing backup file at '{backup_path.resolve()}'.")


@app.command()
def generate_communication_plot(
Expand Down Expand Up @@ -148,8 +166,8 @@ def scalability_report(
pattern: Annotated[
str, typer.Option(help="Python pattern matching names of CSVs in sub-folders.")
],
log_dir: Annotated[
str, typer.Option(help="Directory location for the data files to read")
log_dir: Annotated[
str, typer.Option(help="Directory location for the data files to read")
],
plot_title: Annotated[Optional[str], typer.Option(help=("Plot name."))] = None,
# skip_id: Annotated[Optional[int], typer.Option(help=("Skip epoch ID."))] = None,
Expand All @@ -170,12 +188,18 @@ def scalability_report(
"""
# TODO: add max depth and path different from CWD

from itwinai.scalability import read_scalability_files, archive_data, create_relative_plot, create_absolute_plot
from itwinai.scalability import (
read_scalability_files,
archive_data,
create_relative_plot,
create_absolute_plot,
)

log_dir_path = Path(log_dir)

combined_df, csv_files = read_scalability_files(
pattern=pattern, log_dir=log_dir_path
)
)
print("Merged CSV:")
print(combined_df)

Expand Down
11 changes: 11 additions & 0 deletions src/itwinai/torch/monitoring/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,14 @@ def gpu_energy_plot(gpu_utilization_df: pd.DataFrame) -> Tuple[Figure, Axes]:

def backup_data(file_paths: List):
pass











0 comments on commit e4974e1

Please sign in to comment.