interTwin-eu · jarlsondre · Nov 7, 2024 · Oct 24, 2024 · Oct 28, 2024 · Oct 22, 2024
diff --git a/README.md b/README.md
@@ -325,6 +325,50 @@ For instance, to run the test suite on your laptop user:
 make test
 ```
 
+## Working with Docker containers
+
+This section is intended for the developers of itwinai and outlines the practices
+used to manage container images through GitHub Container Registry (GHCR).
+
+### Terminology Recap
+
+Our container images follow the convention:
+
+```text
+ghcr.io/intertwin-eu/IMAGE_NAME:TAG
+```
+
+For example, in `ghcr.io/intertwin-eu/itwinai:0.2.2-torch2.6-jammy`:
+
+- `IMAGE_NAME` is `itwinai`
+- `TAG` is `0.2.2-torch2.6-jammy`
+
+The `TAG` follows the convention:
+
+```text
+X.Y.Z-[torch|tf]x.y-distro
+```
+
+Where:
+
+- `X.Y.Z` is the **itwinai version**
+- `x.y` is the **version of the ML framework** (e.g., PyTorch or TensorFlow)
+- `distro` is the OS distro in the container (e.g., Ubuntu Jammy)
+
+### Image Names and Their Purpose
+
+We use different image names to group similar images under the same namespace:
+
+- **`itwinai`**: Production images. These should be well-maintained and orderly.
+- **`itwinai-dev`**: Development images. Tags can vary, and may include random
+hashes.
+- **`itwinai-cvmfs`**: Images that need to be made available through CVMFS.
+
+> [!WARNING]
+> It is very important to keep the number of tags for `itwinai-cvmfs` as low
+> as possible. Tags should only be created under this namespace when strictly
+> necessary. Otherwise, this could cause issues for the converter.
+
 <!--
 ### Micromamba installation (deprecated)
 

diff --git a/env-files/torch/createEnvVega.sh b/env-files/torch/createEnvVega.sh
@@ -13,13 +13,17 @@ ml Python
 ml CMake/3.24.3-GCCcore-11.3.0
 ml mpi4py
 ml OpenMPI
-ml CUDA/11.7
+#ml CUDA/11.7
+ml CUDA/12.3
 ml GCCcore/11.3.0
-ml NCCL/2.12.12-GCCcore-11.3.0-CUDA-11.7.0
-ml cuDNN
+#ml NCCL/2.12.12-GCCcore-11.3.0-CUDA-11.7.0
+ml NCCL
+ml cuDNN/8.9.7.29-CUDA-12.3.0
+
+# You should have CUDA 12.3 now
 
 
 # Create and install torch env
 export ENV_NAME=".venv-pytorch"
-export PIP_INDEX_TORCH_CUDA="https://download.pytorch.org/whl/cu118"
-bash env-files/torch/generic_torch.sh
+export PIP_INDEX_TORCH_CUDA="https://download.pytorch.org/whl/cu121"
+bash env-files/torch/generic_torch.sh
diff --git a/src/itwinai/cli.py b/src/itwinai/cli.py
@@ -19,19 +19,63 @@
 app = typer.Typer(pretty_exceptions_enable=False)
 
 
+@app.command()
+def generate_gpu_energy_plot(
+    log_dir: str = "scalability_metrics/gpu_energy_data",
+    pattern: str = r"gpu_energy_data.*\.csv$",
+    output_file: str = "plots/gpu_energy_plot.png",
+) -> None:
+    """Generate a GPU energy plot showing the expenditure for each combination of
+    strategy and number of GPUs in Watt hours.
+
+    Args:
+        log_dir: The directory where the csv logs are stored. Defaults to
+            ``utilization_logs``.
+        pattern: A regex pattern to recognize the file names in the 'log_dir' folder.
+            Defaults to ``dataframe_(?:\\w+)_(?:\\d+)\\.csv$``. Set it to 'None' to
+            make it None. In this case, it will match all files in the given folder.
+        output_file: The path to where the resulting plot should be saved. Defaults to
+            ``plots/gpu_energy_plot.png``.
+
+    """
+    import matplotlib.pyplot as plt
+
+    from itwinai.torch.monitoring.plotting import gpu_energy_plot, read_energy_df
+
+    log_dir_path = Path(log_dir)
+    if not log_dir_path.exists():
+        raise ValueError(
+            f"The provided log_dir, '{log_dir_path.resolve()}', does not exist."
+        )
+
+    if pattern.lower() == "none":
+        pattern = None
+
+    gpu_utilization_df = read_energy_df(pattern=pattern, log_dir=log_dir_path)
+    gpu_energy_plot(gpu_utilization_df=gpu_utilization_df)
+
+    output_path = Path(output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    plt.savefig(output_path)
+    print(f"\nSaved GPU energy plot at '{output_path.resolve()}'.")
+
+
 @app.command()
 def generate_communication_plot(
-    log_dir: str = "profiling_logs",
-    pattern: str = r"profile_(\w+)_(\d+)_(\d+)\.csv$",
-    output_file: str = "plots/comm_plot.png",
+    log_dir: str = "scalability_metrics/communication_data",
+    pattern: str = r"(.+)_(\d+)_(\d+)\.csv$",
+    output_file: str = "plots/communication_plot.png",
 ) -> None:
     """Generate stacked plot showing computation vs. communication fraction. Stores it
+    to output_file.
 
     Args:
-        log_dir: The directory where the csv logs are stored. Defauls to
+        log_dir: The directory where the csv logs are stored. Defaults to
             ``profiling_logs``.
         pattern: A regex pattern to recognize the file names in the 'log_dir' folder.
-            Defaults to ``profile_(\\w+)_(\\d+)_(\\d+)\\.csv$``.
+            Defaults to ``profile_(\\w+)_(\\d+)_(\\d+)\\.csv$``. Set it to 'None' to
+            make it None. In this case, it will match all files in the given folder.
         output_file: The path to where the resulting plot should be saved. Defaults to
             ``plots/comm_plot.png``.
     """
@@ -45,13 +89,17 @@ def generate_communication_plot(
 
     log_dir_path = Path(log_dir)
     if not log_dir_path.exists():
-        raise IOError(
+        raise ValueError(
             f"The directory '{log_dir_path.resolve()}' does not exist, so could not"
             f"extract profiling logs. Make sure you are running this command in the "
-            f"same directory as the logging dir."
+            f"same directory as the logging dir or are passing a sufficient relative"
+            f"path."
         )
 
-    df = create_combined_comm_overhead_df(logs_dir=log_dir_path, pattern=pattern)
+    if pattern.lower() == "none":
+        pattern = None
+
+    df = create_combined_comm_overhead_df(log_dir=log_dir_path, pattern=pattern)
     values = get_comp_fraction_full_array(df, print_table=True)
 
     strategies = sorted(df["strategy"].unique())
@@ -67,7 +115,7 @@ def generate_communication_plot(
     output_path.parent.mkdir(parents=True, exist_ok=True)
 
     plt.savefig(output_path)
-    print(f"\nSaved computation vs. communication plot at '{output_path.resolve()}'")
+    print(f"\nSaved computation vs. communication plot at '{output_path.resolve()}'.")
 
 
 @app.command()