interTwin-eu · jarlsondre · Nov 7, 2024 · Oct 24, 2024 · Oct 28, 2024 · Oct 22, 2024
diff --git a/src/itwinai/torch/distributed.py b/src/itwinai/torch/distributed.py
@@ -37,6 +37,9 @@ class TorchDistributedStrategy(DistributedStrategy):
     #: Defaults to False.
     is_initialized: bool = False
 
+    # Provides the name of the strategy for logging purposes etc. 
+    name: str
+
     @property
     def is_main_worker(self) -> bool:
         """Checks if local worker has global rank equal to zero.
@@ -381,6 +384,7 @@ class TorchDDPStrategy(TorchDistributedStrategy):
     def __init__(self, backend: Literal["nccl", "gloo", "mpi"]) -> None:
         super().__init__()
         self.backend = backend
+        self.name = "torch-ddp"
 
     def init(self) -> None:
         """Initializes the distributed process group and the distributed
@@ -595,6 +599,7 @@ class DeepSpeedStrategy(TorchDistributedStrategy):
     def __init__(self, backend: Literal["nccl", "gloo", "mpi"]) -> None:
         super().__init__()
         self.backend = backend
+        self.name = "deepspeed"
 
     def init(self) -> None:
         """Initializes the distributed process group and the distributed
@@ -779,6 +784,10 @@ def gather(self, tensor: torch.Tensor, dst_rank: int = 0) -> Optional[List]:
 class HorovodStrategy(TorchDistributedStrategy):
     """Horovod distributed strategy class."""
 
+    def __init__(self): 
+        super().__init__()
+        self.name = "horovod"
+
     def init(self) -> None:
         """Initializes the Horovod distributed backend.
 
@@ -965,6 +974,10 @@ class NonDistributedStrategy(TorchDistributedStrategy):
     is_distributed: bool = True
     is_distributed: bool = False
 
+    def __init__(self): 
+        super().__init__()
+        self.name = "non-distributed"
+
     def init(self) -> None:
         """If CUDA is available set CUDA device, and do nothing more.
 

diff --git a/src/itwinai/torch/monitoring/monitoring.py b/src/itwinai/torch/monitoring/monitoring.py
@@ -114,23 +114,12 @@ def measured_method(self: TorchTrainer, *args, **kwargs) -> Any:
         warmup_time = 5
 
         strategy = self.strategy
-        strategy.init()
-
-        if isinstance(strategy, NonDistributedStrategy):
-            strategy_name = "non-dist"
-        elif isinstance(strategy, TorchDDPStrategy):
-            strategy_name = "ddp"
-        elif isinstance(strategy, DeepSpeedStrategy):
-            strategy_name = "deepspeed"
-        elif isinstance(strategy, HorovodStrategy):
-            strategy_name = "horovod"
-        else:
-            strategy_name = "unk"
+        strategy_name = strategy.name
 
         local_rank = strategy.local_rank()
         global_rank = strategy.global_rank()
         num_global_gpus = strategy.global_world_size()
-        num_local_gpus = torch.cuda.device_count()
+        num_local_gpus = strategy.local_world_size()
         node_idx = global_rank // num_local_gpus
 
         output_path = Path(
@@ -185,7 +174,6 @@ def measured_method(self: TorchTrainer, *args, **kwargs) -> Any:
         if strategy.is_main_worker:
             write_logs_to_file(global_utilization_log, output_path)
 
-        strategy.clean_up()
         return result
 
     return measured_method
diff --git a/src/itwinai/torch/trainer.py b/src/itwinai/torch/trainer.py
@@ -375,7 +375,7 @@ def execute(
 
         if self.logger:
             self.logger.destroy_logger_context()
-        # self.strategy.clean_up()
+        self.strategy.clean_up()
         return train_dataset, validation_dataset, test_dataset, self.model
 
     def _set_epoch_dataloaders(self, epoch: int):

diff --git a/use-cases/eurac/trainer.py b/use-cases/eurac/trainer.py
@@ -94,7 +94,7 @@ def __init__(
 
     @suppress_workers_print
     # @profile_torch_trainer
-    @measure_gpu_utilization
+    # @measure_gpu_utilization
     def execute(
         self,
         train_dataset: Dataset,
@@ -148,6 +148,7 @@ def set_epoch(self, epoch: int):
             self.train_loader.sampler.set_epoch(epoch)
             self.val_loader.sampler.set_epoch(epoch)
 
+    @measure_gpu_utilization
     def train(self):
         """Override version of hython to support distributed strategy."""
         # Tracking epoch times for scaling test