diff --git a/.dockerignore b/.dockerignore
index 697f33f2..d6dc083a 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -12,6 +12,10 @@ CHANGELOG
 # Docs
 docs
 
+# interLink pods
+**/interLink
+**/interlink
+
 # Data
 **/MNIST
 **/*-predictions/
diff --git a/.github/linters/.jscpd.json b/.github/linters/.jscpd.json
index 1a035770..8a003c54 100644
--- a/.github/linters/.jscpd.json
+++ b/.github/linters/.jscpd.json
@@ -1,7 +1,6 @@
 {
     "threshold": 2.0,
     "ignore": [
-        "**/itwinai/loggers.py",
-        "**/itwinai/torch/engine.py"
+        "**/itwinai/loggers.py"
     ]
 }
\ No newline at end of file
diff --git a/.github/workflows/workflows-dt.yml b/.github/workflows/pytest.yml
similarity index 88%
rename from .github/workflows/workflows-dt.yml
rename to .github/workflows/pytest.yml
index 53a72e43..ecee2bc1 100644
--- a/.github/workflows/workflows-dt.yml
+++ b/.github/workflows/pytest.yml
@@ -1,10 +1,12 @@
 ---
-name: Test workflows
+name: Unit and integration tests
 
 on:
   pull_request:
     branches: [main, dev]
 
+# TODO: use container and set custom TORCH_ENV and TF_ENV env variables
+
 jobs:
   test-itwinai:
     name: Test itwinai with pytest
diff --git a/.gitignore b/.gitignore
index dd495607..74cf514d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,9 @@ mnist-sample-data/
 exp_data/
 
 
+# Kubernetes
+secret*.yaml
+
 # Custom envs
 .venv*
 envAI_*
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 6f581e8c..08d06d81 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -11,6 +11,7 @@
         "Convolutional",
         "cuda",
         "dataloaders",
+        "dataloading",
         "fromlist",
         "hyperparameters",
         "hyperparams",
diff --git a/README.md b/README.md
index dc9a60dc..ce8b6684 100644
--- a/README.md
+++ b/README.md
@@ -96,7 +96,35 @@ pip install -e .[dev]
 
 #### Test with `pytest`
 
-To run tests on itwinai package:
+Do this only if you are a developer wanting to test your code with pytest.
+
+First, you need to create virtual environments both for torch and tensorflow.
+For instance, you can use:
+
+```bash
+make torch-cpu
+make make tf-2.13-cpu
+```
+
+To select the name of the torch and tf environments you can set the following
+environment variables, which allow to run the tests in environments with
+custom names which are different from `.venv-pytorch` and `.venv-tf`.
+
+```bash
+export TORCH_ENV="my_torch_env"
+export TF_ENV="my_tf_env"
+```
+
+Functional tests (marked with `pytest.mark.functional`) will be executed under
+`/tmp/pytest` location to guarantee they are run in a clean environment.
+
+To run functional tests use:
+
+```bash
+pytest -v tests/ -m "functional"
+```
+
+To run all tests on itwinai package:
 
 ```bash
 # Activate env
diff --git a/docs/conf.py b/docs/conf.py
index f4c9b297..a06c3011 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -21,7 +21,8 @@
 sys.path.insert(0, os.path.abspath('../'))
 
 project = 'itwinai'
-copyright = '2024, Matteo Bunino, Alexander Zoechbauer, Kalliopi Tsolaki, Rakesh Sarma on behalf of CERN & JSC'
+copyright = ('2024, Matteo Bunino, Alexander Zoechbauer, '
+             'Kalliopi Tsolaki, Rakesh Sarma on behalf of CERN & JSC')
 author = 'Matteo Bunino, Alexander Zoechbauer, Kalliopi Tsolaki'
 # version = '0.0'  # short version
 # release = '0.0.2'  # full version
@@ -43,7 +44,9 @@
 
 def get_git_tag():
     try:
-        return subprocess.check_output(['git', 'describe', '--tags', '--abbrev=0']).decode('utf-8').strip()
+        return subprocess.check_output(
+            ['git', 'describe', '--tags', '--abbrev=0']
+        ).decode('utf-8').strip()
     except subprocess.CalledProcessError:
         return 'unknown'
 
diff --git a/env-files/tensorflow/createEnvJSCTF.sh b/env-files/tensorflow/createEnvJSCTF.sh
index 8838347c..377940d4 100644
--- a/env-files/tensorflow/createEnvJSCTF.sh
+++ b/env-files/tensorflow/createEnvJSCTF.sh
@@ -104,5 +104,8 @@ if [ "$cont1" = true ] ; then
   pip3 install -r reqs_TF.txt --ignore-installed
 fi
 
+# Install itwinai
+pip install --upgrade pip
+pip install -e .[dev]
 
 # eof
diff --git a/src/itwinai/cli.py b/src/itwinai/cli.py
index 275d853a..6c27d069 100644
--- a/src/itwinai/cli.py
+++ b/src/itwinai/cli.py
@@ -16,7 +16,7 @@
 import typer
 
 
-app = typer.Typer()
+app = typer.Typer(pretty_exceptions_enable=False)
 
 
 @app.command()
@@ -27,9 +27,6 @@ def scalability_report(
     plot_title: Annotated[Optional[str], typer.Option(
         help=("Plot name.")
     )] = None,
-    logy: Annotated[bool, typer.Option(
-        help=("Log scale on y axis.")
-    )] = False,
     skip_id: Annotated[Optional[int], typer.Option(
         help=("Skip epoch ID.")
     )] = None,
@@ -43,15 +40,17 @@ def scalability_report(
 
     Example:
     >>> itwinai scalability-report --pattern="^epoch.+\\.csv$" --skip-id 0 \\
-    >>>     --plot-title "Some title" --logy --archive archive_name
+    >>>     --plot-title "Some title" --archive archive_name
     """
     # TODO: add max depth and path different from CWD
     import os
     import re
+    import glob
     import shutil
     import pandas as pd
+    import matplotlib
     import matplotlib.pyplot as plt
-    # import numpy as np
+    import numpy as np
 
     regex = re.compile(r'{}'.format(pattern))
     combined_df = pd.DataFrame()
@@ -83,7 +82,13 @@ def scalability_report(
     if plot_title is not None:
         fig.suptitle(plot_title)
 
-    for name in set(avg_times.name.values):
+    sp_up_ax.set_yscale("log")
+    sp_up_ax.set_xscale("log")
+
+    markers = iter("ov^s*dXpD.+12348")
+
+    series_names = sorted(set(avg_times.name.values))
+    for name in series_names:
         df = avg_times[avg_times.name == name].drop(columns='name')
 
         # Debug
@@ -104,32 +109,27 @@ def scalability_report(
         df["Efficiency"] = df["Threadscaled Sim. Time / s"].iloc[0] / \
             df["Threadscaled Sim. Time / s"]
 
-        # Plot
-        # when lines are very close to each other
-        if logy:
-            sp_up_ax.semilogy(
-                df["NGPUs"].values, df["Speedup"].values,
-                marker='*', lw=1.0, label=name)
-        else:
-            sp_up_ax.plot(
-                df["NGPUs"].values, df["Speedup"].values,
-                marker='*', lw=1.0, label=name)
-
-    if logy:
-        sp_up_ax.semilogy(df["NGPUs"].values, df["Speedup - ideal"].values,
-                          ls='dashed', lw=1.0, c='k', label="ideal")
-    else:
-        sp_up_ax.plot(df["NGPUs"].values, df["Speedup - ideal"].values,
-                      ls='dashed', lw=1.0, c='k', label="ideal")
+        sp_up_ax.plot(
+            df["NGPUs"].values, df["Speedup"].values,
+            marker=next(markers), lw=1.0, label=name, alpha=0.7)
+
+    sp_up_ax.plot(df["NGPUs"].values, df["Speedup - ideal"].values,
+                  ls='dashed', lw=1.0, c='k', label="ideal")
     sp_up_ax.legend(ncol=1)
 
     sp_up_ax.set_xticks(df["NGPUs"].values)
-    # sp_up_ax.set_yticks(
-    #     np.arange(1, np.max(df["Speedup - ideal"].values) + 2, 1))
+    sp_up_ax.get_xaxis().set_major_formatter(
+        matplotlib.ticker.ScalarFormatter())
 
     sp_up_ax.set_ylabel('Speedup')
     sp_up_ax.set_xlabel('NGPUs (4 per node)')
     sp_up_ax.grid()
+
+    # Sort legend
+    handles, labels = sp_up_ax.get_legend_handles_labels()
+    order = np.argsort(labels)
+    plt.legend([handles[idx] for idx in order], [labels[idx] for idx in order])
+
     plot_png = f"scaling_plot_{plot_title}.png"
     plt.tight_layout()
     plt.savefig(plot_png, bbox_inches='tight', format='png', dpi=300)
@@ -151,6 +151,18 @@ def scalability_report(
                                                   os.path.basename(csvfile)))
         shutil.copyfile(plot_png, os.path.join(archive, plot_png))
         avg_times.to_csv(os.path.join(archive, "avg_times.csv"), index=False)
+        print("Archived AVG epoch times CSV")
+
+        # Copy SLURM logs: *.err *.out files
+        if os.path.exists('logs_slurm'):
+            print("Archived SLURM logs")
+            shutil.copytree('logs_slurm', os.path.join(archive, 'logs_slurm'))
+        # Copy other SLURM logs
+        for ext in ['*.out', '*.err']:
+            for file in glob.glob(ext):
+                shutil.copyfile(file, os.path.join(archive, file))
+
+        # Create archive
         archive_name = shutil.make_archive(
             base_name=archive,  # archive file name
             format='gztar',
@@ -170,6 +182,11 @@ def exec_pipeline(
         help=("Key in the configuration file identifying "
               "the pipeline object to execute.")
     )] = "pipeline",
+    steps: Annotated[Optional[str], typer.Option(
+        help=("Run only some steps of the pipeline. Accepted values are "
+              "indices, python slices (e.g., 0:3 or 2:10:100), and "
+              "string names of steps.")
+    )] = None,
     print_config: Annotated[bool, typer.Option(
         help=("Print config to be executed after overrides.")
     )] = False,
@@ -195,11 +212,14 @@ def exec_pipeline(
     # to find the local python files imported from the pipeline file
     import os
     import sys
+    import re
+    from .utils import str_to_slice
     sys.path.append(os.path.dirname(config))
     sys.path.append(os.getcwd())
 
     # Parse and execute pipeline
     from itwinai.parser import ConfigParser
+    overrides_list = overrides_list if overrides_list is not None else []
     overrides = {
         k: v for k, v
         in map(lambda x: (x.split('=')[0], x.split('=')[1]), overrides_list)
@@ -213,8 +233,18 @@ def exec_pipeline(
         print("#="*50)
         print()
     pipeline = parser.parse_pipeline(pipeline_nested_key=pipe_key)
+    if steps:
+        if not re.match(r"\d+(:\d+)?(:\d+)?", steps):
+            print(f"Looking for step name '{steps}'")
+        else:
+            steps = str_to_slice(steps)
+        pipeline = pipeline[steps]
     pipeline.execute()
 
+    # Cleanup PYTHONPATH
+    sys.path.pop()
+    sys.path.pop()
+
 
 @app.command()
 def mlflow_ui(
diff --git a/src/itwinai/cluster.py b/src/itwinai/cluster.py
deleted file mode 100644
index 7b9f57e0..00000000
--- a/src/itwinai/cluster.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""Cluster environments where to run AI workflows."""
-
-from __future__ import annotations
-from abc import ABCMeta, abstractmethod
-import os
-from contextlib import contextmanager
-
-
-def setup_for_distributed(is_main):
-    """
-    This function disables printing when not in master process
-    """
-    import builtins as __builtin__
-    builtin_print = __builtin__.print
-
-    def print(*args, **kwself):
-        force = kwself.pop('force', False)
-        if is_main or force:
-            builtin_print(*args, **kwself)
-
-    __builtin__.print = print
-
-
-def handle_sigusr1(signum, frame):
-    os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}')
-    exit()
-
-
-def handle_sigterm(signum, frame):
-    pass
-
-
-class ClusterEnvironment(metaclass=ABCMeta):
-    port: int = -1
-    ngpus_per_node: int = -1
-    global_world_size: int = -1
-    global_rank: int = -1
-    local_world_size: int = -1
-    local_rank: int = -1
-    rnd_seed: int = None
-    distributed: bool = False
-    # This flag tells whether the user wants to use the GPU(s)
-    use_cuda: bool = False
-
-    @property
-    def backend(self) -> str:
-        return self._backend
-
-    @backend.setter
-    def backend(self, backend_name: str) -> None:
-        self._set_backend(backend_name)
-
-    def _set_backend(self, backend_name: str) -> None:
-        # Override to implement sanitization
-        self._backend = backend_name
-
-    @abstractmethod
-    def is_main_worker(self) -> bool:
-        """Tells if the current process is the main/master process."""
-        pass
-
-    @abstractmethod
-    def is_cuda_available(self) -> bool:
-        pass
-
-    @abstractmethod
-    @contextmanager
-    def init_dist_gpu(self, *args, **kwargs):
-        pass
-
-    def cleanup_resources(self):
-        pass
diff --git a/src/itwinai/components.py b/src/itwinai/components.py
index 1f41bacd..eca2e570 100644
--- a/src/itwinai/components.py
+++ b/src/itwinai/components.py
@@ -216,14 +216,6 @@ def execute(
             validation dataset, test dataset, trained model.
         """
 
-    @abstractmethod
-    def save_state(self):
-        pass
-
-    @abstractmethod
-    def load_state(self):
-        pass
-
 
 class Predictor(BaseComponent):
     """Applies a pre-trained machine learning model to unseen data."""
diff --git a/src/itwinai/loggers.py b/src/itwinai/loggers.py
index d5ed0008..7f86ffcb 100644
--- a/src/itwinai/loggers.py
+++ b/src/itwinai/loggers.py
@@ -4,13 +4,12 @@
 import csv
 from abc import ABCMeta, abstractmethod
 from contextlib import contextmanager
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Literal
 import pickle
 import pathlib
 
 import wandb
 import mlflow
-# import mlflow.keras
 
 BASE_EXP_NAME: str = 'unk_experiment'
 
@@ -38,12 +37,12 @@ class Logger(LogMixin, metaclass=ABCMeta):
     """
     savedir: str = None
     supported_types: List[str]  # Supported logging 'kinds'
-    _log_freq: Union[int, str]
+    _log_freq: Union[int, Literal['epoch', 'batch']]
 
     def __init__(
         self,
         savedir: str = 'mllogs',
-        log_freq: Union[int, str] = 'epoch'
+        log_freq: Union[int, Literal['epoch', 'batch']] = 'epoch'
     ) -> None:
         self.savedir = savedir
         self.log_freq = log_freq
@@ -120,7 +119,7 @@ class ConsoleLogger(Logger):
     def __init__(
         self,
         savedir: str = 'mllogs',
-        log_freq: Union[int, str] = 'epoch'
+        log_freq: Union[int, Literal['epoch', 'batch']] = 'epoch'
     ) -> None:
         savedir = os.path.join(savedir, 'simple-logger')
         super().__init__(savedir=savedir, log_freq=log_freq)
@@ -190,7 +189,7 @@ def __init__(
         experiment_name: str = BASE_EXP_NAME,
         tracking_uri: Optional[str] = None,
         run_description: Optional[str] = None,
-        log_freq: Union[int, str] = 'epoch'
+        log_freq: Union[int, Literal['epoch', 'batch']] = 'epoch'
     ):
         savedir = os.path.join(savedir, 'mlflow')
         super().__init__(savedir=savedir, log_freq=log_freq)
@@ -203,7 +202,7 @@ def __init__(
             saved_abs_path = os.path.abspath(self.savedir)
             self.tracking_uri = pathlib.Path(saved_abs_path).as_uri()
             # self.tracking_uri = "file://" + self.savedir
-        print(f'MLFLOW URI: {self.tracking_uri}')
+        # print(f'MLFLOW URI: {self.tracking_uri}')
 
         # TODO: for pytorch lightning:
         # mlflow.pytorch.autolog()
@@ -317,7 +316,7 @@ def __init__(
         self,
         savedir: str = 'mllogs',
         project_name: str = BASE_EXP_NAME,
-        log_freq: Union[int, str] = 'epoch'
+        log_freq: Union[int, Literal['epoch', 'batch']] = 'epoch'
     ) -> None:
         savedir = os.path.join(savedir, 'wandb')
         super().__init__(savedir=savedir, log_freq=log_freq)
@@ -376,7 +375,7 @@ class TensorBoardLogger(Logger):
     def __init__(
         self,
         savedir: str = 'mllogs',
-        log_freq: Union[int, str] = 'epoch'
+        log_freq: Union[int, Literal['epoch', 'batch']] = 'epoch'
     ) -> None:
         savedir = os.path.join(savedir, 'tensorboard')
         super().__init__(savedir=savedir, log_freq=log_freq)
@@ -425,7 +424,7 @@ def __init__(
         self,
         loggers: List[Logger]
     ) -> None:
-        super().__init__(savedir='/.tmp_mllogs_LoggersCollection', log_freq=0)
+        super().__init__(savedir='/.tmp_mllogs_LoggersCollection', log_freq=1)
         self.loggers = loggers
 
     def should_log(self, batch_idx: int = None) -> bool:
@@ -450,6 +449,18 @@ def log(
                 **kwargs
             )
 
+    def create_logger_context(self):
+        for logger in self.loggers:
+            logger.create_logger_context()
+
+    def destroy_logger_context(self):
+        for logger in self.loggers:
+            logger.destroy_logger_context()
+
+    def save_hyperparameters(self, params: Dict[str, Any]) -> None:
+        for logger in self.loggers:
+            logger.save_hyperparameters(params=params)
+
 
 class EpochTimeTracker:
     def __init__(self, series_name: str, csv_file: str) -> None:
diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py
index 0001627b..254e91a9 100644
--- a/src/itwinai/parser.py
+++ b/src/itwinai/parser.py
@@ -76,14 +76,11 @@ class ConfigParser:
     >>>         init_args:
     >>>           save_path: .tmp/
     >>>
-    >>>       - class_path: itwinai.torch.trainer.TorchTrainerMG
+    >>>       - class_path: itwinai.torch.trainer.TorchTrainer
     >>>         init_args:
     >>>           model:
     >>>             class_path: model.Net
-    >>>           loss:
-    >>>             class_path: torch.nn.NLLLoss
-    >>>             init_args:
-    >>>               reduction: mean
+    >>>
     >>> from itwinai.parser import ConfigParser
     >>>
     >>> parser = ConfigParser(
@@ -244,241 +241,3 @@ def __init__(
             "-c", "--config", action=ActionConfigFile,
             help="Path to a configuration file in json or yaml format."
         )
-
-
-# class ConfigParser2:
-#     """
-#     Deprecated: this pipeline structure does not allow for
-#     nested pipelines. However, it is more readable and the linking
-#     from name to step data could be achieved with OmegaConf. This
-#     could be reused in the future: left as example.
-
-#     Parses a configuration file, merging the steps into
-#     the pipeline and returning a pipeline object.
-#     It also provides functionalities for dynamic override
-#     of fields by means of nested key notation.
-
-#     Example:
-
-#     >>> # pipeline.yaml
-#     >>> pipeline:
-#     >>>   class_path: itwinai.pipeline.Pipeline
-#     >>>   steps: [server, client]
-#     >>>
-#     >>> server:
-#     >>>   class_path: mycode.ServerOptions
-#     >>>   init_args:
-#     >>>     host: localhost
-#     >>>     port: 80
-#     >>>
-#     >>> client:
-#     >>>   class_path: mycode.ClientOptions
-#     >>>   init_args:
-#     >>>     url: http://${server.init_args.host}:${server.init_args.port}/
-
-#     >>> from itwinai.parser import ConfigParser2
-#     >>>
-#     >>> parser = ConfigParser2(
-#     >>>     config='pipeline.yaml',
-#     >>>     override_keys={
-#     >>>         'server.init_args.port': 777
-#     >>>     }
-#     >>> )
-#     >>> pipeline = parser.parse_pipeline()
-#     >>> print(pipeline)
-#     >>> print(pipeline.steps)
-#     >>> print(pipeline.steps['server'].port)
-#     >>>
-#     >>> server = parser.parse_step('server')
-#     >>> print(server)
-#     >>> print(server.port)
-#     """
-
-#     config: Dict
-#     pipeline: Pipeline
-
-#     def __init__(
-#         self,
-#         config: Union[str, Dict],
-#         override_keys: Optional[Dict[str, Any]] = None
-#     ) -> None:
-#         self.config = config
-#         self.override_keys = override_keys
-#         if isinstance(self.config, str):
-#             self.config = load_yaml(self.config)
-#         self._dynamic_override_keys()
-#         self._omegaconf_interpolate()
-
-#     def _dynamic_override_keys(self):
-#         if self.override_keys is not None:
-#             for key_chain, value in self.override_keys.items():
-#                 add_replace_field(self.config, key_chain, value)
-
-#     def _omegaconf_interpolate(self) -> None:
-#         """Performs variable interpolation with OmegaConf on internal
-#         configuration file.
-#         """
-#         conf = OmegaConf.create(self.config)
-#         self.config = OmegaConf.to_container(conf, resolve=True)
-
-#     def parse_pipeline(
-#         self,
-#         pipeline_nested_key: str = "pipeline",
-#         verbose: bool = False
-#     ) -> Pipeline:
-#         """Merges steps into pipeline and parses it.
-
-#         Args:
-#             pipeline_nested_key (str, optional): nested key in the
-#             configuration file identifying the pipeline object.
-#             Defaults to "pipeline".
-#             verbose (bool): if True, prints the assembled pipeline
-#             to console formatted as JSON.
-
-#         Returns:
-#             Pipeline: instantiated pipeline.
-#         """
-#         pipe_parser = JAPArgumentParser()
-#         pipe_parser.add_subclass_arguments(Pipeline, pipeline_nested_key)
-#         pipe_dict = self.config[pipeline_nested_key]
-
-#         # Pop steps list from pipeline dictionary
-#         steps_list = pipe_dict['steps']
-#         del pipe_dict['steps']
-
-#         # Link steps with respective dictionaries
-#         if not pipe_dict.get('init_args'):
-#             pipe_dict['init_args'] = {}
-#         steps_dict = pipe_dict['init_args']['steps'] = {}
-#         for step_name in steps_list:
-#             steps_dict[step_name] = self.config[step_name]
-#         pipe_dict = {pipeline_nested_key: pipe_dict}
-
-#         if verbose:
-#             print("Assembled pipeline:")
-#             print(json.dumps(pipe_dict, indent=4))
-
-#         # Parse pipeline dict once merged with steps
-#         conf = pipe_parser.parse_object(pipe_dict)
-#         pipe = pipe_parser.instantiate_classes(conf)
-#         self.pipeline = pipe[pipeline_nested_key]
-#         return self.pipeline
-
-#     def parse_step(
-#         self,
-#         step_name: str,
-#         verbose: bool = False
-#     ) -> BaseComponent:
-#         step_dict_config = self.config[step_name]
-
-#         if verbose:
-#             print(f"STEP '{step_name}' CONFIG:")
-#             print(json.dumps(step_dict_config, indent=4))
-
-#         # Wrap config under "step" field and parse it
-#         step_dict_config = {'step': step_dict_config}
-#         step_parser = JAPArgumentParser()
-#         step_parser.add_subclass_arguments(BaseComponent, "step")
-#         parsed_namespace = step_parser.parse_object(step_dict_config)
-#         return step_parser.instantiate_classes(parsed_namespace)["step"]
-
-
-# class ItwinaiCLI2:
-#     """
-#     Deprecated: the dynamic override does not work with nested parameters
-#     and may be confusing.
-
-#     CLI tool for executing a configuration file, with dynamic
-#     override of fields and variable interpolation with Omegaconf.
-
-#     Example:
-
-#     >>> # train.py
-#     >>> from itwinai.parser import ItwinaiCLI
-#     >>> cli = ItwinaiCLI()
-#     >>> cli.pipeline.execute()
-
-#     >>> # pipeline.yaml
-#     >>> pipeline:
-#     >>>   class_path: itwinai.pipeline.Pipeline
-#     >>>   steps: [server, client]
-#     >>>
-#     >>> server:
-#     >>>   class_path: mycode.ServerOptions
-#     >>>   init_args:
-#     >>>     host: localhost
-#     >>>     port: 80
-#     >>>
-#     >>> client:
-#     >>>   class_path: mycode.ClientOptions
-#     >>>   init_args:
-#     >>>     url: http://${server.init_args.host}:${server.init_args.port}/
-
-#     From command line:
-
-#     >>> python train.py --config itwinai-conf.yaml --help
-#     >>> python train.py --config itwinai-conf.yaml
-#     >>> python train.py --config itwinai-conf.yaml --server.port 8080
-#     """
-#     _parser: JAPArgumentParser
-#     _config: Dict
-#     pipeline: Pipeline
-
-#     def __init__(
-#         self,
-#         pipeline_nested_key: str = "pipeline",
-#         parser_mode: str = "omegaconf"
-#     ) -> None:
-#         self.pipeline_nested_key = pipeline_nested_key
-#         self.parser_mode = parser_mode
-#         self._init_parser()
-#         self._parser.add_argument(f"--{self.pipeline_nested_key}", type=dict)
-#         self._add_steps_arguments()
-#         self._config = self._parser.parse_args()
-
-#         # Merge steps into pipeline and parse it
-#         del self._config['config']
-#         pipe_parser = ConfigParser2(config=self._config.as_dict())
-#         self.pipeline = pipe_parser.parse_pipeline(
-#             pipeline_nested_key=self.pipeline_nested_key
-#         )
-
-#     def _init_parser(self):
-#         self._parser = JAPArgumentParser(parser_mode=self.parser_mode)
-#         self._parser.add_argument(
-#             "-c", "--config", action=ActionConfigFile,
-#             required=True,
-#             help="Path to a configuration file in json or yaml format."
-#         )
-
-#     def _add_steps_arguments(self):
-#         """Pre-parses the configuration file, dynamically adding all the
-#         component classes under 'steps' as arguments of the parser.
-#         """
-#         if "--config" not in sys.argv:
-#             raise ValueError(
-#                 "--config parameter has to be specified with a "
-#                 "valid path to a configuration file."
-#             )
-#         config_path = sys.argv.index("--config") + 1
-#         config_path = sys.argv[config_path]
-#         config = load_yaml(config_path)
-
-#         # Add steps to parser
-#         steps = filter(
-#             lambda itm: itm[0] != self.pipeline_nested_key,
-#             config.items()
-#         )
-#         steps = {
-#             step_name: step_data['class_path']
-#             for step_name, step_data in steps
-#         }
-
-#         for st_nested_key, step_class_str in steps.items():
-#             step_class = dynamically_import_class(step_class_str)
-#             self._add_step_arguments(
-#                 step_class=step_class, nested_key=st_nested_key)
-
-#     def _add_step_arguments(self, step_class, nested_key):
-#         self._parser.add_subclass_arguments(
-#             baseclass=step_class, nested_key=nested_key)
diff --git a/src/itwinai/tensorflow/distributed.py b/src/itwinai/tensorflow/distributed.py
index e6c5f28a..64945ca8 100644
--- a/src/itwinai/tensorflow/distributed.py
+++ b/src/itwinai/tensorflow/distributed.py
@@ -1,17 +1,23 @@
-import tensorflow as tf
 import os
+import tensorflow as tf
+import tensorflow.distribute as dist
 
 
 def get_strategy():
     """Strategy for distributed TensorFlow training"""
-    cluster_resolver = tf.distribute.cluster_resolver.SlurmClusterResolver(
+    if not os.environ.get('SLURM_JOB_ID'):
+        # TODO: improve
+        print('not in SLURM env!')
+        tf_dist_strategy = dist.MirroredStrategy()
+        return tf_dist_strategy, tf_dist_strategy.num_replicas_in_sync
+    cluster_resolver = dist.cluster_resolver.SlurmClusterResolver(
         port_base=12345)
-    implementation = tf.distribute.experimental.CommunicationImplementation.NCCL
-    communication_options = tf.distribute.experimental.CommunicationOptions(
+    implementation = dist.experimental.CommunicationImplementation.NCCL
+    communication_options = dist.experimental.CommunicationOptions(
         implementation=implementation)
 
     # declare distribution strategy
-    tf_dist_strategy = tf.distribute.MultiWorkerMirroredStrategy(
+    tf_dist_strategy = dist.MultiWorkerMirroredStrategy(
         cluster_resolver=cluster_resolver,
         communication_options=communication_options
     )
diff --git a/src/itwinai/tensorflow/trainer.py b/src/itwinai/tensorflow/trainer.py
index d8c40012..51bfb97c 100644
--- a/src/itwinai/tensorflow/trainer.py
+++ b/src/itwinai/tensorflow/trainer.py
@@ -28,12 +28,19 @@ def instance_from_dict(obj_dict: Any) -> Any:
     return obj_dict
 
 
+# TODO: the TF trainer is incomplete:
+#   - strategy is not received from constructor argument: if not needed,
+#     remove it
+#   - dataset is not distributed
+#   - much commented code that has to be removed or included
+
+
 class TensorflowTrainer(Trainer):
     def __init__(
             self,
             epochs,
-            train_dataset,
-            validation_dataset,
+            # train_dataset,
+            # validation_dataset,
             batch_size,
             callbacks,
             model_dict: Dict,
@@ -61,14 +68,14 @@ def __init__(
             # get total number of workers
             print("Number of devices: {}".format(n_devices))
             # distribute datasets among MirroredStrategy's replicas
-            dist_train_dataset = (
-                tf_dist_strategy.experimental_distribute_dataset(
-                    train_dataset
-                ))
-            dist_validation_dataset = (
-                tf_dist_strategy.experimental_distribute_dataset(
-                    validation_dataset
-                ))
+            # dist_train_dataset = (
+            #     tf_dist_strategy.experimental_distribute_dataset(
+            #         train_dataset
+            #     ))
+            # dist_validation_dataset = (
+            #     tf_dist_strategy.experimental_distribute_dataset(
+            #         validation_dataset
+            #     ))
             with self.strategy.scope():
                 # TODO: move loss, optimizer and metrics instantiation under
                 # here
diff --git a/src/itwinai/torch/cluster.py b/src/itwinai/torch/cluster.py
deleted file mode 100644
index aece16e2..00000000
--- a/src/itwinai/torch/cluster.py
+++ /dev/null
@@ -1,225 +0,0 @@
-"""Cluster environments where to run AI workflows. Partially adapted from:
-https://github.com/facebookresearch/detr/blob/master/util/misc.py and
-https://github.com/ramyamounir/Template/blob/main/lib/utils/distributed.py
-"""
-
-from __future__ import annotations
-from typing import Optional
-import os
-import signal
-import subprocess
-from pathlib import Path
-from contextlib import contextmanager
-
-import numpy as np
-
-import torch
-import torch.distributed as dist
-import torch.backends.cudnn as cudnn
-
-from ..cluster import (
-    ClusterEnvironment,
-    setup_for_distributed,
-    handle_sigusr1,
-    handle_sigterm
-)
-from .types import TorchDistributedBackend as BackendT
-
-
-def fix_random_seeds(seed=31):
-    """
-    Fix random seeds.
-    """
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    np.random.seed(seed)
-
-
-class TorchCluster(ClusterEnvironment):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def _set_backend(self, backend_name: str) -> None:
-        if backend_name not in BackendT:
-            raise ValueError(
-                "Unrecognized 'backend' field. Allowed values "
-                f"are: {BackendT.list()}. Received '{backend_name}'")
-        self._backend = backend_name
-
-    def is_cuda_available(self) -> bool:
-        return self.use_cuda and torch.cuda.is_available()
-
-    def is_main_worker(self) -> bool:
-        """Checks if the current process is the main/master process
-        in the whole job.
-        """
-        return self.global_rank == 0
-
-    def cleanup_resources(self):
-        dist.barrier()
-        dist.destroy_process_group()
-
-
-class LocalCluster(TorchCluster):
-    """Simple single node cluster with optional access to multiple GPUs."""
-
-    def __init__(
-        self,
-        backend: Optional[str] = None,
-        gpus: Optional[str] = '',
-        port: int = 49153,
-        rnd_seed: Optional[int] = 42
-    ) -> None:
-        """Initialize local cluster for multi-GPU access.
-
-        Args:
-            backend (Optional[str], optional): supported PyTorch backends.
-                If None, workload is not distributed. Defaults to None.
-            gpus (Optional[str], optional): list of visible GPU devices
-                (e.g., '1,2,3'). If empty string uses all available GPUs.
-                If None, CPU is used. Defaults to ''.
-            port (int, optional): TCP port used by the master process.
-                Defaults to 49153.
-            rnd_seed (Optional[int], optional): random seed to be setup after
-                all processes are setup. Defaults to 42.
-        """
-        super().__init__()
-        self.backend = backend
-        self.gpus = gpus
-        self.port = port
-        self.dist_url = f'tcp://127.0.0.1:{self.port}'
-        self.rnd_seed = rnd_seed
-
-        if self.gpus != '' and self.gpus is not None:
-            # Restrict the number of GPUs visible according to user needs
-            os.environ['CUDA_VISIBLE_DEVICES'] = self.gpus
-
-        self.ngpus_per_node = torch.cuda.device_count()
-        self.global_rank = 0
-        self.global_world_size = self.ngpus_per_node
-
-        print(f"{self.ngpus_per_node} GPUs are available.")
-        self.distributed = True
-        # This flag tells whether the user wants to use the GPU(s)
-        self.use_cuda = (
-            self.gpus is not None  # GPU is not manually disabled
-            and torch.cuda.device_count() >= 1  # At least one GPU is selected
-        )
-        if self.backend is None or self.ngpus_per_node <= 1:
-            print("Distributed has been disabled.")
-            self.distributed = False
-            self.dist_url = None
-            self.global_world_size = 1
-            self.global_rank = 0
-        if not self.is_cuda_available():
-            print("CUDA disabled... Running on single CPU.")
-            self.use_cuda = False
-            self.distributed = False
-            self.dist_url = None
-            self.global_world_size = 1
-            self.global_rank = 0
-
-        # Since single node case
-        self.local_world_size = self.global_world_size
-
-    @contextmanager
-    def init_dist_gpu(self, worker_id) -> torch.device:
-        if self.distributed:
-            torch.cuda.set_device(worker_id)
-            self.global_rank += worker_id
-            # print(f'GLOBAL RANK: {self.global_rank}')
-            # Since single node case
-            self.local_rank = self.global_rank
-            # Simplification: worker ID mapped to GPU ID
-            self.gpu_id = worker_id
-
-            try:
-                dist.init_process_group(
-                    backend=self.backend,
-                    init_method=self.dist_url,
-                    world_size=self.global_world_size,
-                    rank=self.global_rank
-                )
-                fix_random_seeds(self.rnd_seed)
-                torch.cuda.set_device(self.gpu_id)
-                cudnn.benchmark = True
-                dist.barrier()
-
-                setup_for_distributed(self.is_main_worker())
-                print("SETUP DISTRIBUTED COMPLETE")
-                yield torch.device('cuda', worker_id)
-            finally:
-                self.cleanup_resources()
-        else:
-            # Distributed is disabled
-            # Since single node case
-            self.global_rank = 0
-            self.local_rank = self.global_rank
-            if self.use_cuda:
-                torch.cuda.set_device(worker_id)
-                yield torch.device('cuda', worker_id)
-            else:
-                yield torch.device('cpu')
-
-
-class SLURMCluster(TorchCluster):
-    """SLURM cluster with access to multi-node multi-GPU."""
-
-    def __init__(
-            self,
-            port: int = 49153,
-            backend: str = 'gloo',
-            rnd_seed: Optional[int] = 42
-    ) -> None:
-        super().__init__()
-        self.port = port
-        self.backend = backend
-        self.rnd_seed = rnd_seed
-        if 'SLURM_JOB_ID' not in os.environ:
-            raise RuntimeError(
-                "'SLURM_JOB_ID' environment variable is not set. "
-                "Perhaps you are not running in a slurm cluster?"
-            )
-
-        self.ngpus_per_node = torch.cuda.device_count()
-
-        # requeue job on SLURM preemption
-        signal.signal(signal.SIGUSR1, handle_sigusr1)
-        signal.signal(signal.SIGTERM, handle_sigterm)
-
-        # find a common host name on all nodes
-        cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST')
-        stdout = subprocess.check_output(cmd.split())
-        host_name = stdout.decode().splitlines()[0]
-        self.dist_url = f'tcp://{host_name}:{self.port}'
-
-        # distributed parameters
-        self.global_rank = int(os.getenv('SLURM_NODEID')) * self.ngpus_per_node
-        self.global_world_size = int(
-            os.getenv('SLURM_NNODES')) * self.ngpus_per_node
-
-    @contextmanager
-    def init_dist_gpu(self):
-        import submitit
-        try:
-            job_env = submitit.JobEnvironment()
-            self.output_dir = Path(
-                str(self.output_dir).replace("%j", str(job_env.job_id)))
-            self.gpu = job_env.local_rank
-            self.global_rank = job_env.global_rank
-
-            dist.init_process_group(
-                backend=self.backend,
-                init_method=self.dist_url,
-                world_size=self.global_world_size,
-                rank=self.global_rank
-            )
-            fix_random_seeds(self.rnd_seed)
-            torch.cuda.set_device(self.gpu)
-            cudnn.benchmark = True
-            dist.barrier()
-
-            setup_for_distributed(self.is_main_worker())
-            yield
-        finally:
-            self.cleanup_resources()
diff --git a/src/itwinai/torch/distributed.py b/src/itwinai/torch/distributed.py
index 34174346..3bb48647 100644
--- a/src/itwinai/torch/distributed.py
+++ b/src/itwinai/torch/distributed.py
@@ -1,5 +1,5 @@
 import abc
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Optional, Tuple, Union, Iterable
 from pathlib import Path
 import json
 import os
@@ -12,18 +12,47 @@
 import torch.optim as optim
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.optim.optimizer import Optimizer
+from torch.utils.data import Dataset, Sampler, DistributedSampler, DataLoader
+from torch.utils.data.dataloader import T_co, _worker_init_fn_t, _collate_fn_t
 
 from ..distributed import DistributedStrategy
+from .types import UninitializedStrategyError, DistributedStrategyError
+
+
+def distributed_resources_available() -> bool:
+    """Check if the current execution environment
+    has (enough) GPUs available to allow for distributed ML.
+
+    Returns:
+        bool: env can support distributed ML.
+    """
+    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
+        return True
+    return False
 
 
 class TorchDistributedStrategy(DistributedStrategy):
     """Abstract class to define the distributed backend methods for
     PyTorch models.
     """
+    is_distributed: bool = True
+    is_initialized: bool = False
+
+    @property
+    def is_main_worker(self) -> bool:
+        """Checks if local worker has global rank equal to zero.
+
+        Returns:
+            bool: True if main worker.
+        """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
+        return self.global_rank() == 0
+
     @abc.abstractmethod
     def init(self) -> None:
         """Initializes the chosen distributed backend"""
-
     # @abc.abstractmethod
     # def distributed_engine(
     #     self, model: nn.Module, optimizer: Optimizer,
@@ -39,7 +68,7 @@ def distributed(
         """Setup model, optimizer and scheduler for distributed."""
 
     @abc.abstractmethod
-    def dist_gwsize(self) -> int:
+    def global_world_size(self) -> int:
         """Returns the total number of processes (global world size).
 
         Returns:
@@ -47,7 +76,7 @@ def dist_gwsize(self) -> int:
         """
 
     @abc.abstractmethod
-    def dist_lwsize(self) -> int:
+    def local_world_size(self) -> int:
         """Returns the number of local workers available on a node
         (local world size).
         Usually it is equal to the number of available GPUs.
@@ -57,7 +86,7 @@ def dist_lwsize(self) -> int:
         """
 
     @abc.abstractmethod
-    def dist_grank(self) -> int:
+    def global_rank(self) -> int:
         """Returns the global rank of the current process.
         Rank ranges from 0 to world_size.
 
@@ -66,28 +95,182 @@ def dist_grank(self) -> int:
         """
 
     @abc.abstractmethod
-    def dist_lrank(self) -> int:
+    def local_rank(self) -> int:
         """Returns the local rank of the current process.
 
         Returns:
             int: local rank.
         """
 
-    def is_main_worker(self) -> bool:
-        """Checks if local worker has global rank equal to zero.
-
-        Returns:
-            bool: True if main worker.
-        """
-        return self.dist_grank() == 0
-
-    def dist_device(self) -> str:
+    def device(self) -> str:
         """Device used by local worker.
 
         Returns:
             str: torch device in the form 'cuda:N'.
         """
-        return f"cuda:{self.dist_lrank()}"
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
+        return f"cuda:{self.local_rank()}"
+
+    def create_dataloader(
+        self, dataset: Dataset[T_co], batch_size: Optional[int] = 1,
+        shuffle: Optional[bool] = None,
+        sampler: Union[Sampler, Iterable, None] = None,
+        batch_sampler: Union[Sampler[List], Iterable[List], None] = None,
+        num_workers: int = 0, collate_fn: Optional[_collate_fn_t] = None,
+        pin_memory: bool = False, drop_last: bool = False,
+        timeout: float = 0,
+        worker_init_fn: Optional[_worker_init_fn_t] = None,
+        multiprocessing_context=None, generator=None,
+        *, prefetch_factor: Optional[int] = None,
+        persistent_workers: bool = False,
+        pin_memory_device: str = ""
+    ):
+        """Create a distributed DataLoader by using ``DistributedSampler`` as
+        random sampler.
+
+        Args:
+            dataset (Dataset): dataset from which to load the data.
+            batch_size (int, optional): how many samples per batch to load
+                (default: ``1``).
+            shuffle (bool, optional): set to ``True`` to have the data
+                reshuffled at every epoch (default: ``False``).
+            sampler (Sampler or Iterable, optional): defines the strategy to
+                draw
+                samples from the dataset. Can be any ``Iterable`` with
+                ``__len__``
+                implemented. If specified, :attr:`shuffle` must not be
+                specified.
+            batch_sampler (Sampler or Iterable, optional): like
+                :attr:`sampler`, but
+                returns a batch of indices at a time. Mutually exclusive with
+                :attr:`batch_size`, :attr:`shuffle`, :attr:`sampler`,
+                and :attr:`drop_last`.
+            num_workers (int, optional): how many subprocesses to use for data
+                loading. ``0`` means that the data will be loaded in the main
+                process. (default: ``0``)
+            collate_fn (Callable, optional): merges a list of samples to form a
+                mini-batch of Tensor(s).  Used when using batched loading from
+                a map-style dataset.
+            pin_memory (bool, optional): If ``True``, the data loader will
+                copy Tensors
+                into device/CUDA pinned memory before returning them.  If your
+                data elements
+                are a custom type, or your :attr:`collate_fn` returns a batch
+                that is a custom type,
+                see the example below.
+            drop_last (bool, optional): set to ``True`` to drop the last
+                incomplete batch,
+                if the dataset size is not divisible by the batch size.
+                If ``False`` and
+                the size of dataset is not divisible by the batch size, then
+                the last batch
+                will be smaller. (default: ``False``)
+            timeout (numeric, optional): if positive, the timeout value for
+                collecting a batch
+                from workers. Should always be non-negative. (default: ``0``)
+            worker_init_fn (Callable, optional): If not ``None``,
+                this will be called on each
+                worker subprocess with the worker id (an int in
+                ``[0, num_workers - 1]``) as
+                input, after seeding and before data loading.
+                (default: ``None``)
+            multiprocessing_context (str or
+                multiprocessing.context.BaseContext, optional): If
+                ``None``, the default `multiprocessing context`_ of
+                your operating system will
+                be used. (default: ``None``)
+            generator (torch.Generator, optional): If not ``None``,
+                this RNG will be used
+                by RandomSampler to generate random indexes and
+                multiprocessing to generate
+                ``base_seed`` for workers. (default: ``None``)
+            prefetch_factor (int, optional, keyword-only arg): Number of
+                batches loaded
+                in advance by each worker. ``2`` means there will be a total of
+                2 * num_workers batches prefetched across all workers.
+                (default value depends
+                on the set value for num_workers. If value of num_workers=0
+                default is ``None``.
+                Otherwise, if value of ``num_workers > 0`` default is ``2``).
+            persistent_workers (bool, optional): If ``True``, the data loader
+                will not shut down
+                the worker processes after a dataset has been consumed once.
+                This allows to
+                maintain the workers `Dataset` instances alive.
+                (default: ``False``)
+            pin_memory_device (str, optional): the device to
+                :attr:`pin_memory` to if ``pin_memory`` is ``True``.
+
+
+        .. warning:: If the ``spawn`` start method is used,
+                    :attr:`worker_init_fn`
+                    cannot be an unpicklable object, e.g., a lambda function.
+                    See :ref:`multiprocessing-best-practices` on more
+                    details related to multiprocessing in PyTorch.
+
+        .. warning:: ``len(dataloader)`` heuristic is based on the length of
+                    the sampler used.
+                    When :attr:`dataset` is an
+                    :class:`~torch.utils.data.IterableDataset`,
+                    it instead returns an estimate based on
+                    ``len(dataset) / batch_size``, with proper
+                    rounding depending on :attr:`drop_last`, regardless
+                    of multi-process loading
+                    configurations. This represents the best guess PyTorch
+                    can make because PyTorch
+                    trusts user :attr:`dataset` code in correctly handling
+                    multi-process
+                    loading to avoid duplicate data.
+
+                    However, if sharding results in multiple workers having
+                    incomplete last batches,
+                    this estimate can still be inaccurate, because (1) an
+                    otherwise complete batch can
+                    be broken into multiple ones and (2) more than one batch
+                    worth of samples can be
+                    dropped when :attr:`drop_last` is set. Unfortunately,
+                    PyTorch can not detect such cases in general.
+
+                    See `Dataset Types`_ for more details on these two
+                    types of datasets and how
+                    :class:`~torch.utils.data.IterableDataset` interacts with
+                    `Multi-process data loading`_.
+
+        .. warning:: See :ref:`reproducibility`, and
+                    :ref:`dataloader-workers-random-seed`, and
+                    :ref:`data-loading-randomness` notes for random
+                    seed related questions.
+
+        .. _multiprocessing context:
+            https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+    """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
+
+        if self.is_distributed:
+            if sampler is not None:
+                raise RuntimeError(
+                    "User-provided sampler is not supported."
+                )
+            sampler = DistributedSampler(
+                dataset, num_replicas=self.global_world_size(),
+                rank=self.global_rank(),
+                shuffle=shuffle
+            )
+        # shuffle and batch_sampler must be unset
+        return DataLoader(
+            dataset=dataset, batch_size=batch_size, sampler=sampler,
+            num_workers=num_workers, collate_fn=collate_fn,
+            pin_memory=pin_memory, drop_last=drop_last, timeout=timeout,
+            worker_init_fn=worker_init_fn,
+            multiprocessing_context=multiprocessing_context,
+            generator=generator, prefetch_factor=prefetch_factor,
+            persistent_workers=persistent_workers,
+            pin_memory_device=pin_memory_device
+        )
 
     @abc.abstractmethod
     def clean_up(self) -> None:
@@ -105,8 +288,8 @@ def par_allgather_obj(self, obj: Any) -> List[Any]:
         """
 
 
-class DDPDistributedStrategy(TorchDistributedStrategy):
-    """PyTorch DDP distributed strategy class.
+class TorchDDPStrategy(TorchDistributedStrategy):
+    """PyTorch ``DistributedDataParallel`` distributed strategy class.
 
     Args:
         backend (str): Name of the communication backend to employ.
@@ -121,12 +304,21 @@ def __init__(self, backend: str) -> None:
     def init(self) -> None:
         """Initializes the distributed process group and the distributed
         package.
+
+        Raises:
+            RuntimeError: when there are not (enough) GPUs available.
+            DistributedStrategyError: when trying to initialize a strategy
+            already initialized.
         """
-        if torch.cuda.is_available() and torch.cuda.device_count() > 1:
-            dist.init_process_group(backend=self.backend)
-        else:
-            print("WARNING: trying to run distributed on insufficient"
-                  " resources. Skipping distributed process group setup.")
+        if not distributed_resources_available():
+            raise RuntimeError(
+                "Trying to run distributed on insufficient resources.")
+        if self.is_initialized:
+            raise DistributedStrategyError("Strategy was already initialized")
+        dist.init_process_group(backend=self.backend)
+        self.is_initialized = True
+
+        torch.cuda.device(self.local_rank())
 
     # def distributed_engine(
     #     self, model: nn.Module, optimizer: Optimizer,
@@ -158,55 +350,73 @@ def distributed(
         **kwargs
     ) -> Tuple[nn.Module, Optimizer, Optional[LRScheduler]]:
         """Setup model, optimizer and scheduler for distributed."""
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         if torch.cuda.is_available():
             # device = self.dist_lrank()
-            model = model.to(self.dist_device())
+            model = model.to(self.device())
             dist_model = torch.nn.parallel.DistributedDataParallel(
                 model,
-                device_ids=[self.dist_device()],
-                output_device=self.dist_device()
+                device_ids=[self.device()],
+                output_device=self.device()
             )
         else:
             dist_model = model
 
         return dist_model, optimizer, lr_scheduler
 
-    def dist_gwsize(self) -> int:
+    def global_world_size(self) -> int:
         """Returns the total number of processes (global world size).
 
         Returns:
             int: global world size.
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return dist.get_world_size()
 
-    def dist_lwsize(self) -> int:
+    def local_world_size(self) -> int:
         """Returns the local number of workers available per node,
         which is usually the number of GPUs available.
 
         Returns:
             int: local world size.
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return torch.cuda.device_count()
 
-    def dist_grank(self) -> int:
+    def global_rank(self) -> int:
         """Returns the global rank of the current process, where
         rank ranges from 0 to world_size.
 
         Returns:
             int: global rank.
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return dist.get_rank()
 
-    def dist_lrank(self) -> int:
+    def local_rank(self) -> int:
         """Returns the local rank of the current process.
 
         Returns:
             int: local rank.
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return dist.get_rank() % torch.cuda.device_count()
 
     def clean_up(self) -> None:
         """Destroys the current process group."""
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         if torch.cuda.is_available():
             dist.barrier()
             dist.destroy_process_group()
@@ -221,12 +431,15 @@ def par_allgather_obj(self, obj: Any) -> List[Any]:
         Returns:
             List[Any]: List of gathered objects.
         """
-        res = [None] * self.dist_gwsize()
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
+        res = [None] * self.global_world_size()
         dist.all_gather_object(res, obj)
         return res
 
 
-class DSDistributedStrategy(TorchDistributedStrategy):
+class DeepSpeedStrategy(TorchDistributedStrategy):
     """DeepSpeed distributed strategy class.
 
     Args:
@@ -256,7 +469,19 @@ def _load_config(self, ds_config) -> None:
     def init(self) -> None:
         """Initializes the distributed process group and the distributed
         package.
+
+        Raises:
+            RuntimeError: when there are not (enough) GPUs available.
+            DistributedStrategyError: when trying to initialize a strategy
+            already initialized.
         """
+        if not distributed_resources_available():
+            raise RuntimeError(
+                "Trying to run distributed on insufficient resources.")
+
+        if self.is_initialized:
+            raise DistributedStrategyError("Strategy was already initialized")
+
         # https://github.com/Lightning-AI/pytorch-lightning/issues/13567
         ompi_lrank = os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK')
         os.environ['OMPI_COMM_WORLD_LOCAL_RANK'] = os.environ.get(
@@ -264,6 +489,9 @@ def init(self) -> None:
 
         # https://deepspeed.readthedocs.io/en/latest/initialize.html#training-initialization
         deepspeed.init_distributed(dist_backend=self.backend)
+        self.is_initialized = True
+
+        torch.cuda.device(self.local_rank())
 
     def distributed(
         self, model: nn.Module, optimizer: Optional[Optimizer] = None,
@@ -272,6 +500,10 @@ def distributed(
         **init_kwargs
     ) -> Tuple[nn.Module, Optimizer, Optional[LRScheduler]]:
         """Setup model, optimizer and scheduler for distributed."""
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
+
         if init_kwargs.get("config"):
             self._load_config(init_kwargs.get("config"))
         # https://deepspeed.readthedocs.io/en/latest/initialize.html#training-initialization
@@ -286,42 +518,57 @@ def distributed(
         )
         return distrib_model, optimizer, lr_scheduler
 
-    def dist_gwsize(self) -> int:
+    def global_world_size(self) -> int:
         """Returns the total number of processes (global world size).
 
         Returns:
             int: global world size.
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return dist.get_world_size()
 
-    def dist_lwsize(self) -> int:
+    def local_world_size(self) -> int:
         """Returns the local number of workers available per node,
         which is usually the number of GPUs available.
 
         Returns:
             int: local world size.
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return torch.cuda.device_count()
 
-    def dist_grank(self) -> int:
+    def global_rank(self) -> int:
         """Returns the global rank of the current process, where
         rank ranges from 0 to world_size.
 
         Returns:
             int: global rank.
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return dist.get_rank()
 
-    def dist_lrank(self) -> int:
+    def local_rank(self) -> int:
         """Returns the local rank of the current process.
 
         Returns:
             int: local rank.
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return dist.get_rank() % torch.cuda.device_count()
 
     def clean_up(self) -> None:
         """Destroys the current process group."""
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         deepspeed.sys.exit()
 
     def par_allgather_obj(self, obj: Any) -> list[Any]:
@@ -334,18 +581,34 @@ def par_allgather_obj(self, obj: Any) -> list[Any]:
         Returns:
             List[Any]: List of gathered objects.
         """
-        res = [None] * self.dist_gwsize()
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
+        res = [None] * self.global_world_size()
         dist.all_gather_object(res, obj)
         return res
 
 
-class HVDDistributedStrategy(TorchDistributedStrategy):
+class HorovodStrategy(TorchDistributedStrategy):
     """Horovod distributed strategy class."""
 
     def init(self) -> None:
-        """Initializes the Horovod distributed backend."""
+        """Initializes the Horovod distributed backend.
+
+        Raises:
+            RuntimeError: when there are not (enough) GPUs available.
+            DistributedStrategyError: when trying to initialize a strategy
+            already initialized.
+        """
+        if not distributed_resources_available():
+            raise RuntimeError(
+                "Trying to run distributed on insufficient resources.")
+        if self.is_initialized:
+            raise DistributedStrategyError("Strategy was already initialized")
         hvd.init()
-        torch.cuda.set_device(hvd.local_rank())
+        self.is_initialized = True
+
+        torch.cuda.device(self.local_rank())
 
     def distributed(
         self, model: nn.Module, optimizer: Optional[Optimizer] = None,
@@ -353,8 +616,11 @@ def distributed(
         **optim_kwargs
     ) -> Tuple[nn.Module, Optimizer, Optional[LRScheduler]]:
         """Setup model, optimizer and scheduler for distributed."""
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
 
-        model.to(self.dist_device())
+        model.to(self.device())
 
         # Scale learning rate
         # https://github.com/horovod/horovod/issues/1653#issuecomment-574764452
@@ -389,42 +655,57 @@ def _broadcast_params(
         hvd.broadcast_parameters(model.state_dict(), root_rank=0)
         hvd.broadcast_optimizer_state(optimizer, root_rank=-0)
 
-    def dist_gwsize(self) -> int:
+    def global_world_size(self) -> int:
         """Returns the total number of processes (global world size).
 
         Returns:
             int: global world size.
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return hvd.size()
 
-    def dist_lwsize(self) -> int:
+    def local_world_size(self) -> int:
         """Returns the local number of workers available per node,
         which is usually the number of GPUs available.
 
         Returns:
             int: local world size.
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return hvd.local_size()
 
-    def dist_grank(self) -> int:
+    def global_rank(self) -> int:
         """Returns the global rank of the current process, where
         rank ranges from 0 to world_size.
 
         Returns:
             int: global rank.
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return hvd.rank()
 
-    def dist_lrank(self) -> int:
+    def local_rank(self) -> int:
         """Returns the local rank of the current process.
 
         Returns:
             int: local rank.
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return hvd.local_rank()
 
     def clean_up(self) -> None:
         """Shuts Horovod down."""
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         hvd.shutdown()
 
     def par_allgather_obj(self, obj: Any) -> list[Any]:
@@ -437,484 +718,99 @@ def par_allgather_obj(self, obj: Any) -> list[Any]:
         Returns:
             list: gathered list with size(#worker).
         """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
         return hvd.allgather_object(obj)
 
 
-# class TorchDistributedStrategy_old(DistributedStrategy):
-#     """Abstract class to define the distributed backend methods for
-#     PyTorch models.
-#     """
-#     @abc.abstractmethod
-#     def init_backend(self) -> None:
-#         """Initializes the chosen distributed backend"""
+class NonDistributedStrategy(TorchDistributedStrategy):
+    """Dummy class for non-distributed environments."""
 
-#     @abc.abstractmethod
-#     def distribute_model(self, model: Any) -> Any:
-#         """Distributes a machine learning model.
+    is_distributed: bool = False
 
-#         Args:
-#             model (Any): a generic ML model to be distributed.
+    def init(self) -> None:
+        """If CUDA is available set CUDA device, and do nothing more.
 
-#         Returns:
-#             Any: distributed model instance.
-#         """
+        Raises:
+            DistributedStrategyError: when trying to initialize a strategy
+            already initialized.
+        """
+        if self.is_initialized:
+            raise DistributedStrategyError("Strategy was already initialized")
+        if torch.cuda.is_available():
+            torch.cuda.device(self.local_rank())
+        self.is_initialized = True
 
-#     @abc.abstractmethod
-#     def broadcast_params(self, model: Any, optimizer: Any) -> None:
-#         """Broadcasts variables from root rank to all other processes/
+    def device(self) -> str:
+        """Device used by local worker.
 
-#         Args:
-#             model (Any): distributed model.
-#             optimizer (Any): optimizer.
-#         """
+        Returns:
+            str: cpu device if CUDA is not available.
+        """
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
+        if torch.cuda.is_available():
+            return super().device()
+        return "cpu"
+
+    def distributed(
+        self, model: nn.Module, optimizer: Optional[Optimizer] = None,
+        lr_scheduler: Optional[LRScheduler] = None,
+        **kwargs
+    ) -> Tuple[nn.Module, Optimizer, Optional[LRScheduler]]:
+        """Do nothing and return model, optimizer and scheduler."""
+        if not self.is_initialized:
+            raise UninitializedStrategyError(
+                "Strategy has not been initialized. Use the init method.")
+        if torch.cuda.is_available():
+            model = model.cuda()
+        return model, optimizer, lr_scheduler
+
+    def global_world_size(self) -> int:
+        """Returns the total number of processes (global world size).
+
+        Returns:
+            int: global world size.
+        """
+        return 1
+
+    def local_world_size(self) -> int:
+        """Returns the local number of workers available per node,
+        which is usually the number of GPUs available.
+
+        Returns:
+            int: local world size.
+        """
+        return 1
+
+    def global_rank(self) -> int:
+        """Returns the global rank of the current process, where
+        rank ranges from 0 to world_size.
+
+        Returns:
+            int: global rank.
+        """
+        return 0
+
+    def local_rank(self) -> int:
+        """Returns the local rank of the current process.
 
-#     @abc.abstractmethod
-#     def distribute_optimizer(self, optimizer: Any, model: Any) -> Any:
-#         """Distribute optimizer.
+        Returns:
+            int: local rank.
+        """
+        return 0
 
-#         Args:
-#             optimizer (Any): optimizer.
-#             model (Any): distributed model.
+    def clean_up(self) -> None:
+        """Do nothing."""
 
-#         Returns:
-#             Any: distributed optimizer.
-#         """
+    def par_allgather_obj(self, obj: Any) -> list[Any]:
+        """Raise error as this operation is not available.
 
-#     @abc.abstractmethod
-#     def dist_gwsize(self) -> int:
-#         """Returns the total number of processes (global world size).
-
-#         Returns:
-#             int: global world size.
-#         """
-
-#     @abc.abstractmethod
-#     def dist_lwsize(self) -> int:
-#         """Returns the number of local workers available on a node
-#         (local world size).
-#         Usually it is equal to the number of available GPUs.
-
-#         Returns:
-#             int: local world size.
-#         """
-
-#     @abc.abstractmethod
-#     def dist_grank(self) -> int:
-#         """Returns the global rank of the current process.
-#         Rank ranges from 0 to world_size.
-
-#         Returns:
-#             int: global rank.
-#         """
-
-#     @abc.abstractmethod
-#     def dist_lrank(self) -> int:
-#         """Returns the local rank of the current process.
-
-#         Returns:
-#             int: local rank.
-#         """
-
-#     def is_main_worker(self) -> bool:
-#         """Checks if local worker has global rank equal to zero.
-
-#         Returns:
-#             bool: True if main worker.
-#         """
-#         return self.dist_grank() == 0
-
-#     def dist_device(self) -> str:
-#         """Device used by local worker.
-
-#         Returns:
-#             str: torch device in the form 'cuda:N'.
-#         """
-#         return f"cuda:{self.dist_lrank()}"
-
-#     @abc.abstractmethod
-#     def clean_up(self) -> None:
-#         """Cleans up resources allocated by distributed strategy."""
-
-#     @abc.abstractmethod
-#     def par_allgather_obj(self, obj: Any) -> List[Any]:
-#         """Gathers any object from the whole group in a list
-#            (to all workers).
-
-#         Args:
-#             obj (Any): object to gather from all workers.
-
-#         Returns:
-#             List[Any]: list of objects gathered from all workers.
-#         """
-
-
-# class DDPDistributedStrategy_old(TorchDistributedStrategy_old):
-#     """PyTorch DDP distributed strategy class.
-
-#     Args:
-#         backend (str): Name of the communication backend to employ.
-#     """
-
-#     backend: str
-
-#     def __init__(self, backend: str) -> None:
-#         super().__init__()
-#         self.backend = backend
-
-#     def init_backend(self) -> None:
-#         """Initializes the distributed process group and the distributed
-#         package.
-#         """
-#         if torch.cuda.is_available():
-#             dist.init_process_group(backend=self.backend)
-
-#     def distribute_model(self, model: nn.Module) -> nn.Module:
-#         """Achieves data parallelism by synchronizing the gradients
-#         across each model replica located in each available
-#         computing device.
-
-#         Args:
-#             model (nn.Module): ML model to be distributed.
-
-#         Returns:
-#             nn.Module: Distributed model replicas across all devices.
-#             that are to be synchronized.
-#         """
-#         if torch.cuda.is_available():
-#             # device = self.dist_lrank()
-#             model = model.to(self.dist_device())
-#             dist_model = torch.nn.parallel.DistributedDataParallel(
-#                 model,
-#                 device_ids=[self.dist_device()],
-#                 output_device=self.dist_device()
-#             )
-#         else:
-#             dist_model = model
-
-#         return dist_model
-
-#     def broadcast_params(
-#         self,
-#         model: nn.Module,
-#         optimizer: optim.Optimizer
-#     ) -> None:
-#         """Do nothing. Only applicable for Horovod.
-
-#         Args:
-#             model (nn.Module): ML model
-#             optimizer (optim.Optimizer): Optimizer
-#         """
-#         pass
-
-#     def distribute_optimizer(
-#         self,
-#         optimizer: optim.Optimizer,
-#         model: nn.Module = None
-#     ) -> optim.Optimizer:
-#         """Returns the optimizer from argument.
-
-#         Args:
-#             optimizer (optim.Optimizer): optimizer.
-#             model (nn.Module): ML model. Unused here.
-
-#         Returns:
-#             optim.Optimizer: Distributed optimizer.
-#         """
-#         return optimizer
-
-#     def dist_gwsize(self) -> int:
-#         """Returns the total number of processes (global world size).
-
-#         Returns:
-#             int: global world size.
-#         """
-#         return dist.get_world_size()
-
-#     def dist_lwsize(self) -> int:
-#         """Returns the local number of workers available per node,
-#         which is usually the number of GPUs available.
-
-#         Returns:
-#             int: local world size.
-#         """
-#         return torch.cuda.device_count()
-
-#     def dist_grank(self) -> int:
-#         """Returns the global rank of the current process, where
-#         rank ranges from 0 to world_size.
-
-#         Returns:
-#             int: global rank.
-#         """
-#         return dist.get_rank()
-
-#     def dist_lrank(self) -> int:
-#         """Returns the local rank of the current process.
-
-#         Returns:
-#             int: local rank.
-#         """
-#         return dist.get_rank() % torch.cuda.device_count()
-
-#     def clean_up(self) -> None:
-#         """Destroys the current process group."""
-#         if torch.cuda.is_available():
-#             dist.barrier()
-#             dist.destroy_process_group()
-
-#     def par_allgather_obj(self, obj: Any) -> List[Any]:
-#         """Gathers any object from the whole group
-#         in a list (to all workers).
-
-#         Args:
-#             obj (Any): Object to gather from all workers.
-
-#         Returns:
-#             List[Any]: List of gathered objects.
-#         """
-#         res = [None] * self.dist_gwsize()
-#         dist.all_gather_object(res, obj)
-#         return res
-
-
-# class DSDistributedStrategy_old(TorchDistributedStrategy_old):
-#     """DeepSpeed distributed strategy class.
-
-#     Args:
-#         backend (str): Name of the communication backend to employ.
-#         config (Union[dict, Path, str]): DeepSpeed config. Either a
-#         dictionary or a path to a JSON file.
-#     """
-
-#     config: Dict = None
-#     backend: str
-
-#     def __init__(
-#         self,
-#         backend: str,
-#         config: Union[Dict, Path, str]
-#     ) -> None:
-#         super().__init__()
-#         self.backend = backend
-#         self._load_config(config)
-
-#     def _load_config(self, ds_config):
-#         if isinstance(ds_config, (str, Path)):
-#             with open(ds_config) as fp:
-#                 self.config = json.load(fp)
-#         elif isinstance(ds_config, dict):
-#             self.config = ds_config
-#         else:
-#             raise ValueError("ds_config is not a dictionary not a path.")
-
-#     def init_backend(self) -> None:
-#         """Initializes the distributed process group and the distributed
-#         package.
-#         """
-#         deepspeed.init_distributed(dist_backend=self.backend)
-
-#     def distribute_model(self, model: nn.Module) -> nn.Module:
-#         """Achieves data parallelism by synchronizing the gradients
-#         across each model replica located in each available
-#         computing device.
-
-#         Args:
-#             model (nn.Module): ML model to be distributed.
-
-#         Returns:
-#             nn.Module: Distributed model replicas across all devices
-#             that are to be synchronized.
-#         """
-#         distrib_model, __, __, __ = deepspeed.initialize(
-#             model=model,
-#             model_parameters=model.parameters(),
-#             dist_init_required=True,
-#             config=self.config
-#         )
-#         return distrib_model
-
-#     def broadcast_params(
-#             self, model: nn.Module, optimizer: optim.Optimizer
-#     ) -> None:
-#         """Only applicable for Horovod. Does nothing.
-
-#         Args:
-#             model (nn.Module): ML model.
-#             optimizer (optim.Optimizer): optimizer.
-#         """
-#         pass
-
-#     def distribute_optimizer(
-#         self,
-#         optimizer: optim.Optimizer,
-#         model: nn.Module = None
-#     ) -> optim.Optimizer:
-#         """Returns the optimizer from argument.
-
-#         Args:
-#             optimizer (optim.Optimizer): torch optimizer.
-#             model (nn.Module): torch neural network.
-
-#         Returns:
-#             optim.Optimizer: distributed optimizer.
-#         """
-#         return optimizer
-
-#     def dist_gwsize(self) -> int:
-#         """Returns the total number of processes (global world size).
-
-#         Returns:
-#             int: global world size.
-#         """
-#         return dist.get_world_size()
-
-#     def dist_lwsize(self) -> int:
-#         """Returns the local number of workers available per node,
-#         which is usually the number of GPUs available.
-
-#         Returns:
-#             int: local world size.
-#         """
-#         return torch.cuda.device_count()
-
-#     def dist_grank(self) -> int:
-#         """Returns the global rank of the current process, where
-#         rank ranges from 0 to world_size.
-
-#         Returns:
-#             int: global rank.
-#         """
-#         return dist.get_rank()
-
-#     def dist_lrank(self) -> int:
-#         """Returns the local rank of the current process.
-
-#         Returns:
-#             int: local rank.
-#         """
-#         return dist.get_rank() % torch.cuda.device_count()
-
-#     def clean_up(self) -> None:
-#         """Destroys the current process group."""
-#         deepspeed.sys.exit()
-
-#     def par_allgather_obj(self, obj: Any) -> list[Any]:
-#         """Gathers any object from the whole group
-#         in a list (to all workers).
-
-#         Args:
-#             obj (Any): Object to gather from all workers.
-
-#         Returns:
-#             List[Any]: List of gathered objects.
-#         """
-#         res = [None] * self.dist_gwsize()
-#         dist.all_gather_object(res, obj)
-#         return res
-
-
-# class HVDDistributedStrategy_old(TorchDistributedStrategy_old):
-#     """Horovod distributed strategy class."""
-
-#     def init_backend(self) -> None:
-#         """Initializes the Horovod distributed backend."""
-#         hvd.init()
-
-#     def distribute_model(self, model: nn.Module) -> nn.Module:
-#         """Only applicable for DDP and DeepSpeed.
-#         For Horovod, returns the same model passed as argument.
-
-#         Args:
-#             model (nn.Module): ML model to be distributed.
-
-#         Returns:
-#             nn.Module: ML model passed in the argument.
-#         """
-#         return model
-
-#     def broadcast_params(
-#             self, model: nn.Module, optimizer: optim.Optimizer
-#     ) -> None:
-#         """Broadcasts variables from root rank to all other processes.
-
-#         Args:
-#             model (nn.Module): ML model that is to be broadcasted
-#             across processes.
-#             optimizer (optim.Optimizer): Optimizer that is to be broadcasted
-#             across processes.
-#         """
-#         hvd.broadcast_parameters(model.state_dict(), root_rank=0)
-#         hvd.broadcast_optimizer_state(optimizer, root_rank=-0)
-
-#     def distribute_optimizer(
-#         self,
-#         optimizer: optim.Optimizer,
-#         model: nn.Module
-#     ) -> optim.Optimizer:
-#         """Constructs a DistributedOptimizer, for computing single-process
-#         gradient values and applying gradient updates after the gradients
-#         have been combined across all the Horovod ranks.
-
-#         Args:
-#             optimizer (optim.Optimizer): Optimizer to be distributed.
-#             model (nn.Module): ML model to be trained.
-
-#         Returns:
-#             optim.Optimizer: Distributed optimizer across all ranks.
-#         """
-#         distOptimizer = hvd.DistributedOptimizer(
-#             optimizer,
-#             named_parameters=model.named_parameters(),
-#             op=hvd.Average
-#         )
-#         return distOptimizer
-
-#     def dist_gwsize(self) -> int:
-#         """Returns the total number of processes (global world size).
-
-#         Returns:
-#             int: global world size.
-#         """
-#         return hvd.size()
-
-#     def dist_lwsize(self) -> int:
-#         """Returns the local number of workers available per node,
-#         which is usually the number of GPUs available.
-
-#         Returns:
-#             int: local world size.
-#         """
-#         return hvd.local_size()
-
-#     def dist_grank(self) -> int:
-#         """Returns the global rank of the current process, where
-#         rank ranges from 0 to world_size.
-
-#         Returns:
-#             int: global rank.
-#         """
-#         return hvd.rank()
-
-#     def dist_lrank(self) -> int:
-#         """Returns the local rank of the current process.
-
-#         Returns:
-#             int: local rank.
-#         """
-#         return hvd.local_rank()
-
-#     def clean_up(self) -> None:
-#         """Shuts Horovod down."""
-#         hvd.shutdown()
-
-#     def par_allgather_obj(self, obj: Any) -> list[Any]:
-#         """Gathers scalar objects across all workers to a
-#         list with size(#worker), uses horovod communicator
-
-#         Args:
-#             obj (Any): object in a worker.
-
-#         Returns:
-#             list: gathered list with size(#worker).
-#         """
-#         return hvd.allgather_object(obj)
+        Args:
+            obj (Any): object in a worker.
+        """
+        raise RuntimeError(
+            f"{self.__class__.__name__} does not support this operation."
+        )
diff --git a/src/itwinai/torch/engine.py b/src/itwinai/torch/engine.py
deleted file mode 100644
index 7084d6ec..00000000
--- a/src/itwinai/torch/engine.py
+++ /dev/null
@@ -1,276 +0,0 @@
-"""
-Model engine which wraps a torch NN. Still under development. May be removed...
-"""
-
-import abc
-from typing import Any, Union, Optional, Callable
-
-from pydantic import BaseModel
-
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
-from torch.cuda import amp
-from torch import autocast
-
-
-class OptimizerConfig:
-    def __init__(self, optim_class, **kwargs) -> None:
-        self.optim_class = optim_class
-        self.kwargs = kwargs
-
-    def to_optim(self, parameters) -> optim.Optimizer:
-        return self.optim_class(parameters, **self.kwargs)
-
-
-class LRSchedulerConfig:
-    def __init__(self, scheduler_class, **kwargs) -> None:
-        self.scheduler_class = scheduler_class
-        self.kwargs = kwargs
-
-    def to_scheduler(self, optim) -> LRScheduler:
-        return self.scheduler_class(optim, **self.kwargs)
-
-
-class ModelEngineConfig(BaseModel):
-    mixed_precision: bool = False
-
-
-class ModelEngine(abc.ABC):
-    """Wrapper around ML model, which abstracts from distributed and
-    mixed-precision models.
-    """
-
-    model: nn.Module
-    _model_parameters: Any
-    optimizer: optim.Optimizer
-    lr_scheduler: LRScheduler
-    # config: ModelEngineConfig
-    mixed_precision: bool = False
-    grad_scaler: amp.GradScaler = None
-
-    def __init__(
-        self,
-        model: nn.Module,
-        # model_parameters: Any,
-        optimizer: Union[optim.Optimizer, OptimizerConfig],
-        lr_scheduler: Optional[Union[LRScheduler, LRSchedulerConfig]] = None,
-        mixed_precision: bool = False
-        # config: Optional[ModelEngineConfig] = None
-    ) -> None:
-        super().__init__()
-        self.model = model
-        self.optimizer = optimizer
-        self.lr_scheduler = lr_scheduler
-        # self._model_parameters = model_parameters
-        # if isinstance(optimizer, OptimizerConfig):
-        #     self.optimizer = optimizer.to_optim(model_parameters)
-        # else:
-        #     self.optimizer = optimizer
-
-        # if isinstance(lr_scheduler, LRSchedulerConfig):
-        #     self.lr_scheduler = lr_scheduler.to_scheduler(self.optimizer)
-        # else:
-        #     self.lr_scheduler = lr_scheduler
-
-        # if not config:
-        #     self.config = ModelEngineConfig()
-        self.mixed_precision = mixed_precision
-        if mixed_precision:
-            self.grad_scaler = amp.GradScaler()
-
-    def __call__(self, *args: Any, **kwds: Any) -> Any:
-        """Performs the forward operation."""
-        # Wrapper of self.forward()
-        return self.forward(*args, **kwds)
-
-    def forward(self, *args: Any, **kwds: Any) -> Any:
-        """Performs the forward operation."""
-        return self.model(*args, **kwds)
-
-    def train(self, mode: bool = True) -> nn.Module:
-        """Set model in training mode."""
-        self.model.train(mode=mode)
-        return self.model
-
-    def eval(self) -> nn.Module:
-        """Set model in inference mode."""
-        self.model.eval()
-        return self.model
-
-    def to(self, device) -> nn.Module:
-        """Move model to specified device."""
-        self.model.to(device)
-        return self.model
-
-    @abc.abstractmethod
-    def zero_grad():
-        """Set gradients to zero for the optimizer."""
-
-    @abc.abstractmethod
-    def backward(self, loss_fn: Callable, *loss_args) -> torch.Tensor:
-        """Perform backward pass and return the loss.
-
-        Args:
-            loss_fn (Callable): computes the loss.
-            *loss_args: are the arguments to be passed to ``loss_fn``.
-
-        Returns:
-            torch.Tensor: computed loss.
-        """
-
-    @abc.abstractmethod
-    def optimizer_step(self):
-        """Perform optimizer step."""
-
-    @abc.abstractmethod
-    def lr_scheduler_step(self):
-        """Perform lr scheduler step, if present."""
-        # This should be incorporated in the optim step:
-        # https://deepspeed.readthedocs.io/en/latest/schedulers.html
-        # scheduler is updated automatically at each training step
-
-    @abc.abstractmethod
-    def save_checkpoint(self):
-        """Save checkpoint to persistent storage."""
-
-
-class DDPModelEngine(ModelEngine):
-    """Model engine for torch DDP distributed strategy."""
-
-    def forward(self, *args: Any, **kwds: Any) -> Any:
-        """Performs the forward operation."""
-        if self.mixed_precision:
-            # https://pytorch.org/docs/stable/notes/amp_examples.html
-            # Runs the forward pass with autocasting.
-            with autocast(device_type='cuda', dtype=torch.float16):
-                return self.model(*args, **kwds)
-        else:
-            return self.model(*args, **kwds)
-
-    def zero_grad(self):
-        """Set gradients to zero for the optimizer."""
-        self.optimizer.zero_grad()
-
-    def backward(self, loss_fn: Callable, *loss_args) -> torch.Tensor:
-        """Perform backward pass and return the loss.
-
-        Args:
-            loss_fn (Callable): computes the loss.
-            *loss_args: are the arguments to be passed to ``loss_fn``.
-
-        Returns:
-            torch.Tensor: computed loss.
-        """
-        if self.mixed_precision:
-            # https://pytorch.org/docs/stable/notes/amp_examples.html
-            # Runs the forward pass with autocasting.
-            with autocast(device_type='cuda', dtype=torch.float16):
-                loss = loss_fn(*loss_args)
-
-            # Scales loss.  Calls backward() on scaled loss to create scaled
-            # gradients.
-            # Backward passes under autocast are not recommended.
-            # Backward ops run in the same dtype autocast chose for
-            # corresponding forward ops.
-            loss = self.grad_scaler.scale(loss)
-        else:
-            loss = loss_fn(*loss_args)
-        loss.backward()
-        return loss
-
-    def optimizer_step(self):
-        """Perform optimizer step."""
-        if self.mixed_precision:
-            # https://pytorch.org/docs/stable/notes/amp_examples.html#typical-mixed-precision-training
-            # scaler.step() first unscales the gradients of the optimizer's
-            # assigned params.
-            # If these gradients do not contain infs or NaNs, optimizer.step()
-            # is then called,
-            # otherwise, optimizer.step() is skipped.
-            self.grad_scaler.step(self.optimizer)
-
-            # Updates the scale for next iteration.
-            self.grad_scaler.update()
-        else:
-            self.optimizer.step()
-
-    def lr_scheduler_step(self):
-        """Perform lr scheduler step, if present."""
-        if self.lr_scheduler:
-            self.lr_scheduler.step()
-
-    def save_checkpoint(self):
-        """Save checkpoint to persistent storage."""
-        raise NotImplementedError
-
-
-class DSModelEngine(ModelEngine):
-    """Model engine for DeeSpeed distributed strategy."""
-
-    def forward(self, *args: Any, **kwds: Any) -> Any:
-        """Performs the forward operation."""
-        if self.mixed_precision:
-            # https://pytorch.org/docs/stable/notes/amp_examples.html
-            # Runs the forward pass with autocasting.
-            with autocast(device_type='cuda', dtype=torch.float16):
-                return self.model(*args, **kwds)
-        else:
-            return self.model(*args, **kwds)
-
-    def zero_grad(self):
-        """Set gradients to zero for the optimizer."""
-        self.optimizer.zero_grad()
-
-    def backward(self, loss_fn: Callable, *loss_args) -> torch.Tensor:
-        """Perform backward pass and return the loss.
-
-        Args:
-            loss_fn (Callable): computes the loss.
-            *loss_args: are the arguments to be passed to ``loss_fn``.
-
-        Returns:
-            torch.Tensor: computed loss.
-        """
-        if self.mixed_precision:
-            # https://pytorch.org/docs/stable/notes/amp_examples.html
-            # Runs the forward pass with autocasting.
-            with autocast(device_type='cuda', dtype=torch.float16):
-                loss = loss_fn(*loss_args)
-
-            # Scales loss.  Calls backward() on scaled loss to create scaled
-            # gradients.
-            # Backward passes under autocast are not recommended.
-            # Backward ops run in the same dtype autocast chose for
-            # corresponding forward ops.
-            loss = self.grad_scaler.scale(loss)
-        else:
-            loss = loss_fn(*loss_args)
-        loss.backward()
-        return loss
-
-    def optimizer_step(self):
-        """Perform optimizer step."""
-        if self.mixed_precision:
-            # https://pytorch.org/docs/stable/notes/amp_examples.html#typical-mixed-precision-training
-            # scaler.step() first unscales the gradients of the optimizer's
-            # assigned params.
-            # If these gradients do not contain infs or NaNs, optimizer.step()
-            # is then called,
-            # otherwise, optimizer.step() is skipped.
-            self.grad_scaler.step(self.optimizer)
-
-            # Updates the scale for next iteration.
-            self.grad_scaler.update()
-        else:
-            self.optimizer.step()
-
-    def lr_scheduler_step(self):
-        """Perform lr scheduler step, if present."""
-        if self.lr_scheduler:
-            self.lr_scheduler.step()
-
-    def save_checkpoint(self):
-        """Save checkpoint to persistent storage."""
-        raise NotImplementedError
diff --git a/src/itwinai/torch/inference.py b/src/itwinai/torch/inference.py
index 02882f06..bb9af300 100644
--- a/src/itwinai/torch/inference.py
+++ b/src/itwinai/torch/inference.py
@@ -6,8 +6,7 @@
 from torch import nn
 from torch.utils.data import DataLoader, Dataset
 
-from ..utils import dynamically_import_class
-from .utils import clear_key
+from ..utils import dynamically_import_class, clear_key
 from ..components import Predictor, monitor_exec
 from .types import TorchDistributedStrategy as StrategyT
 from .types import Metric, Batch
diff --git a/src/itwinai/torch/mlflow.py b/src/itwinai/torch/mlflow.py
index 18a014ff..36992393 100644
--- a/src/itwinai/torch/mlflow.py
+++ b/src/itwinai/torch/mlflow.py
@@ -16,6 +16,8 @@ def _get_mlflow_logger_conf(pl_config: Dict) -> Optional[Dict]:
         Optional[Dict]: if present, MLFLowLogger constructor arguments
         (under 'init_args' key).
     """
+    if not pl_config['trainer'].get('logger'):
+        return None
     if isinstance(pl_config['trainer']['logger'], list):
         # If multiple loggers are provided
         for logger_conf in pl_config['trainer']['logger']:
@@ -35,6 +37,7 @@ def _mlflow_log_pl_config(pl_config: Dict, local_yaml_path: str) -> None:
 def init_lightning_mlflow(
     pl_config: Dict,
     default_experiment_name: str = 'Default',
+    tmp_dir: str = '.tmp',
     **autolog_kwargs
 ) -> None:
     """Initialize mlflow for pytorch lightning, also setting up
@@ -45,6 +48,7 @@ def init_lightning_mlflow(
         pl_config (Dict): pytorch lightning configuration loaded in memory.
         default_experiment_name (str, optional): used as experiment name
         if it is not given in the lightning conf. Defaults to 'Default'.
+        tmp_dir (str): where to temporarily store some artifacts.
         **autolog_kwargs (kwargs): args for mlflow.pytorch.autolog(...).
     """
     mlflow_conf: Optional[Dict] = _get_mlflow_logger_conf(pl_config)
@@ -63,12 +67,13 @@ def init_lightning_mlflow(
     mlflow.set_tracking_uri(tracking_uri)
     mlflow.set_experiment(experiment_name)
     mlflow.pytorch.autolog(**autolog_kwargs)
-    mlflow.start_run()
+    run = mlflow.start_run()
+    print(f"MLFlow's artifacts URI: {run.info.artifact_uri}")
 
     mlflow_conf['experiment_name'] = experiment_name
     mlflow_conf['run_id'] = mlflow.active_run().info.run_id
 
-    _mlflow_log_pl_config(pl_config, '.tmp/pl_config.yml')
+    _mlflow_log_pl_config(pl_config, os.path.join(tmp_dir, 'pl_config.yml'))
 
 
 def teardown_lightning_mlflow() -> None:
diff --git a/src/itwinai/torch/reproducibility.py b/src/itwinai/torch/reproducibility.py
new file mode 100644
index 00000000..1513c82a
--- /dev/null
+++ b/src/itwinai/torch/reproducibility.py
@@ -0,0 +1,48 @@
+"""
+This module provides the tools to support reproducible execution of
+torch scripts.
+"""
+
+from typing import Optional
+import numpy as np
+import random
+
+import torch
+
+
+def seed_worker(worker_id):
+    """Seed DataLoader worker."""
+    worker_seed = torch.initial_seed() % 2**32
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+def set_seed(
+    rnd_seed: Optional[int],
+    deterministic_cudnn: bool = True
+) -> torch.Generator:
+    """Set torch random seed and return a PRNG object.
+
+    Args:
+        rnd_seed (Optional[int]): random seed. If None, the seed is not set.
+        deterministic_cudnn (bool): if True, sets
+        ``torch.backends.cudnn.benchmark = False``, which may affect
+        performances.
+
+    Returns:
+        torch.Generator: PRNG object.
+    """
+    g = torch.Generator()
+    if rnd_seed is not None:
+        # Deterministic execution
+        np.random.seed(rnd_seed)
+        random.seed(rnd_seed)
+        torch.manual_seed(rnd_seed)
+        g.manual_seed(rnd_seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(rnd_seed)
+            torch.cuda.manual_seed_all(rnd_seed)
+        if deterministic_cudnn:
+            torch.backends.cudnn.benchmark = False
+            torch.backends.cudnn.deterministic = True
+    return g
diff --git a/src/itwinai/torch/trainer.py b/src/itwinai/torch/trainer.py
index f0ad1c03..4e7a108f 100644
--- a/src/itwinai/torch/trainer.py
+++ b/src/itwinai/torch/trainer.py
@@ -1,15 +1,12 @@
 """Provides training logic for PyTorch models via Trainer classes."""
 
 from typing import (
-    Optional, Dict, Union, Tuple, Type, List, Any
+    Optional, Dict, Union, Tuple, List, Any, Literal
 )
-import time
 import os
 import sys
 
-import numpy as np
 import torch
-import torch.multiprocessing as mp
 from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.distributed import DistributedSampler
 import torch.distributed as dist
@@ -17,494 +14,319 @@
 import torch.nn as nn
 from torch.optim.optimizer import Optimizer
 
+import lightning as L
+from lightning.pytorch.cli import LightningCLI
+
+import horovod.torch as hvd
+
 from ..components import Trainer, monitor_exec
-from .utils import seed_worker, par_allgather_obj, clear_key
 from .types import (
     Batch, Loss, LrScheduler, Metric
 )
-from .types import TorchDistributedStrategy as StrategyT
-from ..loggers import LogMixin, Logger, ConsoleLogger
-from ..utils import dynamically_import_class
-from ..cluster import ClusterEnvironment
-# from .distributed import (
-#     TorchDistributedStrategy,
-#     DDPDistributedStrategy,
-#     DSDistributedStrategy,
-#     HVDDistributedStrategy
-# )
+from ..loggers import LogMixin, Logger
+from .reproducibility import seed_worker, set_seed
+from .distributed import (
+    TorchDistributedStrategy,
+    TorchDDPStrategy,
+    HorovodStrategy,
+    DeepSpeedStrategy,
+    NonDistributedStrategy,
+    distributed_resources_available
+)
+from ..utils import load_yaml
+from .mlflow import (
+    init_lightning_mlflow,
+    teardown_lightning_mlflow
+)
 
 
-def preproc_dataloader(dataloader: DataLoader, gwsize, grank):
-    """Makes a Dataloader distributed."""
-    sampler = DistributedSampler(
-        dataloader.dataset,
-        num_replicas=gwsize,
-        rank=grank,
-        shuffle=True
-    )
-    # Recreate dataloader, with updated sampler
-    return DataLoader(
-        dataloader.dataset,
-        batch_size=dataloader.batch_size,
-        sampler=sampler,
-        num_workers=dataloader.num_workers,
-        collate_fn=dataloader.collate_fn,
-        pin_memory=dataloader.pin_memory,
-        drop_last=dataloader.drop_last,
-        timeout=dataloader.timeout,
-        worker_init_fn=seed_worker,  # dataloader.worker_init_fn,
-        multiprocessing_context=dataloader.multiprocessing_context,
-        generator=dataloader.generator,
-        prefetch_factor=dataloader.prefetch_factor,
-        persistent_workers=dataloader.persistent_workers,
-        pin_memory_device=dataloader.pin_memory_device
-    )
+class Config:
+    def __init__(self, my_dict: Optional[Dict] = None):
+        my_dict = my_dict if my_dict is not None else {}
+        self.__dict__.update(my_dict)
 
 
-def distributed(func):
-    """The decorated function must have a standard signature.
-    Its first arguments must be:
-    model, train_dataloader, validation_dataloader, device (in this order).
+class TorchTrainer(Trainer, LogMixin):
+    """Trainer class for torch training algorithms.
 
-    Additional args or kwargs are allowed consistently with the signature
-    of the decorated function.
+    Args:
+        config (Dict): training configuration containing hyperparameters.
+        epochs (int): number of training epochs.
+        model (Optional[nn.Module], optional): model to train.
+        Defaults to None.
+        strategy (Literal[&quot;ddp&quot;, &quot;deepspeed&quot;,
+        &quot;horovod&quot;], optional): distributed strategy.
+        Defaults to 'ddp'.
+        validation_every (Optional[int], optional): run a validation epoch
+        every ``validation_every`` epochs. Disabled if None. Defaults to 1.
+        test_every (Optional[int], optional): run a test epoch
+        every ``test_every`` epochs. Disabled if None. Defaults to None.
+        random_seed (Optional[int], optional): set random seed for
+        reproducibility. If None, the seed is not set. Defaults to None.
+        logger (Optional[Logger], optional): logger for ML tracking.
+        Defaults to None.
+        log_all_workers (bool, optional): if True, the ``log`` method is
+        called on all workers in the distributed context. Defaults to False.
+        metrics (Optional[Dict[str, Metric]], optional): map of torchmetrics
+        metrics. Defaults to None.
+        name (Optional[str], optional): trainer custom name. Defaults to None.
     """
-    def dist_train(
-            model, train_dataloader, validation_dataloader=None, device='cpu',
-            *args, **kwargs
-    ):
-        if torch.cuda.is_available():
-            dist.init_process_group(backend='nccl')
-
-        if torch.cuda.is_available():
-            lwsize = torch.cuda.device_count()  # local world size - per node
-            gwsize = dist.get_world_size()     # global world size - per run
-            grank = dist.get_rank()            # global rank - assign per run
-            lrank = dist.get_rank() % lwsize   # local rank - assign per node
-        else:
-            gwsize = 1
-            grank = 0
-            lrank = 0
-
-        device = torch.device(
-            'cuda' if torch.cuda.is_available() else 'cpu', lrank)
-        if torch.cuda.is_available():
-            torch.cuda.set_device(lrank)
-
-        model = model.to(device)
-        model = DDP(model, device_ids=[device], output_device=device)
-
-        train_dataloader = preproc_dataloader(train_dataloader, gwsize, grank)
-        if validation_dataloader is not None:
-            validation_dataloader = preproc_dataloader(
-                validation_dataloader, gwsize, grank)
-
-        try:
-            func(model, train_dataloader, validation_dataloader, device,
-                 *args, **kwargs)
-        finally:
-            if torch.cuda.is_available():
-                dist.barrier()
-                dist.destroy_process_group()
-    return dist_train
+    # TODO:
+    #   - add checkpointing.
+    #   - extract BaseTorchTrainer and extend it creating a set of trainer
+    #     templates (e.g.. GAN, Classifier, Transformer) allowing scientists
+    #     to reuse ML algos.
+    #   - improve get from configuration object
 
+    _strategy: TorchDistributedStrategy = None
 
-class TorchTrainerMG(Trainer, LogMixin):
-    """
-    Torch trainer for optionally distributed data-parallel (DDP) workload.
-    Multi-GPU distribution.
-
-    Args:
-        model (nn.Module): neural network instance.
-        loss (Loss): torch loss function instance.
-        optimizer_class (str): path to optimizer class
-            (e.g., 'torch.optim.SGD')
-        optimizer_kwargs (Optional[Dict], optional): optimizer constructor
-            arguments (except from parameters). Defaults to None.
-        lr_scheduler_class (Optional[str], optional): path to learning
-            rate scheduler class. Defaults to None.
-        lr_scheduler_kwargs (Optional[Dict], optional): constructor arguments
-            of the learning rate scheduler, except for the optimizer.
-            Defaults to None.
-        train_dataloader_class (str, optional): train dataloader class path.
-            Defaults to 'torch.utils.data.DataLoader'.
-        train_dataloader_kwargs (Optional[Dict], optional): constructor
-            arguments of the train dataloader, except for the dataset
-            instance. Defaults to None.
-        validation_dataloader_class (str, optional): validation dataloader
-            class path. Defaults to 'torch.utils.data.DataLoader'.
-        validation_dataloader_kwargs (Optional[Dict], optional): constructor
-            arguments of the validation dataloader, except for the dataset
-            instance. If None, it replicates `train_dataloader_kwargs`.
-            Defaults to None.
-        epochs (int, optional): number of training epochs. Defaults to 1.
-        strategy (Optional[TorchDistributedStrategy], optional): distributed
-            strategy. Defaults to StrategyT.NONE.value.
-        backend (TorchDistributedBackend, optional): computing backend.
-            Defaults to BackendT.NCCL.value.
-        shuffle_dataset (bool, optional): whether shuffle dataset before
-            sampling batches from dataloader. Defaults to False.
-        use_cuda (bool, optional): whether to use GPU. Defaults to True.
-        benchrun (bool, optional): sets up a debug run. Defaults to False.
-        testrun (bool, optional): deterministic training seeding everything.
-            Defaults to False.
-        seed (Optional[int], optional): random seed. Defaults to None.
-        logger (Optional[List[Logger]], optional): logger. Defaults to None.
-        checkpoint_every (int, optional): how often (epochs) to checkpoint the
-            best model. Defaults to 10.
-        cluster (Optional[ClusterEnvironment], optional): cluster environment
-            object describing the context in which the trainer is executed.
-            Defaults to None.
-        train_metrics (Optional[Dict[str, Metric]], optional):
-            list of metrics computed in the training step on the predictions.
-            It's a dictionary with the form
-            ``{'metric_unique_name': CallableMetric}``. Defaults to None.
-        validation_metrics (Optional[Dict[str, Metric]], optional): same
-            as ``training_metrics``. If not given, it mirrors the training
-            metrics. Defaults to None.
-
-    Raises:
-        RuntimeError: When trying to use DDP without CUDA support.
-        NotImplementedError: when trying to use a strategy different from the
-            ones provided by TorchDistributedStrategy.
-    """
+    train_dataloader: DataLoader = None
+    validation_dataloader: DataLoader = None
+    test_dataloader: DataLoader = None
 
     model: nn.Module = None
     loss: Loss = None
     optimizer: Optimizer = None
-    lr_scheduler = None
-    _strategy: StrategyT = StrategyT.NONE.value
-    train_dataset: Dataset
-    validation_dataset: Dataset
-    train_dataloader: DataLoader = None
-    validation_dataloader: DataLoader = None
-    epoch_idx: int = 0
+    lr_scheduler: LrScheduler = None
+
+    torch_rng: torch.Generator = None
+    logger: Logger = None
     train_glob_step: int = 0
     validation_glob_step: int = 0
-    train_metrics: Dict[str, Metric]
-    validation_metrics: Dict[str, Metric]
+    test_glob_step: int = 0
+    metrics: Dict[str, Metric]
 
     def __init__(
         self,
-        model: nn.Module,
-        loss: Loss,
-        optimizer_class: str,
-        optimizer_kwargs: Optional[Dict] = None,
-        lr_scheduler_class: Optional[str] = None,
-        lr_scheduler_kwargs: Optional[Dict] = None,
-        train_dataloader_class: str = 'torch.utils.data.DataLoader',
-        train_dataloader_kwargs: Optional[Dict] = None,
-        validation_dataloader_class: str = 'torch.utils.data.DataLoader',
-        validation_dataloader_kwargs: Optional[Dict] = None,
-        epochs: int = 1,
-        strategy: str = StrategyT.NONE.value,
-        benchrun: bool = False,
-        testrun: bool = False,
-        seed: Optional[int] = None,
-        logger: Optional[List[Logger]] = None,
-        checkpoint_every: int = 10,
-        cluster: Optional[ClusterEnvironment] = None,
-        train_metrics: Optional[Dict[str, Metric]] = None,
-        validation_metrics: Optional[Dict[str, Metric]] = None
+        config: Dict,
+        epochs: int,
+        model: Optional[nn.Module] = None,
+        strategy: Literal["ddp", "deepspeed", "horovod"] = 'ddp',
+        validation_every: Optional[int] = 1,
+        test_every: Optional[int] = None,
+        random_seed: Optional[int] = None,
+        logger: Optional[Logger] = None,
+        log_all_workers: bool = False,
+        metrics: Optional[Dict[str, Metric]] = None,
+        name: Optional[str] = None
     ) -> None:
-        """Sets up the distributed backend and loggers.
-        Makes the model a DDP model.
-        """
-        super().__init__()
+        super().__init__(name)
         self.save_parameters(**self.locals2params(locals()))
-        self.model = model
-        self.loss = loss
+
+        # config is mean to store all hyperparameters, which can very from use
+        # case to use case
+        # and include learning_rate, batch_size....
+        self.config = Config(config)
         self.epochs = epochs
-        self.testrun = testrun
-        self.seed = seed
+        self.model = model
         self.strategy = strategy
-        self.benchrun = benchrun
-        self.cluster = cluster
-        # Checkpoint every n epochs
-        self.checkpoint_every = checkpoint_every
-
-        # Train and validation dataloaders
-        self.train_dataloader_class = dynamically_import_class(
-            train_dataloader_class
-        )
-        self.validation_dataloader_class = dynamically_import_class(
-            validation_dataloader_class
-        )
-        train_dataloader_kwargs = (
-            train_dataloader_kwargs
-            if train_dataloader_kwargs is not None else {}
-        )
-        self.train_dataloader_kwargs = clear_key(
-            train_dataloader_kwargs, 'train_dataloader_kwargs', 'dataset'
-        )
-        # If validation_dataloader_kwargs is not given,
-        # copy train_dataloader_kwargs
-        validation_dataloader_kwargs = (
-            validation_dataloader_kwargs if validation_dataloader_kwargs
-            is not None else train_dataloader_kwargs
-        )
-        self.validation_dataloader_kwargs = clear_key(
-            validation_dataloader_kwargs, 'validation_dataloader_kwargs',
-            'dataset'
-        )
-
-        # Optimizer and scheduler
-        optim_class = dynamically_import_class(optimizer_class)
-        optimizer_kwargs = (
-            optimizer_kwargs if optimizer_kwargs is not None else {}
-        )
-        optimizer_kwargs = clear_key(
-            optimizer_kwargs, 'optimizer_kwargs', 'parameters'
-        )
-        self.optimizer: Optimizer = optim_class(
-            self.model.parameters(), **optimizer_kwargs
-        )
-        if lr_scheduler_class is not None:
-            scheduler_class = dynamically_import_class(lr_scheduler_class)
-            lr_scheduler_kwargs = (
-                lr_scheduler_kwargs if lr_scheduler_kwargs is not None else {}
-            )
-            lr_scheduler_kwargs = clear_key(
-                lr_scheduler_kwargs, 'lr_scheduler_kwargs', 'optimizer'
-            )
-            self.lr_scheduler: LrScheduler = scheduler_class(
-                self.optimizer, **lr_scheduler_kwargs
-            )
-
-        # Loggers
-        self.logger = logger if logger is not None else ConsoleLogger()
-
-        # Metrics
-        self.train_metrics = (
-            {} if train_metrics is None else train_metrics
-        )
-        self.validation_metrics = (
-            self.train_metrics if validation_metrics is None
-            else validation_metrics
-        )
+        self.validation_every = validation_every
+        self.test_every = test_every
+        self.random_seed = random_seed
+        self.logger = logger
+        self.log_all_workers = log_all_workers
+        self.metrics = metrics if metrics is not None else {}
 
     @property
-    def strategy(self) -> Optional[str]:
+    def strategy(self) -> TorchDistributedStrategy:
         return self._strategy
 
     @strategy.setter
-    def strategy(self, strategy_name) -> None:
-        if strategy_name not in StrategyT:
-            raise ValueError(
-                "Unrecognized 'strategy' field. Allowed values "
-                f"are: {StrategyT.list()}. Received '{strategy_name}'")
-        self._strategy = strategy_name
+    def strategy(self, strategy: Union[str, TorchDistributedStrategy]) -> None:
+        if isinstance(strategy, TorchDistributedStrategy):
+            self._strategy = strategy
+        else:
+            self._strategy = self._detect_strategy(strategy)
 
     @property
-    def global_step(self) -> int:
-        return self.train_glob_step + self.validation_glob_step
+    def device(self) -> str:
+        return self.strategy.device()
+
+    def _detect_strategy(self, strategy: str) -> TorchDistributedStrategy:
+        if not distributed_resources_available():
+            print("WARNING: falling back to non-distributed strategy.")
+            dist_str = NonDistributedStrategy()
+        elif strategy == 'ddp':
+            dist_str = TorchDDPStrategy(backend='nccl')
+        elif strategy == 'horovod':
+            dist_str = HorovodStrategy()
+        elif strategy == 'deepspeed':
+            dist_str = DeepSpeedStrategy(backend='nccl')
+        else:
+            raise NotImplementedError(
+                f"Strategy '{strategy}' is not recognized/implemented.")
+        return dist_str
 
-    def set_seed(self, seed: Optional[int] = None):
-        """Deterministic operations for reproducibility.
-        Sets the random seed.
+    def _init_distributed_strategy(self) -> None:
+        if not self.strategy.is_initialized:
+            self.strategy.init()
 
-        Args:
-            seed (Optional[int], optional): if not None, overrides
-                `self.seed`. Defaults to None.
+    def create_model_loss_optimizer(self) -> None:
+        """
+        Instantiate a torch model, loss, optimizer, and LR scheduler using the
+        configuration provided in the Trainer constructor.
+        Generally a user-define method.
         """
-        seed = seed if seed is not None else self.seed
-        np.random.seed(seed)
-        self.torch_rng = torch.Generator()
-        if seed is not None:
-            torch.manual_seed(seed)
-            self.torch_rng.manual_seed(seed)
-            if self.cluster.is_cuda_available():
-                torch.cuda.manual_seed(seed)
+        ###################################
+        # Dear user, this is a method you #
+        # may be interested to override!  #
+        ###################################
+
+        if self.model is None:
+            # Model was not passed to the constructor.
+            # Create a model here
+            raise ValueError(
+                "self.model is None! Either pass it to the constructor or "
+                "override this method."
+            )
 
-    @monitor_exec
-    def execute(
-        self,
-        train_dataset: Dataset,
-        validation_dataset: Dataset,
-        model: nn.Module = None,
-        optimizer: Optimizer = None,
-        lr_scheduler: LrScheduler = None,
-    ) -> Any:
-        self.train_dataset = train_dataset
-        self.validation_dataset = validation_dataset
-
-        # Update parameters passed for "interactive" use
-        if model is not None:
-            self.model = model
-        if optimizer is not None:
-            self.optimizer = optimizer
-        if lr_scheduler is not None:
-            self.lr_scheduler = lr_scheduler
-
-        # Start training
-        if self.cluster.distributed:
-            # Make training distributed
-            result = mp.spawn(self._train, nprocs=self.cluster.ngpus_per_node)
-        else:
-            result = self._train(0)
+        # A simple NLLLoss
+        self.loss = nn.functional.nll_loss
 
-        # Return value compliant with Executable.execute format
-        return result
+        # TODO: improve robustness of getting from config
+        self.optimizer = torch.optim.SGD(
+            self.model.parameters(),
+            lr=self.config.lr,
+            momentum=self.config.momentum
+        )
+        # Create self.lr_scheduler if needed
 
-    def _train(
-        self,
-        worker_id: int
-    ):
-        # Each worker has a different deterministic seed
-        # Here, 'worker' = replica of the training function
-        worker_seed = (
-            self.seed + worker_id if self.seed is not None else self.seed
+        # IMPORTANT: model, optimizer, and scheduler need to be distributed
+
+        # First, define strategy-wise optional configurations
+        # TODO: improve robustness of getting from config
+        if isinstance(self.strategy, DeepSpeedStrategy):
+            # Batch size definition is not optional for DeepSpeedStrategy!
+            distribute_kwargs = dict(
+                config_params=dict(
+                    train_micro_batch_size_per_gpu=self.config.batch_size
+                )
+            )
+        elif isinstance(self.strategy, HorovodStrategy):
+            distribute_kwargs = dict(
+                compression=(
+                    hvd.Compression.fp16 if self.config.fp16_allreduce
+                    else hvd.Compression.none
+                ),
+                op=hvd.Adasum if self.config.use_adasum else hvd.Average,
+                gradient_predivide_factor=self.config.gradient_predivide_factor
+            )
+        else:
+            distribute_kwargs = {}
+
+        # Distributed model, optimizer, and scheduler
+        (
+            self.model,
+            self.optimizer,
+            self.lr_scheduler
+        ) = self.strategy.distributed(
+            self.model, self.optimizer, self.lr_scheduler, **distribute_kwargs
         )
-        self.set_seed(worker_seed)
 
-        # Instantiate dataloaders
-        self.train_dataloader = self._instantiate_dataloader(
-            dataloader_class=self.train_dataloader_class,
-            dataset=self.train_dataset,
-            init_kwargs=self.train_dataloader_kwargs
+    def create_dataloaders(
+        self,
+        train_dataset: Dataset,
+        validation_dataset: Optional[Dataset] = None,
+        test_dataset: Optional[Dataset] = None
+    ) -> None:
+        """
+        Create train, validation and test dataloaders using the
+        configuration provided in the Trainer constructor.
+        Generally a user-define method.
+
+        Args:
+            train_dataset (Dataset): training dataset object.
+            validation_dataset (Optional[Dataset]): validation dataset object.
+            Default None.
+            test_dataset (Optional[Dataset]): test dataset object.
+            Default None.
+        """
+
+        ###################################
+        # Dear user, this is a method you #
+        # may be interested to override!  #
+        ###################################
+
+        # TODO: improve robustness of getting from config
+        self.train_dataloader = self.strategy.create_dataloader(
+            dataset=train_dataset,
+            batch_size=self.config.batch_size,
+            num_workers=self.config.num_workers,
+            pin_memory=self.config.pin_memory,
+            generator=self.torch_rng
         )
-        if self.validation_dataset is not None:
-            self.validation_dataloader = self._instantiate_dataloader(
-                dataloader_class=self.validation_dataloader_class,
-                dataset=self.validation_dataset,
-                init_kwargs=self.validation_dataloader_kwargs
+        if validation_dataset is not None:
+            self.validation_dataloader = self.strategy.create_dataloader(
+                dataset=train_dataset,
+                batch_size=self.config.batch_size,
+                num_workers=self.config.num_workers,
+                pin_memory=self.config.pin_memory,
+                generator=self.torch_rng
+            )
+        if test_dataset is not None:
+            self.test_dataloader = self.strategy.create_dataloader(
+                dataset=train_dataset,
+                batch_size=self.config.batch_size,
+                num_workers=self.config.num_workers,
+                pin_memory=self.config.pin_memory,
+                generator=self.torch_rng
             )
 
-        # Launch actual training:
-
-        # Single worker case
-        if not self.cluster.distributed:
-            with self.cluster.init_dist_gpu(worker_id) as device:
-                self.device: torch.device = device
-                self.model = self.model.to(self.device)
-                self.setup_logger()
-                self._setup_metrics()
-                try:
-                    train_result = self.train()
-                except Exception as exc:
-                    print(exc)
-                    raise exc
-                finally:
-                    print("INFO: Training ended")
-                    self.destroy_logger()
-                    train_result = None
-                return train_result
-
-        # Init / connect to distributed backend
-        with self.cluster.init_dist_gpu(worker_id) as device:
-            self.device: torch.device = device
-            self._distribute_model()
-            self.setup_logger()
-            self._setup_metrics()
-            try:
-                train_result = self.train()
-            except Exception as exc:
-                print(exc)
-                raise exc
-            finally:
-                print("INFO: Training ended")
-                self.destroy_logger()
-                train_result = None
-        return train_result
-
-    def _instantiate_dataloader(
+    def _setup_metrics(self):
+        """Move metrics to current device."""
+        for m_name, metric in self.metrics.items():
+            self.metrics[m_name] = metric.to(self.device)
+
+    @monitor_exec
+    def execute(
         self,
-        dataloader_class: Type,
-        dataset: Dataset,
-        init_kwargs: Dict
-    ) -> DataLoader:
-        """Make dataloader distributed if using distributed training strategy.
+        train_dataset: Dataset,
+        validation_dataset: Dataset,
+        test_dataset: Dataset
+    ) -> Tuple[Dataset, Dataset, Dataset, Any]:
+        """Prepares distributed environment and data structures
+        for the actual training.
 
         Args:
-            dataloader_class (Type): some torch DataLoader type.
-            dataset (Dataset): torch dataset instance.
-            init_kwargs (Dict): constructor args.
+            train_dataset (Dataset): training dataset.
+            validation_dataset (Dataset): validation dataset.
+            test_dataset (Dataset): test dataset.
+
+        Returns:
+            Tuple[Dataset, Dataset, Dataset, Any]: training dataset,
+            validation dataset, test dataset, trained model.
         """
-        init_kwargs['generator'] = init_kwargs.get(
-            'generator', self.torch_rng
-        )
-        init_kwargs['worker_init_fn'] = init_kwargs.get(
-            'worker_init_fn', seed_worker
+        self.torch_rng = set_seed(self.random_seed)
+        self._init_distributed_strategy()
+        self._setup_metrics()
+
+        self.create_dataloaders(
+            train_dataset=train_dataset,
+            validation_dataset=validation_dataset,
+            test_dataset=test_dataset
         )
+        self.create_model_loss_optimizer()
 
-        if self.strategy == StrategyT.DDP.value and self.cluster.distributed:
-            sampler = DistributedSampler(
-                dataset=dataset,
-                num_replicas=self.cluster.global_world_size,
-                rank=self.cluster.global_rank,
-                shuffle=init_kwargs.get(
-                    'shuffle', False
-                )
-            )
-            # Overwrite existing sampler, if given.
-            # TODO: improve using wrapper:
-            # https://discuss.pytorch.org/t/how-to-use-my-own-sampler-when-i-already-use-distributedsampler/62143?page=2
-            init_kwargs['sampler'] = sampler
-            if init_kwargs.get('shuffle') is not None:
-                # sampler option is mutually exclusive with shuffle
-                del init_kwargs['shuffle']
+        if self.strategy.is_main_worker:
+            self.logger.create_logger_context()
 
-        return dataloader_class(dataset, **init_kwargs)
+        self.train()
 
-    def _setup_metrics(self):
-        for m_name, metric in self.train_metrics.items():
-            self.train_metrics[m_name] = metric.to(self.device)
-        for m_name, metric in self.validation_metrics.items():
-            self.validation_metrics[m_name] = metric.to(self.device)
-
-    def _distribute_model(self):
-        if self.cluster.distributed:
-            # Distribute model
-            self.model = self.model.to(self.device)
-            if self.strategy == StrategyT.NONE.value:
-                print(
-                    "WARNING: A GPU cluster is available but no distributed "
-                    "strategy was given... Falling back to single worker...")
-                if not self.cluster.is_main_worker():
-                    # Use only GPU:0 for single worker
-                    sys.exit(0)
-            elif self.strategy == StrategyT.DDP.value:
-                self.model = DDP(
-                    self.model,
-                    device_ids=[self.device.index],
-                    output_device=self.device
-                )
-            else:
-                raise NotImplementedError("Only DDP strategy is implemented.")
-        else:
-            raise RuntimeError(
-                "Trying to distribute a model when a "
-                "distributed cluster is not available."
-            )
+        if self.strategy.is_main_worker:
+            self.logger.destroy_logger_context()
+        self.strategy.clean_up()
+        return train_dataset, validation_dataset, test_dataset, self.model
 
-    def setup_logger(self):
-        if self.cluster.is_main_worker():
-            # Only setup loggers on main worker
-            if isinstance(self.logger, list):
-                for logger in self.logger:
-                    logger.create_logger_context()
-            elif isinstance(self.logger, Logger):
-                self.logger.create_logger_context()
-            else:
-                raise TypeError(
-                    "Unrecognized self.logger. Allowed types are 'list' and "
-                    f"'Logger'. Received {type(self.logger)}"
-                )
-        else:
-            self.logger = []
-
-    def destroy_logger(self):
-        if self.cluster.is_main_worker():
-            if isinstance(self.logger, list):
-                for logger in self.logger:
-                    logger.destroy_logger_context()
-            elif isinstance(self.logger, Logger):
-                self.logger.destroy_logger_context()
-            else:
-                raise TypeError(
-                    "Unrecognized self.logger. Allowed types are 'list' and "
-                    f"'Logger'. Received {type(self.logger)}"
-                )
+    def _set_epoch_dataloaders(self, epoch: int):
+        """
+        Sets epoch in the distributed sampler of a dataloader when using it.
+        """
+        if self.strategy.is_distributed:
+            self.train_dataloader.sampler.set_epoch(epoch)
+            if self.validation_dataloader is not None:
+                self.validation_dataloader.sampler.set_epoch(epoch)
+            if self.test_dataloader is not None:
+                self.test_dataloader.sampler.set_epoch(epoch)
 
     def log(
         self,
@@ -513,39 +335,44 @@ def log(
         kind: str = 'metric',
         step: Optional[int] = None,
         batch_idx: Optional[int] = None,
-        every_worker: bool = False,
         **kwargs
     ) -> None:
-        if self.cluster.is_main_worker() or every_worker:
-            # Only log on main worker if not specified otherwise
-            if isinstance(self.logger, list):
-                for logger in self.logger:
-                    logger.log(
-                        item=item,
-                        identifier=identifier,
-                        kind=kind,
-                        step=step,
-                        batch_idx=batch_idx,
-                        **kwargs
-                    )
-            elif isinstance(self.logger, Logger):
-                self.logger.log(
-                    item=item,
-                    identifier=identifier,
-                    kind=kind,
-                    step=step,
-                    batch_idx=batch_idx,
-                    **kwargs
-                )
-            else:
-                raise TypeError(
-                    "Unrecognized self.logger. Allowed types are 'list' and "
-                    f"'Logger'. Received {type(self.logger)}"
-                )
+        if self.logger and (
+                self.strategy.is_main_worker or self.log_all_workers):
+            self.logger.log(
+                item=item,
+                identifier=identifier,
+                kind=kind,
+                step=step,
+                batch_idx=batch_idx,
+                **kwargs
+            )
+
+    def train(self):
+        """Trains a machine learning model.
+        Main training loop/logic.
+
+        Args:
+            train_dataset (Dataset): training dataset.
+            validation_dataset (Dataset): validation dataset.
+            test_dataset (Dataset): test dataset.
+
+        Returns:
+            Tuple[Dataset, Dataset, Dataset, Any]: training dataset,
+            validation dataset, test dataset, trained model.
+        """
+        # start_time = time.perf_counter()
+        for epoch in range(self.epochs):
+            epoch_n = epoch + 1
+            self._set_epoch_dataloaders(epoch)
+            self.train_epoch()
+            if self.validation_every and self.validation_every % epoch_n == 0:
+                self.validation_epoch()
+            if self.test_every and self.test_every % epoch_n == 0:
+                self.test_epoch()
 
     def compute_metrics(
         self,
-        metrics: Dict[str, Metric],
         true: Batch,
         pred: Batch,
         logger_step: int,
@@ -566,7 +393,7 @@ def compute_metrics(
             Dict[str, Any]: metric values.
         """
         m_values = {}
-        for m_name, metric in metrics.items():
+        for m_name, metric in self.metrics.items():
             # metric = metric.to(self.device)
             m_val = metric(pred, true).detach().cpu().numpy()
             self.log(
@@ -596,7 +423,6 @@ def training_step(
             batch_idx=batch_idx
         )
         metrics: Dict[str, Any] = self.compute_metrics(
-            metrics=self.train_metrics,
             true=y,
             pred=pred_y,
             logger_step=self.train_glob_step,
@@ -612,8 +438,9 @@ def validation_step(
     ) -> Tuple[Loss, Dict[str, Any]]:
         x, y = batch
         x, y = x.to(self.device), y.to(self.device)
-        pred_y = self.model(x)
-        loss: Loss = self.loss(pred_y, y)
+        with torch.no_grad():
+            pred_y = self.model(x)
+            loss: Loss = self.loss(pred_y, y)
         self.log(
             item=loss.item(),
             identifier='validation_loss',
@@ -622,7 +449,6 @@ def validation_step(
             batch_idx=batch_idx
         )
         metrics: Dict[str, Any] = self.compute_metrics(
-            metrics=self.validation_metrics,
             true=y,
             pred=pred_y,
             logger_step=self.validation_glob_step,
@@ -631,7 +457,7 @@ def validation_step(
         )
         return loss, metrics
 
-    def training_epoch(self) -> Loss:
+    def train_epoch(self) -> Loss:
         self.model.train()
         train_losses = []
         for batch_idx, train_batch in enumerate(self.train_dataloader):
@@ -684,264 +510,130 @@ def validation_epoch(self) -> Loss:
             )
             return avg_loss
 
-    def train(self):
+    def test_epoch(self):
+        # TODO: implement test epoch
+        raise NotImplementedError()
 
-        if self.optimizer is None:
-            raise ValueError("Undefined optimizer!")
-
-        if self.loss is None:
-            raise ValueError("Undefined loss function!")
-
-        st = time.time()
-
-        # Resume state
-        self.start_epoch = 1
-        self.best_loss = np.Inf
-        self.load_state()
-
-        # start training/testing loop
-        if self.cluster.is_main_worker():
-            print(f'TIMER: broadcast: {time.time()-st}s')
-            print('DEBUG: start training')
-            print('-'*56)
-
-        ##############################
-        # Start training: run epochs #
-        ##############################
-
-        et = time.time()
-        for self.epoch_idx in range(self.start_epoch, self.epochs + 1):
-            lt = time.time()
-
-            #######################################################
-            # Perform one training epoch and one validation epoch #
-            #######################################################
-
-            if self.benchrun and self.epoch_idx == self.epochs:
-                # TODO: move profiler into cluster environment
-                # profiling (done on last epoch - slower!)
-                with torch.autograd.profiler.profile(
-                    use_cuda=self.cluster.is_cuda_available(),
-                    profile_memory=True
-                ) as prof:
-                    train_loss = self.training_epoch()
-            else:
-                train_loss = self.training_epoch()
-            val_loss = self.validation_epoch()
-
-            #####################################
-            # Save checkpoint if model improved #
-            #####################################
-
-            ref_loss = val_loss if val_loss is not None else train_loss
-            is_best = ref_loss < self.best_loss
-            if (self.epoch_idx % self.checkpoint_every == 0
-                    and not self.benchrun):
-                self.save_state(
-                    loss_val=ref_loss,
-                    is_best=is_best
-                )
-                self.best_loss = min(ref_loss, self.best_loss)
-
-            ###########################
-            # End of epoch operations #
-            ###########################
-
-            # save first epoch timer
-            if self.epoch_idx == self.start_epoch:
-                first_ep_t = time.time()-lt
-
-            # Final epoch
-            if self.epoch_idx + 1 == self.epochs:
-                self.train_dataloader.last_epoch = True
-                self.validation_dataloader.last_epoch = True
-
-            if self.cluster.is_main_worker():
-                print(f'TIMER: epoch time: {time.time()-lt}s')
-                if self.benchrun and self.epoch_idx == self.epochs:
-                    print('-'*56)
-                    print('benchmark of last epoch:')
-                    what1 = (
-                        'cuda' if self.cluster.is_cuda_available() else 'cpu'
-                    )
-                    print(
-                        prof.key_averages().table(
-                            sort_by='self_'+str(what1)+'_time_total'
-                        )
-                    )
-
-        ##########################
-        # Training has completed #
-        ##########################
-
-        # save final state
-        if not self.benchrun:
-            self.save_state(
-                loss_val=ref_loss,
-                is_best=is_best
-            )
-        if self.cluster.is_cuda_available() and self.cluster.distributed:
-            dist.barrier()
-
-        ########################
-        # Print training stats #
-        ########################
-
-        if self.cluster.is_main_worker():
-            print('-'*56)
-            print('training results:')
-            print(f'TIMER: first epoch time: {first_ep_t}s')
-            print(f'TIMER: last epoch time: {time.time()-lt}s')
-            print(
-                f'TIMER: average epoch time: {(time.time()-et)/self.epochs}s')
-            print(f'TIMER: total epoch time: {time.time()-et}s')
-            if self.epoch_idx > 1:
-                print(
-                    f'TIMER: total epoch-1 time: {time.time()-et-first_ep_t}s'
-                )
-                print(
-                    'TIMER: average epoch-1 time: '
-                    f'{(time.time()-et-first_ep_t)/(self.epochs-1)}s')
-            if self.benchrun:
-                print(
-                    f'TIMER: total epoch-2 time: {lt-first_ep_t}s')
-                print('TIMER: average epoch-2 time: '
-                      f'{(lt-first_ep_t)/(self.epochs-2)}s')
-            mem = int(torch.cuda.memory_reserved(
-                self.cluster.local_rank)/1024/1024)
-            print(
-                f'memory req: {mem} MB'
-                if self.cluster.is_cuda_available()
-                and self.cluster.distributed else 'memory req: - MB'
-            )
-            if self.cluster.is_cuda_available():
-                print(
-                    f'memory summary:\n {torch.cuda.memory_summary(0)}')
-
-        if self.cluster.is_main_worker():
-            print(f'TIMER: final time: {time.time()-st} s')
-
-    def save_state(self, loss_val: Any, is_best: bool):
-        """Save training state."""
-        res_name = 'checkpoint.pth.tar'
-        rt = time.time()
-
-        if (self.cluster.is_cuda_available() and self.cluster.distributed):
-            # find if is_best happened in any worker
-            is_best_m = par_allgather_obj(
-                is_best, self.cluster.global_world_size
-            )
-            if any(is_best_m):
-                # TODO: is this strategy really good? Checkpointing when
-                # at least one worker improves the loss on their local
-                # data split is prone to overfitting, especially when
-                # the dataset in unbalanced!
-
-                # find which rank is_best happened - select first rank
-                # if multiple
-                best_rank = np.where(np.array(is_best_m))[0][0]
-                if self.cluster.global_rank == best_rank:
-                    self._save_sate(
-                        epoch=self.epoch_idx+1,
-                        loss_val=loss_val,
-                        save_path=res_name
-                    )
-                    print(
-                        f'DEBUG: state in {self.cluster.global_rank} is '
-                        f'saved on epoch:{self.epoch_idx} '
-                        f'in {time.time()-rt} s')
-        else:
-            self._save_sate(
-                epoch=self.epoch_idx+1,
-                loss_val=loss_val,
-                save_path=res_name
-            )
-            print(
-                f'DEBUG: state in {self.cluster.global_rank} '
-                f'is saved on epoch:{self.epoch_idx} in {time.time()-rt} s')
 
-    def _save_sate(
+class TorchLightningTrainer(Trainer):
+    """Generic trainer for torch Lightning workflows.
+
+        Args:
+            config (Union[Dict, str]): (path to a) Lightning configuration
+            https://pytorch-lightning.readthedocs.io/en/1.6.5/common/lightning_cli.html
+            mlflow_saved_model (str, optional): name of the model created in
+            MLFlow. Defaults to 'my_model'.
+        """
+
+    def __init__(
         self,
-        epoch: int,
-        loss_val: Any,
-        save_path: str
+        config: Union[Dict, str],
+        mlflow_saved_model: str = 'my_model'
     ):
-        """Save state on disk."""
-        sched = (
-            self.lr_scheduler.state_dict()
-            if self.lr_scheduler is not None else None
+        self.save_parameters(**self.locals2params(locals()))
+        super().__init__()
+        if isinstance(config, str) and os.path.isfile(config):
+            # Load from YAML
+            config = load_yaml(config)
+        self.conf = config
+        self.mlflow_saved_model = mlflow_saved_model
+
+    @monitor_exec
+    def execute(self) -> Any:
+        init_lightning_mlflow(
+            self.conf,
+            tmp_dir='/tmp',
+            registered_model_name=self.mlflow_saved_model
         )
-        state = {
-            'epoch': epoch,
-            'state_dict': self.model.state_dict(),
-            'best_loss': loss_val,
-            'optimizer': self.optimizer.state_dict(),
-            'lr_scheduler': sched
-        }
-        self.log(
-            item=state,
-            identifier=save_path,
-            kind='torch',
-            epoch_step=self.epoch_idx,
-            batch_step=0
+        old_argv = sys.argv
+        sys.argv = ['some_script_placeholder.py']
+        cli = LightningCLI(
+            args=self.conf,
+            model_class=L.LightningModule,
+            datamodule_class=L.LightningDataModule,
+            run=False,
+            save_config_kwargs={
+                "overwrite": True,
+                "config_filename": "pl-training.yml",
+            },
+            subclass_mode_model=True,
+            subclass_mode_data=True,
         )
+        sys.argv = old_argv
+        cli.trainer.fit(cli.model, datamodule=cli.datamodule)
+        teardown_lightning_mlflow()
 
-    def load_state(self):
-        """Load training state."""
-        res_name = 'checkpoint.pth.tar'
-        if os.path.isfile(res_name) and not self.benchrun:
-            try:
-                if (self.cluster.is_cuda_available()
-                        and self.cluster.distributed):
-                    dist.barrier()
-                    # Map model to be loaded to specified single gpu.
-                    # loc = (
-                #     {'cuda:%d' % 0: 'cuda:%d' % self.cluster.local_rank}
-                #     if self.cluster.is_cuda_available()
-                #     else {'cpu:%d' % 0: 'cpu:%d' % self.cluster.local_rank}
-                    # )
-                    # checkpoint = torch.load(res_name, map_location=loc)
-                    checkpoint = torch.load(
-                        res_name, map_location=self.device
-                    )
-                else:
-                    checkpoint = torch.load(res_name, map_location='cpu')
-                self.start_epoch = checkpoint['epoch']
-                self.best_loss = checkpoint['best_loss']
-                self.model.load_state_dict(checkpoint['state_dict'])
-                self.optimizer.load_state_dict(checkpoint['optimizer'])
-                if self.lr_scheduler is not None:
-                    self.lr_scheduler.load_state_dict(
-                        checkpoint['lr_scheduler']
-                    )
-                if self.cluster.is_cuda_available():
-                    if self.cluster.is_main_worker():
-                        print(
-                            f'WARNING: restarting from {self.start_epoch} '
-                            'epoch')
-                else:
-                    print(
-                        f'WARNING: restarting from {self.start_epoch} epoch')
-            except Exception:
-                if self.cluster.is_cuda_available():
-                    if self.cluster.is_main_worker():
-                        print(
-                            'restart file cannot be loaded, restarting!')
-                else:
-                    print(
-                        'WARNING: restart file cannot be loaded, restarting!')
-
-        if self.start_epoch >= self.epochs + 1:
-            if self.cluster.is_cuda_available() and self.cluster.distributed:
-                if self.cluster.is_main_worker():
-                    print(
-                        'WARNING: given epochs are less than the '
-                        'one in the restart file!')
-                    print('WARNING: SYS.EXIT is issued')
-                sys.exit()
-            else:
-                print(
-                    'WARNING: given epochs are less than the '
-                    'one in the restart file!')
-                print('WARNING: SYS.EXIT is issued')
-                sys.exit()
+
+def preproc_dataloader(dataloader: DataLoader, gwsize, grank):
+    """Makes a Dataloader distributed."""
+    sampler = DistributedSampler(
+        dataloader.dataset,
+        num_replicas=gwsize,
+        rank=grank,
+        shuffle=True
+    )
+    # Recreate dataloader, with updated sampler
+    return DataLoader(
+        dataloader.dataset,
+        batch_size=dataloader.batch_size,
+        sampler=sampler,
+        num_workers=dataloader.num_workers,
+        collate_fn=dataloader.collate_fn,
+        pin_memory=dataloader.pin_memory,
+        drop_last=dataloader.drop_last,
+        timeout=dataloader.timeout,
+        worker_init_fn=seed_worker,  # dataloader.worker_init_fn,
+        multiprocessing_context=dataloader.multiprocessing_context,
+        generator=dataloader.generator,
+        prefetch_factor=dataloader.prefetch_factor,
+        persistent_workers=dataloader.persistent_workers,
+        pin_memory_device=dataloader.pin_memory_device
+    )
+
+
+def distributed(func):
+    """The decorated function must have a standard signature.
+    Its first arguments must be:
+    model, train_dataloader, validation_dataloader, device (in this order).
+
+    Additional args or kwargs are allowed consistently with the signature
+    of the decorated function.
+    """
+    def dist_train(
+            model, train_dataloader, validation_dataloader=None, device='cpu',
+            *args, **kwargs
+    ):
+        if torch.cuda.is_available():
+            dist.init_process_group(backend='nccl')
+
+        if torch.cuda.is_available():
+            lwsize = torch.cuda.device_count()  # local world size - per node
+            gwsize = dist.get_world_size()     # global world size - per run
+            grank = dist.get_rank()            # global rank - assign per run
+            lrank = dist.get_rank() % lwsize   # local rank - assign per node
+        else:
+            gwsize = 1
+            grank = 0
+            lrank = 0
+
+        device = torch.device(
+            'cuda' if torch.cuda.is_available() else 'cpu', lrank)
+        if torch.cuda.is_available():
+            torch.cuda.set_device(lrank)
+
+        model = model.to(device)
+        model = DDP(model, device_ids=[device], output_device=device)
+
+        train_dataloader = preproc_dataloader(train_dataloader, gwsize, grank)
+        if validation_dataloader is not None:
+            validation_dataloader = preproc_dataloader(
+                validation_dataloader, gwsize, grank)
+
+        try:
+            func(model, train_dataloader, validation_dataloader, device,
+                 *args, **kwargs)
+        finally:
+            if torch.cuda.is_available():
+                dist.barrier()
+                dist.destroy_process_group()
+    return dist_train
diff --git a/src/itwinai/torch/types.py b/src/itwinai/torch/types.py
index 614462ad..0b6f88ad 100644
--- a/src/itwinai/torch/types.py
+++ b/src/itwinai/torch/types.py
@@ -64,3 +64,11 @@ class TorchOptimizer(BaseEnum):
     """
     SGD = 'SGD'
     ADAM = 'Adam'
+
+
+class UninitializedStrategyError(Exception):
+    """Error raised when a strategy has not been initialized."""
+
+
+class DistributedStrategyError(Exception):
+    """Error raised when a strategy has already been initialized."""
diff --git a/src/itwinai/torch/utils.py b/src/itwinai/torch/utils.py
deleted file mode 100644
index 99bcd246..00000000
--- a/src/itwinai/torch/utils.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import Hashable, Dict
-import time
-import numpy as np
-import random
-
-import torch
-import torch.distributed as dist
-
-
-def save_state(
-    epoch, distrib_model, loss_val, optimizer, res_name, grank, gwsize,
-    is_best, distributed: bool = True
-):
-    """Save training state"""
-    rt = time.time()
-    # find if is_best happened in any worker
-    if torch.cuda.is_available() and distributed:
-        is_best_m = par_allgather_obj(is_best, gwsize)
-
-    if torch.cuda.is_available() and distributed:
-        if any(is_best_m):
-            # find which rank is_best happened - select first rank if multiple
-            is_best_rank = np.where(np.array(is_best_m))[0][0]
-
-            # collect state
-            state = {'epoch': epoch + 1,
-                     'state_dict': distrib_model.state_dict(),
-                     'best_loss': loss_val,
-                     'optimizer': optimizer.state_dict()}
-
-            # write on worker with is_best
-            if grank == is_best_rank:
-                torch.save(state, './'+res_name)
-                print(f'DEBUG: state in {grank} is saved on '
-                      f'epoch:{epoch} in {time.time()-rt} s')
-    else:
-        # collect state
-        state = {'epoch': epoch + 1,
-                 'state_dict': distrib_model.state_dict(),
-                 'best_loss': loss_val,
-                 'optimizer': optimizer.state_dict()}
-
-        torch.save(state, './'+res_name)
-        print(
-            f'DEBUG: state in {grank} is saved on epoch:{epoch} '
-            f'in {time.time()-rt} s')
-
-
-def seed_worker(worker_id):
-    """deterministic dataloader"""
-    worker_seed = torch.initial_seed() % 2**32
-    np.random.seed(worker_seed)
-    random.seed(worker_seed)
-
-
-def par_allgather_obj(obj, gwsize):
-    """gathers any object from the whole group in a list (to all workers)"""
-    res = [None]*gwsize
-    dist.all_gather_object(res, obj, group=None)
-    # print(f'ALLGATHER: {res}')
-    return res
-
-
-def clear_key(
-        my_dict: Dict,
-        dict_name: str,
-        key: Hashable,
-        complain: bool = True
-) -> Dict:
-    """Remove key from dictionary if present and complain.
-
-    Args:
-        my_dict (Dict): Dictionary.
-        dict_name (str): name of the dictionary.
-        key (Hashable): Key to remove.
-    """
-    if key in my_dict:
-        if complain:
-            print(
-                f"Field '{key}' should not be present "
-                f"in dictionary '{dict_name}'"
-            )
-        del my_dict[key]
-    return my_dict
diff --git a/src/itwinai/utils.py b/src/itwinai/utils.py
index 52279aeb..280de5d3 100644
--- a/src/itwinai/utils.py
+++ b/src/itwinai/utils.py
@@ -1,14 +1,11 @@
 """
 Utilities for itwinai package.
 """
-from typing import Dict, Type, Callable, Tuple
-import os
+from typing import Dict, Type, Callable, Tuple, Hashable
 import sys
 import inspect
 from collections.abc import MutableMapping
 import yaml
-from omegaconf import OmegaConf
-from omegaconf.dictconfig import DictConfig
 
 
 def load_yaml(path: str) -> Dict:
@@ -32,32 +29,6 @@ def load_yaml(path: str) -> Dict:
     return loaded_config
 
 
-def load_yaml_with_deps(path: str) -> DictConfig:
-    """
-    Load YAML file with OmegaConf and merge it with its dependencies
-    specified in the `conf-dependencies` field.
-    Assume that the dependencies live in the same folder of the
-    YAML file which is importing them.
-
-    Args:
-        path (str): path to YAML file.
-
-    Raises:
-        exc: yaml.YAMLError for loading/parsing errors.
-
-    Returns:
-        DictConfig: nested representation of parsed YAML file.
-    """
-    yaml_conf = load_yaml(path)
-    use_case_dir = os.path.dirname(path)
-    deps = []
-    if yaml_conf.get("conf-dependencies"):
-        for dependency in yaml_conf["conf-dependencies"]:
-            deps.append(load_yaml(os.path.join(use_case_dir, dependency)))
-
-    return OmegaConf.merge(yaml_conf, *deps)
-
-
 def dynamically_import_class(name: str) -> Type:
     """
     Dynamically import class by module path.
@@ -115,18 +86,6 @@ def flatten_dict(
     return dict(items)
 
 
-# Parse (part of) YAML loaded in memory
-def parse_pipe_config(yaml_file, parser):
-    with open(yaml_file, "r", encoding="utf-8") as f:
-        try:
-            config = yaml.safe_load(f)
-        except yaml.YAMLError as exc:
-            print(exc)
-            raise exc
-
-    return parser.parse_object(config)
-
-
 class SignatureInspector:
     """Provides the functionalities to inspect the signature of a function
     or a method.
@@ -181,3 +140,42 @@ def max_params_num(self) -> int:
         if self.has_kwargs or self.has_varargs:
             return self.INFTY
         return len(self.func_params)
+
+
+def str_to_slice(interval: str) -> slice:
+    import re
+    # TODO: add support for slices starting with empty index
+    # e.g., :20:3
+    if not re.match(r"\d+(:\d+)?(:\d+)?", interval):
+        raise ValueError(
+            f"Received invalid interval for slice: '{interval}'"
+        )
+    if ":" in interval:
+        return slice(*map(
+            lambda x: int(x.strip()) if x.strip() else None,
+            interval.split(':')
+        ))
+    return int(interval)
+
+
+def clear_key(
+        my_dict: Dict,
+        dict_name: str,
+        key: Hashable,
+        complain: bool = True
+) -> Dict:
+    """Remove key from dictionary if present and complain.
+
+    Args:
+        my_dict (Dict): Dictionary.
+        dict_name (str): name of the dictionary.
+        key (Hashable): Key to remove.
+    """
+    if key in my_dict:
+        if complain:
+            print(
+                f"Field '{key}' should not be present "
+                f"in dictionary '{dict_name}'"
+            )
+        del my_dict[key]
+    return my_dict
diff --git a/tests/components/test_components.py b/tests/components/test_components.py
index 3ec55453..890188d7 100644
--- a/tests/components/test_components.py
+++ b/tests/components/test_components.py
@@ -74,11 +74,6 @@ class MyTrainer(Trainer):
         def execute(self):
             ...
 
-        def save_state(self):
-            ...
-
-        def load_state(self):
-            ...
     comp = MyTrainer()
     with pytest.raises(SerializationError) as exc_info:
         dict_serializ = comp.to_dict()
diff --git a/tests/test_cli.py b/tests/test_cli.py
deleted file mode 100644
index 26b57cb0..00000000
--- a/tests/test_cli.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""
-Test itwinai CLI.
-"""
-
-import subprocess
-import pytest
-
-
-@pytest.mark.skip(reason="cli deprecated")
-def test_datasets_viz():
-    """
-    Test visualization of use case's dataset registry.
-    """
-    USE_CASE = "use-cases/mnist/"
-    subprocess.run(
-        f"itwinai datasets --use-case {USE_CASE}".split(), check=True)
-
-
-@pytest.mark.skip(reason="cli deprecated")
-def test_workflows_viz():
-    """
-    Test visualization of use case's workflows.
-    """
-    USE_CASE = "./use-cases/mnist/"
-    subprocess.run(
-        f"itwinai workflows --use-case {USE_CASE}".split(), check=True)
diff --git a/tests/use-cases/conftest.py b/tests/use-cases/conftest.py
index d080e0a8..69229db6 100644
--- a/tests/use-cases/conftest.py
+++ b/tests/use-cases/conftest.py
@@ -2,9 +2,9 @@
 from typing import Callable
 import pytest
 import subprocess
+import random
+import string
 
-pytest.TORCH_PREFIX = './.venv-pytorch'
-pytest.TF_PREFIX = './.venv-tf'
 
 FNAMES = [
     'pipeline.yaml',
@@ -12,6 +12,52 @@
 ]
 
 
+def rnd_string(len: int = 26):
+    return ''.join(random.sample(string.ascii_lowercase, len))
+
+
+@pytest.fixture
+def tmp_test_dir():
+    root = '/tmp/pytest'
+    os.makedirs(root, exist_ok=True)
+    test_dir = os.path.join(root, rnd_string())
+    while os.path.exists(test_dir):
+        test_dir = os.path.join(root, rnd_string())
+    os.makedirs(test_dir, exist_ok=True)
+
+    yield test_dir
+
+    # Optional: remove dir here...
+
+
+@pytest.fixture
+def torch_env() -> str:
+    """
+    Return absolute path to torch virtual environment parsing it
+    from environment variables, if provided, otherwise fall back
+    to ``./.venv-pytorch``.
+    """
+    if os.environ.get('TORCH_ENV') is None:
+        env_p = './.venv-pytorch'
+    else:
+        env_p = os.environ.get('TORCH_ENV')
+    return os.path.abspath(env_p)
+
+
+@pytest.fixture
+def tf_env() -> str:
+    """
+    Return absolute path to tensorflow virtual environment parsing it
+    from environment variables, if provided, otherwise fall back
+    to ``./.venv-tf``.
+    """
+    if os.environ.get('TF_ENV') is None:
+        env_p = './.venv-tf'
+    else:
+        env_p = os.environ.get('TF_ENV')
+    return os.path.abspath(env_p)
+
+
 @pytest.fixture
 def check_folder_structure() -> Callable:
     """
@@ -31,7 +77,6 @@ def install_requirements() -> Callable:
     def _install_reqs(root: str, env_prefix: str):
         req_path = os.path.join(root, 'requirements.txt')
         if os.path.isfile(req_path):
-            cmd = (f"micromamba run -p {env_prefix} "
-                   f"pip install -r {req_path}")
+            cmd = f"{env_prefix}/bin/pip install -r {req_path}"
             subprocess.run(cmd.split(), check=True)
     return _install_reqs
diff --git a/tests/use-cases/test_3dgan.py b/tests/use-cases/test_3dgan.py
index c57e21ff..7f4503e4 100644
--- a/tests/use-cases/test_3dgan.py
+++ b/tests/use-cases/test_3dgan.py
@@ -3,73 +3,63 @@
 """
 import pytest
 import subprocess
-# from itwinai.utils import dynamically_import_class
+import os
 
 CERN_PATH = "use-cases/3dgan"
-CKPT_PATH = "3dgan-inference.pth"
-
-
-@pytest.fixture(scope="module")
-def fake_model_checkpoint() -> None:
-    """
-    Create a dummy model checkpoint for inference.
-    """
-    import sys
-    import torch
-    sys.path.append(CERN_PATH)
-    from model import ThreeDGAN
-    # ThreeDGAN = dynamically_import_class('model.ThreeDGAN')
-    net = ThreeDGAN()
-    torch.save(net, CKPT_PATH)
+CKPT_NAME = "3dgan-inference.pth"
 
 
+@pytest.mark.skip("deprecated")
 def test_structure_3dgan(check_folder_structure):
     """Test 3DGAN folder structure."""
     check_folder_structure(CERN_PATH)
 
 
 @pytest.mark.functional
-def test_3dgan_train(install_requirements):
+def test_3dgan_train(torch_env, tmp_test_dir, install_requirements):
     """
     Test 3DGAN torch lightning trainer by running it end-to-end.
     """
-    install_requirements(CERN_PATH, pytest.TORCH_PREFIX)
-    # cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} python "
-    #        f"{CERN_PATH}/train.py -p {CERN_PATH}/pipeline.yaml")
-    trainer_params = "pipeline.init_args.steps.training_step.init_args"
-    cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} itwinai exec-pipeline "
-           f"--config {CERN_PATH}/pipeline.yaml "
-           f'-o {trainer_params}.config.trainer.accelerator=cpu '
-           f'-o {trainer_params}.config.trainer.strategy=auto '
+    install_requirements(CERN_PATH, torch_env)
+    conf = os.path.join(os.path.abspath(CERN_PATH), 'config.yaml')
+    cmd = (f"{torch_env}/bin/itwinai exec-pipeline "
+           f"--config {conf} --pipe-key training_pipeline "
+           '-o hw_accelerators=auto '
+           '-o distributed_strategy=auto '
            )
-    subprocess.run(cmd.split(), check=True)
+    subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir)
 
 
 @pytest.mark.functional
-def test_3dgan_inference(install_requirements, fake_model_checkpoint):
+def test_3dgan_inference(
+    torch_env,
+    tmp_test_dir,
+    install_requirements,
+    # fake_model_checkpoint
+):
     """
     Test 3DGAN torch lightning trainer by running it end-to-end.
     """
-    install_requirements(CERN_PATH, pytest.TORCH_PREFIX)
-    # cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} python "
-    #        f"{CERN_PATH}/train.py -p {CERN_PATH}/pipeline.yaml")
-    # cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} itwinai exec-pipeline "
-    #        f"--config {CERN_PATH}/inference-pipeline.yaml")
+    install_requirements(CERN_PATH, torch_env)
+
+    # Create fake inference dataset and checkpoint
+    exec = os.path.join(os.path.abspath(CERN_PATH),
+                        'create_inference_sample.py')
+    cmd = (f"{torch_env}/bin/python {exec} "
+           f"--root {tmp_test_dir} "
+           f"--ckpt-name {CKPT_NAME}")
+    subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir)
 
-    getter_params = "pipeline.init_args.steps.dataloading_step.init_args"
-    trainer_params = "pipeline.init_args.steps.inference_step.init_args"
-    logger_params = trainer_params + ".config.trainer.logger.init_args"
-    data_params = trainer_params + ".config.data.init_args"
-    saver_params = "pipeline.init_args.steps.saver_step.init_args"
+    # Test inference
+    conf = os.path.join(os.path.abspath(CERN_PATH), 'config.yaml')
     cmd = (
-        'itwinai exec-pipeline '
-        '--config use-cases/3dgan/inference-pipeline.yaml '
-        f'-o {getter_params}.data_path=exp_data '
-        f'-o {trainer_params}.model.init_args.model_uri={CKPT_PATH} '
-        f'-o {trainer_params}.config.trainer.accelerator=cpu '
-        f'-o {trainer_params}.config.trainer.strategy=auto '
-        f'-o {logger_params}.save_dir=ml_logs/mlflow_logs '
-        f'-o {data_params}.datapath=exp_data/*/*.h5 '
-        f'-o {saver_params}.save_dir=3dgan-generated-data '
+        f'{torch_env}/bin/itwinai exec-pipeline '
+        f'--config {conf} --pipe-key inference_pipeline '
+        '-o dataset_location=exp_data '
+        f'-o inference_model_uri={CKPT_NAME} '
+        '-o hw_accelerators=auto '
+        '-o distributed_strategy=auto '
+        '-o logs_dir=ml_logs/mlflow_logs '
+        '-o inference_results_location=3dgan-generated-data '
     )
-    subprocess.run(cmd.split(), check=True)
+    subprocess.run(cmd.split(), check=True, cwd=CERN_PATH)
diff --git a/tests/use-cases/test_cyclones.py b/tests/use-cases/test_cyclones.py
index 1a5ebb3f..d6a1ea2c 100644
--- a/tests/use-cases/test_cyclones.py
+++ b/tests/use-cases/test_cyclones.py
@@ -7,10 +7,12 @@
 
 import pytest
 import subprocess
+import os
 
 CYCLONES_PATH = "use-cases/cyclones"
 
 
+@pytest.mark.skip("deprecated")
 def test_structure_cyclones(check_folder_structure):
     """Test cyclones folder structure."""
     check_folder_structure(CYCLONES_PATH)
@@ -18,11 +20,14 @@ def test_structure_cyclones(check_folder_structure):
 
 @pytest.mark.functional
 @pytest.mark.memory_heavy
-def test_cyclones_train_tf(install_requirements):
+def test_cyclones_train_tf(tf_env, tmp_test_dir, install_requirements):
     """
     Test Cyclones tensorflow trainer by running it end-to-end.
     """
-    install_requirements(CYCLONES_PATH, pytest.TF_PREFIX)
-    cmd = (f"micromamba run -p {pytest.TF_PREFIX} python "
-           f"{CYCLONES_PATH}/train.py -p {CYCLONES_PATH}/pipeline.yaml")
-    subprocess.run(cmd.split(), check=True)
+    # TODO: create a small sample dataset for tests only
+    install_requirements(CYCLONES_PATH, tf_env)
+    pipe = os.path.join(os.path.abspath(CYCLONES_PATH), 'pipeline.yaml')
+    train = os.path.join(os.path.abspath(CYCLONES_PATH), 'train.py')
+    cmd = (f"{tf_env}/bin/python {train} "
+           f"-p {pipe}")
+    subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir)
diff --git a/tests/use-cases/test_mnist.py b/tests/use-cases/test_mnist.py
index d32aab1c..1f18a8e6 100644
--- a/tests/use-cases/test_mnist.py
+++ b/tests/use-cases/test_mnist.py
@@ -7,72 +7,100 @@
 
 import pytest
 import subprocess
+import os
+# from itwinai.cli import exec_pipeline
 
 TORCH_PATH = "use-cases/mnist/torch"
 LIGHTNING_PATH = "use-cases/mnist/torch-lightning"
 TF_PATH = "use-cases/mnist/tensorflow"
 
 
+@pytest.mark.skip(reason="structure changed")
 def test_structure_mnist_torch(check_folder_structure):
     """Test MNIST folder structure for torch native trainer."""
     check_folder_structure(TORCH_PATH)
 
 
+@pytest.mark.skip(reason="structure changed")
 def test_structure_mnist_lightning(check_folder_structure):
     """Test MNIST folder structure for torch lightning trainer."""
     check_folder_structure(LIGHTNING_PATH)
 
 
+@pytest.mark.skip(reason="structure changed")
 def test_structure_mnist_tf(check_folder_structure):
     """Test MNIST folder structure for tensorflow trainer."""
     check_folder_structure(TF_PATH)
 
 
 @pytest.mark.functional
-def test_mnist_train_torch(install_requirements):
+def test_mnist_train_torch(torch_env, tmp_test_dir, install_requirements):
     """
     Test MNIST torch native trainer by running it end-to-end.
+
+    To set the torch env path set the ``TORCH_ENV`` env variable:
+
+    >>> export TORCH_ENV="my_env"
     """
-    install_requirements(TORCH_PATH, pytest.TORCH_PREFIX)
-    cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} python "
-           f"{TORCH_PATH}/train.py -p {TORCH_PATH}/pipeline.yaml")
-    subprocess.run(cmd.split(), check=True)
+    install_requirements(TORCH_PATH, torch_env)
+    conf = os.path.join(os.path.abspath(TORCH_PATH), 'config.yaml')
+    cmd = (f"{torch_env}/bin/itwinai exec-pipeline "
+           f"--config {conf} --pipe-key training_pipeline")
+    subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir)
 
 
 @pytest.mark.functional
-def test_mnist_train_lightning(install_requirements):
+def test_mnist_inference_torch(torch_env, tmp_test_dir, install_requirements):
     """
-    Test MNIST torch lightning trainer by running it end-to-end.
+    Test MNIST torch native inference by running it end-to-end.
+
+    To set the torch env path set the ``TORCH_ENV`` env variable:
+
+    >>> export TORCH_ENV="my_env"
     """
-    install_requirements(TORCH_PATH, pytest.TORCH_PREFIX)
-    cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} python "
-           f"{LIGHTNING_PATH}/train.py -p {LIGHTNING_PATH}/pipeline.yaml")
-    subprocess.run(cmd.split(), check=True)
+    install_requirements(TORCH_PATH, torch_env)
+
+    # Create fake inference dataset and checkpoint
+    exec = os.path.join(os.path.abspath(TORCH_PATH),
+                        'create_inference_sample.py')
+    cmd = (f"{torch_env}/bin/python {exec} "
+           f"--root {tmp_test_dir}")
+    subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir)
+
+    # Test inference
+    conf = os.path.join(os.path.abspath(TORCH_PATH), 'config.yaml')
+    cmd = (f"{torch_env}/bin/itwinai exec-pipeline "
+           f"--config {conf} --pipe-key inference_pipeline")
+    subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir)
 
 
 @pytest.mark.functional
-def test_mnist_train_tf(install_requirements):
+def test_mnist_train_torch_lightning(
+    torch_env,
+    tmp_test_dir,
+    install_requirements
+):
     """
-    Test MNIST tensorflow trainer by running it end-to-end.
+    Test MNIST torch lightning trainer by running it end-to-end.
+
+    To set the torch env path set the ``TORCH_ENV`` env variable:
+
+    >>> export TORCH_ENV="my_env"
     """
-    install_requirements(TF_PATH, pytest.TF_PREFIX)
-    cmd = (f"micromamba run -p {pytest.TF_PREFIX} python "
-           f"{TF_PATH}/train.py -p {TF_PATH}/pipeline.yaml")
-    subprocess.run(cmd.split(), check=True)
+    install_requirements(LIGHTNING_PATH, torch_env)
+    conf = os.path.join(os.path.abspath(LIGHTNING_PATH), 'config.yaml')
+    cmd = (f"{torch_env}/bin/itwinai exec-pipeline "
+           f"--config {conf} --pipe-key training_pipeline")
+    subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir)
 
 
-@pytest.mark.skip(reason="workflow changed. Left as example")
-@pytest.mark.integration
-def test_mnist_train_legacy():
+@pytest.mark.functional
+def test_mnist_train_tf(tf_env, tmp_test_dir, install_requirements):
     """
-    Test MNIST training workflow(s) by running it end-to-end.
+    Test MNIST tensorflow trainer by running it end-to-end.
     """
-    workflows = [
-        "./use-cases/mnist/torch/workflows/training-workflow.yml",
-        "./use-cases/mnist/tensorflow/workflows/training-workflow.yml",
-    ]
-
-    for workflow in workflows:
-        cmd = f"micromamba run -p ./.venv python run-workflow.py -f {workflow}"
-        subprocess.run(cmd.split(), check=True)
-        subprocess.run(cmd.split() + ["--cwl"], check=True)
+    install_requirements(TF_PATH, tf_env)
+    conf = os.path.join(os.path.abspath(TF_PATH), 'pipeline.yaml')
+    cmd = (f"{tf_env}/bin/itwinai exec-pipeline "
+           f"--config {conf} --pipe-key pipeline")
+    subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir)
diff --git a/tutorials/distributed-ml/torch-scaling-test/README.md b/tutorials/distributed-ml/torch-scaling-test/README.md
index 74e316c0..1344504e 100644
--- a/tutorials/distributed-ml/torch-scaling-test/README.md
+++ b/tutorials/distributed-ml/torch-scaling-test/README.md
@@ -38,11 +38,16 @@ setting SLURM environment variables using the `--export` option:
 
 ```bash
 # Launch a distributed training setup with Torch DDP
-DIST_MODE="ddp"
-RUN_NAME="ddp-bl-imagenent"
-TRAINING_CMD="ddp_trainer.py -c config/base.yaml -c config/ddp.yaml"
-sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD" \
-    --job-name="$RUN_NAME" slurm.sh
+export DIST_MODE="ddp"
+export RUN_NAME="ddp-bl-imagenent"
+export TRAINING_CMD="ddp_trainer.py -c config/base.yaml -c config/ddp.yaml"
+export PYTHON_VENV="../../../envAI_hdfml"
+export N=2 # Number of nodes
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    --nodes=$N slurm.sh
 ```
 
 ## Run all training configurations
diff --git a/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py b/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py
index 54f64fef..0a25ae5b 100755
--- a/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py
+++ b/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py
@@ -18,8 +18,11 @@
 
 from itwinai.parser import ArgumentParser as ItAIArgumentParser
 from itwinai.loggers import EpochTimeTracker
+from itwinai.torch.reproducibility import (
+    seed_worker, set_seed
+)
 
-from utils import seed_worker, imagenet_dataset, set_seed
+from utils import imagenet_dataset
 
 
 def parse_params():
@@ -121,7 +124,7 @@ def main():
         dist.init_process_group(backend=args.backend)
 
     # Set random seed for reproducibility
-    torch_prng = set_seed(args.rnd_seed, use_cuda)
+    torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False)
 
     if is_distributed:
         # get job rank info - rank==0 master gpu
diff --git a/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py b/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py
index 691712e8..e6022021 100644
--- a/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py
+++ b/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py
@@ -18,8 +18,11 @@
 
 from itwinai.parser import ArgumentParser as ItAIArgumentParser
 from itwinai.loggers import EpochTimeTracker
+from itwinai.torch.reproducibility import (
+    seed_worker, set_seed
+)
 
-from utils import seed_worker, set_seed, imagenet_dataset
+from utils import imagenet_dataset
 
 
 def parse_params():
@@ -124,7 +127,7 @@ def main():
         deepspeed.init_distributed(dist_backend=args.backend)
 
     # Set random seed for reproducibility
-    torch_prng = set_seed(args.rnd_seed, use_cuda)
+    torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False)
 
     if is_distributed:
         # Get job rank info - rank==0 master gpu
@@ -248,7 +251,7 @@ def main():
             print('TIMER: epoch time:', timer()-lt, 's')
             epoch_time_tracker.add_epoch_time(epoch-1, timer()-lt)
 
-    if torch.cuda.is_available():
+    if is_distributed:
         dist.barrier()
 
     if grank == 0:
diff --git a/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py b/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py
index 501b545c..a4c3eaa4 100755
--- a/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py
+++ b/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py
@@ -19,8 +19,11 @@
 
 from itwinai.parser import ArgumentParser as ItAIArgumentParser
 from itwinai.loggers import EpochTimeTracker
+from itwinai.torch.reproducibility import (
+    seed_worker, set_seed
+)
 
-from utils import imagenet_dataset, seed_worker, set_seed
+from utils import imagenet_dataset
 
 
 def parse_params():
@@ -129,7 +132,7 @@ def main():
         hvd.init()
 
     # Set random seed for reproducibility
-    torch_prng = set_seed(args.rnd_seed, use_cuda)
+    torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False)
 
     # is_main_worker = True
     # if is_distributed and (hvd.rank() != 0 or hvd.local_rank() != 0):
diff --git a/tutorials/distributed-ml/torch-scaling-test/img/report.png b/tutorials/distributed-ml/torch-scaling-test/img/report.png
index 53bb708a..4e81996e 100644
Binary files a/tutorials/distributed-ml/torch-scaling-test/img/report.png and b/tutorials/distributed-ml/torch-scaling-test/img/report.png differ
diff --git a/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py b/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py
index a1eacc20..cded83af 100644
--- a/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py
+++ b/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py
@@ -21,14 +21,17 @@
 
 from itwinai.torch.distributed import (
     TorchDistributedStrategy,
-    DDPDistributedStrategy,
-    HVDDistributedStrategy,
-    DSDistributedStrategy,
+    TorchDDPStrategy,
+    HorovodStrategy,
+    DeepSpeedStrategy,
 )
 from itwinai.parser import ArgumentParser as ItAIArgumentParser
 from itwinai.loggers import EpochTimeTracker
+from itwinai.torch.reproducibility import (
+    seed_worker, set_seed
+)
 
-from utils import seed_worker, imagenet_dataset, set_seed
+from utils import imagenet_dataset
 
 
 def parse_params() -> argparse.Namespace:
@@ -116,8 +119,8 @@ def train(
     model.train()
     t_list = []
     loss_acc = 0
-    gwsize = strategy.dist_gwsize()
-    if strategy.is_main_worker():
+    gwsize = strategy.global_world_size()
+    if strategy.is_main_worker:
         print("\n")
     for batch_idx, (data, target) in enumerate(train_loader):
         t = timer()
@@ -127,7 +130,7 @@ def train(
         loss = F.nll_loss(output, target)
         loss.backward()
         optimizer.step()
-        if (strategy.is_main_worker() and args.log_int > 0
+        if (strategy.is_main_worker and args.log_int > 0
                 and batch_idx % args.log_int == 0):
             print(
                 f'Train epoch: {epoch} '
@@ -136,7 +139,7 @@ def train(
                 f'Loss: {loss.item():.6f}')
         t_list.append(timer() - t)
         loss_acc += loss.item()
-    if strategy.is_main_worker():
+    if strategy.is_main_worker:
         print('TIMER: train time', sum(t_list) / len(t_list), 's')
     return loss_acc
 
@@ -151,10 +154,10 @@ def main():
                 or not torch.cuda.device_count() > 1):
             raise RuntimeError('Resources unavailable')
 
-        strategy = DDPDistributedStrategy(backend=args.backend)
+        strategy = TorchDDPStrategy(backend=args.backend)
         distribute_kwargs = {}
     elif args.strategy == 'horovod':
-        strategy = HVDDistributedStrategy()
+        strategy = HorovodStrategy()
         distribute_kwargs = dict(
             compression=(
                 hvd.Compression.fp16 if args.fp16_allreduce
@@ -164,7 +167,7 @@ def main():
             gradient_predivide_factor=args.gradient_predivide_factor
         )
     elif args.strategy == 'deepspeed':
-        strategy = DSDistributedStrategy(backend=args.backend)
+        strategy = DeepSpeedStrategy(backend=args.backend)
         distribute_kwargs = dict(
             config_params=dict(train_micro_batch_size_per_gpu=args.batch_size)
         )
@@ -182,19 +185,19 @@ def main():
     # Limit # of CPU threads to be used per worker
     # torch.set_num_threads(1)
 
-    # start the timer for profiling
+    # Start the timer for profiling
     st = timer()
 
     # Set random seed for reproducibility
-    torch_prng = set_seed(args.rnd_seed, use_cuda)
+    torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False)
 
-    # get job rank info - rank==0 master gpu
+    # Get job rank info - rank==0 master gpu
     if is_distributed:
         # local world size - per node
-        lwsize = strategy.dist_lwsize()   # local world size - per run
-        gwsize = strategy.dist_gwsize()   # global world size - per run
-        grank = strategy.dist_grank()     # global rank - assign per run
-        lrank = strategy.dist_lrank()     # local rank - assign per node
+        lwsize = strategy.local_world_size()   # local world size - per run
+        gwsize = strategy.global_world_size()   # global world size - per run
+        grank = strategy.global_rank()     # global rank - assign per run
+        lrank = strategy.local_rank()     # local rank - assign per node
     else:
         # Use a single worker (either on GPU or CPU)
         lwsize = 1
@@ -202,7 +205,7 @@ def main():
         grank = 0
         lrank = 0
 
-    if strategy.is_main_worker():
+    if strategy.is_main_worker:
         print('TIMER: initialise:', timer()-st, 's')
         print('DEBUG: local ranks:', lwsize, '/ global ranks:', gwsize)
         print('DEBUG: sys.version:', sys.version)
@@ -221,7 +224,7 @@ def main():
 
     # Encapsulate the model on the GPU assigned to the current process
     device = torch.device(
-        strategy.dist_device() if use_cuda and torch.cuda.is_available()
+        strategy.device() if use_cuda
         else 'cpu')
     if use_cuda:
         torch.cuda.set_device(lrank)
@@ -263,7 +266,7 @@ def main():
         )
 
     # Start training loop
-    if strategy.is_main_worker():
+    if strategy.is_main_worker:
         print('TIMER: broadcast:', timer()-st, 's')
         print('\nDEBUG: start training')
         print('--------------------------------------------------------')
@@ -302,11 +305,11 @@ def main():
         if epoch + 1 == args.epochs:
             train_loader.last_epoch = True
 
-        if strategy.is_main_worker():
+        if strategy.is_main_worker:
             print('TIMER: epoch time:', timer()-lt, 's')
             epoch_time_tracker.add_epoch_time(epoch-1, timer()-lt)
 
-    if strategy.is_main_worker():
+    if strategy.is_main_worker:
         print('\n--------------------------------------------------------')
         print('DEBUG: training results:\n')
         print('TIMER: first epoch time:', first_ep_t, ' s')
@@ -327,7 +330,7 @@ def main():
         print(f'TIMER: final time: {timer()-st} s\n')
 
     time.sleep(1)
-    print(f"<Global rank: {strategy.dist_grank()}> - TRAINING FINISHED")
+    print(f"<Global rank: {strategy.global_rank()}> - TRAINING FINISHED")
 
     # Clean-up
     if is_distributed:
diff --git a/tutorials/distributed-ml/torch-scaling-test/runall.sh b/tutorials/distributed-ml/torch-scaling-test/runall.sh
index 4f9efdcf..22958c16 100644
--- a/tutorials/distributed-ml/torch-scaling-test/runall.sh
+++ b/tutorials/distributed-ml/torch-scaling-test/runall.sh
@@ -15,47 +15,75 @@ else
 fi
 
 # Common options
-CMD="--nodes=$N --time=$T --account=atmo-rep --partition=booster slurm.sh"
-PYTHON_VENV="../../../envAI_juwels"
+CMD="--nodes=$N --time=$T --account=intertwin --partition=batch slurm.sh"
+PYTHON_VENV="../../../envAI_hdfml"
 
 echo "Distributing training over $N nodes. Timeout set to: $T"
 
+# Clear SLURM logs (*.out and *.err files)
 rm -rf logs_slurm
 mkdir logs_slurm
-rm *.out *.err *.csv #*checkpoint.pth.tar 
+rm -rf logs_torchrun
+
+# Clear scaling test logs 
+rm *.csv # *checkpoint.pth.tar 
 
 # DDP baseline
 DIST_MODE="ddp"
 RUN_NAME="ddp-bl-imagenent"
 TRAINING_CMD="ddp_trainer.py -c config/base.yaml -c config/ddp.yaml"
-sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" --job-name="$RUN_NAME-n$N" $CMD
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    $CMD
 
 # DeepSpeed baseline
 DIST_MODE="deepspeed"
 RUN_NAME="deepspeed-bl-imagenent"
 TRAINING_CMD="deepspeed_trainer.py -c config/base.yaml -c config/deepspeed.yaml"
-sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" --job-name="$RUN_NAME-n$N" $CMD
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    $CMD
 
 # Horovod baseline
 DIST_MODE="horovod"
 RUN_NAME="horovod-bl-imagenent"
 TRAINING_CMD="horovod_trainer.py -c config/base.yaml -c config/horovod.yaml"
-sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" --job-name="$RUN_NAME-n$N" $CMD
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    $CMD
 
 # DDP itwinai
 DIST_MODE="ddp"
 RUN_NAME="ddp-itwinai-imagenent"
 TRAINING_CMD="itwinai_trainer.py -c config/base.yaml -c config/ddp.yaml -s ddp"
-sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" --job-name="$RUN_NAME-n$N" $CMD
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    $CMD
 
 # DeepSpeed itwinai
 DIST_MODE="deepspeed"
 RUN_NAME="deepspeed-itwinai-imagenent"
 TRAINING_CMD="itwinai_trainer.py -c config/base.yaml -c config/deepspeed.yaml -s deepspeed"
-sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" --job-name="$RUN_NAME-n$N" $CMD
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    $CMD
 
 # Horovod itwinai
 DIST_MODE="horovod"
 RUN_NAME="horovod-itwinai-imagenent"
 TRAINING_CMD="itwinai_trainer.py -c config/base.yaml -c config/horovod.yaml -s horovod"
-sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" --job-name="$RUN_NAME-n$N" $CMD
\ No newline at end of file
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    $CMD
\ No newline at end of file
diff --git a/tutorials/distributed-ml/torch-scaling-test/slurm.sh b/tutorials/distributed-ml/torch-scaling-test/slurm.sh
index 93dd4349..c53e3da5 100644
--- a/tutorials/distributed-ml/torch-scaling-test/slurm.sh
+++ b/tutorials/distributed-ml/torch-scaling-test/slurm.sh
@@ -15,7 +15,7 @@
 #SBATCH --partition=batch
 #SBATCH --nodes=2
 #SBATCH --gpus-per-node=4
-#SBATCH --cpus-per-gpu=8
+#SBATCH --cpus-per-gpu=4
 #SBATCH --exclusive
 
 # gres options have to be disabled for deepv
@@ -72,13 +72,13 @@ else
   source $PYTHON_VENV/bin/activate
 fi
 
+# Get GPUs info per node
+srun --cpu-bind=none --ntasks-per-node=1 bash -c 'echo -e "NODE hostname: $(hostname)\n$(nvidia-smi)\n\n"'
+
 # Launch training
 if [ "$DIST_MODE" == "ddp" ] ; then
   echo "DDP training: $TRAINING_CMD"
   srun --cpu-bind=none --ntasks-per-node=1 \
-    --job-name="$RUN_NAME-n$SLURM_NNODES" \
-    --output="logs_slurm/job-$RUN_NAME-n$SLURM_NNODES.out" \
-    --error="logs_slurm/job-$RUN_NAME-n$SLURM_NNODES.err" \
     bash -c "torchrun \
     --log_dir='logs_torchrun' \
     --nnodes=$SLURM_NNODES \
@@ -95,9 +95,6 @@ elif [ "$DIST_MODE" == "deepspeed" ] ; then
   export MASTER_PORT=29500 
 
   srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \
-    --job-name="$RUN_NAME-n$SLURM_NNODES" \
-    --output="logs_slurm/job-$RUN_NAME-n$SLURM_NNODES.out" \
-    --error="logs_slurm/job-$RUN_NAME-n$SLURM_NNODES.err" \
     python -u $TRAINING_CMD --deepspeed
 
   # # Run with deepspeed launcher: set --ntasks-per-node=1
@@ -112,9 +109,6 @@ elif [ "$DIST_MODE" == "deepspeed" ] ; then
 elif [ "$DIST_MODE" == "horovod" ] ; then
   echo "HOROVOD training: $TRAINING_CMD"
   srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \
-    --job-name="$RUN_NAME-imagenet-n$SLURM_NNODES" \
-    --output="logs_slurm/job-$RUN_NAME-n$SLURM_NNODES.out" \
-    --error="logs_slurm/job-$RUN_NAME-n$SLURM_NNODES.err" \
     python -u $TRAINING_CMD
 else
   >&2 echo "ERROR: unrecognized \$DIST_MODE env variable"
diff --git a/tutorials/distributed-ml/torch-scaling-test/utils.py b/tutorials/distributed-ml/torch-scaling-test/utils.py
index cbd6aace..a5dc591e 100644
--- a/tutorials/distributed-ml/torch-scaling-test/utils.py
+++ b/tutorials/distributed-ml/torch-scaling-test/utils.py
@@ -1,40 +1,6 @@
-from typing import Optional
-import numpy as np
-import random
-
-import torch
 from torchvision import datasets, transforms
 
 
-def seed_worker(worker_id):
-    worker_seed = torch.initial_seed() % 2**32
-    np.random.seed(worker_seed)
-    random.seed(worker_seed)
-
-
-def set_seed(rnd_seed: Optional[int], use_cuda: bool) -> torch.Generator:
-    """Set torch random seed and return a PRNG object.
-
-    Args:
-        rnd_seed (Optional[int]): random seed. If None, the seed is not set.
-        use_cuda (bool): whether GPU is available.
-
-    Returns:
-        torch.Generator: PRNG object.
-    """
-    g = torch.Generator()
-    if rnd_seed is not None:
-        # Deterministic execution
-        np.random.seed(rnd_seed)
-        random.seed(rnd_seed)
-        torch.manual_seed(rnd_seed)
-        g.manual_seed(rnd_seed)
-        if use_cuda:
-            torch.cuda.manual_seed(rnd_seed)
-            torch.cuda.manual_seed_all(rnd_seed)
-    return g
-
-
 def imagenet_dataset(data_root: str):
     """Create a torch dataset object for Imagenet."""
     transform = transforms.Compose([
diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/README.md b/tutorials/distributed-ml/torch-tutorial-0-basics/README.md
index 5ddcd635..43d42565 100644
--- a/tutorials/distributed-ml/torch-tutorial-0-basics/README.md
+++ b/tutorials/distributed-ml/torch-tutorial-0-basics/README.md
@@ -23,19 +23,43 @@ should be used to run it:
 If you want to distribute the code in `train.py` with **torch DDP**, run from terminal:
   
 ```bash
-sbatch ddp_slurm.sh
+export DIST_MODE="ddp"
+export RUN_NAME="ddp-itwinai"
+export TRAINING_CMD="train.py -s ddp"
+export PYTHON_VENV="../../../envAI_hdfml"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
 ```
 
 If you want to distribute the code in `train.py` with **DeepSpeed**, run from terminal:
   
 ```bash
-sbatch deepspeed_slurm.sh
+export DIST_MODE="deepspeed"
+export RUN_NAME="deepspeed-itwinai"
+export TRAINING_CMD="train.py -s deepspeed"
+export PYTHON_VENV="../../../envAI_hdfml"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
 ```
 
 If you want to distribute the code in `train.py` with **Horovod**, run from terminal:
   
 ```bash
-sbatch hvd_slurm.sh
+export DIST_MODE="deepspeed"
+export RUN_NAME="deepspeed-itwinai"
+export TRAINING_CMD="train.py -s deepspeed"
+export PYTHON_VENV="../../../envAI_hdfml"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
 ```
 
 You can run all of them with:
diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/ddp_slurm.sh b/tutorials/distributed-ml/torch-tutorial-0-basics/ddp_slurm.sh
deleted file mode 100644
index 1b53f04c..00000000
--- a/tutorials/distributed-ml/torch-tutorial-0-basics/ddp_slurm.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-
-# general configuration of the job
-#SBATCH --job-name=Torch_DDP_tutorial-0
-#SBATCH --account=intertwin
-#SBATCH --mail-user=
-#SBATCH --mail-type=ALL
-#SBATCH --output=job-ddp.out
-#SBATCH --error=job-ddp.err
-#SBATCH --time=00:15:00
-
-# configure node and process count on the CM
-#SBATCH --partition=batch
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=1
-#SBATCH --cpus-per-task=32
-#SBATCH --gpus-per-node=4
-# SBATCH --exclusive
-
-# gres options have to be disabled for deepv
-#SBATCH --gres=gpu:4
-
-# set modules
-ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
-
-# set env
-source ../../../envAI_hdfml/bin/activate
-
-# job info
-debug=false
-echo "DEBUG: TIME: $(date)"
-echo "DEBUG: EXECUTE: $EXEC"
-echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
-echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
-echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
-echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
-echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
-echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
-echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
-echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
-if [ "$debug" = true ] ; then
-  export NCCL_DEBUG=INFO
-fi
-echo
-
-# set comm
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-export OMP_NUM_THREADS=1
-if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
-  export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
-fi
-
-# launch training
-TRAINING_CMD="train.py -s ddp"
-
-srun --cpu-bind=none bash -c "torchrun \
-    --log_dir='logs' \
-    --nnodes=$SLURM_NNODES \
-    --nproc_per_node=$SLURM_GPUS_PER_NODE \
-    --rdzv_id=$SLURM_JOB_ID \
-    --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \
-    --rdzv_backend=c10d \
-    --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \
-    $TRAINING_CMD"
-
diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/deepspeed_slurm.sh b/tutorials/distributed-ml/torch-tutorial-0-basics/deepspeed_slurm.sh
deleted file mode 100644
index b12009de..00000000
--- a/tutorials/distributed-ml/torch-tutorial-0-basics/deepspeed_slurm.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/bin/bash
-
-# general configuration of the job
-#SBATCH --job-name=Torch_DeepSpeed_tutorial-0
-#SBATCH --account=intertwin
-#SBATCH --mail-user=
-#SBATCH --mail-type=ALL
-#SBATCH --output=job-ds.out
-#SBATCH --error=job-ds.err
-#SBATCH --time=00:15:00
-
-# configure node and process count on the CM
-#SBATCH --partition=batch
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=4
-#SBATCH --gpus-per-node=4
-# SBATCH --exclusive
-
-# gres options have to be disabled for deepv
-#SBATCH --gres=gpu:4
-
-# set modules
-ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
-
-# set env
-source ../../../envAI_hdfml/bin/activate
-
-# job info
-debug=false
-echo "DEBUG: TIME: $(date)"
-echo "DEBUG: EXECUTE: $EXEC"
-echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
-echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
-echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
-echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
-echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
-echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
-echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
-echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
-if [ "$debug" = true ] ; then
-  export NCCL_DEBUG=INFO
-fi
-echo
-
-# set env vars
-export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
-export OMP_NUM_THREADS=1
-if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
-  export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
-fi
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-# launch training
-MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i
-export MASTER_ADDR
-export MASTER_PORT=29500 
-
-TRAINING_CMD="train.py -s deepspeed"
-
-# Run without launcher: set --ntasks-per-node=NUM_GPUS
-srun --cpu-bind=none python -u $TRAINING_CMD #--deepspeed
-
-# srun pwd
-
-# # Run with deepspeed launcher: set --ntasks-per-node=1
-# # https://www.deepspeed.ai/getting-started/#multi-node-environment-variables
-# export NCCL_IB_DISABLE=1
-# export NCCL_SOCKET_IFNAME=eth0
-# nodelist=$(scontrol show hostname $SLURM_NODELIST)
-# echo "$nodelist" | sed -e 's/$/ slots=4/' > .hostfile
-# # Requires passwordless SSH access among compute node
-# srun --cpu-bind=none deepspeed --hostfile=.hostfile $TRAINING_CMD --deepspeed
-# rm .hostfile
\ No newline at end of file
diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/hvd_slurm.sh b/tutorials/distributed-ml/torch-tutorial-0-basics/hvd_slurm.sh
deleted file mode 100644
index a2a06e6c..00000000
--- a/tutorials/distributed-ml/torch-tutorial-0-basics/hvd_slurm.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# general configuration of the job
-#SBATCH --job-name=Torch_HVD_tutorial-0
-#SBATCH --account=intertwin
-#SBATCH --mail-user=
-#SBATCH --mail-type=ALL
-#SBATCH --output=job-hvd.out
-#SBATCH --error=job-hvd.err
-#SBATCH --time=00:15:00
-
-# configure node and process count on the CM
-#SBATCH --partition=batch
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=8
-#SBATCH --gpus-per-node=4
-# SBATCH --exclusive
-
-# gres options have to be disabled for deepv
-#SBATCH --gres=gpu:4
-
-# set modules
-ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
-
-# set env
-source ../../../envAI_hdfml/bin/activate
-
-# job info
-debug=false
-echo "DEBUG: TIME: $(date)"
-echo "DEBUG: EXECUTE: $EXEC"
-echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
-echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
-echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
-echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
-echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
-echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
-echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
-echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
-if [ "$debug" = true ] ; then
-  export NCCL_DEBUG=INFO
-fi
-echo
-
-# set vars
-# export NCCL_DEBUG=INFO
-export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
-export OMP_NUM_THREADS=1
-if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
-  export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
-fi
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-# launch training
-TRAINING_CMD="train.py -s horovod"
-
-srun --cpu-bind=none python -u $TRAINING_CMD
-
diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/runall.sh b/tutorials/distributed-ml/torch-tutorial-0-basics/runall.sh
index 17c0f190..48a8f1e0 100644
--- a/tutorials/distributed-ml/torch-tutorial-0-basics/runall.sh
+++ b/tutorials/distributed-ml/torch-tutorial-0-basics/runall.sh
@@ -1,6 +1,39 @@
 #!/bin/bash
-# Run all versions of distributed ML
-rm *.out *.err
-echo "Torch DDP training: $(sbatch ddp_slurm.sh)"
-echo "DeepSpeed training: $(sbatch deepspeed_slurm.sh)"
-echo "Horovod training: $(sbatch hvd_slurm.sh)"
\ No newline at end of file
+
+# Python virtual environment
+PYTHON_VENV="../../../envAI_hdfml"
+
+# Clear SLURM logs (*.out and *.err files)
+rm -rf logs_slurm
+mkdir logs_slurm
+rm -rf logs_torchrun
+
+# DDP itwinai
+DIST_MODE="ddp"
+RUN_NAME="ddp-itwinai"
+TRAINING_CMD="train.py -s ddp"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
+
+# DeepSpeed itwinai
+DIST_MODE="deepspeed"
+RUN_NAME="deepspeed-itwinai"
+TRAINING_CMD="train.py -s deepspeed"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
+
+# Horovod itwinai
+DIST_MODE="horovod"
+RUN_NAME="horovod-itwinai"
+TRAINING_CMD="train.py -s horovod"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
\ No newline at end of file
diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/slurm.sh b/tutorials/distributed-ml/torch-tutorial-0-basics/slurm.sh
new file mode 100644
index 00000000..c53e3da5
--- /dev/null
+++ b/tutorials/distributed-ml/torch-tutorial-0-basics/slurm.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+# SLURM jobscript for JSC systems
+
+# Job configuration
+#SBATCH --job-name=distributed_training
+#SBATCH --account=intertwin
+#SBATCH --mail-user=
+#SBATCH --mail-type=ALL
+#SBATCH --output=job.out
+#SBATCH --error=job.err
+#SBATCH --time=00:30:00
+
+# Resources allocation
+#SBATCH --partition=batch
+#SBATCH --nodes=2
+#SBATCH --gpus-per-node=4
+#SBATCH --cpus-per-gpu=4
+#SBATCH --exclusive
+
+# gres options have to be disabled for deepv
+#SBATCH --gres=gpu:4
+
+# Load environment modules
+ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
+
+# Job info
+echo "DEBUG: TIME: $(date)"
+sysN="$(uname -n | cut -f2- -d.)"
+sysN="${sysN%%[0-9]*}"
+echo "Running on system: $sysN"
+echo "DEBUG: EXECUTE: $EXEC"
+echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
+echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
+echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
+echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
+echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
+echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
+echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
+echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
+echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
+if [ "$DEBUG" = true ] ; then
+  echo "DEBUG: NCCL_DEBUG=INFO" 
+  export NCCL_DEBUG=INFO
+fi
+echo
+
+# Setup env for distributed ML
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+export OMP_NUM_THREADS=1
+if [ "$SLURM_CPUS_PER_GPU" -gt 0 ] ; then
+  export OMP_NUM_THREADS=$SLURM_CPUS_PER_GPU
+fi
+
+# Env vairables check
+if [ -z "$DIST_MODE" ]; then 
+  >&2 echo "ERROR: env variable DIST_MODE is not set. Allowed values are 'horovod', 'ddp' or 'deepspeed'"
+  exit 1
+fi
+if [ -z "$RUN_NAME" ]; then 
+  >&2 echo "WARNING: env variable RUN_NAME is not set. It's a way to identify some specific run of an experiment."
+  RUN_NAME=$DIST_MODE
+fi
+if [ -z "$TRAINING_CMD" ]; then 
+  >&2 echo "ERROR: env variable TRAINING_CMD is not set. It's the python command to execute."
+  exit 1
+fi
+if [ -z "$PYTHON_VENV" ]; then 
+  >&2 echo "WARNING: env variable PYTHON_VENV is not set. It's the path to a python virtual environment."
+else
+  # Activate Python virtual env
+  source $PYTHON_VENV/bin/activate
+fi
+
+# Get GPUs info per node
+srun --cpu-bind=none --ntasks-per-node=1 bash -c 'echo -e "NODE hostname: $(hostname)\n$(nvidia-smi)\n\n"'
+
+# Launch training
+if [ "$DIST_MODE" == "ddp" ] ; then
+  echo "DDP training: $TRAINING_CMD"
+  srun --cpu-bind=none --ntasks-per-node=1 \
+    bash -c "torchrun \
+    --log_dir='logs_torchrun' \
+    --nnodes=$SLURM_NNODES \
+    --nproc_per_node=$SLURM_GPUS_PER_NODE \
+    --rdzv_id=$SLURM_JOB_ID \
+    --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \
+    $TRAINING_CMD"
+elif [ "$DIST_MODE" == "deepspeed" ] ; then
+  echo "DEEPSPEED training: $TRAINING_CMD"
+  MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i
+  export MASTER_ADDR
+  export MASTER_PORT=29500 
+
+  srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \
+    python -u $TRAINING_CMD --deepspeed
+
+  # # Run with deepspeed launcher: set --ntasks-per-node=1
+  # # https://www.deepspeed.ai/getting-started/#multi-node-environment-variables
+  # export NCCL_IB_DISABLE=1
+  # export NCCL_SOCKET_IFNAME=eth0
+  # nodelist=$(scontrol show hostname $SLURM_NODELIST)
+  # echo "$nodelist" | sed -e 's/$/ slots=4/' > .hostfile
+  # # Requires passwordless SSH access among compute node
+  # srun --cpu-bind=none deepspeed --hostfile=.hostfile $TRAINING_CMD --deepspeed
+  # rm .hostfile
+elif [ "$DIST_MODE" == "horovod" ] ; then
+  echo "HOROVOD training: $TRAINING_CMD"
+  srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \
+    python -u $TRAINING_CMD
+else
+  >&2 echo "ERROR: unrecognized \$DIST_MODE env variable"
+  exit 1
+fi
+
diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/train.py b/tutorials/distributed-ml/torch-tutorial-0-basics/train.py
index 614b56e4..29c0d272 100644
--- a/tutorials/distributed-ml/torch-tutorial-0-basics/train.py
+++ b/tutorials/distributed-ml/torch-tutorial-0-basics/train.py
@@ -2,19 +2,23 @@
 Show how to use DDP, Horovod and DeepSpeed strategies interchangeably
 with an extremely simple neural network.
 """
-from typing import Any
-import os
+from typing import Dict
 import argparse
+import time
 
 import torch
 from torch import nn
-from torch.utils.data import DataLoader, Dataset, DistributedSampler
+from torch.utils.data import Dataset
+
+import horovod.torch as hvd
 
 from itwinai.torch.distributed import (
+    distributed_resources_available,
     TorchDistributedStrategy,
-    DDPDistributedStrategy,
-    HVDDistributedStrategy,
-    DSDistributedStrategy,
+    TorchDDPStrategy,
+    HorovodStrategy,
+    DeepSpeedStrategy,
+    NonDistributedStrategy
 )
 
 
@@ -29,6 +33,9 @@ def parse_args() -> argparse.Namespace:
         "--shuffle_dataloader",
         action=argparse.BooleanOptionalAction
     )
+    parser.add_argument(
+        '--batch-size', type=int, default=10,
+        help='input batch size for training (default: 10)')
 
     # DeepSpeed: needs to be removed
     import deepspeed
@@ -55,42 +62,31 @@ def __getitem__(self, index):
         return torch.rand(self.x_size), torch.rand(self.y_size)
 
 
-def trainer_entrypoint_fn(
-        foo: Any, args: argparse.Namespace, strategy: TorchDistributedStrategy
+def training_fn(
+        args: argparse.Namespace,
+        strategy: TorchDistributedStrategy,
+        distribute_kwargs: Dict
 ) -> int:
-    """Dummy training function. This emulates custom code developed
-    by some use case.
-    """
+    """Dummy training function."""
     strategy.init()
-    print(f"{foo}: {os.environ.get('RANK')} {os.environ.get('LOCAL_RANK')} "
-          f"{os.environ.get('MASTER_ADDR')} {os.environ.get('MASTER_PORT')}")
 
     # Local model
     model = nn.Linear(3, 4)
     optim = torch.optim.Adam(model.parameters(), lr=1e-3)
     loss_fn = nn.MSELoss()
     # Distributed model
-    deepspeed_config = dict(train_batch_size=32)
-    # 'config_params' key is ignored if strategy != DSDistributedStrategy
     model, optim, lr_sched = strategy.distributed(
-        model, optim, lr_scheduler=None, config_params=deepspeed_config
+        model, optim, lr_scheduler=None, **distribute_kwargs
     )
 
     # Data
     train_set = UniformRndDataset(x_size=3, y_size=4)
     # Distributed dataloader
-    train_loader = DataLoader(
-        train_set, batch_size=10, num_workers=1,
-        sampler=DistributedSampler(
-            train_set,
-            num_replicas=strategy.dist_gwsize(),
-            rank=strategy.dist_grank(),
-            shuffle=args.shuffle_dataloader
-        )
-    )
+    train_loader = strategy.create_dataloader(
+        train_set, batch_size=args.batch_size, num_workers=1)
 
     # Device allocated for this worker
-    device = strategy.dist_device()
+    device = strategy.device()
 
     for epoch in range(2):
         for (x, y) in train_loader:
@@ -107,7 +103,7 @@ def trainer_entrypoint_fn(
 
             optim.step()
 
-            if strategy.is_main_worker():
+            if strategy.is_main_worker:
                 print(f"Loss [epoch={epoch}]: {loss.item()}")
             print(f"NNLoss [epoch={epoch}]: {loss.item()}")
 
@@ -115,7 +111,8 @@ def trainer_entrypoint_fn(
         if lr_sched:
             lr_sched.step()
 
-    print(f"<Global rank: {strategy.dist_grank()}> - TRAINING FINISHED")
+    time.sleep(1)
+    print(f"<Global rank: {strategy.global_rank()}> - TRAINING FINISHED")
     strategy.clean_up()
     return 123
 
@@ -125,19 +122,27 @@ def trainer_entrypoint_fn(
     args = parse_args()
 
     # Instantiate Strategy
-    if args.strategy == 'ddp':
-        if (not torch.cuda.is_available()
-                or not torch.cuda.device_count() > 1):
-            raise RuntimeError('Resources unavailable')
-
-        strategy = DDPDistributedStrategy(backend='nccl')
+    if not distributed_resources_available():
+        print("WARNING: falling back to non-distributed strategy.")
+        strategy = NonDistributedStrategy()
+        distribute_kwargs = {}
+    elif args.strategy == 'ddp':
+        strategy = TorchDDPStrategy(backend='nccl')
+        distribute_kwargs = {}
     elif args.strategy == 'horovod':
-        strategy = HVDDistributedStrategy()
+        strategy = HorovodStrategy()
+        distribute_kwargs = dict(
+            compression=hvd.Compression.none,
+            op=hvd.Average,
+            gradient_predivide_factor=1.0
+        )
     elif args.strategy == 'deepspeed':
-        strategy = DSDistributedStrategy(backend='nccl')
+        strategy = DeepSpeedStrategy(backend='nccl')
+        distribute_kwargs = dict(
+            config_params=dict(train_micro_batch_size_per_gpu=args.batch_size)
+        )
     else:
         raise NotImplementedError(
             f"Strategy {args.strategy} is not recognized/implemented.")
-
     # Launch distributed training
-    trainer_entrypoint_fn("foobar", args, strategy)
+    training_fn(args, strategy, distribute_kwargs)
diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/README.md b/tutorials/distributed-ml/torch-tutorial-1-mnist/README.md
index 6f22d3ef..70178f0d 100644
--- a/tutorials/distributed-ml/torch-tutorial-1-mnist/README.md
+++ b/tutorials/distributed-ml/torch-tutorial-1-mnist/README.md
@@ -33,19 +33,43 @@ should be used to run it:
 If you want to distribute the code in `train.py` with **torch DDP**, run from terminal:
   
 ```bash
-sbatch ddp_slurm.sh
+export DIST_MODE="ddp"
+export RUN_NAME="ddp-itwinai"
+export TRAINING_CMD="train.py -s ddp -c config.yaml"
+export PYTHON_VENV="../../../envAI_hdfml"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
 ```
 
 If you want to distribute the code in `train.py` with **DeepSpeed**, run from terminal:
   
 ```bash
-sbatch deepspeed_slurm.sh
+export DIST_MODE="deepspeed"
+export RUN_NAME="deepspeed-itwinai"
+export TRAINING_CMD="train.py -s deepspeed -c config.yaml"
+export PYTHON_VENV="../../../envAI_hdfml"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
 ```
 
 If you want to distribute the code in `train.py` with **Horovod**, run from terminal:
   
 ```bash
-sbatch hvd_slurm.sh
+export DIST_MODE="horovod"
+export RUN_NAME="horovod-itwinai"
+export TRAINING_CMD="train.py -s horovod -c config.yaml"
+export PYTHON_VENV="../../../envAI_hdfml"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
 ```
 
 You can run all of them with:
diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/config.yaml b/tutorials/distributed-ml/torch-tutorial-1-mnist/config.yaml
index cb221dec..331d6d04 100644
--- a/tutorials/distributed-ml/torch-tutorial-1-mnist/config.yaml
+++ b/tutorials/distributed-ml/torch-tutorial-1-mnist/config.yaml
@@ -1,26 +1,28 @@
-# I/O
+# Data and logging
 data_dir: ./
+log_int: 10
+verbose: True
 restart_int: 10
 download_only: False
-verbose: True
+dataset_replication: 10
+shuff: False
+nworker: 4 # num workers dataloader
+prefetch: 2
 
 # Model
 batch_size: 64
 epochs: 2
 lr: 0.001
-concM: 100
 momentum: 0.5
-shuff: False
 
-# Debugging
-testrun: False
-nseed: 10
-log_int: 10
+# Reproducibility
+rnd_seed: 10
 
 # Distributed ML
-backend: nccl
-nworker: 4 # num workers dataloader
-prefetch: 2
-no_cuda: False
+backend: nccl # ignored when using Horovod
 
+# Horovod: ignored when NOT using Horovod
+fp16_allreduce: False
+use_adasum: False
+gradient_predivide_factor: 1.0
 
diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/ddp_slurm.sh b/tutorials/distributed-ml/torch-tutorial-1-mnist/ddp_slurm.sh
deleted file mode 100644
index 3d5d4bb3..00000000
--- a/tutorials/distributed-ml/torch-tutorial-1-mnist/ddp_slurm.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-
-# general configuration of the job
-#SBATCH --job-name=Torch_DDP_tutorial-1
-#SBATCH --account=intertwin
-#SBATCH --mail-user=
-#SBATCH --mail-type=ALL
-#SBATCH --output=job-ddp.out
-#SBATCH --error=job-ddp.err
-#SBATCH --time=00:30:00
-
-# configure node and process count on the CM
-#SBATCH --partition=batch
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=1
-#SBATCH --cpus-per-task=32
-#SBATCH --gpus-per-node=4
-# SBATCH --exclusive
-
-# gres options have to be disabled for deepv
-#SBATCH --gres=gpu:4
-
-# set modules
-ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
-
-# set env
-source ../../../envAI_hdfml/bin/activate
-
-# job info
-debug=false
-echo "DEBUG: TIME: $(date)"
-echo "DEBUG: EXECUTE: $EXEC"
-echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
-echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
-echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
-echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
-echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
-echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
-echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
-echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
-if [ "$debug" = true ] ; then
-  export NCCL_DEBUG=INFO
-fi
-echo
-
-# set comm
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-export OMP_NUM_THREADS=1
-if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
-  export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
-fi
-
-# launch training
-TRAINING_CMD="train.py -s ddp -c config.yaml"
-
-srun --cpu-bind=none bash -c "torchrun \
-    --log_dir='logs' \
-    --nnodes=$SLURM_NNODES \
-    --nproc_per_node=$SLURM_GPUS_PER_NODE \
-    --rdzv_id=$SLURM_JOB_ID \
-    --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \
-    --rdzv_backend=c10d \
-    --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \
-    $TRAINING_CMD"
-
diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/deepspeed_slurm.sh b/tutorials/distributed-ml/torch-tutorial-1-mnist/deepspeed_slurm.sh
deleted file mode 100644
index 8e5f7881..00000000
--- a/tutorials/distributed-ml/torch-tutorial-1-mnist/deepspeed_slurm.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/bin/bash
-
-# general configuration of the job
-#SBATCH --job-name=Torch_DeepSpeed_tutorial-1
-#SBATCH --account=intertwin
-#SBATCH --mail-user=
-#SBATCH --mail-type=ALL
-#SBATCH --output=job-ds.out
-#SBATCH --error=job-ds.err
-#SBATCH --time=00:30:00
-
-# configure node and process count on the CM
-#SBATCH --partition=batch
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=4
-#SBATCH --gpus-per-node=4
-# SBATCH --exclusive
-
-# gres options have to be disabled for deepv
-#SBATCH --gres=gpu:4
-
-# set modules
-ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
-
-# set env
-source ../../../envAI_hdfml/bin/activate
-
-# job info
-debug=false
-echo "DEBUG: TIME: $(date)"
-echo "DEBUG: EXECUTE: $EXEC"
-echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
-echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
-echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
-echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
-echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
-echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
-echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
-echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
-if [ "$debug" = true ] ; then
-  export NCCL_DEBUG=INFO
-fi
-echo
-
-# set env vars
-export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
-export OMP_NUM_THREADS=1
-if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
-  export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
-fi
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-# launch training
-MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i
-export MASTER_ADDR
-export MASTER_PORT=29500 
-
-TRAINING_CMD="train.py -s deepspeed -c config.yaml"
-
-# Run without launcher: set --ntasks-per-node=NUM_GPUS
-srun --cpu-bind=none python -u $TRAINING_CMD --deepspeed
-
-# # Run with deepspeed launcher: set --ntasks-per-node=1
-# # https://www.deepspeed.ai/getting-started/#multi-node-environment-variables
-# export NCCL_IB_DISABLE=1
-# export NCCL_SOCKET_IFNAME=eth0
-# nodelist=$(scontrol show hostname $SLURM_NODELIST)
-# echo "$nodelist" | sed -e 's/$/ slots=4/' > .hostfile
-# # Requires passwordless SSH access among compute node
-# srun --cpu-bind=none deepspeed --hostfile=.hostfile $TRAINING_CMD --deepspeed
-# rm .hostfile
-
diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/hvd_slurm.sh b/tutorials/distributed-ml/torch-tutorial-1-mnist/hvd_slurm.sh
deleted file mode 100644
index 3774b6e1..00000000
--- a/tutorials/distributed-ml/torch-tutorial-1-mnist/hvd_slurm.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# general configuration of the job
-#SBATCH --job-name=Torch_HVD_tutorial-1
-#SBATCH --account=intertwin
-#SBATCH --mail-user=
-#SBATCH --mail-type=ALL
-#SBATCH --output=job-hvd.out
-#SBATCH --error=job-hvd.err
-#SBATCH --time=00:30:00
-
-# configure node and process count on the CM
-#SBATCH --partition=batch
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=8
-#SBATCH --gpus-per-node=4
-# SBATCH --exclusive
-
-# gres options have to be disabled for deepv
-#SBATCH --gres=gpu:4
-
-# set modules
-ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
-
-# set env
-source ../../../envAI_hdfml/bin/activate
-
-# job info
-debug=false
-echo "DEBUG: TIME: $(date)"
-echo "DEBUG: EXECUTE: $EXEC"
-echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
-echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
-echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
-echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
-echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
-echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
-echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
-echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
-if [ "$debug" = true ] ; then
-  export NCCL_DEBUG=INFO
-fi
-echo
-
-# set vars
-# export NCCL_DEBUG=INFO
-export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
-export OMP_NUM_THREADS=1
-if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
-  export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
-fi
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-# launch training
-TRAINING_CMD="train.py -s horovod -c config.yaml"
-
-srun --cpu-bind=none python -u $TRAINING_CMD
-
diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/runall.sh b/tutorials/distributed-ml/torch-tutorial-1-mnist/runall.sh
index b1470d75..5a89b4fe 100644
--- a/tutorials/distributed-ml/torch-tutorial-1-mnist/runall.sh
+++ b/tutorials/distributed-ml/torch-tutorial-1-mnist/runall.sh
@@ -1,6 +1,39 @@
 #!/bin/bash
-# Run all versions of distributed ML for MNIST
-rm *checkpoint.pth.tar *.out *.err
-echo "Torch DDP training: $(sbatch ddp_slurm.sh)"
-echo "DeepSpeed training: $(sbatch deepspeed_slurm.sh)"
-echo "Horovod training: $(sbatch hvd_slurm.sh)"
\ No newline at end of file
+
+# Python virtual environment
+PYTHON_VENV="../../../envAI_hdfml"
+
+# Clear SLURM logs (*.out and *.err files)
+rm -rf logs_slurm
+mkdir logs_slurm
+rm -rf logs_torchrun
+
+# DDP itwinai
+DIST_MODE="ddp"
+RUN_NAME="ddp-itwinai"
+TRAINING_CMD="train.py -s ddp -c config.yaml"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
+
+# DeepSpeed itwinai
+DIST_MODE="deepspeed"
+RUN_NAME="deepspeed-itwinai"
+TRAINING_CMD="train.py -s deepspeed -c config.yaml"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
+
+# Horovod itwinai
+DIST_MODE="horovod"
+RUN_NAME="horovod-itwinai"
+TRAINING_CMD="train.py -s horovod -c config.yaml"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
\ No newline at end of file
diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/slurm.sh b/tutorials/distributed-ml/torch-tutorial-1-mnist/slurm.sh
new file mode 100644
index 00000000..3eef38ae
--- /dev/null
+++ b/tutorials/distributed-ml/torch-tutorial-1-mnist/slurm.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# SLURM jobscript for JSC systems
+
+# Job configuration
+#SBATCH --job-name=distributed_training
+#SBATCH --account=intertwin
+#SBATCH --mail-user=
+#SBATCH --mail-type=ALL
+#SBATCH --output=job.out
+#SBATCH --error=job.err
+#SBATCH --time=00:30:00
+
+# Resources allocation
+#SBATCH --partition=batch
+#SBATCH --nodes=2
+#SBATCH --gpus-per-node=4
+#SBATCH --cpus-per-gpu=4
+#SBATCH --exclusive
+
+# gres options have to be disabled for deepv
+#SBATCH --gres=gpu:4
+
+# Load environment modules
+ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
+
+# Job info
+echo "DEBUG: TIME: $(date)"
+sysN="$(uname -n | cut -f2- -d.)"
+sysN="${sysN%%[0-9]*}"
+echo "Running on system: $sysN"
+echo "DEBUG: EXECUTE: $EXEC"
+echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
+echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
+echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
+echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
+echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
+echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
+echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
+echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
+echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
+if [ "$DEBUG" = true ] ; then
+  echo "DEBUG: NCCL_DEBUG=INFO" 
+  export NCCL_DEBUG=INFO
+fi
+echo
+
+# Setup env for distributed ML
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+export OMP_NUM_THREADS=1
+if [ "$SLURM_CPUS_PER_GPU" -gt 0 ] ; then
+  export OMP_NUM_THREADS=$SLURM_CPUS_PER_GPU
+fi
+
+# Env vairables check
+if [ -z "$DIST_MODE" ]; then 
+  >&2 echo "ERROR: env variable DIST_MODE is not set. Allowed values are 'horovod', 'ddp' or 'deepspeed'"
+  exit 1
+fi
+if [ -z "$RUN_NAME" ]; then 
+  >&2 echo "WARNING: env variable RUN_NAME is not set. It's a way to identify some specific run of an experiment."
+  RUN_NAME=$DIST_MODE
+fi
+if [ -z "$TRAINING_CMD" ]; then 
+  >&2 echo "ERROR: env variable TRAINING_CMD is not set. It's the python command to execute."
+  exit 1
+fi
+if [ -z "$PYTHON_VENV" ]; then 
+  >&2 echo "WARNING: env variable PYTHON_VENV is not set. It's the path to a python virtual environment."
+else
+  # Activate Python virtual env
+  source $PYTHON_VENV/bin/activate
+fi
+
+# Get GPUs info per node
+srun --cpu-bind=none --ntasks-per-node=1 bash -c 'echo -e "NODE hostname: $(hostname)\n$(nvidia-smi)\n\n"'
+
+# Launch training
+if [ "$DIST_MODE" == "ddp" ] ; then
+  echo "DDP training: $TRAINING_CMD"
+  srun --cpu-bind=none --ntasks-per-node=1 \
+    bash -c "torchrun \
+    --log_dir='logs_torchrun' \
+    --nnodes=$SLURM_NNODES \
+    --nproc_per_node=$SLURM_GPUS_PER_NODE \
+    --rdzv_id=$SLURM_JOB_ID \
+    --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \
+    $TRAINING_CMD"
+elif [ "$DIST_MODE" == "deepspeed" ] ; then
+  echo "DEEPSPEED training: $TRAINING_CMD"
+  MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i
+  export MASTER_ADDR
+  export MASTER_PORT=29500 
+
+  srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \
+    python -u $TRAINING_CMD --deepspeed
+
+  # # Run with deepspeed launcher: set --ntasks-per-node=1
+  # # https://www.deepspeed.ai/getting-started/#multi-node-environment-variables
+  # export NCCL_IB_DISABLE=1
+  # export NCCL_SOCKET_IFNAME=eth0
+  # nodelist=$(scontrol show hostname $SLURM_NODELIST)
+  # echo "$nodelist" | sed -e 's/$/ slots=4/' > .hostfile
+  # # Requires passwordless SSH access among compute node
+  # srun --cpu-bind=none deepspeed --hostfile=.hostfile $TRAINING_CMD --deepspeed
+  # rm .hostfile
+elif [ "$DIST_MODE" == "horovod" ] ; then
+  echo "HOROVOD training: $TRAINING_CMD"
+  srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \
+    python -u $TRAINING_CMD
+else
+  >&2 echo "ERROR: unrecognized \$DIST_MODE env variable"
+  exit 1
+fi
diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/train.py b/tutorials/distributed-ml/torch-tutorial-1-mnist/train.py
index 365a9048..809480dd 100644
--- a/tutorials/distributed-ml/torch-tutorial-1-mnist/train.py
+++ b/tutorials/distributed-ml/torch-tutorial-1-mnist/train.py
@@ -1,34 +1,38 @@
 """
 Show how to use DDP, Horovod and DeepSpeed strategies interchangeably
-with a simple neural network trained on MNIST dataset, showing how
-to use checkpoints.
+with a simple neural network trained on MNIST dataset.
 """
-import os
+from typing import Tuple
 import argparse
 import sys
 import time
-import numpy as np
-import random
+from timeit import default_timer as timer
 
 import torch
-import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
 from torchvision import datasets, transforms
-from torch.utils.data import DataLoader, DistributedSampler
+from torch.utils.data import Dataset
+
+import horovod.torch as hvd
 
 import deepspeed
 
 from itwinai.torch.distributed import (
+    distributed_resources_available,
     TorchDistributedStrategy,
-    DDPDistributedStrategy,
-    HVDDistributedStrategy,
-    DSDistributedStrategy,
+    TorchDDPStrategy,
+    HorovodStrategy,
+    DeepSpeedStrategy,
+    NonDistributedStrategy
 )
 from itwinai.parser import ArgumentParser as ItAIArgumentParser
+from itwinai.torch.reproducibility import (
+    seed_worker, set_seed
+)
 
 
-def parse_args() -> argparse.Namespace:
+def parse_params() -> argparse.Namespace:
     """
     Parse CLI args, which can also be loaded from a configuration file
     using the --config flag:
@@ -44,54 +48,59 @@ def parse_args() -> argparse.Namespace:
         default='ddp'
     )
 
-    # IO parsers
+    # Data and logging
     parser.add_argument('--data-dir', default='./',
                         help=('location of the training dataset in the local '
                               'filesystem'))
+    parser.add_argument('--log-int', type=int, default=10,
+                        help='log interval per training')
+    parser.add_argument('--verbose',
+                        action=argparse.BooleanOptionalAction,
+                        help='Print parsed arguments')
     parser.add_argument('--restart-int', type=int, default=10,
                         help='restart interval per epoch (default: 10)')
     parser.add_argument('--download-only',
                         action=argparse.BooleanOptionalAction,
                         help='Download dataset and exit')
-    parser.add_argument('--verbose',
-                        action=argparse.BooleanOptionalAction,
-                        help='Print parsed arguments')
+    parser.add_argument('--dataset-replication', type=int, default=100,
+                        help='concatenate MNIST to this factor (default: 100)')
+    parser.add_argument('--shuff', action='store_true', default=False,
+                        help='shuffle dataset (default: False)')
+    parser.add_argument('--nworker', type=int, default=0,
+                        help=('number of workers in DataLoader (default: 0 -'
+                              ' only main)'))
+    parser.add_argument('--prefetch', type=int, default=2,
+                        help='prefetch data in DataLoader (default: 2)')
 
-    # model parsers
+    # Model
     parser.add_argument('--batch-size', type=int, default=64,
                         help='input batch size for training (default: 64)')
     parser.add_argument('--epochs', type=int, default=10,
                         help='number of epochs to train (default: 10)')
     parser.add_argument('--lr', type=float, default=0.01,
                         help='learning rate (default: 0.01)')
-    parser.add_argument('--concM', type=int, default=100,
-                        help='concatenate MNIST to this factor (default: 100)')
     parser.add_argument('--momentum', type=float, default=0.5,
                         help='momentum in SGD optimizer (default: 0.5)')
-    parser.add_argument('--shuff', action='store_true', default=False,
-                        help='shuffle dataset (default: False)')
 
-    # debug parsers
-    parser.add_argument('--testrun', action='store_true', default=False,
-                        help='do a test run with seed (default: False)')
-    parser.add_argument('--nseed', type=int, default=0,
+    # Reproducibility
+    parser.add_argument('--rnd-seed', type=int, default=0,
                         help='seed integer for reproducibility (default: 0)')
-    parser.add_argument('--log-int', type=int, default=10,
-                        help='log interval per training')
 
-    # parallel parsers
+    # Distributed ML
     parser.add_argument('--backend', type=str, default='nccl',
                         help='backend for parrallelisation (default: nccl)')
-    parser.add_argument('--nworker', type=int, default=0,
-                        help=('number of workers in DataLoader (default: 0 -'
-                              ' only main)'))
-    parser.add_argument('--prefetch', type=int, default=2,
-                        help='prefetch data in DataLoader (default: 2)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
-                        help='disables GPGPUs')
     parser.add_argument('--local_rank', type=int, default=-1,
                         help='local rank passed from distributed launcher')
 
+    # Horovod: ignored when not using Horovod
+    parser.add_argument('--fp16-allreduce', action='store_true', default=False,
+                        help='use fp16 compression during allreduce')
+    parser.add_argument('--use-adasum', action='store_true', default=False,
+                        help='use adasum algorithm to do reduction')
+    parser.add_argument('--gradient-predivide-factor', type=float, default=1.0,
+                        help=('apply gradient pre-divide factor in optimizer '
+                              '(default: 1.0)'))
+
     # DeepSpeed
     parser = deepspeed.add_config_arguments(parser)
     args = parser.parse_args()
@@ -127,7 +136,7 @@ def forward(self, x):
 
 
 def train(
-    model, device, train_loader, optimizer, epoch,
+    model, train_loader, optimizer, epoch,
     strategy: TorchDistributedStrategy, args
 ):
     """
@@ -136,108 +145,62 @@ def train(
     model.train()
     t_list = []
     loss_acc = 0
-    gwsize = strategy.dist_gwsize()
-    if strategy.is_main_worker():
+    if strategy.is_main_worker:
         print("\n")
     for batch_idx, (data, target) in enumerate(train_loader):
-        t = time.perf_counter()
-        data, target = data.to(device), target.to(device)
+        t = timer()
+        data = data.to(strategy.device())
+        target = target.to(strategy.device())
         optimizer.zero_grad()
         output = model(data)
         loss = F.nll_loss(output, target)
         loss.backward()
         optimizer.step()
-        if batch_idx % args.log_int == 0 and strategy.is_main_worker():
+        if (strategy.is_main_worker and args.log_int > 0
+                and batch_idx % args.log_int == 0):
+            dl_size = len(train_loader.dataset)//strategy.global_world_size()
             print(
                 f'Train epoch: {epoch} '
-                f'[{batch_idx * len(data)}/{len(train_loader.dataset)/gwsize} '
+                f'[{batch_idx * len(data)}/{dl_size} '
                 f'({100.0 * batch_idx / len(train_loader):.0f}%)]\t\t'
                 f'Loss: {loss.item():.6f}')
-        t_list.append(time.perf_counter() - t)
+        t_list.append(timer() - t)
         loss_acc += loss.item()
-    if strategy.is_main_worker():
+    if strategy.is_main_worker:
         print('TIMER: train time', sum(t_list) / len(t_list), 's')
     return loss_acc
 
 
-def test(model, device, test_loader, strategy: TorchDistributedStrategy):
+def test(model, test_loader, strategy: TorchDistributedStrategy):
     """
     Model validation.
     """
     model.eval()
     test_loss = 0
     correct = 0
-    gwsize = strategy.dist_gwsize()
     with torch.no_grad():
         for data, target in test_loader:
-            data, target = data.to(device), target.to(device)
+            data = data.to(strategy.device())
+            target = target.to(strategy.device())
             output = model(data)
-            # sum up batch loss
+            # Sum up batch loss
             test_loss += F.nll_loss(output, target, reduction="sum").item()
-            # get the index of the max log-probability
+            # Get the index of the max log-probability
             pred = output.argmax(dim=1, keepdim=True)
             correct += pred.eq(target.view_as(pred)).sum().item()
     test_loss /= len(test_loader.dataset)
-    if strategy.is_main_worker():
+    if strategy.is_main_worker:
+        dl_size = len(test_loader.dataset)//strategy.global_world_size()
         print(
             f'Test set: average loss: {test_loss:.4f}\t'
-            f'accurate samples: {correct}/{len(test_loader.dataset)/gwsize}')
-    acc_test = 100.0 * correct * gwsize / len(test_loader.dataset)
+            f'accurate samples: {correct}/{dl_size}')
+    acc_test = (
+        100.0 * correct * strategy.global_world_size()
+        / len(test_loader.dataset)
+    )
     return acc_test
 
 
-def save_state(
-        epoch, distrib_model, loss_acc, optimizer,
-        res_name, is_best, strategy: TorchDistributedStrategy
-):
-    """
-    Save training state.
-    """
-    grank = strategy.dist_grank()
-    rt = time.time()
-    # find if is_best happened in any worker
-    if torch.cuda.is_available():
-        is_best_m = strategy.par_allgather_obj(is_best)
-
-    if torch.cuda.is_available():
-        if any(is_best_m):
-            # find which rank is_best happened - select first rank if multiple
-            is_best_rank = np.where(np.array(is_best_m))[0][0]
-
-            # collect state
-            state = {'epoch': epoch + 1,
-                     'state_dict': distrib_model.state_dict(),
-                     'best_acc': loss_acc,
-                     'optimizer': optimizer.state_dict()}
-
-            # write on worker with is_best
-            if grank == is_best_rank:
-                torch.save(state, './'+res_name)
-                print(
-                    f'DEBUG: state in {grank} is saved on epoch:{epoch} '
-                    f'in {time.time()-rt} s')
-    else:
-        # collect state
-        state = {'epoch': epoch + 1,
-                 'state_dict': distrib_model.state_dict(),
-                 'best_acc': loss_acc,
-                 'optimizer': optimizer.state_dict()}
-
-        torch.save(state, './'+res_name)
-        print(
-            f'DEBUG: state in {grank} is saved on epoch:{epoch} in '
-            f'{time.time()-rt} s')
-
-
-def seed_worker(worker_id):
-    """
-    Seed dataloader worker.
-    """
-    worker_seed = torch.initial_seed() % 2**32
-    np.random.seed(worker_seed)
-    random.seed(worker_seed)
-
-
 def download_mnist():
     """
     Use built-in torch datasets functions to pull MNIST dataset.
@@ -257,212 +220,154 @@ def download_mnist():
         ]))
 
 
+def mnist_dataset(dataset_replication: int = 1) -> Tuple[Dataset, Dataset]:
+    """Load MNIST train and test datasets, replicating them.
+
+    Args:
+        dataset_replication (int): dataset replication factor. Default 1.
+
+    Returns:
+        Tuple[Dataset, Dataset]: train dataset and test dataset.
+    """
+    replicated_data = [
+        datasets.MNIST(args.data_dir, train=True, download=False,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ]))
+        for _ in range(dataset_replication)
+    ]
+    train_dataset = torch.utils.data.ConcatDataset(replicated_data)
+
+    replicated_data = [
+        datasets.MNIST(args.data_dir, train=False, download=False,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ]))
+        for _ in range(dataset_replication)
+    ]
+    test_dataset = torch.utils.data.ConcatDataset(replicated_data)
+    return train_dataset, test_dataset
+
+
 if __name__ == "__main__":
 
-    args = parse_args()
+    args = parse_params()
 
     if args.download_only:
-        # Download datasets and exit
+        # Download datasets from a location with internet access and exit.
+        # This is convenient when submitting training jobs to
+        # a batch system where worker nodes have no internet
+        # access, like in some HPCs.
         download_mnist()
         sys.exit()
 
     # Instantiate Strategy
-    if args.strategy == 'ddp':
-        if (not torch.cuda.is_available()
-                or not torch.cuda.device_count() > 1):
-            raise RuntimeError('Resources unavailable')
-
-        strategy = DDPDistributedStrategy(backend=args.backend)
+    if not distributed_resources_available():
+        print("WARNING: falling back to non-distributed strategy.")
+        strategy = NonDistributedStrategy()
+        distribute_kwargs = {}
+    elif args.strategy == 'ddp':
+        strategy = TorchDDPStrategy(backend=args.backend)
+        distribute_kwargs = {}
     elif args.strategy == 'horovod':
-        strategy = HVDDistributedStrategy()
+        strategy = HorovodStrategy()
+        distribute_kwargs = dict(
+            compression=(
+                hvd.Compression.fp16 if args.fp16_allreduce
+                else hvd.Compression.none
+            ),
+            op=hvd.Adasum if args.use_adasum else hvd.Average,
+            gradient_predivide_factor=args.gradient_predivide_factor
+        )
     elif args.strategy == 'deepspeed':
-        strategy = DSDistributedStrategy(backend=args.backend)
+        strategy = DeepSpeedStrategy(backend=args.backend)
+        distribute_kwargs = dict(
+            config_params=dict(train_micro_batch_size_per_gpu=args.batch_size)
+        )
     else:
         raise NotImplementedError(
             f"Strategy {args.strategy} is not recognized/implemented.")
-    strategy.init()
-
-    # check CUDA availability
-    args.cuda = not args.no_cuda and torch.cuda.is_available()
-
-    # limit # of CPU threads to be used per worker
-    torch.set_num_threads(1)
-
-    # get directory
-    program_dir = os.getcwd()
-
-    # start the time.time for profiling
-    st = time.time()
 
-    # deterministic testrun
-    if args.testrun:
-        torch.manual_seed(args.nseed)
-        g = torch.Generator()
-        g.manual_seed(args.nseed)
-
-    # get job rank info - rank==0 master gpu
-    if torch.cuda.is_available():
-        # local world size - per node
-        lwsize = strategy.dist_lwsize() if args.cuda else 0
-        gwsize = strategy.dist_gwsize()   # global world size - per run
-        grank = strategy.dist_grank()     # global rank - assign per run
-        lrank = strategy.dist_lrank()     # local rank - assign per node
-    else:
-        gwsize = 1
-        grank = 0
-
-    # some debug
-    if strategy.is_main_worker():
-        print('TIMER: initialise:', time.time()-st, 's')
-
-    # move the model on the GPU assigned to the current process
-    device = torch.device(
-        strategy.dist_device() if args.cuda and torch.cuda.is_available()
-        else 'cpu')
-    if args.cuda:
-        torch.cuda.set_device(lrank)
-        # deterministic testrun
-        if args.testrun:
-            torch.cuda.manual_seed(args.nseed)
-
-    # read data
-    mnist_scale = args.concM
-    largeData = []
-    for i in range(mnist_scale):
-        largeData.append(
-            datasets.MNIST(args.data_dir, train=True, download=False,
-                           transform=transforms.Compose([
-                               transforms.ToTensor(),
-                               transforms.Normalize((0.1307,), (0.3081,))
-                           ]))
-        )
-
-    # concat data
-    train_dataset = torch.utils.data.ConcatDataset(largeData)
-
-    mnist_scale = args.concM
-    largeData = []
-    for i in range(mnist_scale):
-        largeData.append(
-            datasets.MNIST(args.data_dir, train=False, download=False,
-                           transform=transforms.Compose([
-                               transforms.ToTensor(),
-                               transforms.Normalize((0.1307,), (0.3081,))
-                           ]))
-        )
+    # Initialize strategy
+    strategy.init()
 
-    # concat data
-    test_dataset = torch.utils.data.ConcatDataset(largeData)
-
-    # restricts data loading to a subset of the dataset exclusive to the
-    # current process
-    args.shuff = args.shuff and not args.testrun
-    if torch.cuda.is_available():
-        train_sampler = DistributedSampler(
-            train_dataset, num_replicas=gwsize, rank=grank, shuffle=args.shuff)
-        test_sampler = DistributedSampler(
-            test_dataset, num_replicas=gwsize, rank=grank, shuffle=args.shuff)
-    # distribute dataset to workers
-    # persistent workers is not possible for nworker=0
-    pers_w = True if args.nworker > 1 else False
-
-    # deterministic testrun - the same dataset each run
-    kwargs = {'worker_init_fn': seed_worker,
-              'generator': g} if args.testrun else {}
-
-    if torch.cuda.is_available():
-        train_loader = DataLoader(
-            train_dataset, batch_size=args.batch_size,
-            sampler=train_sampler, num_workers=args.nworker, pin_memory=True,
-            persistent_workers=pers_w, prefetch_factor=args.prefetch, **kwargs
-        )
-        test_loader = DataLoader(
-            test_dataset, batch_size=args.batch_size,
-            sampler=test_sampler, num_workers=args.nworker, pin_memory=True,
-            persistent_workers=pers_w, prefetch_factor=args.prefetch, **kwargs
-        )
-    else:
-        train_loader = DataLoader(
-            train_dataset, batch_size=args.batch_size)
-        test_loader = DataLoader(
-            test_dataset, batch_size=args.batch_size)
+    # Start the timer for profiling
+    st = timer()
+
+    # Set random seed for reproducibility
+    torch_prng = set_seed(args.rnd_seed)
+
+    if strategy.is_main_worker:
+        print('TIMER: initialise:', timer()-st, 's')
+        print('DEBUG: local ranks:', strategy.local_world_size(),
+              '/ global ranks:', strategy.global_world_size())
+        print('DEBUG: sys.version:', sys.version)
+        print('DEBUG: args.data_dir:', args.data_dir)
+        print('DEBUG: args.log_int:', args.log_int)
+        print('DEBUG: args.nworker:', args.nworker)
+        print('DEBUG: args.prefetch:', args.prefetch)
+        print('DEBUG: args.batch_size:', args.batch_size)
+        print('DEBUG: args.epochs:', args.epochs)
+        print('DEBUG: args.lr:', args.lr)
+        print('DEBUG: args.momentum:', args.momentum)
+        print('DEBUG: args.shuff:', args.shuff)
+        print('DEBUG: args.rnd_seed:', args.rnd_seed)
+        print('DEBUG: args.backend:', args.backend)
+
+    # Dataset
+    train_dataset, test_dataset = mnist_dataset(args.dataset_replication)
+    # Distributed dataloaders
+    train_loader = strategy.create_dataloader(
+        train_dataset, batch_size=args.batch_size,
+        num_workers=args.nworker, pin_memory=True,
+        persistent_workers=(args.nworker > 1),
+        prefetch_factor=args.prefetch, generator=torch_prng,
+        worker_init_fn=seed_worker
+    )
+    test_loader = strategy.create_dataloader(
+        test_dataset, batch_size=args.batch_size,
+        num_workers=args.nworker, pin_memory=True,
+        persistent_workers=(args.nworker > 1),
+        prefetch_factor=args.prefetch, generator=torch_prng,
+        worker_init_fn=seed_worker
+    )
 
-    if strategy.is_main_worker():
-        print('TIMER: read and concat data:', time.time()-st, 's')
+    if strategy.is_main_worker:
+        print('TIMER: read and concat data:', timer()-st, 's')
 
-    # create CNN model
-    model = Net().to(device)
+    # Create CNN model
+    model = Net().to(strategy.device())
 
-    # optimizer
+    # Optimizer
     optimizer = torch.optim.SGD(
         model.parameters(), lr=args.lr, momentum=args.momentum)
 
-    deepspeed_config = dict(train_batch_size=args.batch_size)
-    # 'config_params' key is ignored if strategy != DSDistributedStrategy
-    distrib_model, optimizer, _ = strategy.distributed(
-        model, optimizer, lr_scheduler=None, config_params=deepspeed_config
+    # Distributed model, optimizer, and scheduler
+    model, optimizer, _ = strategy.distributed(
+        model, optimizer, lr_scheduler=None, **distribute_kwargs
     )
 
-    # resume state
-    start_epoch = 1
-    best_acc = np.Inf
-    res_name = f'{args.strategy}-checkpoint.pth.tar'
-    if os.path.isfile(res_name):
-        try:
-            if torch.cuda.is_available():
-                dist.barrier()
-                # Map model to be loaded to specified single gpu.
-                loc = {'cuda:%d' % 0: 'cuda:%d' % lrank} if args.cuda else {
-                    'cpu:%d' % 0: 'cpu:%d' % lrank}
-                checkpoint = torch.load(
-                    program_dir+'/'+res_name, map_location=loc)
-            else:
-                checkpoint = torch.load(program_dir+'/'+res_name)
-            start_epoch = checkpoint['epoch']
-            best_acc = checkpoint['best_acc']
-            distrib_model.load_state_dict(checkpoint['state_dict'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-            if torch.cuda.is_available():
-                if strategy.is_main_worker():
-                    print(f'WARNING: restarting from {start_epoch} epoch')
-            else:
-                print(f'WARNING: restarting from {start_epoch} epoch')
-        except Exception:
-            if torch.cuda.is_available():
-                if strategy.is_main_worker():
-                    print('WARNING: restart file cannot be loaded, '
-                          'restarting!')
-            else:
-                print('WARNING: restart file cannot be loaded, restarting!')
-
-    if start_epoch > args.epochs:
-        if torch.cuda.is_available():
-            if strategy.is_main_worker():
-                print('WARNING: given epochs are less than the one in the '
-                      'restart file!\n'
-                      'WARNING: SYS.EXIT is issued')
-
-            strategy.clean_up()
-            sys.exit()
-        else:
-            print('WARNING: given epochs are less than the one in '
-                  'the restart file!\n'
-                  'WARNING: SYS.EXIT is issued')
-            sys.exit()
-
-    # start trainin/testing loop
-    if strategy.is_main_worker():
-        print('TIMER: broadcast:', time.time()-st, 's')
+    # Start training and test loop
+    if strategy.is_main_worker:
+        print('TIMER: broadcast:', timer()-st, 's')
         print('\nDEBUG: start training')
         print('--------------------------------------------------------')
 
-    et = time.time()
+    et = timer()
+    start_epoch = 1
     for epoch in range(start_epoch, args.epochs + 1):
-        lt = time.time()
-        # training
+        lt = timer()
+        if strategy.is_distributed:
+            # Inform the sampler that a new epoch started: shuffle
+            # may be needed
+            train_loader.sampler.set_epoch(epoch)
+            test_loader.sampler.set_epoch(epoch)
+
+        # Training
         loss_acc = train(
-            model=distrib_model,
-            device=device,
+            model=model,
             train_loader=train_loader,
             optimizer=optimizer,
             epoch=epoch,
@@ -470,77 +375,52 @@ def download_mnist():
             args=args
         )
 
-        # testing
+        # Testing
         acc_test = test(
-            model=distrib_model,
-            device=device,
+            model=model,
             test_loader=test_loader,
             strategy=strategy
         )
 
-        # save first epoch timer
+        # Save first epoch timer
         if epoch == start_epoch:
-            first_ep_t = time.time()-lt
+            first_ep_t = timer()-lt
 
-        # final epoch
+        # Final epoch
         if epoch + 1 == args.epochs:
             train_loader.last_epoch = True
             test_loader.last_epoch = True
 
-        if strategy.is_main_worker():
-            print('TIMER: epoch time:', time.time()-lt, 's')
+        if strategy.is_main_worker:
+            print('TIMER: epoch time:', timer()-lt, 's')
             print('DEBUG: accuracy:', acc_test, '%')
 
-        # save state if found a better state
-        is_best = loss_acc < best_acc
-        if epoch % args.restart_int == 0:
-            save_state(
-                epoch=epoch,
-                distrib_model=distrib_model,
-                loss_acc=loss_acc,
-                optimizer=optimizer,
-                res_name=res_name,
-                is_best=is_best,
-                strategy=strategy
-            )
-            # reset best_acc
-            best_acc = min(loss_acc, best_acc)
-
-    # finalise
-    # save final state
-    save_state(
-        epoch=epoch,
-        distrib_model=distrib_model,
-        loss_acc=loss_acc,
-        optimizer=optimizer,
-        res_name=res_name,
-        is_best=True,
-        strategy=strategy
-    )
-
-    # some debug
-    if strategy.is_main_worker():
+    if strategy.is_main_worker:
         print('\n--------------------------------------------------------')
         print('DEBUG: training results:\n')
         print('TIMER: first epoch time:', first_ep_t, ' s')
-        print('TIMER: last epoch time:', time.time()-lt, ' s')
-        print('TIMER: average epoch time:', (time.time()-et)/args.epochs, ' s')
-        print('TIMER: total epoch time:', time.time()-et, ' s')
+        print('TIMER: last epoch time:', timer()-lt, ' s')
+        print('TIMER: average epoch time:', (timer()-et)/args.epochs, ' s')
+        print('TIMER: total epoch time:', timer()-et, ' s')
         if epoch > 1:
             print('TIMER: total epoch-1 time:',
-                  time.time()-et-first_ep_t, ' s')
+                  timer()-et-first_ep_t, ' s')
             print('TIMER: average epoch-1 time:',
-                  (time.time()-et-first_ep_t)/(args.epochs-1), ' s')
+                  (timer()-et-first_ep_t)/(args.epochs-1), ' s')
         print('DEBUG: last accuracy:', acc_test, '%')
-        print('DEBUG: memory req:',
-              int(torch.cuda.memory_reserved(lrank)/1024/1024), 'MB') \
-            if args.cuda else 'DEBUG: memory req: - MB'
-        print('DEBUG: memory summary:\n\n',
-              torch.cuda.memory_summary(0)) if args.cuda else ''
+        if torch.cuda.is_available():
+            print('DEBUG: memory req:',
+                  int(torch.cuda.memory_reserved(
+                      strategy.local_rank())/1024/1024),
+                  'MB')
+            print('DEBUG: memory summary:\n\n',
+                  torch.cuda.memory_summary(0))
+
+        print(f'TIMER: final time: {timer()-st} s\n')
 
-    if strategy.is_main_worker():
-        print(f'TIMER: final time: {time.time()-st} s\n')
+    time.sleep(1)
+    print(f"<Global rank: {strategy.global_rank()}> - TRAINING FINISHED")
 
-    print(f"<Global rank: {strategy.dist_grank()}> - TRAINING FINISHED")
+    # Clean-up
     strategy.clean_up()
     sys.exit()
diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/README.md b/tutorials/distributed-ml/torch-tutorial-2-imagenet/README.md
deleted file mode 100644
index 780eb278..00000000
--- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/README.md
+++ /dev/null
@@ -1,47 +0,0 @@
-# Tutorial: distributed strategies for PyTorch model trained on MNIST dataset
-
-In this tutorial we show how to use torch `DistributedDataParallel` (DDP), Horovod and
-DeepSpeed from the same client code.
-Note that the environment is tested on the HDFML system at JSC. For other systems,
-the module versions might need change accordingly.
-
-## Setup
-
-First, from the root of this repository, build the environment containing
-pytorch, horovod and deepspeed. You can *try* with:
-
-```bash
-# Creates a Python venv called envAI_hdfml
-make torch-gpu-jsc
-```
-
-The Imagenet dataset is assumed to be already downloaded to some location.
-
-## Distributed training
-
-Each distributed strategy has its own SLURM job script, which
-should be used to run it:
-
-If you want to distribute the code in `train.py` with **torch DDP**, run from terminal:
-  
-```bash
-sbatch ddp_slurm.sh
-```
-
-If you want to distribute the code in `train.py` with **DeepSpeed**, run from terminal:
-  
-```bash
-sbatch deepspeed_slurm.sh
-```
-
-If you want to distribute the code in `train.py` with **Horovod**, run from terminal:
-  
-```bash
-sbatch hvd_slurm.sh
-```
-
-You can run all of them with:
-
-```bash
-bash runall.sh
-```
diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/config.yaml b/tutorials/distributed-ml/torch-tutorial-2-imagenet/config.yaml
deleted file mode 100644
index 2473d346..00000000
--- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/config.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-# I/O
-data_dir: /p/project/intertwin/datasets/Imagenet_sub/ImageNet_uncompressed/train/ #/p/project/intertwin/datasets/ImageNet_uncompressed/train
-restart_int: 10
-verbose: True
-
-# Model
-batch_size: 64
-epochs: 3
-lr: 0.001
-momentum: 0.5
-shuff: False
-num_classes: 1000
-
-# Debugging
-testrun: False
-nseed: 10
-log_int: 10
-
-# Distributed ML
-backend: nccl
-nworker: 4 # num workers dataloader
-prefetch: 2
-no_cuda: False
-
-
diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/ddp_slurm.sh b/tutorials/distributed-ml/torch-tutorial-2-imagenet/ddp_slurm.sh
deleted file mode 100644
index 4e9749c2..00000000
--- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/ddp_slurm.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-
-# general configuration of the job
-#SBATCH --job-name=Torch_DDP_tutorial-1
-#SBATCH --account=intertwin
-#SBATCH --mail-user=
-#SBATCH --mail-type=ALL
-#SBATCH --output=job-ddp.out
-#SBATCH --error=job-ddp.err
-#SBATCH --time=00:30:00
-
-# configure node and process count on the CM
-#SBATCH --partition=batch
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=1
-#SBATCH --cpus-per-task=32
-#SBATCH --gpus-per-node=4
-#SBATCH --exclusive
-
-# gres options have to be disabled for deepv
-#SBATCH --gres=gpu:4
-
-# set modules
-ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
-
-# set env
-source ../../../envAI_hdfml/bin/activate
-
-# job info
-debug=false
-echo "DEBUG: TIME: $(date)"
-echo "DEBUG: EXECUTE: $EXEC"
-echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
-echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
-echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
-echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
-echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
-echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
-echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
-echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
-if [ "$debug" = true ] ; then
-  export NCCL_DEBUG=INFO
-fi
-echo
-
-# set comm
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-export OMP_NUM_THREADS=1
-if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
-  export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
-fi
-
-# launch training
-TRAINING_CMD="train.py -s ddp -c config.yaml"
-
-srun --cpu-bind=none bash -c "torchrun \
-    --log_dir='logs' \
-    --nnodes=$SLURM_NNODES \
-    --nproc_per_node=$SLURM_GPUS_PER_NODE \
-    --rdzv_id=$SLURM_JOB_ID \
-    --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \
-    --rdzv_backend=c10d \
-    --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \
-    $TRAINING_CMD"
-
diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/deepspeed_slurm.sh b/tutorials/distributed-ml/torch-tutorial-2-imagenet/deepspeed_slurm.sh
deleted file mode 100644
index 8f1c2d2d..00000000
--- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/deepspeed_slurm.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/bin/bash
-
-# general configuration of the job
-#SBATCH --job-name=Torch_DeepSpeed_tutorial-1
-#SBATCH --account=intertwin
-#SBATCH --mail-user=
-#SBATCH --mail-type=ALL
-#SBATCH --output=job-ds.out
-#SBATCH --error=job-ds.err
-#SBATCH --time=00:30:00
-
-# configure node and process count on the CM
-#SBATCH --partition=batch
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=4
-#SBATCH --gpus-per-node=4
-#SBATCH --exclusive
-
-# gres options have to be disabled for deepv
-#SBATCH --gres=gpu:4
-
-# set modules
-ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
-
-# set env
-source ../../../envAI_hdfml/bin/activate
-
-# job info
-debug=false
-echo "DEBUG: TIME: $(date)"
-echo "DEBUG: EXECUTE: $EXEC"
-echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
-echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
-echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
-echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
-echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
-echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
-echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
-echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
-if [ "$debug" = true ] ; then
-  export NCCL_DEBUG=INFO
-fi
-echo
-
-# set env vars
-export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
-export OMP_NUM_THREADS=1
-if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
-  export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
-fi
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-# launch training
-MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i
-export MASTER_ADDR
-export MASTER_PORT=29500 
-
-TRAINING_CMD="train.py -s deepspeed -c config.yaml"
-
-# Run without launcher: set --ntasks-per-node=NUM_GPUS
-srun --cpu-bind=none python -u $TRAINING_CMD --deepspeed
-
-# # Run with deepspeed launcher: set --ntasks-per-node=1
-# # https://www.deepspeed.ai/getting-started/#multi-node-environment-variables
-# export NCCL_IB_DISABLE=1
-# export NCCL_SOCKET_IFNAME=eth0
-# nodelist=$(scontrol show hostname $SLURM_NODELIST)
-# echo "$nodelist" | sed -e 's/$/ slots=4/' > .hostfile
-# # Requires passwordless SSH access among compute node
-# srun --cpu-bind=none deepspeed --hostfile=.hostfile $TRAINING_CMD --deepspeed
-# rm .hostfile
-
diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/hvd_slurm.sh b/tutorials/distributed-ml/torch-tutorial-2-imagenet/hvd_slurm.sh
deleted file mode 100644
index 69b9d51e..00000000
--- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/hvd_slurm.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# general configuration of the job
-#SBATCH --job-name=Torch_HVD_tutorial-1
-#SBATCH --account=intertwin
-#SBATCH --mail-user=
-#SBATCH --mail-type=ALL
-#SBATCH --output=job-hvd.out
-#SBATCH --error=job-hvd.err
-#SBATCH --time=00:30:00
-
-# configure node and process count on the CM
-#SBATCH --partition=batch
-#SBATCH --nodes=2
-#SBATCH --ntasks-per-node=4
-#SBATCH --cpus-per-task=8
-#SBATCH --gpus-per-node=4
-#SBATCH --exclusive
-
-# gres options have to be disabled for deepv
-#SBATCH --gres=gpu:4
-
-# set modules
-ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
-
-# set env
-source ../../../envAI_hdfml/bin/activate
-
-# job info
-debug=false
-echo "DEBUG: TIME: $(date)"
-echo "DEBUG: EXECUTE: $EXEC"
-echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
-echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
-echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
-echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
-echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
-echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
-echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
-echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
-echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
-if [ "$debug" = true ] ; then
-  export NCCL_DEBUG=INFO
-fi
-echo
-
-# set vars
-# export NCCL_DEBUG=INFO
-export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK}
-export OMP_NUM_THREADS=1
-if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then
-  export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
-fi
-export CUDA_VISIBLE_DEVICES="0,1,2,3"
-
-# launch training
-TRAINING_CMD="train.py -s horovod -c config.yaml"
-
-srun --cpu-bind=none python -u $TRAINING_CMD
-
diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/runall.sh b/tutorials/distributed-ml/torch-tutorial-2-imagenet/runall.sh
deleted file mode 100644
index 21c02a22..00000000
--- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/runall.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-# Run all versions of distributed ML version
-rm *checkpoint.pth.tar *.out *.err *.csv
-echo "Torch DDP training: $(sbatch ddp_slurm.sh)"
-echo "DeepSpeed training: $(sbatch deepspeed_slurm.sh)"
-echo "Horovod training: $(sbatch hvd_slurm.sh)"
\ No newline at end of file
diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/scaling-test.sh b/tutorials/distributed-ml/torch-tutorial-2-imagenet/scaling-test.sh
deleted file mode 100644
index 275f7fb7..00000000
--- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/scaling-test.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-rm *checkpoint.pth.tar *.out *.err *.csv
-
-timeout="01:01:00"
-for N in 1 2 4 8
-do
-    sbatch --job-name="DDP-imagenet-n$N" --nodes=$N --output="job-ddp-n$N.out" --error="job-ddp-n$N.err" --time=$timeout ddp_slurm.sh
-    sbatch --job-name="DS-imagenet-n$N" --nodes=$N --output="job-ds-n$N.out" --error="job-ds-n$N.err" --time=$timeout deepspeed_slurm.sh
-    sbatch --job-name="HVD-imagenet-n$N" --nodes=$N --output="job-hvd-n$N.out" --error="job-hvd-n$N.err" --time=$timeout hvd_slurm.sh
-done
\ No newline at end of file
diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/train.py b/tutorials/distributed-ml/torch-tutorial-2-imagenet/train.py
deleted file mode 100644
index 6bd71214..00000000
--- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/train.py
+++ /dev/null
@@ -1,499 +0,0 @@
-"""
-Show how to use DDP, Horovod and DeepSpeed strategies interchangeably
-with a large neural network trained on Imagenet dataset, showing how
-to use checkpoints.
-"""
-import os
-import argparse
-import sys
-import time
-import numpy as np
-import random
-
-import torch
-from torch import nn
-import torch.distributed as dist
-import torch.nn.functional as F
-import torchvision
-from torchvision import transforms
-from torch.utils.data import DataLoader, DistributedSampler
-
-import deepspeed
-
-from itwinai.torch.distributed import (
-    TorchDistributedStrategy,
-    DDPDistributedStrategy,
-    HVDDistributedStrategy,
-    DSDistributedStrategy,
-)
-from itwinai.parser import ArgumentParser as ItAIArgumentParser
-from itwinai.loggers import EpochTimeTracker
-
-
-def parse_args() -> argparse.Namespace:
-    """
-    Parse CLI args, which can also be loaded from a configuration file
-    using the --config flag:
-
-    >>> train.py --strategy ddp --config config.yaml
-    """
-    parser = ItAIArgumentParser(description='PyTorch MNIST Example')
-
-    # Distributed ML strategy
-    parser.add_argument(
-        "--strategy", "-s", type=str,
-        choices=['ddp', 'horovod', 'deepspeed'],
-        default='ddp'
-    )
-
-    # IO parsers
-    parser.add_argument('--data-dir', default='./',
-                        help=('location of the training dataset in the local '
-                              'filesystem'))
-    parser.add_argument('--restart-int', type=int, default=10,
-                        help='restart interval per epoch (default: 10)')
-    parser.add_argument('--verbose',
-                        action=argparse.BooleanOptionalAction,
-                        help='Print parsed arguments')
-
-    # model parsers
-    parser.add_argument('--batch-size', type=int, default=64,
-                        help='input batch size for training (default: 64)')
-    parser.add_argument('--epochs', type=int, default=10,
-                        help='number of epochs to train (default: 10)')
-    parser.add_argument('--lr', type=float, default=0.01,
-                        help='learning rate (default: 0.01)')
-    parser.add_argument('--momentum', type=float, default=0.5,
-                        help='momentum in SGD optimizer (default: 0.5)')
-    parser.add_argument('--shuff', action='store_true', default=False,
-                        help='shuffle dataset (default: False)')
-    parser.add_argument('--num-classes', type=int, default=1000,
-                        help='number of classes in dataset')
-
-    # debug parsers
-    parser.add_argument('--testrun', action='store_true', default=False,
-                        help='do a test run with seed (default: False)')
-    parser.add_argument('--nseed', type=int, default=0,
-                        help='seed integer for reproducibility (default: 0)')
-    parser.add_argument('--log-int', type=int, default=10,
-                        help='log interval per training')
-
-    # parallel parsers
-    parser.add_argument('--backend', type=str, default='nccl',
-                        help='backend for parrallelisation (default: nccl)')
-    parser.add_argument('--nworker', type=int, default=0,
-                        help=('number of workers in DataLoader (default: 0 -'
-                              ' only main)'))
-    parser.add_argument('--prefetch', type=int, default=2,
-                        help='prefetch data in DataLoader (default: 2)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
-                        help='disables GPGPUs')
-    parser.add_argument('--local_rank', type=int, default=-1,
-                        help='local rank passed from distributed launcher')
-
-    # DeepSpeed
-    parser = deepspeed.add_config_arguments(parser)
-    args = parser.parse_args()
-
-    if args.verbose:
-        args_list = [f"{key}: {val}" for key, val in args.items()]
-        print("PARSED ARGS:\n", '\n'.join(args_list))
-
-    return args
-
-
-def train(
-    model, device, train_loader, optimizer, epoch,
-    strategy: TorchDistributedStrategy, args
-):
-    """
-    Training function, representing an epoch.
-    """
-    model.train()
-    t_list = []
-    loss_acc = 0
-    gwsize = strategy.dist_gwsize()
-    if strategy.is_main_worker():
-        print("\n")
-    for batch_idx, (data, target) in enumerate(train_loader):
-        t = time.perf_counter()
-        data, target = data.to(device), target.to(device)
-        optimizer.zero_grad()
-        output = model(data)
-        loss = F.nll_loss(output, target)
-        loss.backward()
-        optimizer.step()
-        if batch_idx % args.log_int == 0 and strategy.is_main_worker():
-            print(
-                f'Train epoch: {epoch} '
-                f'[{batch_idx * len(data)}/{len(train_loader.dataset)/gwsize} '
-                f'({100.0 * batch_idx / len(train_loader):.0f}%)]\t\t'
-                f'Loss: {loss.item():.6f}')
-        t_list.append(time.perf_counter() - t)
-        loss_acc += loss.item()
-    if strategy.is_main_worker():
-        print('TIMER: train time', sum(t_list) / len(t_list), 's')
-    return loss_acc
-
-
-def test(model, device, test_loader, strategy: TorchDistributedStrategy):
-    """
-    Model validation.
-    """
-    model.eval()
-    test_loss = 0
-    correct = 0
-    gwsize = strategy.dist_gwsize()
-    with torch.no_grad():
-        for data, target in test_loader:
-            data, target = data.to(device), target.to(device)
-            output = model(data)
-            # sum up batch loss
-            test_loss += F.nll_loss(output, target, reduction="sum").item()
-            # get the index of the max log-probability
-            pred = output.argmax(dim=1, keepdim=True)
-            correct += pred.eq(target.view_as(pred)).sum().item()
-    test_loss /= len(test_loader.dataset)
-    if strategy.is_main_worker():
-        print(
-            f'Test set: average loss: {test_loss:.4f}\t'
-            f'accurate samples: {correct}/{len(test_loader.dataset)/gwsize}')
-    acc_test = 100.0 * correct * gwsize / len(test_loader.dataset)
-    return acc_test
-
-
-def save_state(
-        epoch, distrib_model, loss_acc, optimizer,
-        res_name, is_best, strategy: TorchDistributedStrategy
-):
-    """
-    Save training state.
-    """
-    grank = strategy.dist_grank()
-    rt = time.time()
-    # find if is_best happened in any worker
-    if torch.cuda.is_available():
-        is_best_m = strategy.par_allgather_obj(is_best)
-
-    if torch.cuda.is_available():
-        if any(is_best_m):
-            # find which rank is_best happened - select first rank if multiple
-            is_best_rank = np.where(np.array(is_best_m))[0][0]
-
-            # collect state
-            state = {'epoch': epoch + 1,
-                     'state_dict': distrib_model.state_dict(),
-                     'best_acc': loss_acc,
-                     'optimizer': optimizer.state_dict()}
-
-            # write on worker with is_best
-            if grank == is_best_rank:
-                torch.save(state, './'+res_name)
-                print(
-                    f'DEBUG: state in {grank} is saved on epoch:{epoch} '
-                    f'in {time.time()-rt} s')
-    else:
-        # collect state
-        state = {'epoch': epoch + 1,
-                 'state_dict': distrib_model.state_dict(),
-                 'best_acc': loss_acc,
-                 'optimizer': optimizer.state_dict()}
-
-        torch.save(state, './'+res_name)
-        print(
-            f'DEBUG: state in {grank} is saved on epoch:{epoch} in '
-            f'{time.time()-rt} s')
-
-
-def seed_worker(worker_id):
-    """
-    Seed dataloader worker.
-    """
-    worker_seed = torch.initial_seed() % 2**32
-    np.random.seed(worker_seed)
-    random.seed(worker_seed)
-
-
-if __name__ == "__main__":
-
-    args = parse_args()
-
-    # Instantiate Strategy
-    if args.strategy == 'ddp':
-        if (not torch.cuda.is_available()
-                or not torch.cuda.device_count() > 1):
-            raise RuntimeError('Resources unavailable')
-
-        strategy = DDPDistributedStrategy(backend=args.backend)
-    elif args.strategy == 'horovod':
-        strategy = HVDDistributedStrategy()
-    elif args.strategy == 'deepspeed':
-        strategy = DSDistributedStrategy(backend=args.backend)
-    else:
-        raise NotImplementedError(
-            f"Strategy {args.strategy} is not recognized/implemented.")
-    strategy.init()
-
-    # check CUDA availability
-    args.cuda = not args.no_cuda and torch.cuda.is_available()
-
-    # limit # of CPU threads to be used per worker
-    torch.set_num_threads(1)
-
-    # get directory
-    program_dir = os.getcwd()
-
-    # start the time.time for profiling
-    st = time.time()
-
-    # deterministic testrun
-    if args.testrun:
-        torch.manual_seed(args.nseed)
-        g = torch.Generator()
-        g.manual_seed(args.nseed)
-
-    # get job rank info - rank==0 master gpu
-    if torch.cuda.is_available():
-        # local world size - per node
-        lwsize = strategy.dist_lwsize() if args.cuda else 0
-        gwsize = strategy.dist_gwsize()   # global world size - per run
-        grank = strategy.dist_grank()     # global rank - assign per run
-        lrank = strategy.dist_lrank()     # local rank - assign per node
-    else:
-        gwsize = 1
-        grank = 0
-
-    # some debug
-    if strategy.is_main_worker():
-        print('TIMER: initialise:', time.time()-st, 's')
-
-    # move the model on the GPU assigned to the current process
-    device = torch.device(
-        strategy.dist_device() if args.cuda and torch.cuda.is_available()
-        else 'cpu')
-    if args.cuda:
-        torch.cuda.set_device(lrank)
-        # deterministic testrun
-        if args.testrun:
-            torch.cuda.manual_seed(args.nseed)
-
-    # dataset
-    # Initialize transformations for data augmentation
-    transform = transforms.Compose([
-        transforms.Resize(256),
-        transforms.RandomHorizontalFlip(),
-        transforms.RandomVerticalFlip(),
-        transforms.RandomRotation(degrees=45),
-        transforms.ColorJitter(
-            brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
-        transforms.CenterCrop(224),
-        transforms.ToTensor(),
-        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
-    ])
-
-    # Load the ImageNet Object Localization Challenge dataset
-    train_dataset = torchvision.datasets.ImageFolder(
-        root=args.data_dir,
-        transform=transform
-    )
-    # test_dataset = ...
-
-    # restricts data loading to a subset of the dataset exclusive to the
-    # current process
-    args.shuff = args.shuff and not args.testrun
-    if torch.cuda.is_available():
-        train_sampler = DistributedSampler(
-            train_dataset, num_replicas=gwsize, rank=grank, shuffle=args.shuff)
-        # test_sampler = DistributedSampler(
-        #     test_dataset, num_replicas=gwsize, rank=grank,
-        # shuffle=args.shuff)
-    # distribute dataset to workers
-    # persistent workers is not possible for nworker=0
-    pers_w = True if args.nworker > 1 else False
-
-    # deterministic testrun - the same dataset each run
-    kwargs = {'worker_init_fn': seed_worker,
-              'generator': g} if args.testrun else {}
-
-    if torch.cuda.is_available():
-        train_loader = DataLoader(
-            train_dataset, batch_size=args.batch_size,
-            sampler=train_sampler, num_workers=args.nworker, pin_memory=True,
-            persistent_workers=pers_w, prefetch_factor=args.prefetch, **kwargs
-        )
-        # test_loader = DataLoader(
-        #     test_dataset, batch_size=args.batch_size,
-        #     sampler=test_sampler, num_workers=args.nworker, pin_memory=True,
-        #     persistent_workers=pers_w, prefetch_factor=args.prefetch,
-        # **kwargs
-        # )
-    else:
-        train_loader = DataLoader(
-            train_dataset, batch_size=args.batch_size)
-        # test_loader = DataLoader(
-        #     test_dataset, batch_size=args.batch_size)
-
-    if strategy.is_main_worker():
-        print('TIMER: read and concat data:', time.time()-st, 's')
-
-    # create CNN model: resnet 50, resnet101, resnet152
-    model = torchvision.models.resnet152()
-    model.fc = nn.Linear(2048, args.num_classes)
-
-    # optimizer
-    optimizer = torch.optim.SGD(
-        model.parameters(), lr=args.lr, momentum=args.momentum)
-
-    deepspeed_config = dict(train_micro_batch_size_per_gpu=args.batch_size)
-    # 'config_params' key is ignored if strategy != DSDistributedStrategy
-    distrib_model, optimizer, _ = strategy.distributed(
-        model, optimizer, lr_scheduler=None, config_params=deepspeed_config
-    )
-
-    # resume state
-    start_epoch = 1
-    best_acc = np.Inf
-    nnod = os.environ.get('SLURM_NNODES', 'unk')
-    res_name = f'{args.strategy}-{nnod}N-checkpoint.pth.tar'
-    if os.path.isfile(res_name):
-        try:
-            if torch.cuda.is_available():
-                dist.barrier()
-                # Map model to be loaded to specified single gpu.
-                loc = {'cuda:%d' % 0: 'cuda:%d' % lrank} if args.cuda else {
-                    'cpu:%d' % 0: 'cpu:%d' % lrank}
-                checkpoint = torch.load(
-                    program_dir+'/'+res_name, map_location=loc)
-            else:
-                checkpoint = torch.load(program_dir+'/'+res_name)
-            start_epoch = checkpoint['epoch']
-            best_acc = checkpoint['best_acc']
-            distrib_model.load_state_dict(checkpoint['state_dict'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-            if torch.cuda.is_available():
-                if strategy.is_main_worker():
-                    print(f'WARNING: restarting from {start_epoch} epoch')
-            else:
-                print(f'WARNING: restarting from {start_epoch} epoch')
-        except Exception:
-            if torch.cuda.is_available():
-                if strategy.is_main_worker():
-                    print('WARNING: restart file cannot be loaded, '
-                          'restarting!')
-            else:
-                print('WARNING: restart file cannot be loaded, restarting!')
-
-    if start_epoch > args.epochs:
-        if torch.cuda.is_available():
-            if strategy.is_main_worker():
-                print('WARNING: given epochs are less than the one in the '
-                      'restart file!\n'
-                      'WARNING: SYS.EXIT is issued')
-
-            strategy.clean_up()
-            sys.exit()
-        else:
-            print('WARNING: given epochs are less than the one in '
-                  'the restart file!\n'
-                  'WARNING: SYS.EXIT is issued')
-            sys.exit()
-
-    # start trainin/testing loop
-    if strategy.is_main_worker():
-        print('TIMER: broadcast:', time.time()-st, 's')
-        print('\nDEBUG: start training')
-        print('--------------------------------------------------------')
-        epoch_time_tracker = EpochTimeTracker(series_name=args.strategy)
-
-    et = time.time()
-    for epoch in range(start_epoch, args.epochs + 1):
-        lt = time.time()
-        # training
-        loss_acc = train(
-            model=distrib_model,
-            device=device,
-            train_loader=train_loader,
-            optimizer=optimizer,
-            epoch=epoch,
-            strategy=strategy,
-            args=args
-        )
-
-        # # testing
-        # acc_test = test(
-        #     model=distrib_model,
-        #     device=device,
-        #     test_loader=test_loader,
-        #     strategy=strategy
-        # )
-
-        # save first epoch timer
-        if epoch == start_epoch:
-            first_ep_t = time.time()-lt
-
-        # final epoch
-        if epoch + 1 == args.epochs:
-            train_loader.last_epoch = True
-            # test_loader.last_epoch = True
-
-        if strategy.is_main_worker():
-            print('TIMER: epoch time:', time.time()-lt, 's')
-            epoch_time_tracker.add_epoch_time(epoch-1, time.time()-lt)
-            # print('DEBUG: accuracy:', acc_test, '%')
-
-        # save state if found a better state
-        is_best = loss_acc < best_acc
-        if epoch % args.restart_int == 0:
-            save_state(
-                epoch=epoch,
-                distrib_model=distrib_model,
-                loss_acc=loss_acc,
-                optimizer=optimizer,
-                res_name=res_name,
-                is_best=is_best,
-                strategy=strategy
-            )
-            # reset best_acc
-            best_acc = min(loss_acc, best_acc)
-
-    # finalise
-    # save final state
-    save_state(
-        epoch=epoch,
-        distrib_model=distrib_model,
-        loss_acc=loss_acc,
-        optimizer=optimizer,
-        res_name=res_name,
-        is_best=True,
-        strategy=strategy
-    )
-
-    # some debug
-    if strategy.is_main_worker():
-        print('\n--------------------------------------------------------')
-        print('DEBUG: training results:\n')
-        print('TIMER: first epoch time:', first_ep_t, ' s')
-        print('TIMER: last epoch time:', time.time()-lt, ' s')
-        print('TIMER: average epoch time:', (time.time()-et)/args.epochs, ' s')
-        print('TIMER: total epoch time:', time.time()-et, ' s')
-        if epoch > 1:
-            print('TIMER: total epoch-1 time:',
-                  time.time()-et-first_ep_t, ' s')
-            print('TIMER: average epoch-1 time:',
-                  (time.time()-et-first_ep_t)/(args.epochs-1), ' s')
-        # print('DEBUG: last accuracy:', acc_test, '%')
-        print('DEBUG: memory req:',
-              int(torch.cuda.memory_reserved(lrank)/1024/1024), 'MB') \
-            if args.cuda else 'DEBUG: memory req: - MB'
-        print('DEBUG: memory summary:\n\n',
-              torch.cuda.memory_summary(0)) if args.cuda else ''
-
-    if strategy.is_main_worker():
-        print(f'TIMER: final time: {time.time()-st} s\n')
-        nnod = os.environ.get('SLURM_NNODES', 'unk')
-        epoch_time_tracker.save(
-            csv_file=f"epochtime_{args.strategy}_{nnod}N.csv")
-
-    print(f"<Global rank: {strategy.dist_grank()}> - TRAINING FINISHED")
-    strategy.clean_up()
-    sys.exit()
diff --git a/tutorials/ml-workflows/basic_components.py b/tutorials/ml-workflows/basic_components.py
index 49e74180..1fca03d8 100644
--- a/tutorials/ml-workflows/basic_components.py
+++ b/tutorials/ml-workflows/basic_components.py
@@ -70,12 +70,6 @@ def execute(
         """
         return train_set, vaild_set, test_set, "my_trained_model"
 
-    def save_state(self):
-        return super().save_state()
-
-    def load_state(self):
-        return super().load_state()
-
 
 class MySaver(Saver):
     @monitor_exec
diff --git a/use-cases/3dgan/Dockerfile b/use-cases/3dgan/Dockerfile
index c10d8ec8..26cc3f29 100644
--- a/use-cases/3dgan/Dockerfile
+++ b/use-cases/3dgan/Dockerfile
@@ -1,19 +1,25 @@
-# FROM python:3.9.12
 FROM nvcr.io/nvidia/pytorch:23.09-py3
+# FROM python:3.11
 
 WORKDIR /usr/src/app
 
-RUN pip install --upgrade pip
-RUN pip install --no-cache-dir lightning
+# Install itwinai
+COPY pyproject.toml ./
+COPY src ./
+RUN pip install --upgrade pip \
+    && pip install --no-cache-dir lightning \
+    && pip install --no-cache-dir .
 
-# Add 3DGAN custom requirements
+# Add 3DGAN use case files and install additional requirements
 COPY use-cases/3dgan/requirements.txt ./
+COPY use-cases/3dgan/* ./
 RUN pip install --no-cache-dir -r requirements.txt
 
-# Install itwinai and dependencies
-COPY pyproject.toml ./
-COPY src ./
-RUN  pip install --no-cache-dir .
+# Create non-root user
+RUN groupadd -g 10001 dotnet \
+    && useradd -m -u 10000 -g dotnet dotnet \
+    && chown -R dotnet:dotnet /usr/src/app
+USER dotnet:dotnet
 
-# Add 3DGAN use case files
-COPY use-cases/3dgan/* ./
\ No newline at end of file
+# ENTRYPOINT [ "itwinai", "exec-pipeline" ]
+# CMD [ "--config", "pipeline.yaml" ]
\ No newline at end of file
diff --git a/use-cases/3dgan/README.md b/use-cases/3dgan/README.md
index d0bf2c82..53501e89 100644
--- a/use-cases/3dgan/README.md
+++ b/use-cases/3dgan/README.md
@@ -19,30 +19,22 @@ micromamba virtual environment.
 
 ## Training
 
-At CERN, use the dedicated configuration file:
+Launch training using `itwinai` and the training configuration:
 
 ```bash
 cd use-cases/3dgan
-itwinai exec-pipeline --config cern-pipeline.yaml
+itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline
 
 # Or better:
-micromamba run -p ../../.venv-pytorch/ torchrun --nproc_per_node gpu itwinai exec-pipeline --config cern-pipeline.yaml
+micromamba run -p ../../.venv-pytorch/ torchrun --nproc_per_node gpu \
+    itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline
 ```
 
-Anywhere else, use the general purpose training configuration:
+To visualize the logs with MLFLow, if you set a local path as tracking URI,
+run the following in the terminal:
 
 ```bash
-cd use-cases/3dgan
-itwinai exec-pipeline --config pipeline.yaml
-
-# Or better:
-micromamba run -p ../../.venv-pytorch/ torchrun --nproc_per_node gpu itwinai exec-pipeline --config pipeline.yaml
-```
-
-To visualize the logs with MLFLow run the following in the terminal:
-
-```bash
-micromamba run -p ../../.venv-pytorch mlflow ui --backend-store-uri ml_logs/mlflow_logs
+micromamba run -p ../../.venv-pytorch mlflow ui --backend-store-uri LOCAL_TRACKING_URI
 ```
 
 And select the "3DGAN" experiment.
@@ -69,12 +61,8 @@ sub-folders:
 2. As model, if a pre-trained checkpoint is not available,
 we can create a dummy version of it with:
 
-    ```python
-    import torch
-    from model import ThreeDGAN
-    # Same params as in the training config file!
-    my_gan = ThreeDGAN()
-    torch.save(my_gan, '3dgan-inference.pth')
+    ```bash
+    python create_inference_sample.py
     ```
 
 3. Run inference command. This will generate a `3dgan-generated-data`
@@ -82,7 +70,7 @@ folder containing generated particle traces in form of torch tensors
 (.pth files) and 3D scatter plots (.jpg images).
 
     ```bash
-    itwinai exec-pipeline --config inference-pipeline.yaml
+    itwinai exec-pipeline --config config.yaml --pipe-key inference_pipeline
     ```
 
 The inference execution will produce a folder called
@@ -120,19 +108,20 @@ export STRATEGY="auto" # distributed strategy
 export DEVICES="0," # GPU devices list
 
 
-itwinai exec-pipeline --print-config --config $CERN_CODE_ROOT/inference-pipeline.yaml \
--o pipeline.init_args.steps.dataloading_step.init_args.data_path=$TMP_DATA_ROOT/exp_data \
--o pipeline.init_args.steps.inference_step.init_args.config.trainer.logger.init_args.save_dir=$TMP_DATA_ROOT/ml_logs/mlflow_logs \
--o pipeline.init_args.steps.inference_step.init_args.config.trainer.strategy=$STRATEGY \
--o pipeline.init_args.steps.inference_step.init_args.config.trainer.devices=$DEVICES \
--o pipeline.init_args.steps.inference_step.init_args.config.trainer.accelerator=$ACCELERATOR \
--o pipeline.init_args.steps.inference_step.init_args.model.init_args.model_uri=$CERN_CODE_ROOT/3dgan-inference.pth \
--o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.datapath=$TMP_DATA_ROOT/exp_data/*/*.h5 \
--o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.max_samples=$MAX_DATA_SAMPLES \
--o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.batch_size=$BATCH_SIZE \
--o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.num_workers=$NUM_WORKERS_DL \
--o pipeline.init_args.steps.saver_step.init_args.save_dir=$TMP_DATA_ROOT/3dgan-generated-data \
--o pipeline.init_args.steps.saver_step.init_args.aggregate_predictions=$AGGREGATE_PREDS 
+itwinai exec-pipeline --print-config --config $CERN_CODE_ROOT/config.yaml \
+    --pipe-key inference_pipeline \
+    -o dataset_location=$CERN_DATA_ROOT/exp_data \
+    -o logs_dir=$TMP_DATA_ROOT/ml_logs/mlflow_logs \
+    -o distributed_strategy=$STRATEGY \
+    -o devices=$DEVICES \
+    -o hw_accelerators=$ACCELERATOR \
+    -o checkpoints_path=\\$TMP_DATA_ROOT/checkpoints \
+    -o inference_model_uri=$CERN_CODE_ROOT/3dgan-inference.pth \
+    -o max_dataset_size=$MAX_DATA_SAMPLES \
+    -o batch_size=$BATCH_SIZE \
+    -o num_workers_dataloader=$NUM_WORKERS_DL \
+    -o inference_results_location=$TMP_DATA_ROOT/3dgan-generated-data \
+    -o aggregate_predictions=$AGGREGATE_PREDS
 ```
 
 ### Docker image
@@ -196,17 +185,20 @@ export ACCELERATOR="gpu" # choose "cpu" or "gpu"
 docker run -it --rm --name running-inference \
 -v "$PWD":/usr/data ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.1 \
 /bin/bash -c "itwinai exec-pipeline \
---config inference-pipeline.yaml --print-config \
--o pipeline.init_args.steps.dataloading_step.init_args.data_path=$CERN_DATA_ROOT/exp_data \
--o pipeline.init_args.steps.inference_step.init_args.config.trainer.logger.init_args.save_dir=$CERN_DATA_ROOT/ml_logs/mlflow_logs \
--o pipeline.init_args.steps.inference_step.init_args.config.trainer.accelerator=$ACCELERATOR \
--o pipeline.init_args.steps.inference_step.init_args.model.init_args.model_uri=$CERN_CODE_ROOT/3dgan-inference.pth \
--o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.datapath=$CERN_DATA_ROOT/exp_data/*/*.h5 \
--o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.max_samples=$MAX_DATA_SAMPLES \
--o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.batch_size=$BATCH_SIZE \
--o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.num_workers=$NUM_WORKERS_DL \
--o pipeline.init_args.steps.saver_step.init_args.save_dir=$CERN_DATA_ROOT/3dgan-generated-data \
--o pipeline.init_args.steps.saver_step.init_args.aggregate_predictions=$AGGREGATE_PREDS "
+    --print-config --config $CERN_CODE_ROOT/config.yaml \
+    --pipe-key inference_pipeline \
+    -o dataset_location=$CERN_DATA_ROOT/exp_data \
+    -o logs_dir=$TMP_DATA_ROOT/ml_logs/mlflow_logs \
+    -o distributed_strategy=$STRATEGY \
+    -o devices=$DEVICES \
+    -o hw_accelerators=$ACCELERATOR \
+    -o checkpoints_path=\\$TMP_DATA_ROOT/checkpoints \
+    -o inference_model_uri=$CERN_CODE_ROOT/3dgan-inference.pth \
+    -o max_dataset_size=$MAX_DATA_SAMPLES \
+    -o batch_size=$BATCH_SIZE \
+    -o num_workers_dataloader=$NUM_WORKERS_DL \
+    -o inference_results_location=$TMP_DATA_ROOT/3dgan-generated-data \
+    -o aggregate_predictions=$AGGREGATE_PREDS "
 ```
 
 #### How to fully exploit GPU resources
@@ -231,7 +223,7 @@ Run Docker container with Singularity:
 
 ```bash
 singularity run --nv -B "$PWD":/usr/data docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.1 /bin/bash -c \
-"cd /usr/src/app && itwinai exec-pipeline --config inference-pipeline.yaml"
+"cd /usr/src/app && itwinai exec-pipeline --config config.yaml --pipe-key inference_pipeline"
 ```
 
 Example with overrides (as above for Docker):
@@ -248,15 +240,18 @@ export ACCELERATOR="gpu" # choose "cpu" or "gpu"
 
 singularity run --nv -B "$PWD":/usr/data docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.1 /bin/bash -c \
 "cd /usr/src/app && itwinai exec-pipeline \
---config inference-pipeline.yaml --print-config \
--o pipeline.init_args.steps.dataloading_step.init_args.data_path=$CERN_DATA_ROOT/exp_data \
--o pipeline.init_args.steps.inference_step.init_args.config.trainer.logger.init_args.save_dir=$CERN_DATA_ROOT/ml_logs/mlflow_logs \
--o pipeline.init_args.steps.inference_step.init_args.config.trainer.accelerator=$ACCELERATOR \
--o pipeline.init_args.steps.inference_step.init_args.model.init_args.model_uri=$CERN_CODE_ROOT/3dgan-inference.pth \
--o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.datapath=$CERN_DATA_ROOT/exp_data/*/*.h5 \
--o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.max_samples=$MAX_DATA_SAMPLES \
--o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.batch_size=$BATCH_SIZE \
--o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.num_workers=$NUM_WORKERS_DL \
--o pipeline.init_args.steps.saver_step.init_args.save_dir=$CERN_DATA_ROOT/3dgan-generated-data \
--o pipeline.init_args.steps.saver_step.init_args.aggregate_predictions=$AGGREGATE_PREDS "
+    --print-config --config $CERN_CODE_ROOT/config.yaml \
+    --pipe-key inference_pipeline \
+    -o dataset_location=$CERN_DATA_ROOT/exp_data \
+    -o logs_dir=$TMP_DATA_ROOT/ml_logs/mlflow_logs \
+    -o distributed_strategy=$STRATEGY \
+    -o devices=$DEVICES \
+    -o hw_accelerators=$ACCELERATOR \
+    -o checkpoints_path=\\$TMP_DATA_ROOT/checkpoints \
+    -o inference_model_uri=$CERN_CODE_ROOT/3dgan-inference.pth \
+    -o max_dataset_size=$MAX_DATA_SAMPLES \
+    -o batch_size=$BATCH_SIZE \
+    -o num_workers_dataloader=$NUM_WORKERS_DL \
+    -o inference_results_location=$TMP_DATA_ROOT/3dgan-generated-data \
+    -o aggregate_predictions=$AGGREGATE_PREDS "
 ```
diff --git a/use-cases/3dgan/cern-pipeline.yaml b/use-cases/3dgan/cern-pipeline.yaml
deleted file mode 100644
index 0bc9a756..00000000
--- a/use-cases/3dgan/cern-pipeline.yaml
+++ /dev/null
@@ -1,95 +0,0 @@
-pipeline:
-  class_path: itwinai.pipeline.Pipeline
-  init_args:
-    steps:
-      - class_path: dataloader.Lightning3DGANDownloader
-        init_args:
-          data_path: /eos/user/k/ktsolaki/data/3dgan_data
-          data_url: null # https://drive.google.com/drive/folders/1uPpz0tquokepptIfJenTzGpiENfo2xRX
-
-      - class_path: trainer.Lightning3DGANTrainer
-        init_args:
-          # Pytorch lightning config for training
-          config:
-            seed_everything: 4231162351
-            trainer:
-              accelerator: auto
-              accumulate_grad_batches: 1
-              barebones: false
-              benchmark: null
-              callbacks:
-                - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
-                  init_args:
-                    monitor: val_generator_loss
-                    patience: 2
-                - class_path: lightning.pytorch.callbacks.lr_monitor.LearningRateMonitor
-                  init_args:
-                    logging_interval: step
-                - class_path: lightning.pytorch.callbacks.ModelCheckpoint
-                  init_args:
-                    dirpath: checkpoints
-                    filename: best-checkpoint
-                    mode: min
-                    monitor: val_generator_loss
-                    save_top_k: 1
-                    verbose: true
-              check_val_every_n_epoch: 1
-              default_root_dir: null
-              detect_anomaly: false
-              deterministic: null
-              devices: auto #[0]
-              enable_checkpointing: true
-              enable_model_summary: null
-              enable_progress_bar: null
-              fast_dev_run: false
-              gradient_clip_algorithm: null
-              gradient_clip_val: null
-              inference_mode: true
-              limit_predict_batches: null
-              limit_test_batches: null
-              limit_train_batches: null
-              limit_val_batches: null
-              log_every_n_steps: 2
-              logger: 
-                # - class_path: lightning.pytorch.loggers.CSVLogger
-                #   init_args:
-                #     save_dir: ml_logs/csv_logs
-                class_path: lightning.pytorch.loggers.MLFlowLogger
-                init_args:
-                  experiment_name: 3DGAN
-                  save_dir: ml_logs/mlflow_logs
-                  log_model: all
-              max_epochs: 100
-              max_steps: -1
-              max_time: null
-              min_epochs: null
-              min_steps: null
-              num_sanity_val_steps: null
-              overfit_batches: 0.0
-              plugins: null
-              profiler: null
-              reload_dataloaders_every_n_epochs: 0
-              strategy: ddp_find_unused_parameters_true #auto
-              sync_batchnorm: false
-              use_distributed_sampler: true
-              val_check_interval: null
-
-            # Lightning Model configuration
-            model:
-              class_path: model.ThreeDGAN
-              init_args:
-                latent_size: 256
-                batch_size: 128
-                loss_weights: [3, 0.1, 25, 0.1]
-                power: 0.85
-                lr: 0.001
-                checkpoint_path: checkpoints/3dgan.pth
-
-            # Lightning data module configuration
-            data:
-              class_path: dataloader.ParticlesDataModule
-              init_args:
-                datapath: /eos/user/k/ktsolaki/data/3dgan_data/*.h5 # exp_data/*/*.h5
-                batch_size: 128
-                num_workers: 0
-                max_samples: 10000
diff --git a/use-cases/3dgan/config.yaml b/use-cases/3dgan/config.yaml
new file mode 100644
index 00000000..d23288d5
--- /dev/null
+++ b/use-cases/3dgan/config.yaml
@@ -0,0 +1,208 @@
+# Main configurations
+dataset_location: exp_data/
+dataset_url: https://drive.google.com/drive/folders/1uPpz0tquokepptIfJenTzGpiENfo2xRX
+hw_accelerators: auto
+distributed_strategy: auto #ddp_find_unused_parameters_true
+devices: auto #[0]
+checkpoints_path: checkpoints
+logs_dir: ml_logs
+mlflow_tracking_uri: https://131.154.99.166.myip.cloud.infn.it
+batch_size: 4
+num_workers_dataloader: 0
+max_epochs: 2
+max_dataset_size: 48
+random_seed: 4231162351
+inference_results_location: 3dgan-generated-data/
+inference_model_uri: 3dgan-inference.pth
+aggregate_predictions: false
+
+# Dataloading step is common and can be reused
+dataloading_step:
+  class_path: dataloader.Lightning3DGANDownloader
+  init_args:
+    data_path: ${dataset_location} # Set to null to skip dataset download
+    data_url: ${dataset_url}
+
+# AI workflows
+training_pipeline:
+  class_path: itwinai.pipeline.Pipeline
+  init_args:
+    steps:
+      dataloading_step: ${dataloading_step}
+
+      training_step:
+        class_path: trainer.Lightning3DGANTrainer
+        init_args:
+          exp_root: ${logs_dir}
+          # Pytorch lightning config for training
+          config:
+            seed_everything: ${random_seed}
+            trainer:
+              accelerator: ${hw_accelerators}
+              accumulate_grad_batches: 1
+              barebones: false
+              benchmark: null
+              callbacks:
+                - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
+                  init_args:
+                    monitor: val_generator_loss
+                    patience: 2
+                - class_path: lightning.pytorch.callbacks.lr_monitor.LearningRateMonitor
+                  init_args:
+                    logging_interval: step
+                - class_path: lightning.pytorch.callbacks.ModelCheckpoint
+                  init_args:
+                    dirpath: ${checkpoints_path}
+                    filename: best-checkpoint
+                    mode: min
+                    monitor: val_generator_loss
+                    save_top_k: 1
+                    verbose: true
+              check_val_every_n_epoch: 1
+              default_root_dir: null
+              detect_anomaly: false
+              deterministic: null
+              devices: ${devices}
+              enable_checkpointing: true
+              enable_model_summary: null
+              enable_progress_bar: null
+              fast_dev_run: false
+              gradient_clip_algorithm: null
+              gradient_clip_val: null
+              inference_mode: true
+              limit_predict_batches: null
+              limit_test_batches: null
+              limit_train_batches: null
+              limit_val_batches: null
+              log_every_n_steps: 1
+              logger:
+                - class_path: lightning.pytorch.loggers.CSVLogger
+                  init_args:
+                    name: 3DGAN
+                    save_dir: ${logs_dir}
+                - class_path: lightning.pytorch.loggers.MLFlowLogger
+                  init_args:
+                    experiment_name: 3DGAN
+                    save_dir: null #ml_logs/mlflow_logs
+                    tracking_uri: ${mlflow_tracking_uri}
+                    log_model: all
+              max_epochs: ${max_epochs}
+              max_time: null
+              min_epochs: null
+              min_steps: null
+              num_sanity_val_steps: null
+              overfit_batches: 0.0
+              plugins: null
+              profiler: null
+              reload_dataloaders_every_n_epochs: 0
+              strategy: ${distributed_strategy}
+              sync_batchnorm: false
+              use_distributed_sampler: true
+              val_check_interval: null
+
+            # Lightning Model configuration
+            model:
+              class_path: model.ThreeDGAN
+              init_args:
+                latent_size: 256
+                loss_weights: [3, 0.1, 25, 0.1]
+                power: 0.85
+                lr: 0.001
+                checkpoints_dir: ${checkpoints_path}
+
+            # Lightning data module configuration
+            data:
+              class_path: dataloader.ParticlesDataModule
+              init_args:
+                datapath: ${dataset_location}
+                batch_size: ${batch_size}
+                num_workers: ${num_workers_dataloader}
+                max_samples: ${max_dataset_size}
+
+inference_pipeline:
+  class_path: itwinai.pipeline.Pipeline
+  init_args:
+    steps:
+      dataloading_step: ${dataloading_step}
+
+      inference_step:
+        class_path: trainer.Lightning3DGANPredictor
+        init_args:
+          model:
+            class_path: trainer.LightningModelLoader
+            init_args:
+              model_uri: ${inference_model_uri}
+
+          # Pytorch lightning config for training
+          config:
+            seed_everything: ${random_seed}
+            trainer:
+              accelerator: ${hw_accelerators}
+              accumulate_grad_batches: 1
+              barebones: false
+              benchmark: null
+              check_val_every_n_epoch: 1
+              default_root_dir: null
+              detect_anomaly: false
+              deterministic: null
+              devices: ${devices}
+              enable_checkpointing: true
+              enable_model_summary: null
+              enable_progress_bar: null
+              fast_dev_run: false
+              gradient_clip_algorithm: null
+              gradient_clip_val: null
+              inference_mode: true
+              limit_predict_batches: null
+              limit_test_batches: null
+              limit_train_batches: null
+              limit_val_batches: null
+              log_every_n_steps: 2
+              logger: 
+                # - class_path: lightning.pytorch.loggers.CSVLogger
+                #   init_args:
+                #     save_dir: ml_logs/csv_logs
+                class_path: lightning.pytorch.loggers.MLFlowLogger
+                init_args:
+                  experiment_name: 3DGAN
+                  save_dir: ${logs_dir}
+                  log_model: all
+              max_epochs: ${max_epochs}
+              max_steps: 20
+              max_time: null
+              min_epochs: null
+              min_steps: null
+              num_sanity_val_steps: null
+              overfit_batches: 0.0
+              plugins: null
+              profiler: null
+              reload_dataloaders_every_n_epochs: 0
+              strategy: ${distributed_strategy}
+              sync_batchnorm: false
+              use_distributed_sampler: true
+              val_check_interval: null
+
+            # Lightning Model configuration
+            model:
+              class_path: model.ThreeDGAN
+              init_args:
+                latent_size: 256
+                loss_weights: [3, 0.1, 25, 0.1]
+                power: 0.85
+                lr: 0.001
+                checkpoints_dir: ${checkpoints_path}
+
+            # Lightning data module configuration
+            data:
+              class_path: dataloader.ParticlesDataModule
+              init_args:
+                datapath: ${dataset_location}
+                batch_size: ${batch_size} #1024
+                num_workers: ${num_workers_dataloader} #4
+                max_samples: ${max_dataset_size} #null, 10000
+
+      saver_step:
+        class_path: saver.ParticleImagesSaver
+        init_args:
+          save_dir: ${inference_results_location}
+          aggregate_predictions: ${aggregate_predictions}
\ No newline at end of file
diff --git a/use-cases/3dgan/create_inference_sample.py b/use-cases/3dgan/create_inference_sample.py
new file mode 100644
index 00000000..14b88870
--- /dev/null
+++ b/use-cases/3dgan/create_inference_sample.py
@@ -0,0 +1,23 @@
+"""Create a simple inference dataset sample and a checkpoint."""
+
+import argparse
+import os
+import torch
+from model import ThreeDGAN
+
+
+def create_checkpoint(
+    root: str = '.',
+    ckpt_name: str = "3dgan-inference.pth"
+):
+    ckpt_path = os.path.join(root, ckpt_name)
+    net = ThreeDGAN()
+    torch.save(net, ckpt_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--root", type=str, default='.')
+    parser.add_argument("--ckpt-name", type=str, default="3dgan-inference.pth")
+    args = parser.parse_args()
+    create_checkpoint(**vars(args))
diff --git a/use-cases/3dgan/dataloader.py b/use-cases/3dgan/dataloader.py
index 89234895..49d2c2f5 100644
--- a/use-cases/3dgan/dataloader.py
+++ b/use-cases/3dgan/dataloader.py
@@ -35,7 +35,8 @@ def execute(self):
 
             gdown.download_folder(
                 url=self.data_url, quiet=False,
-                output=self.data_path
+                output=self.data_path,
+                verify=False
             )
 
 
@@ -57,7 +58,8 @@ def __getitem__(self, idx):
     def fetch_data(self) -> None:
 
         print("Searching in :", self.datapath)
-        files = sorted(glob.glob(self.datapath))
+        files = sorted(glob.glob(os.path.join(
+            self.datapath, '**/*.h5'), recursive=True))
         print("Found {} files. ".format(len(files)))
         if len(files) == 0:
             raise RuntimeError(f"No H5 files found at '{self.datapath}'!")
diff --git a/use-cases/3dgan/inference-pipeline.yaml b/use-cases/3dgan/inference-pipeline.yaml
deleted file mode 100644
index f5125576..00000000
--- a/use-cases/3dgan/inference-pipeline.yaml
+++ /dev/null
@@ -1,106 +0,0 @@
-pipeline:
-  class_path: itwinai.pipeline.Pipeline
-  init_args:
-    steps:
-      dataloading_step:
-        class_path: dataloader.Lightning3DGANDownloader
-        init_args:
-          data_path: /usr/data/exp_data/
-          data_url: https://drive.google.com/drive/folders/1uPpz0tquokepptIfJenTzGpiENfo2xRX
-
-      inference_step:
-        class_path: trainer.Lightning3DGANPredictor
-        init_args:
-          model:
-            class_path: trainer.LightningModelLoader
-            init_args:
-              model_uri: 3dgan-inference.pth
-
-          # Pytorch lightning config for training
-          config:
-            seed_everything: 4231162351
-            trainer:
-              accelerator: auto
-              accumulate_grad_batches: 1
-              barebones: false
-              benchmark: null
-              # callbacks:
-              #   # - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
-              #   #   init_args:
-              #   #     monitor: val_loss
-              #   #     patience: 2
-              #   - class_path: lightning.pytorch.callbacks.lr_monitor.LearningRateMonitor
-              #     init_args:
-              #       logging_interval: step
-              #   # - class_path: lightning.pytorch.callbacks.ModelCheckpoint
-              #   #   init_args:
-              #   #     dirpath: checkpoints
-              #   #     filename: best-checkpoint
-              #   #     mode: min
-              #   #     monitor: val_loss
-              #   #     save_top_k: 1
-              #   #     verbose: true
-              check_val_every_n_epoch: 1
-              default_root_dir: null
-              detect_anomaly: false
-              deterministic: null
-              devices: auto #[0]
-              enable_checkpointing: true
-              enable_model_summary: null
-              enable_progress_bar: null
-              fast_dev_run: false
-              gradient_clip_algorithm: null
-              gradient_clip_val: null
-              inference_mode: true
-              limit_predict_batches: null
-              limit_test_batches: null
-              limit_train_batches: null
-              limit_val_batches: null
-              log_every_n_steps: 2
-              logger: 
-                # - class_path: lightning.pytorch.loggers.CSVLogger
-                #   init_args:
-                #     save_dir: ml_logs/csv_logs
-                class_path: lightning.pytorch.loggers.MLFlowLogger
-                init_args:
-                  experiment_name: 3DGAN
-                  save_dir: /usr/data/ml_logs/mlflow_logs
-                  log_model: all
-              max_epochs: 1
-              max_steps: 20
-              max_time: null
-              min_epochs: null
-              min_steps: null
-              num_sanity_val_steps: null
-              overfit_batches: 0.0
-              plugins: null
-              profiler: null
-              reload_dataloaders_every_n_epochs: 0
-              strategy: ddp_find_unused_parameters_true #auto
-              sync_batchnorm: false
-              use_distributed_sampler: true
-              val_check_interval: null
-
-            # Lightning Model configuration
-            model:
-              class_path: model.ThreeDGAN
-              init_args:
-                latent_size: 256
-                loss_weights: [3, 0.1, 25, 0.1]
-                power: 0.85
-                lr: 0.001
-
-            # Lightning data module configuration
-            data:
-              class_path: dataloader.ParticlesDataModule
-              init_args:
-                datapath: /usr/data/exp_data/*/*.h5
-                batch_size: 64 #1024
-                num_workers: 2 #4
-                max_samples: 10 #null, 10000
-
-      saver_step:
-        class_path: saver.ParticleImagesSaver
-        init_args:
-          save_dir: /usr/data/3dgan-generated-data
-          aggregate_predictions: false
\ No newline at end of file
diff --git a/use-cases/3dgan/interLink/3dgan-inference-cpu.yaml b/use-cases/3dgan/interLink/3dgan-inference-cpu.yaml
index 2ba3c0a8..ef9016b4 100644
--- a/use-cases/3dgan/interLink/3dgan-inference-cpu.yaml
+++ b/use-cases/3dgan/interLink/3dgan-inference-cpu.yaml
@@ -1,29 +1,31 @@
 apiVersion: v1
 kind: Pod
 metadata:
-  name: 3dgan-cpu
+  name: 3dgan-inference-cpu
   annotations:
-    slurm-job.vk.io/flags: "-p gpu --gres=gpu:1 --cpus-per-task=4 --mem=100G --ntasks-per-node=1 --nodes=1"
+    slurm-job.vk.io/flags: "-p gpu --gres=gpu:1 --cpus-per-task=4 --mem=100G --ntasks-per-node=1 --nodes=1 --time=00:55:00"
     job.vk.io/singularity-mounts: "--bind /ceph/hpc/data/st2301-itwin-users/egarciagarcia:/exp_data"
-    #job.vk.io/pre-exec: "singularity pull /ceph/hpc/data/st2301-itwin-users/itwinaiv6_1.sif docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.2"
+    #job.vk.io/pre-exec: "singularity pull /ceph/hpc/data/st2301-itwin-users/itwinai_v9.5.sif docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.4"
 spec:
   automountServiceAccountToken: false
   containers:
   - args:
     - -c
-    - "\" cd /usr/src/app && itwinai exec-pipeline --print-config --config \\$CERN_CODE_ROOT/inference-pipeline.yaml \ 
-          -o pipeline.init_args.steps.dataloading_step.init_args.data_path=\\$CERN_DATA_ROOT \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.trainer.logger.init_args.save_dir=\\$TMP_DATA_ROOT/ml_logs/mlflow_logs \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.trainer.strategy=\\$STRATEGY \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.trainer.devices=\\$DEVICES \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.trainer.accelerator=\\$ACCELERATOR \ 
-          -o pipeline.init_args.steps.inference_step.init_args.model.init_args.model_uri=\\$CERN_CODE_ROOT/3dgan-inference.pth \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.datapath=\\$CERN_DATA_ROOT/*.h5 \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.max_samples=\\$MAX_DATA_SAMPLES \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.batch_size=\\$BATCH_SIZE \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.num_workers=\\$NUM_WORKERS_DL \ 
-          -o pipeline.init_args.steps.saver_step.init_args.save_dir=\\$TMP_DATA_ROOT/3dgan-generated-data \ 
-          -o pipeline.init_args.steps.saver_step.init_args.aggregate_predictions=\\$AGGREGATE_PREDS \""
+    - "\" cd /usr/src/app && itwinai exec-pipeline --print-config \
+          --config \\$CERN_CODE_ROOT/config.yaml \
+          --pipe-key inference_pipeline \
+          -o dataset_location=\\$CERN_DATA_ROOT \
+          -o logs_dir=\\$TMP_DATA_ROOT/ml_logs/mlflow_logs \
+          -o distributed_strategy=\\$STRATEGY \
+          -o devices=\\$DEVICES \
+          -o hw_accelerators=\\$ACCELERATOR \
+          -o inference_model_uri=\\$CERN_CODE_ROOT/3dgan-inference.pth \
+          -o checkpoints_path=\\$TMP_DATA_ROOT/checkpoints \
+          -o max_dataset_size=\\$MAX_DATA_SAMPLES \
+          -o batch_size=\\$BATCH_SIZE \
+          -o num_workers_dataloader=\\$NUM_WORKERS_DL \
+          -o inference_results_location=\\$TMP_DATA_ROOT/3dgan-generated-data \
+          -o aggregate_predictions=\\$AGGREGATE_PREDS \""
     command:
     - /bin/sh
     env:
@@ -47,7 +49,7 @@ spec:
       value: "auto"
     - name: DEVICES
       value: "auto"
-    image: /ceph/hpc/data/st2301-itwin-users/itwinaiv6_1.sif
+    image: /ceph/hpc/data/st2301-itwin-users/itwinai_v9.5.sif
     imagePullPolicy: Always
     name: oscar-container
     resources:
diff --git a/use-cases/3dgan/interLink/3dgan-inference.yaml b/use-cases/3dgan/interLink/3dgan-inference.yaml
index 4a9b4575..c07997de 100644
--- a/use-cases/3dgan/interLink/3dgan-inference.yaml
+++ b/use-cases/3dgan/interLink/3dgan-inference.yaml
@@ -1,29 +1,31 @@
 apiVersion: v1
 kind: Pod
 metadata:
-  name: 3dgan
+  name: 3dgan-inference
   annotations:
-    slurm-job.vk.io/flags: "-p gpu --gres=gpu:1 --cpus-per-task=4 --mem=100G --ntasks-per-node=1 --nodes=1"
+    slurm-job.vk.io/flags: "-p gpu --gres=gpu:1 --cpus-per-task=4 --mem=100G --ntasks-per-node=1 --nodes=1 --time=00:55:00"
     job.vk.io/singularity-mounts: "--bind /ceph/hpc/data/st2301-itwin-users/egarciagarcia:/exp_data"
-    #job.vk.io/pre-exec: "singularity pull /ceph/hpc/data/st2301-itwin-users/itwinaiv6_1.sif docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.2"
+    # job.vk.io/pre-exec: "singularity pull /ceph/hpc/data/st2301-itwin-users/itwinai_v9.5.sif docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.4"
 spec:
   automountServiceAccountToken: false
   containers:
   - args:
     - -c
-    - "\" cd /usr/src/app && itwinai exec-pipeline --print-config --config \\$CERN_CODE_ROOT/inference-pipeline.yaml \ 
-          -o pipeline.init_args.steps.dataloading_step.init_args.data_path=\\$CERN_DATA_ROOT \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.trainer.logger.init_args.save_dir=\\$TMP_DATA_ROOT/ml_logs/mlflow_logs \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.trainer.strategy=\\$STRATEGY \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.trainer.devices=\\$DEVICES \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.trainer.accelerator=\\$ACCELERATOR \ 
-          -o pipeline.init_args.steps.inference_step.init_args.model.init_args.model_uri=\\$CERN_CODE_ROOT/3dgan-inference.pth \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.datapath=\\$CERN_DATA_ROOT/*.h5 \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.max_samples=\\$MAX_DATA_SAMPLES \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.batch_size=\\$BATCH_SIZE \ 
-          -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.num_workers=\\$NUM_WORKERS_DL \ 
-          -o pipeline.init_args.steps.saver_step.init_args.save_dir=\\$TMP_DATA_ROOT/3dgan-generated-data \ 
-          -o pipeline.init_args.steps.saver_step.init_args.aggregate_predictions=\\$AGGREGATE_PREDS \""
+    - "\" cd /usr/src/app && itwinai exec-pipeline --print-config \
+          --config \\$CERN_CODE_ROOT/config.yaml \
+          --pipe-key inference_pipeline \
+          -o dataset_location=\\$CERN_DATA_ROOT \
+          -o logs_dir=\\$TMP_DATA_ROOT/ml_logs/mlflow_logs \ 
+          -o distributed_strategy=\\$STRATEGY \
+          -o devices=\\$DEVICES \
+          -o hw_accelerators=\\$ACCELERATOR \
+          -o checkpoints_path=\\$TMP_DATA_ROOT/checkpoints \
+          -o inference_model_uri=\\$CERN_CODE_ROOT/3dgan-inference.pth \ 
+          -o max_dataset_size=\\$MAX_DATA_SAMPLES \
+          -o batch_size=\\$BATCH_SIZE \
+          -o num_workers_dataloader=\\$NUM_WORKERS_DL \
+          -o inference_results_location=\\$TMP_DATA_ROOT/3dgan-generated-data \
+          -o aggregate_predictions=\\$AGGREGATE_PREDS \""
     command:
     - /bin/sh
     env:
@@ -47,7 +49,7 @@ spec:
       value: "auto"
     - name: DEVICES
       value: "auto"
-    image: /ceph/hpc/data/st2301-itwin-users/itwinaiv6_1.sif
+    image: /ceph/hpc/data/st2301-itwin-users/itwinai_v9.5.sif
     imagePullPolicy: Always
     name: oscar-container
     resources:
diff --git a/use-cases/3dgan/interLink/3dgan-train.yaml b/use-cases/3dgan/interLink/3dgan-train.yaml
new file mode 100644
index 00000000..e0885fd9
--- /dev/null
+++ b/use-cases/3dgan/interLink/3dgan-train.yaml
@@ -0,0 +1,88 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: 3dgan-train
+  annotations:
+    slurm-job.vk.io/flags: "-p gpu --gres=gpu:1 --cpus-per-task=4 --mem=100G --ntasks-per-node=1 --nodes=1 --time=00:55:00"
+    job.vk.io/singularity-mounts: "--bind /ceph/hpc/data/st2301-itwin-users/egarciagarcia:/exp_data"
+    # job.vk.io/pre-exec: "singularity pull /ceph/hpc/data/st2301-itwin-users/itwinai_v9.5.sif docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.4"
+spec:
+  automountServiceAccountToken: false
+  containers:
+  - args:
+    - -c
+    - "\" cd /usr/src/app && itwinai exec-pipeline --print-config \
+          --config \\$CERN_CODE_ROOT/config.yaml \ 
+          --pipe-key training_pipeline \
+          -o dataset_location=\\$CERN_DATA_ROOT \ 
+          -o pipeline.init_args.steps.training_step.init_args.exp_root=\\$TMP_DATA_ROOT \ 
+          -o logs_dir=\\$TMP_DATA_ROOT/ml_logs \ 
+          -o distributed_strategy=\\$STRATEGY \ 
+          -o devices=\\$DEVICES \ 
+          -o hw_accelerators=\\$ACCELERATOR \ 
+          -o checkpoints_path=\\$TMP_DATA_ROOT/checkpoints \
+          -o max_samples=\\$MAX_DATA_SAMPLES \ 
+          -o batch_size=\\$BATCH_SIZE \ 
+          -o max_dataset_size=\\$NUM_WORKERS_DL \""
+    command:
+    - /bin/sh
+    env:
+    - name: CERN_DATA_ROOT
+      value: "/exp_data"
+    - name: CERN_CODE_ROOT
+      value: "/usr/src/app"
+    - name: TMP_DATA_ROOT
+      value: "/exp_data"
+    - name: MAX_DATA_SAMPLES
+      value: "1000"
+    - name: BATCH_SIZE
+      value: "512"
+    - name: NUM_WORKERS_DL
+      value: "4"
+    - name: ACCELERATOR
+      value: "gpu"
+    - name: STRATEGY
+      value: "auto"
+    - name: DEVICES
+      value: "auto"
+
+    # - name: MLFLOW_TRACKING_USERNAME
+    #   valueFrom:
+    #     secretKeyRef:
+    #       name: mlflow-server
+    #       key: username
+    # - name: MLFLOW_TRACKING_PASSWORD
+    #   valueFrom:
+    #     secretKeyRef:
+    #       name: mlflow-server
+    #       key: password
+
+    - name: MLFLOW_TRACKING_USERNAME
+      value: "XXX"
+    - name: MLFLOW_TRACKING_PASSWORD
+      value: "XXX"
+    image: /ceph/hpc/data/st2301-itwin-users/itwinai_v9.5.sif
+    imagePullPolicy: Always
+    name: oscar-container
+    resources:
+      limits:
+        cpu: "1"
+        memory: 1Gi
+      requests:
+        cpu: "1"
+        memory: 1Gi
+    terminationMessagePath: /dev/termination-log
+    terminationMessagePolicy: File
+  nodeSelector:
+    kubernetes.io/hostname: vega-new-vk
+  tolerations:
+  - key: virtual-node.interlink/no-schedule
+    operator: Exists
+  - effect: NoExecute
+    key: node.kubernetes.io/not-ready
+    operator: Exists
+    tolerationSeconds: 300
+  - effect: NoExecute
+    key: node.kubernetes.io/unreachable
+    operator: Exists
+    tolerationSeconds: 300
\ No newline at end of file
diff --git a/use-cases/3dgan/interLink/README.md b/use-cases/3dgan/interLink/README.md
index d4b6dcca..c2831f7b 100644
--- a/use-cases/3dgan/interLink/README.md
+++ b/use-cases/3dgan/interLink/README.md
@@ -53,3 +53,14 @@ nodeSelector:
 ```
 
 Additional info in [interLink](https://github.com/interTwin-eu/interLink) docs.
+
+## Secrets
+
+See [this guide](https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#define-container-environment-variables-using-secret-data)
+on how to set Kubernetes secretes as env variables of a container.
+
+Example:
+
+```bash
+kubectl create secret generic mlflow-server --from-literal=username='XYZ' --from-literal=password='ABC'
+```
diff --git a/use-cases/3dgan/model.py b/use-cases/3dgan/model.py
index 9653c98e..60dd48f9 100644
--- a/use-cases/3dgan/model.py
+++ b/use-cases/3dgan/model.py
@@ -1,5 +1,5 @@
 import sys
-# import os
+import os
 # import pickle
 from collections import defaultdict
 import math
@@ -309,6 +309,7 @@ def __init__(
         loss_weights=[3, 0.1, 25, 0.1],
         power=0.85,
         lr=0.001,
+        checkpoints_dir: str = '.'
         # checkpoint_path: str = '3Dgan.pth'
     ):
         super().__init__()
@@ -319,6 +320,8 @@ def __init__(
         self.loss_weights = loss_weights
         self.lr = lr
         self.power = power
+        self.checkpoints_dir = checkpoints_dir
+        os.makedirs(self.checkpoints_dir, exist_ok=True)
 
         self.generator = Generator(self.latent_size)
         self.discriminator = Discriminator(self.power)
@@ -544,9 +547,10 @@ def training_step(self, batch, batch_idx):
         if fake_batch_loss[3] == 100.0 and self.index > 10:
             # print("Empty image with Ecal loss equal to 100.0 "
             #       f"for {self.index} batch")
-            torch.save(self.generator.state_dict(), "generator_weights.pth")
-            torch.save(self.discriminator.state_dict(),
-                       "discriminator_weights.pth")
+            torch.save(self.generator.state_dict(), os.path.join(
+                self.checkpoints_dir, "generator_weights.pth"))
+            torch.save(self.discriminator.state_dict(), os.path.join(
+                       self.checkpoints_dir, "discriminator_weights.pth"))
             # print("real_batch_loss", real_batch_loss)
             # print("fake_batch_loss", fake_batch_loss)
             sys.exit()
@@ -609,9 +613,10 @@ def on_train_epoch_end(self):  # outputs
         print(ROW_FMT.format("discriminator (train)",
               *self.train_history["discriminator"][-1]))
 
-        torch.save(self.generator.state_dict(), "generator_weights.pth")
-        torch.save(self.discriminator.state_dict(),
-                   "discriminator_weights.pth")
+        torch.save(self.generator.state_dict(), os.path.join(
+            self.checkpoints_dir, "generator_weights.pth"))
+        torch.save(self.discriminator.state_dict(), os.path.join(
+            self.checkpoints_dir, "discriminator_weights.pth"))
 
         # with open(self.pklfile, "wb") as f:
         #     pickle.dump({"train": self.train_history,
diff --git a/use-cases/3dgan/pipeline.yaml b/use-cases/3dgan/pipeline.yaml
deleted file mode 100644
index d6bade54..00000000
--- a/use-cases/3dgan/pipeline.yaml
+++ /dev/null
@@ -1,91 +0,0 @@
-pipeline:
-  class_path: itwinai.pipeline.Pipeline
-  init_args:
-    steps:
-      dataloading_step:
-        class_path: dataloader.Lightning3DGANDownloader
-        init_args:
-          data_path: exp_data/ # Set to null to skip dataset download
-          data_url: https://drive.google.com/drive/folders/1uPpz0tquokepptIfJenTzGpiENfo2xRX
-
-      training_step:
-        class_path: trainer.Lightning3DGANTrainer
-        init_args:
-          # Pytorch lightning config for training
-          config:
-            seed_everything: 4231162351
-            trainer:
-              accelerator: auto
-              accumulate_grad_batches: 1
-              barebones: false
-              benchmark: null
-              callbacks:
-                - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping
-                  init_args:
-                    monitor: val_generator_loss
-                    patience: 2
-                - class_path: lightning.pytorch.callbacks.lr_monitor.LearningRateMonitor
-                  init_args:
-                    logging_interval: step
-                - class_path: lightning.pytorch.callbacks.ModelCheckpoint
-                  init_args:
-                    dirpath: checkpoints
-                    filename: best-checkpoint
-                    mode: min
-                    monitor: val_generator_loss
-                    save_top_k: 1
-                    verbose: true
-              check_val_every_n_epoch: 1
-              default_root_dir: null
-              detect_anomaly: false
-              deterministic: null
-              devices: auto #[0]
-              enable_checkpointing: true
-              enable_model_summary: null
-              enable_progress_bar: null
-              fast_dev_run: false
-              gradient_clip_algorithm: null
-              gradient_clip_val: null
-              inference_mode: true
-              limit_predict_batches: null
-              limit_test_batches: null
-              limit_train_batches: null
-              limit_val_batches: null
-              log_every_n_steps: 1
-              logger: 
-                class_path: lightning.pytorch.loggers.MLFlowLogger
-                init_args:
-                  experiment_name: 3DGAN
-                  save_dir: ml_logs/mlflow_logs
-                  log_model: all
-              max_epochs: 5
-              max_time: null
-              min_epochs: null
-              min_steps: null
-              num_sanity_val_steps: null
-              overfit_batches: 0.0
-              plugins: null
-              profiler: null
-              reload_dataloaders_every_n_epochs: 0
-              strategy: auto #ddp_find_unused_parameters_true
-              sync_batchnorm: false
-              use_distributed_sampler: true
-              val_check_interval: null
-
-            # Lightning Model configuration
-            model:
-              class_path: model.ThreeDGAN
-              init_args:
-                latent_size: 256
-                loss_weights: [3, 0.1, 25, 0.1]
-                power: 0.85
-                lr: 0.001
-
-            # Lightning data module configuration
-            data:
-              class_path: dataloader.ParticlesDataModule
-              init_args:
-                datapath: exp_data/*/*.h5
-                batch_size: 4
-                num_workers: 0
-                max_samples: 48
diff --git a/use-cases/3dgan/trainer.py b/use-cases/3dgan/trainer.py
index 3bb5a1fd..f51b5d5a 100644
--- a/use-cases/3dgan/trainer.py
+++ b/use-cases/3dgan/trainer.py
@@ -23,17 +23,22 @@
 
 
 class Lightning3DGANTrainer(Trainer):
-    def __init__(self, config: Union[Dict, str]):
+    def __init__(self, config: Union[Dict, str], exp_root: str = '.'):
         self.save_parameters(**self.locals2params(locals()))
         super().__init__()
         if isinstance(config, str) and os.path.isfile(config):
             # Load from YAML
             config = load_yaml(config)
         self.conf = config
+        self.exp_root = exp_root
 
     @monitor_exec
     def execute(self) -> Any:
-        init_lightning_mlflow(self.conf, registered_model_name='3dgan-lite')
+        init_lightning_mlflow(
+            self.conf,
+            tmp_dir=os.path.join(self.exp_root, '.tmp'),
+            registered_model_name='3dgan-lite'
+        )
         old_argv = sys.argv
         sys.argv = ['some_script_placeholder.py']
         cli = LightningCLI(
@@ -52,12 +57,6 @@ def execute(self) -> Any:
         cli.trainer.fit(cli.model, datamodule=cli.datamodule)
         teardown_lightning_mlflow()
 
-    def save_state(self):
-        return super().save_state()
-
-    def load_state(self):
-        return super().load_state()
-
 
 class LightningModelLoader(TorchModelLoader):
     """Loads a torch lightning model from somewhere.
diff --git a/use-cases/cyclones/README.md b/use-cases/cyclones/README.md
new file mode 100644
index 00000000..6b504fb0
--- /dev/null
+++ b/use-cases/cyclones/README.md
@@ -0,0 +1,12 @@
+# Tropical cyclone detection
+
+## Dataset
+
+If the automatic download from python does not work, try from the command line from
+within the virtual environment:
+
+```bash
+gdown https://drive.google.com/drive/folders/1TnmujO4T-8_j4bCxqNe5HEw9njJIIBQD -O data/tmp_data/trainval --folder
+```
+
+For more info visit the [gdown](https://github.com/wkentaro/gdown) repository.
diff --git a/use-cases/cyclones/dataloader.py b/use-cases/cyclones/dataloader.py
index 7f224157..3cf1d97b 100644
--- a/use-cases/cyclones/dataloader.py
+++ b/use-cases/cyclones/dataloader.py
@@ -180,6 +180,7 @@ def setup_config(self, config: Dict) -> None:
         if not exists(join(root_dir, self.data_path)):
             gdown.download_folder(
                 url=self.data_url, quiet=False,
+                verify=False,
                 output=join(root_dir, self.data_path)
             )
 
diff --git a/use-cases/cyclones/trainer.py b/use-cases/cyclones/trainer.py
index 1c47819b..054f772b 100644
--- a/use-cases/cyclones/trainer.py
+++ b/use-cases/cyclones/trainer.py
@@ -155,9 +155,3 @@ def setup_config(self, config: Dict) -> None:
         if self.model_backup:
             self.best_model_name = join(self.model_backup, "best_model.h5")
         self.last_model_name = join(self.run_dir, "last_model.h5")
-
-    def load_state(self):
-        return super().load_state()
-
-    def save_state(self):
-        return super().save_state()
diff --git a/use-cases/mnist/tensorflow/pipeline.yaml b/use-cases/mnist/tensorflow/pipeline.yaml
index 9fced327..314f78b1 100644
--- a/use-cases/mnist/tensorflow/pipeline.yaml
+++ b/use-cases/mnist/tensorflow/pipeline.yaml
@@ -32,9 +32,9 @@ pipeline:
           strategy:
             class_path: tensorflow.python.distribute.mirrored_strategy.MirroredStrategy
 
-          logger: 
-            - class_path: itwinai.loggers.ConsoleLogger
-            - class_path: itwinai.loggers.MLFlowLogger
-              init_args:
-                experiment_name: MNIST classifier
-                log_freq: batch 
+          # logger: 
+          #   - class_path: itwinai.loggers.ConsoleLogger
+          #   - class_path: itwinai.loggers.MLFlowLogger
+          #     init_args:
+          #       experiment_name: MNIST classifier
+          #       log_freq: batch 
diff --git a/use-cases/mnist/tensorflow/trainer.py b/use-cases/mnist/tensorflow/trainer.py
index 17ef19a5..435f79f4 100644
--- a/use-cases/mnist/tensorflow/trainer.py
+++ b/use-cases/mnist/tensorflow/trainer.py
@@ -35,9 +35,3 @@ def __init__(
     @monitor_exec
     def execute(self, train_dataset, validation_dataset) -> Any:
         return super().execute(train_dataset, validation_dataset)
-
-    def load_state(self):
-        return super().load_state()
-
-    def save_state(self):
-        return super().save_state()
diff --git a/use-cases/mnist/torch-lightning/README.md b/use-cases/mnist/torch-lightning/README.md
new file mode 100644
index 00000000..bd769c70
--- /dev/null
+++ b/use-cases/mnist/torch-lightning/README.md
@@ -0,0 +1,17 @@
+# Torch Lightning example on MNIST dataset
+
+## Training
+
+```bash
+# Download dataset and exit: only run first step in the pipeline (index=0)
+itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline --steps 0
+
+# Run the whole training pipeline
+itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline 
+```
+
+View training logs on MLFLow server (if activated from the configuration):
+
+```bash
+mlflow ui --backend-store-uri mllogs/mlflow/
+```
diff --git a/use-cases/mnist/torch-lightning/pipeline.yaml b/use-cases/mnist/torch-lightning/config.yaml
similarity index 96%
rename from use-cases/mnist/torch-lightning/pipeline.yaml
rename to use-cases/mnist/torch-lightning/config.yaml
index cf754b2f..23fde03d 100644
--- a/use-cases/mnist/torch-lightning/pipeline.yaml
+++ b/use-cases/mnist/torch-lightning/config.yaml
@@ -1,4 +1,4 @@
-pipeline:
+training_pipeline:
   class_path: itwinai.pipeline.Pipeline
   init_args:
     steps:
@@ -6,7 +6,7 @@ pipeline:
         init_args:
           data_path: data/
 
-      - class_path: trainer.LightningMNISTTrainer
+      - class_path: itwinai.torch.trainer.TorchLightningTrainer #trainer.LightningMNISTTrainer
         init_args:
           # Pytorch lightning config for training
           config:
diff --git a/use-cases/mnist/torch-lightning/dataloader.py b/use-cases/mnist/torch-lightning/dataloader.py
index 1f062fe5..b7e8d46e 100644
--- a/use-cases/mnist/torch-lightning/dataloader.py
+++ b/use-cases/mnist/torch-lightning/dataloader.py
@@ -31,7 +31,7 @@ def execute(self) -> None:
         self._downloader.setup(stage='predict')
 
 
-class MNISTDataModule(L.LightningModule):
+class MNISTDataModule(L.LightningDataModule):
     def __init__(
         self,
         data_path: str,
diff --git a/use-cases/mnist/torch-lightning/train.py b/use-cases/mnist/torch-lightning/train.py
deleted file mode 100644
index 97f53093..00000000
--- a/use-cases/mnist/torch-lightning/train.py
+++ /dev/null
@@ -1,44 +0,0 @@
-"""
-Training pipeline. To run this script, use the following commands.
-
-On login node:
-
->>> micromamba run -p ../../../.venv-pytorch/ \
-    python train.py -p pipeline.yaml -d
-
-On compute nodes:
-
->>> micromamba run -p ../../../.venv-pytorch/ \
-    python train.py -p pipeline.yaml
-
-"""
-
-import argparse
-
-from itwinai.parser import ConfigParser
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-p", "--pipeline", type=str, required=True,
-        help='Configuration file to the pipeline to execute.'
-    )
-    parser.add_argument(
-        '-d', '--download-only',
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help=('Whether to download only the dataset and exit execution '
-              '(suggested on login nodes of HPC systems)')
-    )
-    args = parser.parse_args()
-
-    # Create parser for the pipeline
-    pipe_parser = ConfigParser(config=args.pipeline)
-    pipeline = pipe_parser.parse_pipeline()
-
-    if args.download_only:
-        print('Downloading datasets and exiting...')
-        pipeline = pipeline[:1]
-
-    pipeline.execute()
diff --git a/use-cases/mnist/torch-lightning/trainer.py b/use-cases/mnist/torch-lightning/trainer.py
deleted file mode 100644
index 128cf5c6..00000000
--- a/use-cases/mnist/torch-lightning/trainer.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import os
-from typing import Union, Dict, Any
-
-from itwinai.components import Trainer, monitor_exec
-from itwinai.torch.models.mnist import MNISTModel
-from dataloader import MNISTDataModule
-from lightning.pytorch.cli import LightningCLI
-from utils import load_yaml
-
-
-class LightningMNISTTrainer(Trainer):
-    def __init__(self, config: Union[Dict, str]):
-        super().__init__()
-        self.save_parameters(**self.locals2params(locals()))
-        if isinstance(config, str) and os.path.isfile(config):
-            # Load from YAML
-            config = load_yaml(config)
-        self.conf = config
-
-    @monitor_exec
-    def execute(self) -> Any:
-        cli = LightningCLI(
-            args=self.conf,
-            model_class=MNISTModel,
-            datamodule_class=MNISTDataModule,
-            run=False,
-            save_config_kwargs={
-                "overwrite": True,
-                "config_filename": "pl-training.yml",
-            },
-            subclass_mode_model=True,
-            subclass_mode_data=True,
-        )
-        cli.trainer.fit(cli.model, datamodule=cli.datamodule)
-
-    def save_state(self):
-        return super().save_state()
-
-    def load_state(self):
-        return super().load_state()
diff --git a/use-cases/mnist/torch/Dockerfile b/use-cases/mnist/torch/Dockerfile
index b4cf3654..5b96feb5 100644
--- a/use-cases/mnist/torch/Dockerfile
+++ b/use-cases/mnist/torch/Dockerfile
@@ -1,4 +1,5 @@
-FROM python:3.9.12
+# FROM python:3.9
+FROM nvcr.io/nvidia/pytorch:23.09-py3
 
 WORKDIR /usr/src/app
 
@@ -13,6 +14,3 @@ RUN  pip install --no-cache-dir .
 
 # Add torch MNIST use case
 COPY use-cases/mnist/torch/* ./
-
-# Run inference
-CMD [ "python", "train.py", "-p", "inference-pipeline.yaml"]
\ No newline at end of file
diff --git a/use-cases/mnist/torch/README.md b/use-cases/mnist/torch/README.md
index c953671f..e333f14b 100644
--- a/use-cases/mnist/torch/README.md
+++ b/use-cases/mnist/torch/README.md
@@ -3,10 +3,18 @@
 ## Training
 
 ```bash
-python train.py -p pipeline.yaml [-d]
+# Download dataset and exit
+itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline --steps dataloading_step
+
+# Run the whole training pipeline
+itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline 
 ```
 
-Use `-d` flag to run only the fist step in the pipeline.
+View training logs on MLFLow server (if activated from the configuration):
+
+```bash
+mlflow ui --backend-store-uri mllogs/mlflow/
+```
 
 ## Inference
 
@@ -30,24 +38,37 @@ Use `-d` flag to run only the fist step in the pipeline.
 folder containing a CSV file with the predictions as rows.
 
     ```bash
-    python train.py -p inference-pipeline.yaml
+    itwinai exec-pipeline --config config.yaml --pipe-key inference_pipeline 
     ```
 
 Note the same entry point as for training.
 
-### Docker image
+## Docker image
 
 Build from project root with
 
 ```bash
 # Local
-docker buildx build -t itwinai-mnist-torch-inference -f use-cases/mnist/torch/Dockerfile .
+docker buildx build -t itwinai:0.0.1-mnist-torch-0.1 -f use-cases/mnist/torch/Dockerfile .
 
 # Ghcr.io
-docker buildx build -t ghcr.io/intertwin-eu/itwinai-mnist-torch-inference:0.0.1 -f use-cases/mnist/torch/Dockerfile .
-docker push ghcr.io/intertwin-eu/itwinai-mnist-torch-inference:0.0.1
+docker buildx build -t ghcr.io/intertwin-eu/itwinai:0.0.1-mnist-torch-0.1 -f use-cases/mnist/torch/Dockerfile .
+docker push ghcr.io/intertwin-eu/itwinai:0.0.1-mnist-torch-0.1
 ```
 
+### Training with Docker container
+
+```bash
+docker run -it --rm --name running-inference \
+    -v "$PWD":/usr/data ghcr.io/intertwin-eu/itwinai:0.01-mnist-torch-0.1 \
+    /bin/bash -c "itwinai exec-pipeline --print-config \
+    --config /usr/src/app/config.yaml \
+    --pipe-key training_pipeline \
+    -o dataset_root=/usr/data/mnist-dataset "
+```
+
+### Inference with Docker container
+
 From wherever a sample of MNIST jpg images is available
 (folder called 'mnist-sample-data/'):
 
@@ -62,7 +83,14 @@ From wherever a sample of MNIST jpg images is available
 ```
 
 ```bash
-docker run -it --rm --name running-inference -v "$PWD":/usr/data ghcr.io/intertwin-eu/itwinai-mnist-torch-inference:0.0.1
+docker run -it --rm --name running-inference \
+    -v "$PWD":/usr/data ghcr.io/intertwin-eu/itwinai:0.01-mnist-torch-0.1 \
+    /bin/bash -c "itwinai exec-pipeline --print-config \
+    --config /usr/src/app/config.yaml \
+    --pipe-key inference_pipeline \
+    -o test_data_path=/usr/data/mnist-sample-data \
+    -o inference_model_mlflow_uri=/usr/src/app/mnist-pre-trained.pth \
+    -o predictions_dir=/usr/data/mnist-predictions "
 ```
 
 This command will store the results in a folder called "mnist-predictions":
diff --git a/use-cases/mnist/torch/config.yaml b/use-cases/mnist/torch/config.yaml
new file mode 100644
index 00000000..c5d71204
--- /dev/null
+++ b/use-cases/mnist/torch/config.yaml
@@ -0,0 +1,99 @@
+# General config
+dataset_root: .tmp/
+num_classes: 10
+batch_size: 64
+num_workers_dataloader: 4
+pin_memory: False
+lr: 0.001
+momentum: 0.9
+fp16_allreduce: False
+use_adasum: False
+gradient_predivide_factor: 1.0
+epochs: 2
+strategy: ddp
+test_data_path: mnist-sample-data
+inference_model_mlflow_uri: mnist-pre-trained.pth
+predictions_dir: mnist-predictions
+predictions_file: predictions.csv
+class_labels: null
+
+# Workflows configuration
+training_pipeline:
+  class_path: itwinai.pipeline.Pipeline
+  init_args:
+    steps:
+      dataloading_step:
+        class_path: dataloader.MNISTDataModuleTorch
+        init_args:
+          save_path: ${dataset_root}
+
+      training_step:
+        class_path: itwinai.torch.trainer.TorchTrainer
+        init_args:
+          config:
+            batch_size: ${batch_size}
+            num_workers: ${num_workers_dataloader}
+            pin_memory: ${pin_memory}
+            lr: ${lr}
+            momentum: ${momentum}
+            fp16_allreduce: ${fp16_allreduce}
+            use_adasum: ${use_adasum}
+            gradient_predivide_factor: ${gradient_predivide_factor}
+
+          model:
+            class_path: model.Net
+          epochs: ${epochs}
+          metrics:
+            accuracy:
+              class_path: torchmetrics.classification.MulticlassAccuracy
+              init_args:
+                num_classes: ${num_classes}
+            precision:
+              class_path: torchmetrics.classification.MulticlassPrecision
+              init_args:
+                num_classes: ${num_classes}
+            recall:
+              class_path: torchmetrics.classification.MulticlassRecall
+              init_args:
+                num_classes: ${num_classes}
+          logger:
+            class_path: itwinai.loggers.LoggersCollection
+            init_args:
+              loggers:
+                - class_path: itwinai.loggers.ConsoleLogger
+                  init_args:
+                    log_freq: 100
+                - class_path: itwinai.loggers.MLFlowLogger
+                  init_args:
+                    experiment_name: MNIST classifier
+                    log_freq: batch 
+          strategy: ${strategy}
+          # checkpoint_every: 1
+          # cluster:
+          #   class_path: itwinai.torch.cluster.LocalCluster
+          #   init_args:
+          #     gpus: '0,1,2'
+          #     backend: nccl
+
+inference_pipeline:
+  class_path: itwinai.pipeline.Pipeline
+  init_args:
+    steps:
+      - class_path: dataloader.MNISTPredictLoader
+        init_args:
+          test_data_path: ${test_data_path}
+
+      - class_path: itwinai.torch.inference.MulticlassTorchPredictor
+        init_args: 
+          model:
+            class_path: itwinai.torch.inference.TorchModelLoader
+            init_args:
+              model_uri: ${inference_model_mlflow_uri}
+          test_dataloader_kwargs:
+            batch_size: ${batch_size}
+      
+      - class_path: saver.TorchMNISTLabelSaver
+        init_args:
+          save_dir: ${predictions_dir}
+          predictions_file: ${predictions_file}
+          class_labels: ${class_labels}
\ No newline at end of file
diff --git a/use-cases/mnist/torch/create_inference_sample.py b/use-cases/mnist/torch/create_inference_sample.py
new file mode 100644
index 00000000..1c588c48
--- /dev/null
+++ b/use-cases/mnist/torch/create_inference_sample.py
@@ -0,0 +1,42 @@
+"""Create a simple inference dataset sample and a checkpoint."""
+
+import torch
+import os
+import argparse
+
+from model import Net
+from dataloader import InferenceMNIST
+
+
+def mnist_torch_inference_files(
+    root: str = '.',
+    samples_path: str = 'mnist-sample-data/',
+    model_name: str = 'mnist-pre-trained.pth'
+):
+    """Create sample dataset and fake model to test mnist
+    inference workflow. Assumes to be run from
+    the use case folder.
+
+    Args:
+        root (str, optional): where to create the files.
+        Defaults to '.'.
+    """
+
+    sample = os.path.join(root, samples_path)
+    InferenceMNIST.generate_jpg_sample(sample, 10)
+
+    # Fake checkpoint
+    dummy_nn = Net()
+    mdl_ckpt = os.path.join(root, model_name)
+    torch.save(dummy_nn, mdl_ckpt)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--root", type=str, default='.')
+    parser.add_argument("--samples-path", type=str,
+                        default='mnist-sample-data')
+    parser.add_argument("--model-name", type=str,
+                        default='mnist-pre-trained.pth')
+    args = parser.parse_args()
+    mnist_torch_inference_files(**vars(args))
diff --git a/use-cases/mnist/torch/dataloader.py b/use-cases/mnist/torch/dataloader.py
index e4243763..a19c647e 100644
--- a/use-cases/mnist/torch/dataloader.py
+++ b/use-cases/mnist/torch/dataloader.py
@@ -34,7 +34,7 @@ def execute(self) -> Tuple[Dataset, Dataset]:
                 transforms.Normalize((0.1307,), (0.3081,))
             ]))
         print("Train and validation datasets loaded.")
-        return train_dataset, validation_dataset
+        return train_dataset, validation_dataset, None
 
 
 class InferenceMNIST(Dataset):
diff --git a/use-cases/mnist/torch/inference-pipeline.yaml b/use-cases/mnist/torch/inference-pipeline.yaml
deleted file mode 100644
index 5edf6ce9..00000000
--- a/use-cases/mnist/torch/inference-pipeline.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-pipeline:
-  class_path: itwinai.pipeline.Pipeline
-  init_args:
-    steps:
-      - class_path: dataloader.MNISTPredictLoader
-        init_args:
-          test_data_path: /usr/data/mnist-sample-data
-
-      - class_path: itwinai.torch.inference.MulticlassTorchPredictor
-        init_args: 
-          model:
-            class_path: itwinai.torch.inference.TorchModelLoader
-            init_args:
-              model_uri: mnist-pre-trained.pth
-          test_dataloader_kwargs:
-            batch_size: 3
-      
-      - class_path: saver.TorchMNISTLabelSaver
-        init_args:
-          save_dir: /usr/data/mnist-predictions
-          predictions_file: predictions.csv
-          class_labels: null
\ No newline at end of file
diff --git a/use-cases/mnist/torch/pipeline.yaml b/use-cases/mnist/torch/pipeline.yaml
deleted file mode 100644
index 99f35c73..00000000
--- a/use-cases/mnist/torch/pipeline.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-pipeline:
-  class_path: itwinai.pipeline.Pipeline
-  init_args:
-    steps:
-      dataloading_step:
-        class_path: dataloader.MNISTDataModuleTorch
-        init_args:
-          save_path: .tmp/
-
-      training_step:
-        class_path: itwinai.torch.trainer.TorchTrainerMG
-        init_args:
-          model:
-            class_path: model.Net
-          loss:
-            class_path: torch.nn.NLLLoss
-            init_args:
-              reduction: mean
-          optimizer_class: torch.optim.SGD
-          optimizer_kwargs: 
-            lr: 0.001
-          train_dataloader_kwargs:
-            batch_size: 32
-            pin_memory: True
-            shuffle: True
-          validation_dataloader_kwargs:
-            batch_size: 32
-            pin_memory: True
-            shuffle: False
-          epochs: 2
-          train_metrics:
-            accuracy:
-              class_path: torchmetrics.classification.MulticlassAccuracy
-              init_args:
-                num_classes: 10
-            precision:
-              class_path: torchmetrics.classification.MulticlassPrecision
-              init_args:
-                num_classes: 10
-            recall:
-              class_path: torchmetrics.classification.MulticlassRecall
-              init_args:
-                num_classes: 10
-          logger: 
-            - class_path: itwinai.loggers.ConsoleLogger
-            - class_path: itwinai.loggers.MLFlowLogger
-              init_args:
-                experiment_name: MNIST classifier
-                log_freq: batch 
-          strategy: ddp
-          checkpoint_every: 1
-          cluster:
-            class_path: itwinai.torch.cluster.LocalCluster
-            init_args:
-              gpus: '0,1,2'
-              backend: nccl
diff --git a/use-cases/mnist/torch/runall.sh b/use-cases/mnist/torch/runall.sh
new file mode 100644
index 00000000..e81ed74d
--- /dev/null
+++ b/use-cases/mnist/torch/runall.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Python virtual environment (no conda/micromamba)
+PYTHON_VENV="../../../envAI_hdfml"
+
+# Clear SLURM logs (*.out and *.err files)
+rm -rf logs_slurm
+mkdir logs_slurm
+rm -rf logs_torchrun
+
+# DDP itwinai
+DIST_MODE="ddp"
+RUN_NAME="ddp-itwinai"
+TRAINING_CMD="$PYTHON_VENV/bin/itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline -o strategy=ddp"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
+
+# DeepSpeed itwinai
+DIST_MODE="deepspeed"
+RUN_NAME="deepspeed-itwinai"
+TRAINING_CMD="$PYTHON_VENV/bin/itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline -o strategy=deepspeed"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
+
+# Horovod itwinai
+DIST_MODE="horovod"
+RUN_NAME="horovod-itwinai"
+TRAINING_CMD="$PYTHON_VENV/bin/itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline -o strategy=horovod"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
+    --job-name="$RUN_NAME-n$N" \
+    --output="logs_slurm/job-$RUN_NAME-n$N.out" \
+    --error="logs_slurm/job-$RUN_NAME-n$N.err" \
+    slurm.sh
\ No newline at end of file
diff --git a/use-cases/mnist/torch/slurm.sh b/use-cases/mnist/torch/slurm.sh
new file mode 100644
index 00000000..2a2a15d8
--- /dev/null
+++ b/use-cases/mnist/torch/slurm.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# SLURM jobscript for JSC systems
+
+# Job configuration
+#SBATCH --job-name=distributed_training
+#SBATCH --account=intertwin
+#SBATCH --mail-user=
+#SBATCH --mail-type=ALL
+#SBATCH --output=job.out
+#SBATCH --error=job.err
+#SBATCH --time=00:30:00
+
+# Resources allocation
+#SBATCH --partition=batch
+#SBATCH --nodes=2
+#SBATCH --gpus-per-node=4
+#SBATCH --cpus-per-gpu=4
+#SBATCH --exclusive
+
+# gres options have to be disabled for deepv
+#SBATCH --gres=gpu:4
+
+# Load environment modules
+ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py
+
+# Job info
+echo "DEBUG: TIME: $(date)"
+sysN="$(uname -n | cut -f2- -d.)"
+sysN="${sysN%%[0-9]*}"
+echo "Running on system: $sysN"
+echo "DEBUG: EXECUTE: $EXEC"
+echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR"
+echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID"
+echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
+echo "DEBUG: SLURM_NNODES: $SLURM_NNODES"
+echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS"
+echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
+echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST"
+echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME"
+echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
+if [ "$DEBUG" = true ] ; then
+  echo "DEBUG: NCCL_DEBUG=INFO" 
+  export NCCL_DEBUG=INFO
+fi
+echo
+
+# Setup env for distributed ML
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+export OMP_NUM_THREADS=1
+if [ "$SLURM_CPUS_PER_GPU" -gt 0 ] ; then
+  export OMP_NUM_THREADS=$SLURM_CPUS_PER_GPU
+fi
+
+# Env vairables check
+if [ -z "$DIST_MODE" ]; then 
+  >&2 echo "ERROR: env variable DIST_MODE is not set. Allowed values are 'horovod', 'ddp' or 'deepspeed'"
+  exit 1
+fi
+if [ -z "$RUN_NAME" ]; then 
+  >&2 echo "WARNING: env variable RUN_NAME is not set. It's a way to identify some specific run of an experiment."
+  RUN_NAME=$DIST_MODE
+fi
+if [ -z "$TRAINING_CMD" ]; then 
+  >&2 echo "ERROR: env variable TRAINING_CMD is not set. It's the python command to execute."
+  exit 1
+fi
+if [ -z "$PYTHON_VENV" ]; then 
+  >&2 echo "WARNING: env variable PYTHON_VENV is not set. It's the path to a python virtual environment."
+else
+  # Activate Python virtual env
+  source $PYTHON_VENV/bin/activate
+fi
+
+# Get GPUs info per node
+srun --cpu-bind=none --ntasks-per-node=1 bash -c 'echo -e "NODE hostname: $(hostname)\n$(nvidia-smi)\n\n"'
+
+# Launch training
+if [ "$DIST_MODE" == "ddp" ] ; then
+  echo "DDP training: $TRAINING_CMD"
+  srun --cpu-bind=none --ntasks-per-node=1 \
+    bash -c "torchrun \
+    --log_dir='logs_torchrun' \
+    --nnodes=$SLURM_NNODES \
+    --nproc_per_node=$SLURM_GPUS_PER_NODE \
+    --rdzv_id=$SLURM_JOB_ID \
+    --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \
+    --rdzv_backend=c10d \
+    --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \
+    $TRAINING_CMD"
+elif [ "$DIST_MODE" == "deepspeed" ] ; then
+  echo "DEEPSPEED training: $TRAINING_CMD"
+  MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i
+  export MASTER_ADDR
+  export MASTER_PORT=29500 
+
+  srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \
+    $TRAINING_CMD
+
+  # # Run with deepspeed launcher: set --ntasks-per-node=1
+  # # https://www.deepspeed.ai/getting-started/#multi-node-environment-variables
+  # export NCCL_IB_DISABLE=1
+  # export NCCL_SOCKET_IFNAME=eth0
+  # nodelist=$(scontrol show hostname $SLURM_NODELIST)
+  # echo "$nodelist" | sed -e 's/$/ slots=4/' > .hostfile
+  # # Requires passwordless SSH access among compute node
+  # srun --cpu-bind=none deepspeed --hostfile=.hostfile $TRAINING_CMD --deepspeed
+  # rm .hostfile
+elif [ "$DIST_MODE" == "horovod" ] ; then
+  echo "HOROVOD training: $TRAINING_CMD"
+  srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \
+    $TRAINING_CMD
+else
+  >&2 echo "ERROR: unrecognized \$DIST_MODE env variable"
+  exit 1
+fi
diff --git a/use-cases/mnist/torch/train.py b/use-cases/mnist/torch/train.py
deleted file mode 100644
index 97f53093..00000000
--- a/use-cases/mnist/torch/train.py
+++ /dev/null
@@ -1,44 +0,0 @@
-"""
-Training pipeline. To run this script, use the following commands.
-
-On login node:
-
->>> micromamba run -p ../../../.venv-pytorch/ \
-    python train.py -p pipeline.yaml -d
-
-On compute nodes:
-
->>> micromamba run -p ../../../.venv-pytorch/ \
-    python train.py -p pipeline.yaml
-
-"""
-
-import argparse
-
-from itwinai.parser import ConfigParser
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-p", "--pipeline", type=str, required=True,
-        help='Configuration file to the pipeline to execute.'
-    )
-    parser.add_argument(
-        '-d', '--download-only',
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help=('Whether to download only the dataset and exit execution '
-              '(suggested on login nodes of HPC systems)')
-    )
-    args = parser.parse_args()
-
-    # Create parser for the pipeline
-    pipe_parser = ConfigParser(config=args.pipeline)
-    pipeline = pipe_parser.parse_pipeline()
-
-    if args.download_only:
-        print('Downloading datasets and exiting...')
-        pipeline = pipeline[:1]
-
-    pipeline.execute()