diff --git a/.dockerignore b/.dockerignore index 697f33f2..d6dc083a 100644 --- a/.dockerignore +++ b/.dockerignore @@ -12,6 +12,10 @@ CHANGELOG # Docs docs +# interLink pods +**/interLink +**/interlink + # Data **/MNIST **/*-predictions/ diff --git a/.github/linters/.jscpd.json b/.github/linters/.jscpd.json index 1a035770..8a003c54 100644 --- a/.github/linters/.jscpd.json +++ b/.github/linters/.jscpd.json @@ -1,7 +1,6 @@ { "threshold": 2.0, "ignore": [ - "**/itwinai/loggers.py", - "**/itwinai/torch/engine.py" + "**/itwinai/loggers.py" ] } \ No newline at end of file diff --git a/.github/workflows/workflows-dt.yml b/.github/workflows/pytest.yml similarity index 88% rename from .github/workflows/workflows-dt.yml rename to .github/workflows/pytest.yml index 53a72e43..ecee2bc1 100644 --- a/.github/workflows/workflows-dt.yml +++ b/.github/workflows/pytest.yml @@ -1,10 +1,12 @@ --- -name: Test workflows +name: Unit and integration tests on: pull_request: branches: [main, dev] +# TODO: use container and set custom TORCH_ENV and TF_ENV env variables + jobs: test-itwinai: name: Test itwinai with pytest diff --git a/.gitignore b/.gitignore index dd495607..74cf514d 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,9 @@ mnist-sample-data/ exp_data/ +# Kubernetes +secret*.yaml + # Custom envs .venv* envAI_* diff --git a/.vscode/settings.json b/.vscode/settings.json index 6f581e8c..08d06d81 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -11,6 +11,7 @@ "Convolutional", "cuda", "dataloaders", + "dataloading", "fromlist", "hyperparameters", "hyperparams", diff --git a/README.md b/README.md index dc9a60dc..ce8b6684 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,35 @@ pip install -e .[dev] #### Test with `pytest` -To run tests on itwinai package: +Do this only if you are a developer wanting to test your code with pytest. + +First, you need to create virtual environments both for torch and tensorflow. +For instance, you can use: + +```bash +make torch-cpu +make make tf-2.13-cpu +``` + +To select the name of the torch and tf environments you can set the following +environment variables, which allow to run the tests in environments with +custom names which are different from `.venv-pytorch` and `.venv-tf`. + +```bash +export TORCH_ENV="my_torch_env" +export TF_ENV="my_tf_env" +``` + +Functional tests (marked with `pytest.mark.functional`) will be executed under +`/tmp/pytest` location to guarantee they are run in a clean environment. + +To run functional tests use: + +```bash +pytest -v tests/ -m "functional" +``` + +To run all tests on itwinai package: ```bash # Activate env diff --git a/docs/conf.py b/docs/conf.py index f4c9b297..a06c3011 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,7 +21,8 @@ sys.path.insert(0, os.path.abspath('../')) project = 'itwinai' -copyright = '2024, Matteo Bunino, Alexander Zoechbauer, Kalliopi Tsolaki, Rakesh Sarma on behalf of CERN & JSC' +copyright = ('2024, Matteo Bunino, Alexander Zoechbauer, ' + 'Kalliopi Tsolaki, Rakesh Sarma on behalf of CERN & JSC') author = 'Matteo Bunino, Alexander Zoechbauer, Kalliopi Tsolaki' # version = '0.0' # short version # release = '0.0.2' # full version @@ -43,7 +44,9 @@ def get_git_tag(): try: - return subprocess.check_output(['git', 'describe', '--tags', '--abbrev=0']).decode('utf-8').strip() + return subprocess.check_output( + ['git', 'describe', '--tags', '--abbrev=0'] + ).decode('utf-8').strip() except subprocess.CalledProcessError: return 'unknown' diff --git a/env-files/tensorflow/createEnvJSCTF.sh b/env-files/tensorflow/createEnvJSCTF.sh index 8838347c..377940d4 100644 --- a/env-files/tensorflow/createEnvJSCTF.sh +++ b/env-files/tensorflow/createEnvJSCTF.sh @@ -104,5 +104,8 @@ if [ "$cont1" = true ] ; then pip3 install -r reqs_TF.txt --ignore-installed fi +# Install itwinai +pip install --upgrade pip +pip install -e .[dev] # eof diff --git a/src/itwinai/cli.py b/src/itwinai/cli.py index 275d853a..6c27d069 100644 --- a/src/itwinai/cli.py +++ b/src/itwinai/cli.py @@ -16,7 +16,7 @@ import typer -app = typer.Typer() +app = typer.Typer(pretty_exceptions_enable=False) @app.command() @@ -27,9 +27,6 @@ def scalability_report( plot_title: Annotated[Optional[str], typer.Option( help=("Plot name.") )] = None, - logy: Annotated[bool, typer.Option( - help=("Log scale on y axis.") - )] = False, skip_id: Annotated[Optional[int], typer.Option( help=("Skip epoch ID.") )] = None, @@ -43,15 +40,17 @@ def scalability_report( Example: >>> itwinai scalability-report --pattern="^epoch.+\\.csv$" --skip-id 0 \\ - >>> --plot-title "Some title" --logy --archive archive_name + >>> --plot-title "Some title" --archive archive_name """ # TODO: add max depth and path different from CWD import os import re + import glob import shutil import pandas as pd + import matplotlib import matplotlib.pyplot as plt - # import numpy as np + import numpy as np regex = re.compile(r'{}'.format(pattern)) combined_df = pd.DataFrame() @@ -83,7 +82,13 @@ def scalability_report( if plot_title is not None: fig.suptitle(plot_title) - for name in set(avg_times.name.values): + sp_up_ax.set_yscale("log") + sp_up_ax.set_xscale("log") + + markers = iter("ov^s*dXpD.+12348") + + series_names = sorted(set(avg_times.name.values)) + for name in series_names: df = avg_times[avg_times.name == name].drop(columns='name') # Debug @@ -104,32 +109,27 @@ def scalability_report( df["Efficiency"] = df["Threadscaled Sim. Time / s"].iloc[0] / \ df["Threadscaled Sim. Time / s"] - # Plot - # when lines are very close to each other - if logy: - sp_up_ax.semilogy( - df["NGPUs"].values, df["Speedup"].values, - marker='*', lw=1.0, label=name) - else: - sp_up_ax.plot( - df["NGPUs"].values, df["Speedup"].values, - marker='*', lw=1.0, label=name) - - if logy: - sp_up_ax.semilogy(df["NGPUs"].values, df["Speedup - ideal"].values, - ls='dashed', lw=1.0, c='k', label="ideal") - else: - sp_up_ax.plot(df["NGPUs"].values, df["Speedup - ideal"].values, - ls='dashed', lw=1.0, c='k', label="ideal") + sp_up_ax.plot( + df["NGPUs"].values, df["Speedup"].values, + marker=next(markers), lw=1.0, label=name, alpha=0.7) + + sp_up_ax.plot(df["NGPUs"].values, df["Speedup - ideal"].values, + ls='dashed', lw=1.0, c='k', label="ideal") sp_up_ax.legend(ncol=1) sp_up_ax.set_xticks(df["NGPUs"].values) - # sp_up_ax.set_yticks( - # np.arange(1, np.max(df["Speedup - ideal"].values) + 2, 1)) + sp_up_ax.get_xaxis().set_major_formatter( + matplotlib.ticker.ScalarFormatter()) sp_up_ax.set_ylabel('Speedup') sp_up_ax.set_xlabel('NGPUs (4 per node)') sp_up_ax.grid() + + # Sort legend + handles, labels = sp_up_ax.get_legend_handles_labels() + order = np.argsort(labels) + plt.legend([handles[idx] for idx in order], [labels[idx] for idx in order]) + plot_png = f"scaling_plot_{plot_title}.png" plt.tight_layout() plt.savefig(plot_png, bbox_inches='tight', format='png', dpi=300) @@ -151,6 +151,18 @@ def scalability_report( os.path.basename(csvfile))) shutil.copyfile(plot_png, os.path.join(archive, plot_png)) avg_times.to_csv(os.path.join(archive, "avg_times.csv"), index=False) + print("Archived AVG epoch times CSV") + + # Copy SLURM logs: *.err *.out files + if os.path.exists('logs_slurm'): + print("Archived SLURM logs") + shutil.copytree('logs_slurm', os.path.join(archive, 'logs_slurm')) + # Copy other SLURM logs + for ext in ['*.out', '*.err']: + for file in glob.glob(ext): + shutil.copyfile(file, os.path.join(archive, file)) + + # Create archive archive_name = shutil.make_archive( base_name=archive, # archive file name format='gztar', @@ -170,6 +182,11 @@ def exec_pipeline( help=("Key in the configuration file identifying " "the pipeline object to execute.") )] = "pipeline", + steps: Annotated[Optional[str], typer.Option( + help=("Run only some steps of the pipeline. Accepted values are " + "indices, python slices (e.g., 0:3 or 2:10:100), and " + "string names of steps.") + )] = None, print_config: Annotated[bool, typer.Option( help=("Print config to be executed after overrides.") )] = False, @@ -195,11 +212,14 @@ def exec_pipeline( # to find the local python files imported from the pipeline file import os import sys + import re + from .utils import str_to_slice sys.path.append(os.path.dirname(config)) sys.path.append(os.getcwd()) # Parse and execute pipeline from itwinai.parser import ConfigParser + overrides_list = overrides_list if overrides_list is not None else [] overrides = { k: v for k, v in map(lambda x: (x.split('=')[0], x.split('=')[1]), overrides_list) @@ -213,8 +233,18 @@ def exec_pipeline( print("#="*50) print() pipeline = parser.parse_pipeline(pipeline_nested_key=pipe_key) + if steps: + if not re.match(r"\d+(:\d+)?(:\d+)?", steps): + print(f"Looking for step name '{steps}'") + else: + steps = str_to_slice(steps) + pipeline = pipeline[steps] pipeline.execute() + # Cleanup PYTHONPATH + sys.path.pop() + sys.path.pop() + @app.command() def mlflow_ui( diff --git a/src/itwinai/cluster.py b/src/itwinai/cluster.py deleted file mode 100644 index 7b9f57e0..00000000 --- a/src/itwinai/cluster.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Cluster environments where to run AI workflows.""" - -from __future__ import annotations -from abc import ABCMeta, abstractmethod -import os -from contextlib import contextmanager - - -def setup_for_distributed(is_main): - """ - This function disables printing when not in master process - """ - import builtins as __builtin__ - builtin_print = __builtin__.print - - def print(*args, **kwself): - force = kwself.pop('force', False) - if is_main or force: - builtin_print(*args, **kwself) - - __builtin__.print = print - - -def handle_sigusr1(signum, frame): - os.system(f'scontrol requeue {os.getenv("SLURM_JOB_ID")}') - exit() - - -def handle_sigterm(signum, frame): - pass - - -class ClusterEnvironment(metaclass=ABCMeta): - port: int = -1 - ngpus_per_node: int = -1 - global_world_size: int = -1 - global_rank: int = -1 - local_world_size: int = -1 - local_rank: int = -1 - rnd_seed: int = None - distributed: bool = False - # This flag tells whether the user wants to use the GPU(s) - use_cuda: bool = False - - @property - def backend(self) -> str: - return self._backend - - @backend.setter - def backend(self, backend_name: str) -> None: - self._set_backend(backend_name) - - def _set_backend(self, backend_name: str) -> None: - # Override to implement sanitization - self._backend = backend_name - - @abstractmethod - def is_main_worker(self) -> bool: - """Tells if the current process is the main/master process.""" - pass - - @abstractmethod - def is_cuda_available(self) -> bool: - pass - - @abstractmethod - @contextmanager - def init_dist_gpu(self, *args, **kwargs): - pass - - def cleanup_resources(self): - pass diff --git a/src/itwinai/components.py b/src/itwinai/components.py index 1f41bacd..eca2e570 100644 --- a/src/itwinai/components.py +++ b/src/itwinai/components.py @@ -216,14 +216,6 @@ def execute( validation dataset, test dataset, trained model. """ - @abstractmethod - def save_state(self): - pass - - @abstractmethod - def load_state(self): - pass - class Predictor(BaseComponent): """Applies a pre-trained machine learning model to unseen data.""" diff --git a/src/itwinai/loggers.py b/src/itwinai/loggers.py index d5ed0008..7f86ffcb 100644 --- a/src/itwinai/loggers.py +++ b/src/itwinai/loggers.py @@ -4,13 +4,12 @@ import csv from abc import ABCMeta, abstractmethod from contextlib import contextmanager -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Literal import pickle import pathlib import wandb import mlflow -# import mlflow.keras BASE_EXP_NAME: str = 'unk_experiment' @@ -38,12 +37,12 @@ class Logger(LogMixin, metaclass=ABCMeta): """ savedir: str = None supported_types: List[str] # Supported logging 'kinds' - _log_freq: Union[int, str] + _log_freq: Union[int, Literal['epoch', 'batch']] def __init__( self, savedir: str = 'mllogs', - log_freq: Union[int, str] = 'epoch' + log_freq: Union[int, Literal['epoch', 'batch']] = 'epoch' ) -> None: self.savedir = savedir self.log_freq = log_freq @@ -120,7 +119,7 @@ class ConsoleLogger(Logger): def __init__( self, savedir: str = 'mllogs', - log_freq: Union[int, str] = 'epoch' + log_freq: Union[int, Literal['epoch', 'batch']] = 'epoch' ) -> None: savedir = os.path.join(savedir, 'simple-logger') super().__init__(savedir=savedir, log_freq=log_freq) @@ -190,7 +189,7 @@ def __init__( experiment_name: str = BASE_EXP_NAME, tracking_uri: Optional[str] = None, run_description: Optional[str] = None, - log_freq: Union[int, str] = 'epoch' + log_freq: Union[int, Literal['epoch', 'batch']] = 'epoch' ): savedir = os.path.join(savedir, 'mlflow') super().__init__(savedir=savedir, log_freq=log_freq) @@ -203,7 +202,7 @@ def __init__( saved_abs_path = os.path.abspath(self.savedir) self.tracking_uri = pathlib.Path(saved_abs_path).as_uri() # self.tracking_uri = "file://" + self.savedir - print(f'MLFLOW URI: {self.tracking_uri}') + # print(f'MLFLOW URI: {self.tracking_uri}') # TODO: for pytorch lightning: # mlflow.pytorch.autolog() @@ -317,7 +316,7 @@ def __init__( self, savedir: str = 'mllogs', project_name: str = BASE_EXP_NAME, - log_freq: Union[int, str] = 'epoch' + log_freq: Union[int, Literal['epoch', 'batch']] = 'epoch' ) -> None: savedir = os.path.join(savedir, 'wandb') super().__init__(savedir=savedir, log_freq=log_freq) @@ -376,7 +375,7 @@ class TensorBoardLogger(Logger): def __init__( self, savedir: str = 'mllogs', - log_freq: Union[int, str] = 'epoch' + log_freq: Union[int, Literal['epoch', 'batch']] = 'epoch' ) -> None: savedir = os.path.join(savedir, 'tensorboard') super().__init__(savedir=savedir, log_freq=log_freq) @@ -425,7 +424,7 @@ def __init__( self, loggers: List[Logger] ) -> None: - super().__init__(savedir='/.tmp_mllogs_LoggersCollection', log_freq=0) + super().__init__(savedir='/.tmp_mllogs_LoggersCollection', log_freq=1) self.loggers = loggers def should_log(self, batch_idx: int = None) -> bool: @@ -450,6 +449,18 @@ def log( **kwargs ) + def create_logger_context(self): + for logger in self.loggers: + logger.create_logger_context() + + def destroy_logger_context(self): + for logger in self.loggers: + logger.destroy_logger_context() + + def save_hyperparameters(self, params: Dict[str, Any]) -> None: + for logger in self.loggers: + logger.save_hyperparameters(params=params) + class EpochTimeTracker: def __init__(self, series_name: str, csv_file: str) -> None: diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py index 0001627b..254e91a9 100644 --- a/src/itwinai/parser.py +++ b/src/itwinai/parser.py @@ -76,14 +76,11 @@ class ConfigParser: >>> init_args: >>> save_path: .tmp/ >>> - >>> - class_path: itwinai.torch.trainer.TorchTrainerMG + >>> - class_path: itwinai.torch.trainer.TorchTrainer >>> init_args: >>> model: >>> class_path: model.Net - >>> loss: - >>> class_path: torch.nn.NLLLoss - >>> init_args: - >>> reduction: mean + >>> >>> from itwinai.parser import ConfigParser >>> >>> parser = ConfigParser( @@ -244,241 +241,3 @@ def __init__( "-c", "--config", action=ActionConfigFile, help="Path to a configuration file in json or yaml format." ) - - -# class ConfigParser2: -# """ -# Deprecated: this pipeline structure does not allow for -# nested pipelines. However, it is more readable and the linking -# from name to step data could be achieved with OmegaConf. This -# could be reused in the future: left as example. - -# Parses a configuration file, merging the steps into -# the pipeline and returning a pipeline object. -# It also provides functionalities for dynamic override -# of fields by means of nested key notation. - -# Example: - -# >>> # pipeline.yaml -# >>> pipeline: -# >>> class_path: itwinai.pipeline.Pipeline -# >>> steps: [server, client] -# >>> -# >>> server: -# >>> class_path: mycode.ServerOptions -# >>> init_args: -# >>> host: localhost -# >>> port: 80 -# >>> -# >>> client: -# >>> class_path: mycode.ClientOptions -# >>> init_args: -# >>> url: http://${server.init_args.host}:${server.init_args.port}/ - -# >>> from itwinai.parser import ConfigParser2 -# >>> -# >>> parser = ConfigParser2( -# >>> config='pipeline.yaml', -# >>> override_keys={ -# >>> 'server.init_args.port': 777 -# >>> } -# >>> ) -# >>> pipeline = parser.parse_pipeline() -# >>> print(pipeline) -# >>> print(pipeline.steps) -# >>> print(pipeline.steps['server'].port) -# >>> -# >>> server = parser.parse_step('server') -# >>> print(server) -# >>> print(server.port) -# """ - -# config: Dict -# pipeline: Pipeline - -# def __init__( -# self, -# config: Union[str, Dict], -# override_keys: Optional[Dict[str, Any]] = None -# ) -> None: -# self.config = config -# self.override_keys = override_keys -# if isinstance(self.config, str): -# self.config = load_yaml(self.config) -# self._dynamic_override_keys() -# self._omegaconf_interpolate() - -# def _dynamic_override_keys(self): -# if self.override_keys is not None: -# for key_chain, value in self.override_keys.items(): -# add_replace_field(self.config, key_chain, value) - -# def _omegaconf_interpolate(self) -> None: -# """Performs variable interpolation with OmegaConf on internal -# configuration file. -# """ -# conf = OmegaConf.create(self.config) -# self.config = OmegaConf.to_container(conf, resolve=True) - -# def parse_pipeline( -# self, -# pipeline_nested_key: str = "pipeline", -# verbose: bool = False -# ) -> Pipeline: -# """Merges steps into pipeline and parses it. - -# Args: -# pipeline_nested_key (str, optional): nested key in the -# configuration file identifying the pipeline object. -# Defaults to "pipeline". -# verbose (bool): if True, prints the assembled pipeline -# to console formatted as JSON. - -# Returns: -# Pipeline: instantiated pipeline. -# """ -# pipe_parser = JAPArgumentParser() -# pipe_parser.add_subclass_arguments(Pipeline, pipeline_nested_key) -# pipe_dict = self.config[pipeline_nested_key] - -# # Pop steps list from pipeline dictionary -# steps_list = pipe_dict['steps'] -# del pipe_dict['steps'] - -# # Link steps with respective dictionaries -# if not pipe_dict.get('init_args'): -# pipe_dict['init_args'] = {} -# steps_dict = pipe_dict['init_args']['steps'] = {} -# for step_name in steps_list: -# steps_dict[step_name] = self.config[step_name] -# pipe_dict = {pipeline_nested_key: pipe_dict} - -# if verbose: -# print("Assembled pipeline:") -# print(json.dumps(pipe_dict, indent=4)) - -# # Parse pipeline dict once merged with steps -# conf = pipe_parser.parse_object(pipe_dict) -# pipe = pipe_parser.instantiate_classes(conf) -# self.pipeline = pipe[pipeline_nested_key] -# return self.pipeline - -# def parse_step( -# self, -# step_name: str, -# verbose: bool = False -# ) -> BaseComponent: -# step_dict_config = self.config[step_name] - -# if verbose: -# print(f"STEP '{step_name}' CONFIG:") -# print(json.dumps(step_dict_config, indent=4)) - -# # Wrap config under "step" field and parse it -# step_dict_config = {'step': step_dict_config} -# step_parser = JAPArgumentParser() -# step_parser.add_subclass_arguments(BaseComponent, "step") -# parsed_namespace = step_parser.parse_object(step_dict_config) -# return step_parser.instantiate_classes(parsed_namespace)["step"] - - -# class ItwinaiCLI2: -# """ -# Deprecated: the dynamic override does not work with nested parameters -# and may be confusing. - -# CLI tool for executing a configuration file, with dynamic -# override of fields and variable interpolation with Omegaconf. - -# Example: - -# >>> # train.py -# >>> from itwinai.parser import ItwinaiCLI -# >>> cli = ItwinaiCLI() -# >>> cli.pipeline.execute() - -# >>> # pipeline.yaml -# >>> pipeline: -# >>> class_path: itwinai.pipeline.Pipeline -# >>> steps: [server, client] -# >>> -# >>> server: -# >>> class_path: mycode.ServerOptions -# >>> init_args: -# >>> host: localhost -# >>> port: 80 -# >>> -# >>> client: -# >>> class_path: mycode.ClientOptions -# >>> init_args: -# >>> url: http://${server.init_args.host}:${server.init_args.port}/ - -# From command line: - -# >>> python train.py --config itwinai-conf.yaml --help -# >>> python train.py --config itwinai-conf.yaml -# >>> python train.py --config itwinai-conf.yaml --server.port 8080 -# """ -# _parser: JAPArgumentParser -# _config: Dict -# pipeline: Pipeline - -# def __init__( -# self, -# pipeline_nested_key: str = "pipeline", -# parser_mode: str = "omegaconf" -# ) -> None: -# self.pipeline_nested_key = pipeline_nested_key -# self.parser_mode = parser_mode -# self._init_parser() -# self._parser.add_argument(f"--{self.pipeline_nested_key}", type=dict) -# self._add_steps_arguments() -# self._config = self._parser.parse_args() - -# # Merge steps into pipeline and parse it -# del self._config['config'] -# pipe_parser = ConfigParser2(config=self._config.as_dict()) -# self.pipeline = pipe_parser.parse_pipeline( -# pipeline_nested_key=self.pipeline_nested_key -# ) - -# def _init_parser(self): -# self._parser = JAPArgumentParser(parser_mode=self.parser_mode) -# self._parser.add_argument( -# "-c", "--config", action=ActionConfigFile, -# required=True, -# help="Path to a configuration file in json or yaml format." -# ) - -# def _add_steps_arguments(self): -# """Pre-parses the configuration file, dynamically adding all the -# component classes under 'steps' as arguments of the parser. -# """ -# if "--config" not in sys.argv: -# raise ValueError( -# "--config parameter has to be specified with a " -# "valid path to a configuration file." -# ) -# config_path = sys.argv.index("--config") + 1 -# config_path = sys.argv[config_path] -# config = load_yaml(config_path) - -# # Add steps to parser -# steps = filter( -# lambda itm: itm[0] != self.pipeline_nested_key, -# config.items() -# ) -# steps = { -# step_name: step_data['class_path'] -# for step_name, step_data in steps -# } - -# for st_nested_key, step_class_str in steps.items(): -# step_class = dynamically_import_class(step_class_str) -# self._add_step_arguments( -# step_class=step_class, nested_key=st_nested_key) - -# def _add_step_arguments(self, step_class, nested_key): -# self._parser.add_subclass_arguments( -# baseclass=step_class, nested_key=nested_key) diff --git a/src/itwinai/tensorflow/distributed.py b/src/itwinai/tensorflow/distributed.py index e6c5f28a..64945ca8 100644 --- a/src/itwinai/tensorflow/distributed.py +++ b/src/itwinai/tensorflow/distributed.py @@ -1,17 +1,23 @@ -import tensorflow as tf import os +import tensorflow as tf +import tensorflow.distribute as dist def get_strategy(): """Strategy for distributed TensorFlow training""" - cluster_resolver = tf.distribute.cluster_resolver.SlurmClusterResolver( + if not os.environ.get('SLURM_JOB_ID'): + # TODO: improve + print('not in SLURM env!') + tf_dist_strategy = dist.MirroredStrategy() + return tf_dist_strategy, tf_dist_strategy.num_replicas_in_sync + cluster_resolver = dist.cluster_resolver.SlurmClusterResolver( port_base=12345) - implementation = tf.distribute.experimental.CommunicationImplementation.NCCL - communication_options = tf.distribute.experimental.CommunicationOptions( + implementation = dist.experimental.CommunicationImplementation.NCCL + communication_options = dist.experimental.CommunicationOptions( implementation=implementation) # declare distribution strategy - tf_dist_strategy = tf.distribute.MultiWorkerMirroredStrategy( + tf_dist_strategy = dist.MultiWorkerMirroredStrategy( cluster_resolver=cluster_resolver, communication_options=communication_options ) diff --git a/src/itwinai/tensorflow/trainer.py b/src/itwinai/tensorflow/trainer.py index d8c40012..51bfb97c 100644 --- a/src/itwinai/tensorflow/trainer.py +++ b/src/itwinai/tensorflow/trainer.py @@ -28,12 +28,19 @@ def instance_from_dict(obj_dict: Any) -> Any: return obj_dict +# TODO: the TF trainer is incomplete: +# - strategy is not received from constructor argument: if not needed, +# remove it +# - dataset is not distributed +# - much commented code that has to be removed or included + + class TensorflowTrainer(Trainer): def __init__( self, epochs, - train_dataset, - validation_dataset, + # train_dataset, + # validation_dataset, batch_size, callbacks, model_dict: Dict, @@ -61,14 +68,14 @@ def __init__( # get total number of workers print("Number of devices: {}".format(n_devices)) # distribute datasets among MirroredStrategy's replicas - dist_train_dataset = ( - tf_dist_strategy.experimental_distribute_dataset( - train_dataset - )) - dist_validation_dataset = ( - tf_dist_strategy.experimental_distribute_dataset( - validation_dataset - )) + # dist_train_dataset = ( + # tf_dist_strategy.experimental_distribute_dataset( + # train_dataset + # )) + # dist_validation_dataset = ( + # tf_dist_strategy.experimental_distribute_dataset( + # validation_dataset + # )) with self.strategy.scope(): # TODO: move loss, optimizer and metrics instantiation under # here diff --git a/src/itwinai/torch/cluster.py b/src/itwinai/torch/cluster.py deleted file mode 100644 index aece16e2..00000000 --- a/src/itwinai/torch/cluster.py +++ /dev/null @@ -1,225 +0,0 @@ -"""Cluster environments where to run AI workflows. Partially adapted from: -https://github.com/facebookresearch/detr/blob/master/util/misc.py and -https://github.com/ramyamounir/Template/blob/main/lib/utils/distributed.py -""" - -from __future__ import annotations -from typing import Optional -import os -import signal -import subprocess -from pathlib import Path -from contextlib import contextmanager - -import numpy as np - -import torch -import torch.distributed as dist -import torch.backends.cudnn as cudnn - -from ..cluster import ( - ClusterEnvironment, - setup_for_distributed, - handle_sigusr1, - handle_sigterm -) -from .types import TorchDistributedBackend as BackendT - - -def fix_random_seeds(seed=31): - """ - Fix random seeds. - """ - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - np.random.seed(seed) - - -class TorchCluster(ClusterEnvironment): - def __init__(self) -> None: - super().__init__() - - def _set_backend(self, backend_name: str) -> None: - if backend_name not in BackendT: - raise ValueError( - "Unrecognized 'backend' field. Allowed values " - f"are: {BackendT.list()}. Received '{backend_name}'") - self._backend = backend_name - - def is_cuda_available(self) -> bool: - return self.use_cuda and torch.cuda.is_available() - - def is_main_worker(self) -> bool: - """Checks if the current process is the main/master process - in the whole job. - """ - return self.global_rank == 0 - - def cleanup_resources(self): - dist.barrier() - dist.destroy_process_group() - - -class LocalCluster(TorchCluster): - """Simple single node cluster with optional access to multiple GPUs.""" - - def __init__( - self, - backend: Optional[str] = None, - gpus: Optional[str] = '', - port: int = 49153, - rnd_seed: Optional[int] = 42 - ) -> None: - """Initialize local cluster for multi-GPU access. - - Args: - backend (Optional[str], optional): supported PyTorch backends. - If None, workload is not distributed. Defaults to None. - gpus (Optional[str], optional): list of visible GPU devices - (e.g., '1,2,3'). If empty string uses all available GPUs. - If None, CPU is used. Defaults to ''. - port (int, optional): TCP port used by the master process. - Defaults to 49153. - rnd_seed (Optional[int], optional): random seed to be setup after - all processes are setup. Defaults to 42. - """ - super().__init__() - self.backend = backend - self.gpus = gpus - self.port = port - self.dist_url = f'tcp://127.0.0.1:{self.port}' - self.rnd_seed = rnd_seed - - if self.gpus != '' and self.gpus is not None: - # Restrict the number of GPUs visible according to user needs - os.environ['CUDA_VISIBLE_DEVICES'] = self.gpus - - self.ngpus_per_node = torch.cuda.device_count() - self.global_rank = 0 - self.global_world_size = self.ngpus_per_node - - print(f"{self.ngpus_per_node} GPUs are available.") - self.distributed = True - # This flag tells whether the user wants to use the GPU(s) - self.use_cuda = ( - self.gpus is not None # GPU is not manually disabled - and torch.cuda.device_count() >= 1 # At least one GPU is selected - ) - if self.backend is None or self.ngpus_per_node <= 1: - print("Distributed has been disabled.") - self.distributed = False - self.dist_url = None - self.global_world_size = 1 - self.global_rank = 0 - if not self.is_cuda_available(): - print("CUDA disabled... Running on single CPU.") - self.use_cuda = False - self.distributed = False - self.dist_url = None - self.global_world_size = 1 - self.global_rank = 0 - - # Since single node case - self.local_world_size = self.global_world_size - - @contextmanager - def init_dist_gpu(self, worker_id) -> torch.device: - if self.distributed: - torch.cuda.set_device(worker_id) - self.global_rank += worker_id - # print(f'GLOBAL RANK: {self.global_rank}') - # Since single node case - self.local_rank = self.global_rank - # Simplification: worker ID mapped to GPU ID - self.gpu_id = worker_id - - try: - dist.init_process_group( - backend=self.backend, - init_method=self.dist_url, - world_size=self.global_world_size, - rank=self.global_rank - ) - fix_random_seeds(self.rnd_seed) - torch.cuda.set_device(self.gpu_id) - cudnn.benchmark = True - dist.barrier() - - setup_for_distributed(self.is_main_worker()) - print("SETUP DISTRIBUTED COMPLETE") - yield torch.device('cuda', worker_id) - finally: - self.cleanup_resources() - else: - # Distributed is disabled - # Since single node case - self.global_rank = 0 - self.local_rank = self.global_rank - if self.use_cuda: - torch.cuda.set_device(worker_id) - yield torch.device('cuda', worker_id) - else: - yield torch.device('cpu') - - -class SLURMCluster(TorchCluster): - """SLURM cluster with access to multi-node multi-GPU.""" - - def __init__( - self, - port: int = 49153, - backend: str = 'gloo', - rnd_seed: Optional[int] = 42 - ) -> None: - super().__init__() - self.port = port - self.backend = backend - self.rnd_seed = rnd_seed - if 'SLURM_JOB_ID' not in os.environ: - raise RuntimeError( - "'SLURM_JOB_ID' environment variable is not set. " - "Perhaps you are not running in a slurm cluster?" - ) - - self.ngpus_per_node = torch.cuda.device_count() - - # requeue job on SLURM preemption - signal.signal(signal.SIGUSR1, handle_sigusr1) - signal.signal(signal.SIGTERM, handle_sigterm) - - # find a common host name on all nodes - cmd = 'scontrol show hostnames ' + os.getenv('SLURM_JOB_NODELIST') - stdout = subprocess.check_output(cmd.split()) - host_name = stdout.decode().splitlines()[0] - self.dist_url = f'tcp://{host_name}:{self.port}' - - # distributed parameters - self.global_rank = int(os.getenv('SLURM_NODEID')) * self.ngpus_per_node - self.global_world_size = int( - os.getenv('SLURM_NNODES')) * self.ngpus_per_node - - @contextmanager - def init_dist_gpu(self): - import submitit - try: - job_env = submitit.JobEnvironment() - self.output_dir = Path( - str(self.output_dir).replace("%j", str(job_env.job_id))) - self.gpu = job_env.local_rank - self.global_rank = job_env.global_rank - - dist.init_process_group( - backend=self.backend, - init_method=self.dist_url, - world_size=self.global_world_size, - rank=self.global_rank - ) - fix_random_seeds(self.rnd_seed) - torch.cuda.set_device(self.gpu) - cudnn.benchmark = True - dist.barrier() - - setup_for_distributed(self.is_main_worker()) - yield - finally: - self.cleanup_resources() diff --git a/src/itwinai/torch/distributed.py b/src/itwinai/torch/distributed.py index 34174346..3bb48647 100644 --- a/src/itwinai/torch/distributed.py +++ b/src/itwinai/torch/distributed.py @@ -1,5 +1,5 @@ import abc -from typing import Any, List, Optional, Tuple +from typing import Any, List, Optional, Tuple, Union, Iterable from pathlib import Path import json import os @@ -12,18 +12,47 @@ import torch.optim as optim from torch.optim.lr_scheduler import _LRScheduler as LRScheduler from torch.optim.optimizer import Optimizer +from torch.utils.data import Dataset, Sampler, DistributedSampler, DataLoader +from torch.utils.data.dataloader import T_co, _worker_init_fn_t, _collate_fn_t from ..distributed import DistributedStrategy +from .types import UninitializedStrategyError, DistributedStrategyError + + +def distributed_resources_available() -> bool: + """Check if the current execution environment + has (enough) GPUs available to allow for distributed ML. + + Returns: + bool: env can support distributed ML. + """ + if torch.cuda.is_available() and torch.cuda.device_count() > 1: + return True + return False class TorchDistributedStrategy(DistributedStrategy): """Abstract class to define the distributed backend methods for PyTorch models. """ + is_distributed: bool = True + is_initialized: bool = False + + @property + def is_main_worker(self) -> bool: + """Checks if local worker has global rank equal to zero. + + Returns: + bool: True if main worker. + """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") + return self.global_rank() == 0 + @abc.abstractmethod def init(self) -> None: """Initializes the chosen distributed backend""" - # @abc.abstractmethod # def distributed_engine( # self, model: nn.Module, optimizer: Optimizer, @@ -39,7 +68,7 @@ def distributed( """Setup model, optimizer and scheduler for distributed.""" @abc.abstractmethod - def dist_gwsize(self) -> int: + def global_world_size(self) -> int: """Returns the total number of processes (global world size). Returns: @@ -47,7 +76,7 @@ def dist_gwsize(self) -> int: """ @abc.abstractmethod - def dist_lwsize(self) -> int: + def local_world_size(self) -> int: """Returns the number of local workers available on a node (local world size). Usually it is equal to the number of available GPUs. @@ -57,7 +86,7 @@ def dist_lwsize(self) -> int: """ @abc.abstractmethod - def dist_grank(self) -> int: + def global_rank(self) -> int: """Returns the global rank of the current process. Rank ranges from 0 to world_size. @@ -66,28 +95,182 @@ def dist_grank(self) -> int: """ @abc.abstractmethod - def dist_lrank(self) -> int: + def local_rank(self) -> int: """Returns the local rank of the current process. Returns: int: local rank. """ - def is_main_worker(self) -> bool: - """Checks if local worker has global rank equal to zero. - - Returns: - bool: True if main worker. - """ - return self.dist_grank() == 0 - - def dist_device(self) -> str: + def device(self) -> str: """Device used by local worker. Returns: str: torch device in the form 'cuda:N'. """ - return f"cuda:{self.dist_lrank()}" + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") + return f"cuda:{self.local_rank()}" + + def create_dataloader( + self, dataset: Dataset[T_co], batch_size: Optional[int] = 1, + shuffle: Optional[bool] = None, + sampler: Union[Sampler, Iterable, None] = None, + batch_sampler: Union[Sampler[List], Iterable[List], None] = None, + num_workers: int = 0, collate_fn: Optional[_collate_fn_t] = None, + pin_memory: bool = False, drop_last: bool = False, + timeout: float = 0, + worker_init_fn: Optional[_worker_init_fn_t] = None, + multiprocessing_context=None, generator=None, + *, prefetch_factor: Optional[int] = None, + persistent_workers: bool = False, + pin_memory_device: str = "" + ): + """Create a distributed DataLoader by using ``DistributedSampler`` as + random sampler. + + Args: + dataset (Dataset): dataset from which to load the data. + batch_size (int, optional): how many samples per batch to load + (default: ``1``). + shuffle (bool, optional): set to ``True`` to have the data + reshuffled at every epoch (default: ``False``). + sampler (Sampler or Iterable, optional): defines the strategy to + draw + samples from the dataset. Can be any ``Iterable`` with + ``__len__`` + implemented. If specified, :attr:`shuffle` must not be + specified. + batch_sampler (Sampler or Iterable, optional): like + :attr:`sampler`, but + returns a batch of indices at a time. Mutually exclusive with + :attr:`batch_size`, :attr:`shuffle`, :attr:`sampler`, + and :attr:`drop_last`. + num_workers (int, optional): how many subprocesses to use for data + loading. ``0`` means that the data will be loaded in the main + process. (default: ``0``) + collate_fn (Callable, optional): merges a list of samples to form a + mini-batch of Tensor(s). Used when using batched loading from + a map-style dataset. + pin_memory (bool, optional): If ``True``, the data loader will + copy Tensors + into device/CUDA pinned memory before returning them. If your + data elements + are a custom type, or your :attr:`collate_fn` returns a batch + that is a custom type, + see the example below. + drop_last (bool, optional): set to ``True`` to drop the last + incomplete batch, + if the dataset size is not divisible by the batch size. + If ``False`` and + the size of dataset is not divisible by the batch size, then + the last batch + will be smaller. (default: ``False``) + timeout (numeric, optional): if positive, the timeout value for + collecting a batch + from workers. Should always be non-negative. (default: ``0``) + worker_init_fn (Callable, optional): If not ``None``, + this will be called on each + worker subprocess with the worker id (an int in + ``[0, num_workers - 1]``) as + input, after seeding and before data loading. + (default: ``None``) + multiprocessing_context (str or + multiprocessing.context.BaseContext, optional): If + ``None``, the default `multiprocessing context`_ of + your operating system will + be used. (default: ``None``) + generator (torch.Generator, optional): If not ``None``, + this RNG will be used + by RandomSampler to generate random indexes and + multiprocessing to generate + ``base_seed`` for workers. (default: ``None``) + prefetch_factor (int, optional, keyword-only arg): Number of + batches loaded + in advance by each worker. ``2`` means there will be a total of + 2 * num_workers batches prefetched across all workers. + (default value depends + on the set value for num_workers. If value of num_workers=0 + default is ``None``. + Otherwise, if value of ``num_workers > 0`` default is ``2``). + persistent_workers (bool, optional): If ``True``, the data loader + will not shut down + the worker processes after a dataset has been consumed once. + This allows to + maintain the workers `Dataset` instances alive. + (default: ``False``) + pin_memory_device (str, optional): the device to + :attr:`pin_memory` to if ``pin_memory`` is ``True``. + + + .. warning:: If the ``spawn`` start method is used, + :attr:`worker_init_fn` + cannot be an unpicklable object, e.g., a lambda function. + See :ref:`multiprocessing-best-practices` on more + details related to multiprocessing in PyTorch. + + .. warning:: ``len(dataloader)`` heuristic is based on the length of + the sampler used. + When :attr:`dataset` is an + :class:`~torch.utils.data.IterableDataset`, + it instead returns an estimate based on + ``len(dataset) / batch_size``, with proper + rounding depending on :attr:`drop_last`, regardless + of multi-process loading + configurations. This represents the best guess PyTorch + can make because PyTorch + trusts user :attr:`dataset` code in correctly handling + multi-process + loading to avoid duplicate data. + + However, if sharding results in multiple workers having + incomplete last batches, + this estimate can still be inaccurate, because (1) an + otherwise complete batch can + be broken into multiple ones and (2) more than one batch + worth of samples can be + dropped when :attr:`drop_last` is set. Unfortunately, + PyTorch can not detect such cases in general. + + See `Dataset Types`_ for more details on these two + types of datasets and how + :class:`~torch.utils.data.IterableDataset` interacts with + `Multi-process data loading`_. + + .. warning:: See :ref:`reproducibility`, and + :ref:`dataloader-workers-random-seed`, and + :ref:`data-loading-randomness` notes for random + seed related questions. + + .. _multiprocessing context: + https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods + """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") + + if self.is_distributed: + if sampler is not None: + raise RuntimeError( + "User-provided sampler is not supported." + ) + sampler = DistributedSampler( + dataset, num_replicas=self.global_world_size(), + rank=self.global_rank(), + shuffle=shuffle + ) + # shuffle and batch_sampler must be unset + return DataLoader( + dataset=dataset, batch_size=batch_size, sampler=sampler, + num_workers=num_workers, collate_fn=collate_fn, + pin_memory=pin_memory, drop_last=drop_last, timeout=timeout, + worker_init_fn=worker_init_fn, + multiprocessing_context=multiprocessing_context, + generator=generator, prefetch_factor=prefetch_factor, + persistent_workers=persistent_workers, + pin_memory_device=pin_memory_device + ) @abc.abstractmethod def clean_up(self) -> None: @@ -105,8 +288,8 @@ def par_allgather_obj(self, obj: Any) -> List[Any]: """ -class DDPDistributedStrategy(TorchDistributedStrategy): - """PyTorch DDP distributed strategy class. +class TorchDDPStrategy(TorchDistributedStrategy): + """PyTorch ``DistributedDataParallel`` distributed strategy class. Args: backend (str): Name of the communication backend to employ. @@ -121,12 +304,21 @@ def __init__(self, backend: str) -> None: def init(self) -> None: """Initializes the distributed process group and the distributed package. + + Raises: + RuntimeError: when there are not (enough) GPUs available. + DistributedStrategyError: when trying to initialize a strategy + already initialized. """ - if torch.cuda.is_available() and torch.cuda.device_count() > 1: - dist.init_process_group(backend=self.backend) - else: - print("WARNING: trying to run distributed on insufficient" - " resources. Skipping distributed process group setup.") + if not distributed_resources_available(): + raise RuntimeError( + "Trying to run distributed on insufficient resources.") + if self.is_initialized: + raise DistributedStrategyError("Strategy was already initialized") + dist.init_process_group(backend=self.backend) + self.is_initialized = True + + torch.cuda.device(self.local_rank()) # def distributed_engine( # self, model: nn.Module, optimizer: Optimizer, @@ -158,55 +350,73 @@ def distributed( **kwargs ) -> Tuple[nn.Module, Optimizer, Optional[LRScheduler]]: """Setup model, optimizer and scheduler for distributed.""" + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") if torch.cuda.is_available(): # device = self.dist_lrank() - model = model.to(self.dist_device()) + model = model.to(self.device()) dist_model = torch.nn.parallel.DistributedDataParallel( model, - device_ids=[self.dist_device()], - output_device=self.dist_device() + device_ids=[self.device()], + output_device=self.device() ) else: dist_model = model return dist_model, optimizer, lr_scheduler - def dist_gwsize(self) -> int: + def global_world_size(self) -> int: """Returns the total number of processes (global world size). Returns: int: global world size. """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return dist.get_world_size() - def dist_lwsize(self) -> int: + def local_world_size(self) -> int: """Returns the local number of workers available per node, which is usually the number of GPUs available. Returns: int: local world size. """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return torch.cuda.device_count() - def dist_grank(self) -> int: + def global_rank(self) -> int: """Returns the global rank of the current process, where rank ranges from 0 to world_size. Returns: int: global rank. """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return dist.get_rank() - def dist_lrank(self) -> int: + def local_rank(self) -> int: """Returns the local rank of the current process. Returns: int: local rank. """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return dist.get_rank() % torch.cuda.device_count() def clean_up(self) -> None: """Destroys the current process group.""" + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") if torch.cuda.is_available(): dist.barrier() dist.destroy_process_group() @@ -221,12 +431,15 @@ def par_allgather_obj(self, obj: Any) -> List[Any]: Returns: List[Any]: List of gathered objects. """ - res = [None] * self.dist_gwsize() + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") + res = [None] * self.global_world_size() dist.all_gather_object(res, obj) return res -class DSDistributedStrategy(TorchDistributedStrategy): +class DeepSpeedStrategy(TorchDistributedStrategy): """DeepSpeed distributed strategy class. Args: @@ -256,7 +469,19 @@ def _load_config(self, ds_config) -> None: def init(self) -> None: """Initializes the distributed process group and the distributed package. + + Raises: + RuntimeError: when there are not (enough) GPUs available. + DistributedStrategyError: when trying to initialize a strategy + already initialized. """ + if not distributed_resources_available(): + raise RuntimeError( + "Trying to run distributed on insufficient resources.") + + if self.is_initialized: + raise DistributedStrategyError("Strategy was already initialized") + # https://github.com/Lightning-AI/pytorch-lightning/issues/13567 ompi_lrank = os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK') os.environ['OMPI_COMM_WORLD_LOCAL_RANK'] = os.environ.get( @@ -264,6 +489,9 @@ def init(self) -> None: # https://deepspeed.readthedocs.io/en/latest/initialize.html#training-initialization deepspeed.init_distributed(dist_backend=self.backend) + self.is_initialized = True + + torch.cuda.device(self.local_rank()) def distributed( self, model: nn.Module, optimizer: Optional[Optimizer] = None, @@ -272,6 +500,10 @@ def distributed( **init_kwargs ) -> Tuple[nn.Module, Optimizer, Optional[LRScheduler]]: """Setup model, optimizer and scheduler for distributed.""" + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") + if init_kwargs.get("config"): self._load_config(init_kwargs.get("config")) # https://deepspeed.readthedocs.io/en/latest/initialize.html#training-initialization @@ -286,42 +518,57 @@ def distributed( ) return distrib_model, optimizer, lr_scheduler - def dist_gwsize(self) -> int: + def global_world_size(self) -> int: """Returns the total number of processes (global world size). Returns: int: global world size. """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return dist.get_world_size() - def dist_lwsize(self) -> int: + def local_world_size(self) -> int: """Returns the local number of workers available per node, which is usually the number of GPUs available. Returns: int: local world size. """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return torch.cuda.device_count() - def dist_grank(self) -> int: + def global_rank(self) -> int: """Returns the global rank of the current process, where rank ranges from 0 to world_size. Returns: int: global rank. """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return dist.get_rank() - def dist_lrank(self) -> int: + def local_rank(self) -> int: """Returns the local rank of the current process. Returns: int: local rank. """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return dist.get_rank() % torch.cuda.device_count() def clean_up(self) -> None: """Destroys the current process group.""" + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") deepspeed.sys.exit() def par_allgather_obj(self, obj: Any) -> list[Any]: @@ -334,18 +581,34 @@ def par_allgather_obj(self, obj: Any) -> list[Any]: Returns: List[Any]: List of gathered objects. """ - res = [None] * self.dist_gwsize() + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") + res = [None] * self.global_world_size() dist.all_gather_object(res, obj) return res -class HVDDistributedStrategy(TorchDistributedStrategy): +class HorovodStrategy(TorchDistributedStrategy): """Horovod distributed strategy class.""" def init(self) -> None: - """Initializes the Horovod distributed backend.""" + """Initializes the Horovod distributed backend. + + Raises: + RuntimeError: when there are not (enough) GPUs available. + DistributedStrategyError: when trying to initialize a strategy + already initialized. + """ + if not distributed_resources_available(): + raise RuntimeError( + "Trying to run distributed on insufficient resources.") + if self.is_initialized: + raise DistributedStrategyError("Strategy was already initialized") hvd.init() - torch.cuda.set_device(hvd.local_rank()) + self.is_initialized = True + + torch.cuda.device(self.local_rank()) def distributed( self, model: nn.Module, optimizer: Optional[Optimizer] = None, @@ -353,8 +616,11 @@ def distributed( **optim_kwargs ) -> Tuple[nn.Module, Optimizer, Optional[LRScheduler]]: """Setup model, optimizer and scheduler for distributed.""" + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") - model.to(self.dist_device()) + model.to(self.device()) # Scale learning rate # https://github.com/horovod/horovod/issues/1653#issuecomment-574764452 @@ -389,42 +655,57 @@ def _broadcast_params( hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=-0) - def dist_gwsize(self) -> int: + def global_world_size(self) -> int: """Returns the total number of processes (global world size). Returns: int: global world size. """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return hvd.size() - def dist_lwsize(self) -> int: + def local_world_size(self) -> int: """Returns the local number of workers available per node, which is usually the number of GPUs available. Returns: int: local world size. """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return hvd.local_size() - def dist_grank(self) -> int: + def global_rank(self) -> int: """Returns the global rank of the current process, where rank ranges from 0 to world_size. Returns: int: global rank. """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return hvd.rank() - def dist_lrank(self) -> int: + def local_rank(self) -> int: """Returns the local rank of the current process. Returns: int: local rank. """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return hvd.local_rank() def clean_up(self) -> None: """Shuts Horovod down.""" + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") hvd.shutdown() def par_allgather_obj(self, obj: Any) -> list[Any]: @@ -437,484 +718,99 @@ def par_allgather_obj(self, obj: Any) -> list[Any]: Returns: list: gathered list with size(#worker). """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") return hvd.allgather_object(obj) -# class TorchDistributedStrategy_old(DistributedStrategy): -# """Abstract class to define the distributed backend methods for -# PyTorch models. -# """ -# @abc.abstractmethod -# def init_backend(self) -> None: -# """Initializes the chosen distributed backend""" +class NonDistributedStrategy(TorchDistributedStrategy): + """Dummy class for non-distributed environments.""" -# @abc.abstractmethod -# def distribute_model(self, model: Any) -> Any: -# """Distributes a machine learning model. + is_distributed: bool = False -# Args: -# model (Any): a generic ML model to be distributed. + def init(self) -> None: + """If CUDA is available set CUDA device, and do nothing more. -# Returns: -# Any: distributed model instance. -# """ + Raises: + DistributedStrategyError: when trying to initialize a strategy + already initialized. + """ + if self.is_initialized: + raise DistributedStrategyError("Strategy was already initialized") + if torch.cuda.is_available(): + torch.cuda.device(self.local_rank()) + self.is_initialized = True -# @abc.abstractmethod -# def broadcast_params(self, model: Any, optimizer: Any) -> None: -# """Broadcasts variables from root rank to all other processes/ + def device(self) -> str: + """Device used by local worker. -# Args: -# model (Any): distributed model. -# optimizer (Any): optimizer. -# """ + Returns: + str: cpu device if CUDA is not available. + """ + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") + if torch.cuda.is_available(): + return super().device() + return "cpu" + + def distributed( + self, model: nn.Module, optimizer: Optional[Optimizer] = None, + lr_scheduler: Optional[LRScheduler] = None, + **kwargs + ) -> Tuple[nn.Module, Optimizer, Optional[LRScheduler]]: + """Do nothing and return model, optimizer and scheduler.""" + if not self.is_initialized: + raise UninitializedStrategyError( + "Strategy has not been initialized. Use the init method.") + if torch.cuda.is_available(): + model = model.cuda() + return model, optimizer, lr_scheduler + + def global_world_size(self) -> int: + """Returns the total number of processes (global world size). + + Returns: + int: global world size. + """ + return 1 + + def local_world_size(self) -> int: + """Returns the local number of workers available per node, + which is usually the number of GPUs available. + + Returns: + int: local world size. + """ + return 1 + + def global_rank(self) -> int: + """Returns the global rank of the current process, where + rank ranges from 0 to world_size. + + Returns: + int: global rank. + """ + return 0 + + def local_rank(self) -> int: + """Returns the local rank of the current process. -# @abc.abstractmethod -# def distribute_optimizer(self, optimizer: Any, model: Any) -> Any: -# """Distribute optimizer. + Returns: + int: local rank. + """ + return 0 -# Args: -# optimizer (Any): optimizer. -# model (Any): distributed model. + def clean_up(self) -> None: + """Do nothing.""" -# Returns: -# Any: distributed optimizer. -# """ + def par_allgather_obj(self, obj: Any) -> list[Any]: + """Raise error as this operation is not available. -# @abc.abstractmethod -# def dist_gwsize(self) -> int: -# """Returns the total number of processes (global world size). - -# Returns: -# int: global world size. -# """ - -# @abc.abstractmethod -# def dist_lwsize(self) -> int: -# """Returns the number of local workers available on a node -# (local world size). -# Usually it is equal to the number of available GPUs. - -# Returns: -# int: local world size. -# """ - -# @abc.abstractmethod -# def dist_grank(self) -> int: -# """Returns the global rank of the current process. -# Rank ranges from 0 to world_size. - -# Returns: -# int: global rank. -# """ - -# @abc.abstractmethod -# def dist_lrank(self) -> int: -# """Returns the local rank of the current process. - -# Returns: -# int: local rank. -# """ - -# def is_main_worker(self) -> bool: -# """Checks if local worker has global rank equal to zero. - -# Returns: -# bool: True if main worker. -# """ -# return self.dist_grank() == 0 - -# def dist_device(self) -> str: -# """Device used by local worker. - -# Returns: -# str: torch device in the form 'cuda:N'. -# """ -# return f"cuda:{self.dist_lrank()}" - -# @abc.abstractmethod -# def clean_up(self) -> None: -# """Cleans up resources allocated by distributed strategy.""" - -# @abc.abstractmethod -# def par_allgather_obj(self, obj: Any) -> List[Any]: -# """Gathers any object from the whole group in a list -# (to all workers). - -# Args: -# obj (Any): object to gather from all workers. - -# Returns: -# List[Any]: list of objects gathered from all workers. -# """ - - -# class DDPDistributedStrategy_old(TorchDistributedStrategy_old): -# """PyTorch DDP distributed strategy class. - -# Args: -# backend (str): Name of the communication backend to employ. -# """ - -# backend: str - -# def __init__(self, backend: str) -> None: -# super().__init__() -# self.backend = backend - -# def init_backend(self) -> None: -# """Initializes the distributed process group and the distributed -# package. -# """ -# if torch.cuda.is_available(): -# dist.init_process_group(backend=self.backend) - -# def distribute_model(self, model: nn.Module) -> nn.Module: -# """Achieves data parallelism by synchronizing the gradients -# across each model replica located in each available -# computing device. - -# Args: -# model (nn.Module): ML model to be distributed. - -# Returns: -# nn.Module: Distributed model replicas across all devices. -# that are to be synchronized. -# """ -# if torch.cuda.is_available(): -# # device = self.dist_lrank() -# model = model.to(self.dist_device()) -# dist_model = torch.nn.parallel.DistributedDataParallel( -# model, -# device_ids=[self.dist_device()], -# output_device=self.dist_device() -# ) -# else: -# dist_model = model - -# return dist_model - -# def broadcast_params( -# self, -# model: nn.Module, -# optimizer: optim.Optimizer -# ) -> None: -# """Do nothing. Only applicable for Horovod. - -# Args: -# model (nn.Module): ML model -# optimizer (optim.Optimizer): Optimizer -# """ -# pass - -# def distribute_optimizer( -# self, -# optimizer: optim.Optimizer, -# model: nn.Module = None -# ) -> optim.Optimizer: -# """Returns the optimizer from argument. - -# Args: -# optimizer (optim.Optimizer): optimizer. -# model (nn.Module): ML model. Unused here. - -# Returns: -# optim.Optimizer: Distributed optimizer. -# """ -# return optimizer - -# def dist_gwsize(self) -> int: -# """Returns the total number of processes (global world size). - -# Returns: -# int: global world size. -# """ -# return dist.get_world_size() - -# def dist_lwsize(self) -> int: -# """Returns the local number of workers available per node, -# which is usually the number of GPUs available. - -# Returns: -# int: local world size. -# """ -# return torch.cuda.device_count() - -# def dist_grank(self) -> int: -# """Returns the global rank of the current process, where -# rank ranges from 0 to world_size. - -# Returns: -# int: global rank. -# """ -# return dist.get_rank() - -# def dist_lrank(self) -> int: -# """Returns the local rank of the current process. - -# Returns: -# int: local rank. -# """ -# return dist.get_rank() % torch.cuda.device_count() - -# def clean_up(self) -> None: -# """Destroys the current process group.""" -# if torch.cuda.is_available(): -# dist.barrier() -# dist.destroy_process_group() - -# def par_allgather_obj(self, obj: Any) -> List[Any]: -# """Gathers any object from the whole group -# in a list (to all workers). - -# Args: -# obj (Any): Object to gather from all workers. - -# Returns: -# List[Any]: List of gathered objects. -# """ -# res = [None] * self.dist_gwsize() -# dist.all_gather_object(res, obj) -# return res - - -# class DSDistributedStrategy_old(TorchDistributedStrategy_old): -# """DeepSpeed distributed strategy class. - -# Args: -# backend (str): Name of the communication backend to employ. -# config (Union[dict, Path, str]): DeepSpeed config. Either a -# dictionary or a path to a JSON file. -# """ - -# config: Dict = None -# backend: str - -# def __init__( -# self, -# backend: str, -# config: Union[Dict, Path, str] -# ) -> None: -# super().__init__() -# self.backend = backend -# self._load_config(config) - -# def _load_config(self, ds_config): -# if isinstance(ds_config, (str, Path)): -# with open(ds_config) as fp: -# self.config = json.load(fp) -# elif isinstance(ds_config, dict): -# self.config = ds_config -# else: -# raise ValueError("ds_config is not a dictionary not a path.") - -# def init_backend(self) -> None: -# """Initializes the distributed process group and the distributed -# package. -# """ -# deepspeed.init_distributed(dist_backend=self.backend) - -# def distribute_model(self, model: nn.Module) -> nn.Module: -# """Achieves data parallelism by synchronizing the gradients -# across each model replica located in each available -# computing device. - -# Args: -# model (nn.Module): ML model to be distributed. - -# Returns: -# nn.Module: Distributed model replicas across all devices -# that are to be synchronized. -# """ -# distrib_model, __, __, __ = deepspeed.initialize( -# model=model, -# model_parameters=model.parameters(), -# dist_init_required=True, -# config=self.config -# ) -# return distrib_model - -# def broadcast_params( -# self, model: nn.Module, optimizer: optim.Optimizer -# ) -> None: -# """Only applicable for Horovod. Does nothing. - -# Args: -# model (nn.Module): ML model. -# optimizer (optim.Optimizer): optimizer. -# """ -# pass - -# def distribute_optimizer( -# self, -# optimizer: optim.Optimizer, -# model: nn.Module = None -# ) -> optim.Optimizer: -# """Returns the optimizer from argument. - -# Args: -# optimizer (optim.Optimizer): torch optimizer. -# model (nn.Module): torch neural network. - -# Returns: -# optim.Optimizer: distributed optimizer. -# """ -# return optimizer - -# def dist_gwsize(self) -> int: -# """Returns the total number of processes (global world size). - -# Returns: -# int: global world size. -# """ -# return dist.get_world_size() - -# def dist_lwsize(self) -> int: -# """Returns the local number of workers available per node, -# which is usually the number of GPUs available. - -# Returns: -# int: local world size. -# """ -# return torch.cuda.device_count() - -# def dist_grank(self) -> int: -# """Returns the global rank of the current process, where -# rank ranges from 0 to world_size. - -# Returns: -# int: global rank. -# """ -# return dist.get_rank() - -# def dist_lrank(self) -> int: -# """Returns the local rank of the current process. - -# Returns: -# int: local rank. -# """ -# return dist.get_rank() % torch.cuda.device_count() - -# def clean_up(self) -> None: -# """Destroys the current process group.""" -# deepspeed.sys.exit() - -# def par_allgather_obj(self, obj: Any) -> list[Any]: -# """Gathers any object from the whole group -# in a list (to all workers). - -# Args: -# obj (Any): Object to gather from all workers. - -# Returns: -# List[Any]: List of gathered objects. -# """ -# res = [None] * self.dist_gwsize() -# dist.all_gather_object(res, obj) -# return res - - -# class HVDDistributedStrategy_old(TorchDistributedStrategy_old): -# """Horovod distributed strategy class.""" - -# def init_backend(self) -> None: -# """Initializes the Horovod distributed backend.""" -# hvd.init() - -# def distribute_model(self, model: nn.Module) -> nn.Module: -# """Only applicable for DDP and DeepSpeed. -# For Horovod, returns the same model passed as argument. - -# Args: -# model (nn.Module): ML model to be distributed. - -# Returns: -# nn.Module: ML model passed in the argument. -# """ -# return model - -# def broadcast_params( -# self, model: nn.Module, optimizer: optim.Optimizer -# ) -> None: -# """Broadcasts variables from root rank to all other processes. - -# Args: -# model (nn.Module): ML model that is to be broadcasted -# across processes. -# optimizer (optim.Optimizer): Optimizer that is to be broadcasted -# across processes. -# """ -# hvd.broadcast_parameters(model.state_dict(), root_rank=0) -# hvd.broadcast_optimizer_state(optimizer, root_rank=-0) - -# def distribute_optimizer( -# self, -# optimizer: optim.Optimizer, -# model: nn.Module -# ) -> optim.Optimizer: -# """Constructs a DistributedOptimizer, for computing single-process -# gradient values and applying gradient updates after the gradients -# have been combined across all the Horovod ranks. - -# Args: -# optimizer (optim.Optimizer): Optimizer to be distributed. -# model (nn.Module): ML model to be trained. - -# Returns: -# optim.Optimizer: Distributed optimizer across all ranks. -# """ -# distOptimizer = hvd.DistributedOptimizer( -# optimizer, -# named_parameters=model.named_parameters(), -# op=hvd.Average -# ) -# return distOptimizer - -# def dist_gwsize(self) -> int: -# """Returns the total number of processes (global world size). - -# Returns: -# int: global world size. -# """ -# return hvd.size() - -# def dist_lwsize(self) -> int: -# """Returns the local number of workers available per node, -# which is usually the number of GPUs available. - -# Returns: -# int: local world size. -# """ -# return hvd.local_size() - -# def dist_grank(self) -> int: -# """Returns the global rank of the current process, where -# rank ranges from 0 to world_size. - -# Returns: -# int: global rank. -# """ -# return hvd.rank() - -# def dist_lrank(self) -> int: -# """Returns the local rank of the current process. - -# Returns: -# int: local rank. -# """ -# return hvd.local_rank() - -# def clean_up(self) -> None: -# """Shuts Horovod down.""" -# hvd.shutdown() - -# def par_allgather_obj(self, obj: Any) -> list[Any]: -# """Gathers scalar objects across all workers to a -# list with size(#worker), uses horovod communicator - -# Args: -# obj (Any): object in a worker. - -# Returns: -# list: gathered list with size(#worker). -# """ -# return hvd.allgather_object(obj) + Args: + obj (Any): object in a worker. + """ + raise RuntimeError( + f"{self.__class__.__name__} does not support this operation." + ) diff --git a/src/itwinai/torch/engine.py b/src/itwinai/torch/engine.py deleted file mode 100644 index 7084d6ec..00000000 --- a/src/itwinai/torch/engine.py +++ /dev/null @@ -1,276 +0,0 @@ -""" -Model engine which wraps a torch NN. Still under development. May be removed... -""" - -import abc -from typing import Any, Union, Optional, Callable - -from pydantic import BaseModel - -import torch -import torch.nn as nn -import torch.optim as optim -from torch.optim.lr_scheduler import _LRScheduler as LRScheduler -from torch.cuda import amp -from torch import autocast - - -class OptimizerConfig: - def __init__(self, optim_class, **kwargs) -> None: - self.optim_class = optim_class - self.kwargs = kwargs - - def to_optim(self, parameters) -> optim.Optimizer: - return self.optim_class(parameters, **self.kwargs) - - -class LRSchedulerConfig: - def __init__(self, scheduler_class, **kwargs) -> None: - self.scheduler_class = scheduler_class - self.kwargs = kwargs - - def to_scheduler(self, optim) -> LRScheduler: - return self.scheduler_class(optim, **self.kwargs) - - -class ModelEngineConfig(BaseModel): - mixed_precision: bool = False - - -class ModelEngine(abc.ABC): - """Wrapper around ML model, which abstracts from distributed and - mixed-precision models. - """ - - model: nn.Module - _model_parameters: Any - optimizer: optim.Optimizer - lr_scheduler: LRScheduler - # config: ModelEngineConfig - mixed_precision: bool = False - grad_scaler: amp.GradScaler = None - - def __init__( - self, - model: nn.Module, - # model_parameters: Any, - optimizer: Union[optim.Optimizer, OptimizerConfig], - lr_scheduler: Optional[Union[LRScheduler, LRSchedulerConfig]] = None, - mixed_precision: bool = False - # config: Optional[ModelEngineConfig] = None - ) -> None: - super().__init__() - self.model = model - self.optimizer = optimizer - self.lr_scheduler = lr_scheduler - # self._model_parameters = model_parameters - # if isinstance(optimizer, OptimizerConfig): - # self.optimizer = optimizer.to_optim(model_parameters) - # else: - # self.optimizer = optimizer - - # if isinstance(lr_scheduler, LRSchedulerConfig): - # self.lr_scheduler = lr_scheduler.to_scheduler(self.optimizer) - # else: - # self.lr_scheduler = lr_scheduler - - # if not config: - # self.config = ModelEngineConfig() - self.mixed_precision = mixed_precision - if mixed_precision: - self.grad_scaler = amp.GradScaler() - - def __call__(self, *args: Any, **kwds: Any) -> Any: - """Performs the forward operation.""" - # Wrapper of self.forward() - return self.forward(*args, **kwds) - - def forward(self, *args: Any, **kwds: Any) -> Any: - """Performs the forward operation.""" - return self.model(*args, **kwds) - - def train(self, mode: bool = True) -> nn.Module: - """Set model in training mode.""" - self.model.train(mode=mode) - return self.model - - def eval(self) -> nn.Module: - """Set model in inference mode.""" - self.model.eval() - return self.model - - def to(self, device) -> nn.Module: - """Move model to specified device.""" - self.model.to(device) - return self.model - - @abc.abstractmethod - def zero_grad(): - """Set gradients to zero for the optimizer.""" - - @abc.abstractmethod - def backward(self, loss_fn: Callable, *loss_args) -> torch.Tensor: - """Perform backward pass and return the loss. - - Args: - loss_fn (Callable): computes the loss. - *loss_args: are the arguments to be passed to ``loss_fn``. - - Returns: - torch.Tensor: computed loss. - """ - - @abc.abstractmethod - def optimizer_step(self): - """Perform optimizer step.""" - - @abc.abstractmethod - def lr_scheduler_step(self): - """Perform lr scheduler step, if present.""" - # This should be incorporated in the optim step: - # https://deepspeed.readthedocs.io/en/latest/schedulers.html - # scheduler is updated automatically at each training step - - @abc.abstractmethod - def save_checkpoint(self): - """Save checkpoint to persistent storage.""" - - -class DDPModelEngine(ModelEngine): - """Model engine for torch DDP distributed strategy.""" - - def forward(self, *args: Any, **kwds: Any) -> Any: - """Performs the forward operation.""" - if self.mixed_precision: - # https://pytorch.org/docs/stable/notes/amp_examples.html - # Runs the forward pass with autocasting. - with autocast(device_type='cuda', dtype=torch.float16): - return self.model(*args, **kwds) - else: - return self.model(*args, **kwds) - - def zero_grad(self): - """Set gradients to zero for the optimizer.""" - self.optimizer.zero_grad() - - def backward(self, loss_fn: Callable, *loss_args) -> torch.Tensor: - """Perform backward pass and return the loss. - - Args: - loss_fn (Callable): computes the loss. - *loss_args: are the arguments to be passed to ``loss_fn``. - - Returns: - torch.Tensor: computed loss. - """ - if self.mixed_precision: - # https://pytorch.org/docs/stable/notes/amp_examples.html - # Runs the forward pass with autocasting. - with autocast(device_type='cuda', dtype=torch.float16): - loss = loss_fn(*loss_args) - - # Scales loss. Calls backward() on scaled loss to create scaled - # gradients. - # Backward passes under autocast are not recommended. - # Backward ops run in the same dtype autocast chose for - # corresponding forward ops. - loss = self.grad_scaler.scale(loss) - else: - loss = loss_fn(*loss_args) - loss.backward() - return loss - - def optimizer_step(self): - """Perform optimizer step.""" - if self.mixed_precision: - # https://pytorch.org/docs/stable/notes/amp_examples.html#typical-mixed-precision-training - # scaler.step() first unscales the gradients of the optimizer's - # assigned params. - # If these gradients do not contain infs or NaNs, optimizer.step() - # is then called, - # otherwise, optimizer.step() is skipped. - self.grad_scaler.step(self.optimizer) - - # Updates the scale for next iteration. - self.grad_scaler.update() - else: - self.optimizer.step() - - def lr_scheduler_step(self): - """Perform lr scheduler step, if present.""" - if self.lr_scheduler: - self.lr_scheduler.step() - - def save_checkpoint(self): - """Save checkpoint to persistent storage.""" - raise NotImplementedError - - -class DSModelEngine(ModelEngine): - """Model engine for DeeSpeed distributed strategy.""" - - def forward(self, *args: Any, **kwds: Any) -> Any: - """Performs the forward operation.""" - if self.mixed_precision: - # https://pytorch.org/docs/stable/notes/amp_examples.html - # Runs the forward pass with autocasting. - with autocast(device_type='cuda', dtype=torch.float16): - return self.model(*args, **kwds) - else: - return self.model(*args, **kwds) - - def zero_grad(self): - """Set gradients to zero for the optimizer.""" - self.optimizer.zero_grad() - - def backward(self, loss_fn: Callable, *loss_args) -> torch.Tensor: - """Perform backward pass and return the loss. - - Args: - loss_fn (Callable): computes the loss. - *loss_args: are the arguments to be passed to ``loss_fn``. - - Returns: - torch.Tensor: computed loss. - """ - if self.mixed_precision: - # https://pytorch.org/docs/stable/notes/amp_examples.html - # Runs the forward pass with autocasting. - with autocast(device_type='cuda', dtype=torch.float16): - loss = loss_fn(*loss_args) - - # Scales loss. Calls backward() on scaled loss to create scaled - # gradients. - # Backward passes under autocast are not recommended. - # Backward ops run in the same dtype autocast chose for - # corresponding forward ops. - loss = self.grad_scaler.scale(loss) - else: - loss = loss_fn(*loss_args) - loss.backward() - return loss - - def optimizer_step(self): - """Perform optimizer step.""" - if self.mixed_precision: - # https://pytorch.org/docs/stable/notes/amp_examples.html#typical-mixed-precision-training - # scaler.step() first unscales the gradients of the optimizer's - # assigned params. - # If these gradients do not contain infs or NaNs, optimizer.step() - # is then called, - # otherwise, optimizer.step() is skipped. - self.grad_scaler.step(self.optimizer) - - # Updates the scale for next iteration. - self.grad_scaler.update() - else: - self.optimizer.step() - - def lr_scheduler_step(self): - """Perform lr scheduler step, if present.""" - if self.lr_scheduler: - self.lr_scheduler.step() - - def save_checkpoint(self): - """Save checkpoint to persistent storage.""" - raise NotImplementedError diff --git a/src/itwinai/torch/inference.py b/src/itwinai/torch/inference.py index 02882f06..bb9af300 100644 --- a/src/itwinai/torch/inference.py +++ b/src/itwinai/torch/inference.py @@ -6,8 +6,7 @@ from torch import nn from torch.utils.data import DataLoader, Dataset -from ..utils import dynamically_import_class -from .utils import clear_key +from ..utils import dynamically_import_class, clear_key from ..components import Predictor, monitor_exec from .types import TorchDistributedStrategy as StrategyT from .types import Metric, Batch diff --git a/src/itwinai/torch/mlflow.py b/src/itwinai/torch/mlflow.py index 18a014ff..36992393 100644 --- a/src/itwinai/torch/mlflow.py +++ b/src/itwinai/torch/mlflow.py @@ -16,6 +16,8 @@ def _get_mlflow_logger_conf(pl_config: Dict) -> Optional[Dict]: Optional[Dict]: if present, MLFLowLogger constructor arguments (under 'init_args' key). """ + if not pl_config['trainer'].get('logger'): + return None if isinstance(pl_config['trainer']['logger'], list): # If multiple loggers are provided for logger_conf in pl_config['trainer']['logger']: @@ -35,6 +37,7 @@ def _mlflow_log_pl_config(pl_config: Dict, local_yaml_path: str) -> None: def init_lightning_mlflow( pl_config: Dict, default_experiment_name: str = 'Default', + tmp_dir: str = '.tmp', **autolog_kwargs ) -> None: """Initialize mlflow for pytorch lightning, also setting up @@ -45,6 +48,7 @@ def init_lightning_mlflow( pl_config (Dict): pytorch lightning configuration loaded in memory. default_experiment_name (str, optional): used as experiment name if it is not given in the lightning conf. Defaults to 'Default'. + tmp_dir (str): where to temporarily store some artifacts. **autolog_kwargs (kwargs): args for mlflow.pytorch.autolog(...). """ mlflow_conf: Optional[Dict] = _get_mlflow_logger_conf(pl_config) @@ -63,12 +67,13 @@ def init_lightning_mlflow( mlflow.set_tracking_uri(tracking_uri) mlflow.set_experiment(experiment_name) mlflow.pytorch.autolog(**autolog_kwargs) - mlflow.start_run() + run = mlflow.start_run() + print(f"MLFlow's artifacts URI: {run.info.artifact_uri}") mlflow_conf['experiment_name'] = experiment_name mlflow_conf['run_id'] = mlflow.active_run().info.run_id - _mlflow_log_pl_config(pl_config, '.tmp/pl_config.yml') + _mlflow_log_pl_config(pl_config, os.path.join(tmp_dir, 'pl_config.yml')) def teardown_lightning_mlflow() -> None: diff --git a/src/itwinai/torch/reproducibility.py b/src/itwinai/torch/reproducibility.py new file mode 100644 index 00000000..1513c82a --- /dev/null +++ b/src/itwinai/torch/reproducibility.py @@ -0,0 +1,48 @@ +""" +This module provides the tools to support reproducible execution of +torch scripts. +""" + +from typing import Optional +import numpy as np +import random + +import torch + + +def seed_worker(worker_id): + """Seed DataLoader worker.""" + worker_seed = torch.initial_seed() % 2**32 + np.random.seed(worker_seed) + random.seed(worker_seed) + + +def set_seed( + rnd_seed: Optional[int], + deterministic_cudnn: bool = True +) -> torch.Generator: + """Set torch random seed and return a PRNG object. + + Args: + rnd_seed (Optional[int]): random seed. If None, the seed is not set. + deterministic_cudnn (bool): if True, sets + ``torch.backends.cudnn.benchmark = False``, which may affect + performances. + + Returns: + torch.Generator: PRNG object. + """ + g = torch.Generator() + if rnd_seed is not None: + # Deterministic execution + np.random.seed(rnd_seed) + random.seed(rnd_seed) + torch.manual_seed(rnd_seed) + g.manual_seed(rnd_seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(rnd_seed) + torch.cuda.manual_seed_all(rnd_seed) + if deterministic_cudnn: + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + return g diff --git a/src/itwinai/torch/trainer.py b/src/itwinai/torch/trainer.py index f0ad1c03..4e7a108f 100644 --- a/src/itwinai/torch/trainer.py +++ b/src/itwinai/torch/trainer.py @@ -1,15 +1,12 @@ """Provides training logic for PyTorch models via Trainer classes.""" from typing import ( - Optional, Dict, Union, Tuple, Type, List, Any + Optional, Dict, Union, Tuple, List, Any, Literal ) -import time import os import sys -import numpy as np import torch -import torch.multiprocessing as mp from torch.utils.data import DataLoader, Dataset from torch.utils.data.distributed import DistributedSampler import torch.distributed as dist @@ -17,494 +14,319 @@ import torch.nn as nn from torch.optim.optimizer import Optimizer +import lightning as L +from lightning.pytorch.cli import LightningCLI + +import horovod.torch as hvd + from ..components import Trainer, monitor_exec -from .utils import seed_worker, par_allgather_obj, clear_key from .types import ( Batch, Loss, LrScheduler, Metric ) -from .types import TorchDistributedStrategy as StrategyT -from ..loggers import LogMixin, Logger, ConsoleLogger -from ..utils import dynamically_import_class -from ..cluster import ClusterEnvironment -# from .distributed import ( -# TorchDistributedStrategy, -# DDPDistributedStrategy, -# DSDistributedStrategy, -# HVDDistributedStrategy -# ) +from ..loggers import LogMixin, Logger +from .reproducibility import seed_worker, set_seed +from .distributed import ( + TorchDistributedStrategy, + TorchDDPStrategy, + HorovodStrategy, + DeepSpeedStrategy, + NonDistributedStrategy, + distributed_resources_available +) +from ..utils import load_yaml +from .mlflow import ( + init_lightning_mlflow, + teardown_lightning_mlflow +) -def preproc_dataloader(dataloader: DataLoader, gwsize, grank): - """Makes a Dataloader distributed.""" - sampler = DistributedSampler( - dataloader.dataset, - num_replicas=gwsize, - rank=grank, - shuffle=True - ) - # Recreate dataloader, with updated sampler - return DataLoader( - dataloader.dataset, - batch_size=dataloader.batch_size, - sampler=sampler, - num_workers=dataloader.num_workers, - collate_fn=dataloader.collate_fn, - pin_memory=dataloader.pin_memory, - drop_last=dataloader.drop_last, - timeout=dataloader.timeout, - worker_init_fn=seed_worker, # dataloader.worker_init_fn, - multiprocessing_context=dataloader.multiprocessing_context, - generator=dataloader.generator, - prefetch_factor=dataloader.prefetch_factor, - persistent_workers=dataloader.persistent_workers, - pin_memory_device=dataloader.pin_memory_device - ) +class Config: + def __init__(self, my_dict: Optional[Dict] = None): + my_dict = my_dict if my_dict is not None else {} + self.__dict__.update(my_dict) -def distributed(func): - """The decorated function must have a standard signature. - Its first arguments must be: - model, train_dataloader, validation_dataloader, device (in this order). +class TorchTrainer(Trainer, LogMixin): + """Trainer class for torch training algorithms. - Additional args or kwargs are allowed consistently with the signature - of the decorated function. + Args: + config (Dict): training configuration containing hyperparameters. + epochs (int): number of training epochs. + model (Optional[nn.Module], optional): model to train. + Defaults to None. + strategy (Literal["ddp", "deepspeed", + "horovod"], optional): distributed strategy. + Defaults to 'ddp'. + validation_every (Optional[int], optional): run a validation epoch + every ``validation_every`` epochs. Disabled if None. Defaults to 1. + test_every (Optional[int], optional): run a test epoch + every ``test_every`` epochs. Disabled if None. Defaults to None. + random_seed (Optional[int], optional): set random seed for + reproducibility. If None, the seed is not set. Defaults to None. + logger (Optional[Logger], optional): logger for ML tracking. + Defaults to None. + log_all_workers (bool, optional): if True, the ``log`` method is + called on all workers in the distributed context. Defaults to False. + metrics (Optional[Dict[str, Metric]], optional): map of torchmetrics + metrics. Defaults to None. + name (Optional[str], optional): trainer custom name. Defaults to None. """ - def dist_train( - model, train_dataloader, validation_dataloader=None, device='cpu', - *args, **kwargs - ): - if torch.cuda.is_available(): - dist.init_process_group(backend='nccl') - - if torch.cuda.is_available(): - lwsize = torch.cuda.device_count() # local world size - per node - gwsize = dist.get_world_size() # global world size - per run - grank = dist.get_rank() # global rank - assign per run - lrank = dist.get_rank() % lwsize # local rank - assign per node - else: - gwsize = 1 - grank = 0 - lrank = 0 - - device = torch.device( - 'cuda' if torch.cuda.is_available() else 'cpu', lrank) - if torch.cuda.is_available(): - torch.cuda.set_device(lrank) - - model = model.to(device) - model = DDP(model, device_ids=[device], output_device=device) - - train_dataloader = preproc_dataloader(train_dataloader, gwsize, grank) - if validation_dataloader is not None: - validation_dataloader = preproc_dataloader( - validation_dataloader, gwsize, grank) - - try: - func(model, train_dataloader, validation_dataloader, device, - *args, **kwargs) - finally: - if torch.cuda.is_available(): - dist.barrier() - dist.destroy_process_group() - return dist_train + # TODO: + # - add checkpointing. + # - extract BaseTorchTrainer and extend it creating a set of trainer + # templates (e.g.. GAN, Classifier, Transformer) allowing scientists + # to reuse ML algos. + # - improve get from configuration object + _strategy: TorchDistributedStrategy = None -class TorchTrainerMG(Trainer, LogMixin): - """ - Torch trainer for optionally distributed data-parallel (DDP) workload. - Multi-GPU distribution. - - Args: - model (nn.Module): neural network instance. - loss (Loss): torch loss function instance. - optimizer_class (str): path to optimizer class - (e.g., 'torch.optim.SGD') - optimizer_kwargs (Optional[Dict], optional): optimizer constructor - arguments (except from parameters). Defaults to None. - lr_scheduler_class (Optional[str], optional): path to learning - rate scheduler class. Defaults to None. - lr_scheduler_kwargs (Optional[Dict], optional): constructor arguments - of the learning rate scheduler, except for the optimizer. - Defaults to None. - train_dataloader_class (str, optional): train dataloader class path. - Defaults to 'torch.utils.data.DataLoader'. - train_dataloader_kwargs (Optional[Dict], optional): constructor - arguments of the train dataloader, except for the dataset - instance. Defaults to None. - validation_dataloader_class (str, optional): validation dataloader - class path. Defaults to 'torch.utils.data.DataLoader'. - validation_dataloader_kwargs (Optional[Dict], optional): constructor - arguments of the validation dataloader, except for the dataset - instance. If None, it replicates `train_dataloader_kwargs`. - Defaults to None. - epochs (int, optional): number of training epochs. Defaults to 1. - strategy (Optional[TorchDistributedStrategy], optional): distributed - strategy. Defaults to StrategyT.NONE.value. - backend (TorchDistributedBackend, optional): computing backend. - Defaults to BackendT.NCCL.value. - shuffle_dataset (bool, optional): whether shuffle dataset before - sampling batches from dataloader. Defaults to False. - use_cuda (bool, optional): whether to use GPU. Defaults to True. - benchrun (bool, optional): sets up a debug run. Defaults to False. - testrun (bool, optional): deterministic training seeding everything. - Defaults to False. - seed (Optional[int], optional): random seed. Defaults to None. - logger (Optional[List[Logger]], optional): logger. Defaults to None. - checkpoint_every (int, optional): how often (epochs) to checkpoint the - best model. Defaults to 10. - cluster (Optional[ClusterEnvironment], optional): cluster environment - object describing the context in which the trainer is executed. - Defaults to None. - train_metrics (Optional[Dict[str, Metric]], optional): - list of metrics computed in the training step on the predictions. - It's a dictionary with the form - ``{'metric_unique_name': CallableMetric}``. Defaults to None. - validation_metrics (Optional[Dict[str, Metric]], optional): same - as ``training_metrics``. If not given, it mirrors the training - metrics. Defaults to None. - - Raises: - RuntimeError: When trying to use DDP without CUDA support. - NotImplementedError: when trying to use a strategy different from the - ones provided by TorchDistributedStrategy. - """ + train_dataloader: DataLoader = None + validation_dataloader: DataLoader = None + test_dataloader: DataLoader = None model: nn.Module = None loss: Loss = None optimizer: Optimizer = None - lr_scheduler = None - _strategy: StrategyT = StrategyT.NONE.value - train_dataset: Dataset - validation_dataset: Dataset - train_dataloader: DataLoader = None - validation_dataloader: DataLoader = None - epoch_idx: int = 0 + lr_scheduler: LrScheduler = None + + torch_rng: torch.Generator = None + logger: Logger = None train_glob_step: int = 0 validation_glob_step: int = 0 - train_metrics: Dict[str, Metric] - validation_metrics: Dict[str, Metric] + test_glob_step: int = 0 + metrics: Dict[str, Metric] def __init__( self, - model: nn.Module, - loss: Loss, - optimizer_class: str, - optimizer_kwargs: Optional[Dict] = None, - lr_scheduler_class: Optional[str] = None, - lr_scheduler_kwargs: Optional[Dict] = None, - train_dataloader_class: str = 'torch.utils.data.DataLoader', - train_dataloader_kwargs: Optional[Dict] = None, - validation_dataloader_class: str = 'torch.utils.data.DataLoader', - validation_dataloader_kwargs: Optional[Dict] = None, - epochs: int = 1, - strategy: str = StrategyT.NONE.value, - benchrun: bool = False, - testrun: bool = False, - seed: Optional[int] = None, - logger: Optional[List[Logger]] = None, - checkpoint_every: int = 10, - cluster: Optional[ClusterEnvironment] = None, - train_metrics: Optional[Dict[str, Metric]] = None, - validation_metrics: Optional[Dict[str, Metric]] = None + config: Dict, + epochs: int, + model: Optional[nn.Module] = None, + strategy: Literal["ddp", "deepspeed", "horovod"] = 'ddp', + validation_every: Optional[int] = 1, + test_every: Optional[int] = None, + random_seed: Optional[int] = None, + logger: Optional[Logger] = None, + log_all_workers: bool = False, + metrics: Optional[Dict[str, Metric]] = None, + name: Optional[str] = None ) -> None: - """Sets up the distributed backend and loggers. - Makes the model a DDP model. - """ - super().__init__() + super().__init__(name) self.save_parameters(**self.locals2params(locals())) - self.model = model - self.loss = loss + + # config is mean to store all hyperparameters, which can very from use + # case to use case + # and include learning_rate, batch_size.... + self.config = Config(config) self.epochs = epochs - self.testrun = testrun - self.seed = seed + self.model = model self.strategy = strategy - self.benchrun = benchrun - self.cluster = cluster - # Checkpoint every n epochs - self.checkpoint_every = checkpoint_every - - # Train and validation dataloaders - self.train_dataloader_class = dynamically_import_class( - train_dataloader_class - ) - self.validation_dataloader_class = dynamically_import_class( - validation_dataloader_class - ) - train_dataloader_kwargs = ( - train_dataloader_kwargs - if train_dataloader_kwargs is not None else {} - ) - self.train_dataloader_kwargs = clear_key( - train_dataloader_kwargs, 'train_dataloader_kwargs', 'dataset' - ) - # If validation_dataloader_kwargs is not given, - # copy train_dataloader_kwargs - validation_dataloader_kwargs = ( - validation_dataloader_kwargs if validation_dataloader_kwargs - is not None else train_dataloader_kwargs - ) - self.validation_dataloader_kwargs = clear_key( - validation_dataloader_kwargs, 'validation_dataloader_kwargs', - 'dataset' - ) - - # Optimizer and scheduler - optim_class = dynamically_import_class(optimizer_class) - optimizer_kwargs = ( - optimizer_kwargs if optimizer_kwargs is not None else {} - ) - optimizer_kwargs = clear_key( - optimizer_kwargs, 'optimizer_kwargs', 'parameters' - ) - self.optimizer: Optimizer = optim_class( - self.model.parameters(), **optimizer_kwargs - ) - if lr_scheduler_class is not None: - scheduler_class = dynamically_import_class(lr_scheduler_class) - lr_scheduler_kwargs = ( - lr_scheduler_kwargs if lr_scheduler_kwargs is not None else {} - ) - lr_scheduler_kwargs = clear_key( - lr_scheduler_kwargs, 'lr_scheduler_kwargs', 'optimizer' - ) - self.lr_scheduler: LrScheduler = scheduler_class( - self.optimizer, **lr_scheduler_kwargs - ) - - # Loggers - self.logger = logger if logger is not None else ConsoleLogger() - - # Metrics - self.train_metrics = ( - {} if train_metrics is None else train_metrics - ) - self.validation_metrics = ( - self.train_metrics if validation_metrics is None - else validation_metrics - ) + self.validation_every = validation_every + self.test_every = test_every + self.random_seed = random_seed + self.logger = logger + self.log_all_workers = log_all_workers + self.metrics = metrics if metrics is not None else {} @property - def strategy(self) -> Optional[str]: + def strategy(self) -> TorchDistributedStrategy: return self._strategy @strategy.setter - def strategy(self, strategy_name) -> None: - if strategy_name not in StrategyT: - raise ValueError( - "Unrecognized 'strategy' field. Allowed values " - f"are: {StrategyT.list()}. Received '{strategy_name}'") - self._strategy = strategy_name + def strategy(self, strategy: Union[str, TorchDistributedStrategy]) -> None: + if isinstance(strategy, TorchDistributedStrategy): + self._strategy = strategy + else: + self._strategy = self._detect_strategy(strategy) @property - def global_step(self) -> int: - return self.train_glob_step + self.validation_glob_step + def device(self) -> str: + return self.strategy.device() + + def _detect_strategy(self, strategy: str) -> TorchDistributedStrategy: + if not distributed_resources_available(): + print("WARNING: falling back to non-distributed strategy.") + dist_str = NonDistributedStrategy() + elif strategy == 'ddp': + dist_str = TorchDDPStrategy(backend='nccl') + elif strategy == 'horovod': + dist_str = HorovodStrategy() + elif strategy == 'deepspeed': + dist_str = DeepSpeedStrategy(backend='nccl') + else: + raise NotImplementedError( + f"Strategy '{strategy}' is not recognized/implemented.") + return dist_str - def set_seed(self, seed: Optional[int] = None): - """Deterministic operations for reproducibility. - Sets the random seed. + def _init_distributed_strategy(self) -> None: + if not self.strategy.is_initialized: + self.strategy.init() - Args: - seed (Optional[int], optional): if not None, overrides - `self.seed`. Defaults to None. + def create_model_loss_optimizer(self) -> None: + """ + Instantiate a torch model, loss, optimizer, and LR scheduler using the + configuration provided in the Trainer constructor. + Generally a user-define method. """ - seed = seed if seed is not None else self.seed - np.random.seed(seed) - self.torch_rng = torch.Generator() - if seed is not None: - torch.manual_seed(seed) - self.torch_rng.manual_seed(seed) - if self.cluster.is_cuda_available(): - torch.cuda.manual_seed(seed) + ################################### + # Dear user, this is a method you # + # may be interested to override! # + ################################### + + if self.model is None: + # Model was not passed to the constructor. + # Create a model here + raise ValueError( + "self.model is None! Either pass it to the constructor or " + "override this method." + ) - @monitor_exec - def execute( - self, - train_dataset: Dataset, - validation_dataset: Dataset, - model: nn.Module = None, - optimizer: Optimizer = None, - lr_scheduler: LrScheduler = None, - ) -> Any: - self.train_dataset = train_dataset - self.validation_dataset = validation_dataset - - # Update parameters passed for "interactive" use - if model is not None: - self.model = model - if optimizer is not None: - self.optimizer = optimizer - if lr_scheduler is not None: - self.lr_scheduler = lr_scheduler - - # Start training - if self.cluster.distributed: - # Make training distributed - result = mp.spawn(self._train, nprocs=self.cluster.ngpus_per_node) - else: - result = self._train(0) + # A simple NLLLoss + self.loss = nn.functional.nll_loss - # Return value compliant with Executable.execute format - return result + # TODO: improve robustness of getting from config + self.optimizer = torch.optim.SGD( + self.model.parameters(), + lr=self.config.lr, + momentum=self.config.momentum + ) + # Create self.lr_scheduler if needed - def _train( - self, - worker_id: int - ): - # Each worker has a different deterministic seed - # Here, 'worker' = replica of the training function - worker_seed = ( - self.seed + worker_id if self.seed is not None else self.seed + # IMPORTANT: model, optimizer, and scheduler need to be distributed + + # First, define strategy-wise optional configurations + # TODO: improve robustness of getting from config + if isinstance(self.strategy, DeepSpeedStrategy): + # Batch size definition is not optional for DeepSpeedStrategy! + distribute_kwargs = dict( + config_params=dict( + train_micro_batch_size_per_gpu=self.config.batch_size + ) + ) + elif isinstance(self.strategy, HorovodStrategy): + distribute_kwargs = dict( + compression=( + hvd.Compression.fp16 if self.config.fp16_allreduce + else hvd.Compression.none + ), + op=hvd.Adasum if self.config.use_adasum else hvd.Average, + gradient_predivide_factor=self.config.gradient_predivide_factor + ) + else: + distribute_kwargs = {} + + # Distributed model, optimizer, and scheduler + ( + self.model, + self.optimizer, + self.lr_scheduler + ) = self.strategy.distributed( + self.model, self.optimizer, self.lr_scheduler, **distribute_kwargs ) - self.set_seed(worker_seed) - # Instantiate dataloaders - self.train_dataloader = self._instantiate_dataloader( - dataloader_class=self.train_dataloader_class, - dataset=self.train_dataset, - init_kwargs=self.train_dataloader_kwargs + def create_dataloaders( + self, + train_dataset: Dataset, + validation_dataset: Optional[Dataset] = None, + test_dataset: Optional[Dataset] = None + ) -> None: + """ + Create train, validation and test dataloaders using the + configuration provided in the Trainer constructor. + Generally a user-define method. + + Args: + train_dataset (Dataset): training dataset object. + validation_dataset (Optional[Dataset]): validation dataset object. + Default None. + test_dataset (Optional[Dataset]): test dataset object. + Default None. + """ + + ################################### + # Dear user, this is a method you # + # may be interested to override! # + ################################### + + # TODO: improve robustness of getting from config + self.train_dataloader = self.strategy.create_dataloader( + dataset=train_dataset, + batch_size=self.config.batch_size, + num_workers=self.config.num_workers, + pin_memory=self.config.pin_memory, + generator=self.torch_rng ) - if self.validation_dataset is not None: - self.validation_dataloader = self._instantiate_dataloader( - dataloader_class=self.validation_dataloader_class, - dataset=self.validation_dataset, - init_kwargs=self.validation_dataloader_kwargs + if validation_dataset is not None: + self.validation_dataloader = self.strategy.create_dataloader( + dataset=train_dataset, + batch_size=self.config.batch_size, + num_workers=self.config.num_workers, + pin_memory=self.config.pin_memory, + generator=self.torch_rng + ) + if test_dataset is not None: + self.test_dataloader = self.strategy.create_dataloader( + dataset=train_dataset, + batch_size=self.config.batch_size, + num_workers=self.config.num_workers, + pin_memory=self.config.pin_memory, + generator=self.torch_rng ) - # Launch actual training: - - # Single worker case - if not self.cluster.distributed: - with self.cluster.init_dist_gpu(worker_id) as device: - self.device: torch.device = device - self.model = self.model.to(self.device) - self.setup_logger() - self._setup_metrics() - try: - train_result = self.train() - except Exception as exc: - print(exc) - raise exc - finally: - print("INFO: Training ended") - self.destroy_logger() - train_result = None - return train_result - - # Init / connect to distributed backend - with self.cluster.init_dist_gpu(worker_id) as device: - self.device: torch.device = device - self._distribute_model() - self.setup_logger() - self._setup_metrics() - try: - train_result = self.train() - except Exception as exc: - print(exc) - raise exc - finally: - print("INFO: Training ended") - self.destroy_logger() - train_result = None - return train_result - - def _instantiate_dataloader( + def _setup_metrics(self): + """Move metrics to current device.""" + for m_name, metric in self.metrics.items(): + self.metrics[m_name] = metric.to(self.device) + + @monitor_exec + def execute( self, - dataloader_class: Type, - dataset: Dataset, - init_kwargs: Dict - ) -> DataLoader: - """Make dataloader distributed if using distributed training strategy. + train_dataset: Dataset, + validation_dataset: Dataset, + test_dataset: Dataset + ) -> Tuple[Dataset, Dataset, Dataset, Any]: + """Prepares distributed environment and data structures + for the actual training. Args: - dataloader_class (Type): some torch DataLoader type. - dataset (Dataset): torch dataset instance. - init_kwargs (Dict): constructor args. + train_dataset (Dataset): training dataset. + validation_dataset (Dataset): validation dataset. + test_dataset (Dataset): test dataset. + + Returns: + Tuple[Dataset, Dataset, Dataset, Any]: training dataset, + validation dataset, test dataset, trained model. """ - init_kwargs['generator'] = init_kwargs.get( - 'generator', self.torch_rng - ) - init_kwargs['worker_init_fn'] = init_kwargs.get( - 'worker_init_fn', seed_worker + self.torch_rng = set_seed(self.random_seed) + self._init_distributed_strategy() + self._setup_metrics() + + self.create_dataloaders( + train_dataset=train_dataset, + validation_dataset=validation_dataset, + test_dataset=test_dataset ) + self.create_model_loss_optimizer() - if self.strategy == StrategyT.DDP.value and self.cluster.distributed: - sampler = DistributedSampler( - dataset=dataset, - num_replicas=self.cluster.global_world_size, - rank=self.cluster.global_rank, - shuffle=init_kwargs.get( - 'shuffle', False - ) - ) - # Overwrite existing sampler, if given. - # TODO: improve using wrapper: - # https://discuss.pytorch.org/t/how-to-use-my-own-sampler-when-i-already-use-distributedsampler/62143?page=2 - init_kwargs['sampler'] = sampler - if init_kwargs.get('shuffle') is not None: - # sampler option is mutually exclusive with shuffle - del init_kwargs['shuffle'] + if self.strategy.is_main_worker: + self.logger.create_logger_context() - return dataloader_class(dataset, **init_kwargs) + self.train() - def _setup_metrics(self): - for m_name, metric in self.train_metrics.items(): - self.train_metrics[m_name] = metric.to(self.device) - for m_name, metric in self.validation_metrics.items(): - self.validation_metrics[m_name] = metric.to(self.device) - - def _distribute_model(self): - if self.cluster.distributed: - # Distribute model - self.model = self.model.to(self.device) - if self.strategy == StrategyT.NONE.value: - print( - "WARNING: A GPU cluster is available but no distributed " - "strategy was given... Falling back to single worker...") - if not self.cluster.is_main_worker(): - # Use only GPU:0 for single worker - sys.exit(0) - elif self.strategy == StrategyT.DDP.value: - self.model = DDP( - self.model, - device_ids=[self.device.index], - output_device=self.device - ) - else: - raise NotImplementedError("Only DDP strategy is implemented.") - else: - raise RuntimeError( - "Trying to distribute a model when a " - "distributed cluster is not available." - ) + if self.strategy.is_main_worker: + self.logger.destroy_logger_context() + self.strategy.clean_up() + return train_dataset, validation_dataset, test_dataset, self.model - def setup_logger(self): - if self.cluster.is_main_worker(): - # Only setup loggers on main worker - if isinstance(self.logger, list): - for logger in self.logger: - logger.create_logger_context() - elif isinstance(self.logger, Logger): - self.logger.create_logger_context() - else: - raise TypeError( - "Unrecognized self.logger. Allowed types are 'list' and " - f"'Logger'. Received {type(self.logger)}" - ) - else: - self.logger = [] - - def destroy_logger(self): - if self.cluster.is_main_worker(): - if isinstance(self.logger, list): - for logger in self.logger: - logger.destroy_logger_context() - elif isinstance(self.logger, Logger): - self.logger.destroy_logger_context() - else: - raise TypeError( - "Unrecognized self.logger. Allowed types are 'list' and " - f"'Logger'. Received {type(self.logger)}" - ) + def _set_epoch_dataloaders(self, epoch: int): + """ + Sets epoch in the distributed sampler of a dataloader when using it. + """ + if self.strategy.is_distributed: + self.train_dataloader.sampler.set_epoch(epoch) + if self.validation_dataloader is not None: + self.validation_dataloader.sampler.set_epoch(epoch) + if self.test_dataloader is not None: + self.test_dataloader.sampler.set_epoch(epoch) def log( self, @@ -513,39 +335,44 @@ def log( kind: str = 'metric', step: Optional[int] = None, batch_idx: Optional[int] = None, - every_worker: bool = False, **kwargs ) -> None: - if self.cluster.is_main_worker() or every_worker: - # Only log on main worker if not specified otherwise - if isinstance(self.logger, list): - for logger in self.logger: - logger.log( - item=item, - identifier=identifier, - kind=kind, - step=step, - batch_idx=batch_idx, - **kwargs - ) - elif isinstance(self.logger, Logger): - self.logger.log( - item=item, - identifier=identifier, - kind=kind, - step=step, - batch_idx=batch_idx, - **kwargs - ) - else: - raise TypeError( - "Unrecognized self.logger. Allowed types are 'list' and " - f"'Logger'. Received {type(self.logger)}" - ) + if self.logger and ( + self.strategy.is_main_worker or self.log_all_workers): + self.logger.log( + item=item, + identifier=identifier, + kind=kind, + step=step, + batch_idx=batch_idx, + **kwargs + ) + + def train(self): + """Trains a machine learning model. + Main training loop/logic. + + Args: + train_dataset (Dataset): training dataset. + validation_dataset (Dataset): validation dataset. + test_dataset (Dataset): test dataset. + + Returns: + Tuple[Dataset, Dataset, Dataset, Any]: training dataset, + validation dataset, test dataset, trained model. + """ + # start_time = time.perf_counter() + for epoch in range(self.epochs): + epoch_n = epoch + 1 + self._set_epoch_dataloaders(epoch) + self.train_epoch() + if self.validation_every and self.validation_every % epoch_n == 0: + self.validation_epoch() + if self.test_every and self.test_every % epoch_n == 0: + self.test_epoch() def compute_metrics( self, - metrics: Dict[str, Metric], true: Batch, pred: Batch, logger_step: int, @@ -566,7 +393,7 @@ def compute_metrics( Dict[str, Any]: metric values. """ m_values = {} - for m_name, metric in metrics.items(): + for m_name, metric in self.metrics.items(): # metric = metric.to(self.device) m_val = metric(pred, true).detach().cpu().numpy() self.log( @@ -596,7 +423,6 @@ def training_step( batch_idx=batch_idx ) metrics: Dict[str, Any] = self.compute_metrics( - metrics=self.train_metrics, true=y, pred=pred_y, logger_step=self.train_glob_step, @@ -612,8 +438,9 @@ def validation_step( ) -> Tuple[Loss, Dict[str, Any]]: x, y = batch x, y = x.to(self.device), y.to(self.device) - pred_y = self.model(x) - loss: Loss = self.loss(pred_y, y) + with torch.no_grad(): + pred_y = self.model(x) + loss: Loss = self.loss(pred_y, y) self.log( item=loss.item(), identifier='validation_loss', @@ -622,7 +449,6 @@ def validation_step( batch_idx=batch_idx ) metrics: Dict[str, Any] = self.compute_metrics( - metrics=self.validation_metrics, true=y, pred=pred_y, logger_step=self.validation_glob_step, @@ -631,7 +457,7 @@ def validation_step( ) return loss, metrics - def training_epoch(self) -> Loss: + def train_epoch(self) -> Loss: self.model.train() train_losses = [] for batch_idx, train_batch in enumerate(self.train_dataloader): @@ -684,264 +510,130 @@ def validation_epoch(self) -> Loss: ) return avg_loss - def train(self): + def test_epoch(self): + # TODO: implement test epoch + raise NotImplementedError() - if self.optimizer is None: - raise ValueError("Undefined optimizer!") - - if self.loss is None: - raise ValueError("Undefined loss function!") - - st = time.time() - - # Resume state - self.start_epoch = 1 - self.best_loss = np.Inf - self.load_state() - - # start training/testing loop - if self.cluster.is_main_worker(): - print(f'TIMER: broadcast: {time.time()-st}s') - print('DEBUG: start training') - print('-'*56) - - ############################## - # Start training: run epochs # - ############################## - - et = time.time() - for self.epoch_idx in range(self.start_epoch, self.epochs + 1): - lt = time.time() - - ####################################################### - # Perform one training epoch and one validation epoch # - ####################################################### - - if self.benchrun and self.epoch_idx == self.epochs: - # TODO: move profiler into cluster environment - # profiling (done on last epoch - slower!) - with torch.autograd.profiler.profile( - use_cuda=self.cluster.is_cuda_available(), - profile_memory=True - ) as prof: - train_loss = self.training_epoch() - else: - train_loss = self.training_epoch() - val_loss = self.validation_epoch() - - ##################################### - # Save checkpoint if model improved # - ##################################### - - ref_loss = val_loss if val_loss is not None else train_loss - is_best = ref_loss < self.best_loss - if (self.epoch_idx % self.checkpoint_every == 0 - and not self.benchrun): - self.save_state( - loss_val=ref_loss, - is_best=is_best - ) - self.best_loss = min(ref_loss, self.best_loss) - - ########################### - # End of epoch operations # - ########################### - - # save first epoch timer - if self.epoch_idx == self.start_epoch: - first_ep_t = time.time()-lt - - # Final epoch - if self.epoch_idx + 1 == self.epochs: - self.train_dataloader.last_epoch = True - self.validation_dataloader.last_epoch = True - - if self.cluster.is_main_worker(): - print(f'TIMER: epoch time: {time.time()-lt}s') - if self.benchrun and self.epoch_idx == self.epochs: - print('-'*56) - print('benchmark of last epoch:') - what1 = ( - 'cuda' if self.cluster.is_cuda_available() else 'cpu' - ) - print( - prof.key_averages().table( - sort_by='self_'+str(what1)+'_time_total' - ) - ) - - ########################## - # Training has completed # - ########################## - - # save final state - if not self.benchrun: - self.save_state( - loss_val=ref_loss, - is_best=is_best - ) - if self.cluster.is_cuda_available() and self.cluster.distributed: - dist.barrier() - - ######################## - # Print training stats # - ######################## - - if self.cluster.is_main_worker(): - print('-'*56) - print('training results:') - print(f'TIMER: first epoch time: {first_ep_t}s') - print(f'TIMER: last epoch time: {time.time()-lt}s') - print( - f'TIMER: average epoch time: {(time.time()-et)/self.epochs}s') - print(f'TIMER: total epoch time: {time.time()-et}s') - if self.epoch_idx > 1: - print( - f'TIMER: total epoch-1 time: {time.time()-et-first_ep_t}s' - ) - print( - 'TIMER: average epoch-1 time: ' - f'{(time.time()-et-first_ep_t)/(self.epochs-1)}s') - if self.benchrun: - print( - f'TIMER: total epoch-2 time: {lt-first_ep_t}s') - print('TIMER: average epoch-2 time: ' - f'{(lt-first_ep_t)/(self.epochs-2)}s') - mem = int(torch.cuda.memory_reserved( - self.cluster.local_rank)/1024/1024) - print( - f'memory req: {mem} MB' - if self.cluster.is_cuda_available() - and self.cluster.distributed else 'memory req: - MB' - ) - if self.cluster.is_cuda_available(): - print( - f'memory summary:\n {torch.cuda.memory_summary(0)}') - - if self.cluster.is_main_worker(): - print(f'TIMER: final time: {time.time()-st} s') - - def save_state(self, loss_val: Any, is_best: bool): - """Save training state.""" - res_name = 'checkpoint.pth.tar' - rt = time.time() - - if (self.cluster.is_cuda_available() and self.cluster.distributed): - # find if is_best happened in any worker - is_best_m = par_allgather_obj( - is_best, self.cluster.global_world_size - ) - if any(is_best_m): - # TODO: is this strategy really good? Checkpointing when - # at least one worker improves the loss on their local - # data split is prone to overfitting, especially when - # the dataset in unbalanced! - - # find which rank is_best happened - select first rank - # if multiple - best_rank = np.where(np.array(is_best_m))[0][0] - if self.cluster.global_rank == best_rank: - self._save_sate( - epoch=self.epoch_idx+1, - loss_val=loss_val, - save_path=res_name - ) - print( - f'DEBUG: state in {self.cluster.global_rank} is ' - f'saved on epoch:{self.epoch_idx} ' - f'in {time.time()-rt} s') - else: - self._save_sate( - epoch=self.epoch_idx+1, - loss_val=loss_val, - save_path=res_name - ) - print( - f'DEBUG: state in {self.cluster.global_rank} ' - f'is saved on epoch:{self.epoch_idx} in {time.time()-rt} s') - def _save_sate( +class TorchLightningTrainer(Trainer): + """Generic trainer for torch Lightning workflows. + + Args: + config (Union[Dict, str]): (path to a) Lightning configuration + https://pytorch-lightning.readthedocs.io/en/1.6.5/common/lightning_cli.html + mlflow_saved_model (str, optional): name of the model created in + MLFlow. Defaults to 'my_model'. + """ + + def __init__( self, - epoch: int, - loss_val: Any, - save_path: str + config: Union[Dict, str], + mlflow_saved_model: str = 'my_model' ): - """Save state on disk.""" - sched = ( - self.lr_scheduler.state_dict() - if self.lr_scheduler is not None else None + self.save_parameters(**self.locals2params(locals())) + super().__init__() + if isinstance(config, str) and os.path.isfile(config): + # Load from YAML + config = load_yaml(config) + self.conf = config + self.mlflow_saved_model = mlflow_saved_model + + @monitor_exec + def execute(self) -> Any: + init_lightning_mlflow( + self.conf, + tmp_dir='/tmp', + registered_model_name=self.mlflow_saved_model ) - state = { - 'epoch': epoch, - 'state_dict': self.model.state_dict(), - 'best_loss': loss_val, - 'optimizer': self.optimizer.state_dict(), - 'lr_scheduler': sched - } - self.log( - item=state, - identifier=save_path, - kind='torch', - epoch_step=self.epoch_idx, - batch_step=0 + old_argv = sys.argv + sys.argv = ['some_script_placeholder.py'] + cli = LightningCLI( + args=self.conf, + model_class=L.LightningModule, + datamodule_class=L.LightningDataModule, + run=False, + save_config_kwargs={ + "overwrite": True, + "config_filename": "pl-training.yml", + }, + subclass_mode_model=True, + subclass_mode_data=True, ) + sys.argv = old_argv + cli.trainer.fit(cli.model, datamodule=cli.datamodule) + teardown_lightning_mlflow() - def load_state(self): - """Load training state.""" - res_name = 'checkpoint.pth.tar' - if os.path.isfile(res_name) and not self.benchrun: - try: - if (self.cluster.is_cuda_available() - and self.cluster.distributed): - dist.barrier() - # Map model to be loaded to specified single gpu. - # loc = ( - # {'cuda:%d' % 0: 'cuda:%d' % self.cluster.local_rank} - # if self.cluster.is_cuda_available() - # else {'cpu:%d' % 0: 'cpu:%d' % self.cluster.local_rank} - # ) - # checkpoint = torch.load(res_name, map_location=loc) - checkpoint = torch.load( - res_name, map_location=self.device - ) - else: - checkpoint = torch.load(res_name, map_location='cpu') - self.start_epoch = checkpoint['epoch'] - self.best_loss = checkpoint['best_loss'] - self.model.load_state_dict(checkpoint['state_dict']) - self.optimizer.load_state_dict(checkpoint['optimizer']) - if self.lr_scheduler is not None: - self.lr_scheduler.load_state_dict( - checkpoint['lr_scheduler'] - ) - if self.cluster.is_cuda_available(): - if self.cluster.is_main_worker(): - print( - f'WARNING: restarting from {self.start_epoch} ' - 'epoch') - else: - print( - f'WARNING: restarting from {self.start_epoch} epoch') - except Exception: - if self.cluster.is_cuda_available(): - if self.cluster.is_main_worker(): - print( - 'restart file cannot be loaded, restarting!') - else: - print( - 'WARNING: restart file cannot be loaded, restarting!') - - if self.start_epoch >= self.epochs + 1: - if self.cluster.is_cuda_available() and self.cluster.distributed: - if self.cluster.is_main_worker(): - print( - 'WARNING: given epochs are less than the ' - 'one in the restart file!') - print('WARNING: SYS.EXIT is issued') - sys.exit() - else: - print( - 'WARNING: given epochs are less than the ' - 'one in the restart file!') - print('WARNING: SYS.EXIT is issued') - sys.exit() + +def preproc_dataloader(dataloader: DataLoader, gwsize, grank): + """Makes a Dataloader distributed.""" + sampler = DistributedSampler( + dataloader.dataset, + num_replicas=gwsize, + rank=grank, + shuffle=True + ) + # Recreate dataloader, with updated sampler + return DataLoader( + dataloader.dataset, + batch_size=dataloader.batch_size, + sampler=sampler, + num_workers=dataloader.num_workers, + collate_fn=dataloader.collate_fn, + pin_memory=dataloader.pin_memory, + drop_last=dataloader.drop_last, + timeout=dataloader.timeout, + worker_init_fn=seed_worker, # dataloader.worker_init_fn, + multiprocessing_context=dataloader.multiprocessing_context, + generator=dataloader.generator, + prefetch_factor=dataloader.prefetch_factor, + persistent_workers=dataloader.persistent_workers, + pin_memory_device=dataloader.pin_memory_device + ) + + +def distributed(func): + """The decorated function must have a standard signature. + Its first arguments must be: + model, train_dataloader, validation_dataloader, device (in this order). + + Additional args or kwargs are allowed consistently with the signature + of the decorated function. + """ + def dist_train( + model, train_dataloader, validation_dataloader=None, device='cpu', + *args, **kwargs + ): + if torch.cuda.is_available(): + dist.init_process_group(backend='nccl') + + if torch.cuda.is_available(): + lwsize = torch.cuda.device_count() # local world size - per node + gwsize = dist.get_world_size() # global world size - per run + grank = dist.get_rank() # global rank - assign per run + lrank = dist.get_rank() % lwsize # local rank - assign per node + else: + gwsize = 1 + grank = 0 + lrank = 0 + + device = torch.device( + 'cuda' if torch.cuda.is_available() else 'cpu', lrank) + if torch.cuda.is_available(): + torch.cuda.set_device(lrank) + + model = model.to(device) + model = DDP(model, device_ids=[device], output_device=device) + + train_dataloader = preproc_dataloader(train_dataloader, gwsize, grank) + if validation_dataloader is not None: + validation_dataloader = preproc_dataloader( + validation_dataloader, gwsize, grank) + + try: + func(model, train_dataloader, validation_dataloader, device, + *args, **kwargs) + finally: + if torch.cuda.is_available(): + dist.barrier() + dist.destroy_process_group() + return dist_train diff --git a/src/itwinai/torch/types.py b/src/itwinai/torch/types.py index 614462ad..0b6f88ad 100644 --- a/src/itwinai/torch/types.py +++ b/src/itwinai/torch/types.py @@ -64,3 +64,11 @@ class TorchOptimizer(BaseEnum): """ SGD = 'SGD' ADAM = 'Adam' + + +class UninitializedStrategyError(Exception): + """Error raised when a strategy has not been initialized.""" + + +class DistributedStrategyError(Exception): + """Error raised when a strategy has already been initialized.""" diff --git a/src/itwinai/torch/utils.py b/src/itwinai/torch/utils.py deleted file mode 100644 index 99bcd246..00000000 --- a/src/itwinai/torch/utils.py +++ /dev/null @@ -1,84 +0,0 @@ -from typing import Hashable, Dict -import time -import numpy as np -import random - -import torch -import torch.distributed as dist - - -def save_state( - epoch, distrib_model, loss_val, optimizer, res_name, grank, gwsize, - is_best, distributed: bool = True -): - """Save training state""" - rt = time.time() - # find if is_best happened in any worker - if torch.cuda.is_available() and distributed: - is_best_m = par_allgather_obj(is_best, gwsize) - - if torch.cuda.is_available() and distributed: - if any(is_best_m): - # find which rank is_best happened - select first rank if multiple - is_best_rank = np.where(np.array(is_best_m))[0][0] - - # collect state - state = {'epoch': epoch + 1, - 'state_dict': distrib_model.state_dict(), - 'best_loss': loss_val, - 'optimizer': optimizer.state_dict()} - - # write on worker with is_best - if grank == is_best_rank: - torch.save(state, './'+res_name) - print(f'DEBUG: state in {grank} is saved on ' - f'epoch:{epoch} in {time.time()-rt} s') - else: - # collect state - state = {'epoch': epoch + 1, - 'state_dict': distrib_model.state_dict(), - 'best_loss': loss_val, - 'optimizer': optimizer.state_dict()} - - torch.save(state, './'+res_name) - print( - f'DEBUG: state in {grank} is saved on epoch:{epoch} ' - f'in {time.time()-rt} s') - - -def seed_worker(worker_id): - """deterministic dataloader""" - worker_seed = torch.initial_seed() % 2**32 - np.random.seed(worker_seed) - random.seed(worker_seed) - - -def par_allgather_obj(obj, gwsize): - """gathers any object from the whole group in a list (to all workers)""" - res = [None]*gwsize - dist.all_gather_object(res, obj, group=None) - # print(f'ALLGATHER: {res}') - return res - - -def clear_key( - my_dict: Dict, - dict_name: str, - key: Hashable, - complain: bool = True -) -> Dict: - """Remove key from dictionary if present and complain. - - Args: - my_dict (Dict): Dictionary. - dict_name (str): name of the dictionary. - key (Hashable): Key to remove. - """ - if key in my_dict: - if complain: - print( - f"Field '{key}' should not be present " - f"in dictionary '{dict_name}'" - ) - del my_dict[key] - return my_dict diff --git a/src/itwinai/utils.py b/src/itwinai/utils.py index 52279aeb..280de5d3 100644 --- a/src/itwinai/utils.py +++ b/src/itwinai/utils.py @@ -1,14 +1,11 @@ """ Utilities for itwinai package. """ -from typing import Dict, Type, Callable, Tuple -import os +from typing import Dict, Type, Callable, Tuple, Hashable import sys import inspect from collections.abc import MutableMapping import yaml -from omegaconf import OmegaConf -from omegaconf.dictconfig import DictConfig def load_yaml(path: str) -> Dict: @@ -32,32 +29,6 @@ def load_yaml(path: str) -> Dict: return loaded_config -def load_yaml_with_deps(path: str) -> DictConfig: - """ - Load YAML file with OmegaConf and merge it with its dependencies - specified in the `conf-dependencies` field. - Assume that the dependencies live in the same folder of the - YAML file which is importing them. - - Args: - path (str): path to YAML file. - - Raises: - exc: yaml.YAMLError for loading/parsing errors. - - Returns: - DictConfig: nested representation of parsed YAML file. - """ - yaml_conf = load_yaml(path) - use_case_dir = os.path.dirname(path) - deps = [] - if yaml_conf.get("conf-dependencies"): - for dependency in yaml_conf["conf-dependencies"]: - deps.append(load_yaml(os.path.join(use_case_dir, dependency))) - - return OmegaConf.merge(yaml_conf, *deps) - - def dynamically_import_class(name: str) -> Type: """ Dynamically import class by module path. @@ -115,18 +86,6 @@ def flatten_dict( return dict(items) -# Parse (part of) YAML loaded in memory -def parse_pipe_config(yaml_file, parser): - with open(yaml_file, "r", encoding="utf-8") as f: - try: - config = yaml.safe_load(f) - except yaml.YAMLError as exc: - print(exc) - raise exc - - return parser.parse_object(config) - - class SignatureInspector: """Provides the functionalities to inspect the signature of a function or a method. @@ -181,3 +140,42 @@ def max_params_num(self) -> int: if self.has_kwargs or self.has_varargs: return self.INFTY return len(self.func_params) + + +def str_to_slice(interval: str) -> slice: + import re + # TODO: add support for slices starting with empty index + # e.g., :20:3 + if not re.match(r"\d+(:\d+)?(:\d+)?", interval): + raise ValueError( + f"Received invalid interval for slice: '{interval}'" + ) + if ":" in interval: + return slice(*map( + lambda x: int(x.strip()) if x.strip() else None, + interval.split(':') + )) + return int(interval) + + +def clear_key( + my_dict: Dict, + dict_name: str, + key: Hashable, + complain: bool = True +) -> Dict: + """Remove key from dictionary if present and complain. + + Args: + my_dict (Dict): Dictionary. + dict_name (str): name of the dictionary. + key (Hashable): Key to remove. + """ + if key in my_dict: + if complain: + print( + f"Field '{key}' should not be present " + f"in dictionary '{dict_name}'" + ) + del my_dict[key] + return my_dict diff --git a/tests/components/test_components.py b/tests/components/test_components.py index 3ec55453..890188d7 100644 --- a/tests/components/test_components.py +++ b/tests/components/test_components.py @@ -74,11 +74,6 @@ class MyTrainer(Trainer): def execute(self): ... - def save_state(self): - ... - - def load_state(self): - ... comp = MyTrainer() with pytest.raises(SerializationError) as exc_info: dict_serializ = comp.to_dict() diff --git a/tests/test_cli.py b/tests/test_cli.py deleted file mode 100644 index 26b57cb0..00000000 --- a/tests/test_cli.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -Test itwinai CLI. -""" - -import subprocess -import pytest - - -@pytest.mark.skip(reason="cli deprecated") -def test_datasets_viz(): - """ - Test visualization of use case's dataset registry. - """ - USE_CASE = "use-cases/mnist/" - subprocess.run( - f"itwinai datasets --use-case {USE_CASE}".split(), check=True) - - -@pytest.mark.skip(reason="cli deprecated") -def test_workflows_viz(): - """ - Test visualization of use case's workflows. - """ - USE_CASE = "./use-cases/mnist/" - subprocess.run( - f"itwinai workflows --use-case {USE_CASE}".split(), check=True) diff --git a/tests/use-cases/conftest.py b/tests/use-cases/conftest.py index d080e0a8..69229db6 100644 --- a/tests/use-cases/conftest.py +++ b/tests/use-cases/conftest.py @@ -2,9 +2,9 @@ from typing import Callable import pytest import subprocess +import random +import string -pytest.TORCH_PREFIX = './.venv-pytorch' -pytest.TF_PREFIX = './.venv-tf' FNAMES = [ 'pipeline.yaml', @@ -12,6 +12,52 @@ ] +def rnd_string(len: int = 26): + return ''.join(random.sample(string.ascii_lowercase, len)) + + +@pytest.fixture +def tmp_test_dir(): + root = '/tmp/pytest' + os.makedirs(root, exist_ok=True) + test_dir = os.path.join(root, rnd_string()) + while os.path.exists(test_dir): + test_dir = os.path.join(root, rnd_string()) + os.makedirs(test_dir, exist_ok=True) + + yield test_dir + + # Optional: remove dir here... + + +@pytest.fixture +def torch_env() -> str: + """ + Return absolute path to torch virtual environment parsing it + from environment variables, if provided, otherwise fall back + to ``./.venv-pytorch``. + """ + if os.environ.get('TORCH_ENV') is None: + env_p = './.venv-pytorch' + else: + env_p = os.environ.get('TORCH_ENV') + return os.path.abspath(env_p) + + +@pytest.fixture +def tf_env() -> str: + """ + Return absolute path to tensorflow virtual environment parsing it + from environment variables, if provided, otherwise fall back + to ``./.venv-tf``. + """ + if os.environ.get('TF_ENV') is None: + env_p = './.venv-tf' + else: + env_p = os.environ.get('TF_ENV') + return os.path.abspath(env_p) + + @pytest.fixture def check_folder_structure() -> Callable: """ @@ -31,7 +77,6 @@ def install_requirements() -> Callable: def _install_reqs(root: str, env_prefix: str): req_path = os.path.join(root, 'requirements.txt') if os.path.isfile(req_path): - cmd = (f"micromamba run -p {env_prefix} " - f"pip install -r {req_path}") + cmd = f"{env_prefix}/bin/pip install -r {req_path}" subprocess.run(cmd.split(), check=True) return _install_reqs diff --git a/tests/use-cases/test_3dgan.py b/tests/use-cases/test_3dgan.py index c57e21ff..7f4503e4 100644 --- a/tests/use-cases/test_3dgan.py +++ b/tests/use-cases/test_3dgan.py @@ -3,73 +3,63 @@ """ import pytest import subprocess -# from itwinai.utils import dynamically_import_class +import os CERN_PATH = "use-cases/3dgan" -CKPT_PATH = "3dgan-inference.pth" - - -@pytest.fixture(scope="module") -def fake_model_checkpoint() -> None: - """ - Create a dummy model checkpoint for inference. - """ - import sys - import torch - sys.path.append(CERN_PATH) - from model import ThreeDGAN - # ThreeDGAN = dynamically_import_class('model.ThreeDGAN') - net = ThreeDGAN() - torch.save(net, CKPT_PATH) +CKPT_NAME = "3dgan-inference.pth" +@pytest.mark.skip("deprecated") def test_structure_3dgan(check_folder_structure): """Test 3DGAN folder structure.""" check_folder_structure(CERN_PATH) @pytest.mark.functional -def test_3dgan_train(install_requirements): +def test_3dgan_train(torch_env, tmp_test_dir, install_requirements): """ Test 3DGAN torch lightning trainer by running it end-to-end. """ - install_requirements(CERN_PATH, pytest.TORCH_PREFIX) - # cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} python " - # f"{CERN_PATH}/train.py -p {CERN_PATH}/pipeline.yaml") - trainer_params = "pipeline.init_args.steps.training_step.init_args" - cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} itwinai exec-pipeline " - f"--config {CERN_PATH}/pipeline.yaml " - f'-o {trainer_params}.config.trainer.accelerator=cpu ' - f'-o {trainer_params}.config.trainer.strategy=auto ' + install_requirements(CERN_PATH, torch_env) + conf = os.path.join(os.path.abspath(CERN_PATH), 'config.yaml') + cmd = (f"{torch_env}/bin/itwinai exec-pipeline " + f"--config {conf} --pipe-key training_pipeline " + '-o hw_accelerators=auto ' + '-o distributed_strategy=auto ' ) - subprocess.run(cmd.split(), check=True) + subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir) @pytest.mark.functional -def test_3dgan_inference(install_requirements, fake_model_checkpoint): +def test_3dgan_inference( + torch_env, + tmp_test_dir, + install_requirements, + # fake_model_checkpoint +): """ Test 3DGAN torch lightning trainer by running it end-to-end. """ - install_requirements(CERN_PATH, pytest.TORCH_PREFIX) - # cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} python " - # f"{CERN_PATH}/train.py -p {CERN_PATH}/pipeline.yaml") - # cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} itwinai exec-pipeline " - # f"--config {CERN_PATH}/inference-pipeline.yaml") + install_requirements(CERN_PATH, torch_env) + + # Create fake inference dataset and checkpoint + exec = os.path.join(os.path.abspath(CERN_PATH), + 'create_inference_sample.py') + cmd = (f"{torch_env}/bin/python {exec} " + f"--root {tmp_test_dir} " + f"--ckpt-name {CKPT_NAME}") + subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir) - getter_params = "pipeline.init_args.steps.dataloading_step.init_args" - trainer_params = "pipeline.init_args.steps.inference_step.init_args" - logger_params = trainer_params + ".config.trainer.logger.init_args" - data_params = trainer_params + ".config.data.init_args" - saver_params = "pipeline.init_args.steps.saver_step.init_args" + # Test inference + conf = os.path.join(os.path.abspath(CERN_PATH), 'config.yaml') cmd = ( - 'itwinai exec-pipeline ' - '--config use-cases/3dgan/inference-pipeline.yaml ' - f'-o {getter_params}.data_path=exp_data ' - f'-o {trainer_params}.model.init_args.model_uri={CKPT_PATH} ' - f'-o {trainer_params}.config.trainer.accelerator=cpu ' - f'-o {trainer_params}.config.trainer.strategy=auto ' - f'-o {logger_params}.save_dir=ml_logs/mlflow_logs ' - f'-o {data_params}.datapath=exp_data/*/*.h5 ' - f'-o {saver_params}.save_dir=3dgan-generated-data ' + f'{torch_env}/bin/itwinai exec-pipeline ' + f'--config {conf} --pipe-key inference_pipeline ' + '-o dataset_location=exp_data ' + f'-o inference_model_uri={CKPT_NAME} ' + '-o hw_accelerators=auto ' + '-o distributed_strategy=auto ' + '-o logs_dir=ml_logs/mlflow_logs ' + '-o inference_results_location=3dgan-generated-data ' ) - subprocess.run(cmd.split(), check=True) + subprocess.run(cmd.split(), check=True, cwd=CERN_PATH) diff --git a/tests/use-cases/test_cyclones.py b/tests/use-cases/test_cyclones.py index 1a5ebb3f..d6a1ea2c 100644 --- a/tests/use-cases/test_cyclones.py +++ b/tests/use-cases/test_cyclones.py @@ -7,10 +7,12 @@ import pytest import subprocess +import os CYCLONES_PATH = "use-cases/cyclones" +@pytest.mark.skip("deprecated") def test_structure_cyclones(check_folder_structure): """Test cyclones folder structure.""" check_folder_structure(CYCLONES_PATH) @@ -18,11 +20,14 @@ def test_structure_cyclones(check_folder_structure): @pytest.mark.functional @pytest.mark.memory_heavy -def test_cyclones_train_tf(install_requirements): +def test_cyclones_train_tf(tf_env, tmp_test_dir, install_requirements): """ Test Cyclones tensorflow trainer by running it end-to-end. """ - install_requirements(CYCLONES_PATH, pytest.TF_PREFIX) - cmd = (f"micromamba run -p {pytest.TF_PREFIX} python " - f"{CYCLONES_PATH}/train.py -p {CYCLONES_PATH}/pipeline.yaml") - subprocess.run(cmd.split(), check=True) + # TODO: create a small sample dataset for tests only + install_requirements(CYCLONES_PATH, tf_env) + pipe = os.path.join(os.path.abspath(CYCLONES_PATH), 'pipeline.yaml') + train = os.path.join(os.path.abspath(CYCLONES_PATH), 'train.py') + cmd = (f"{tf_env}/bin/python {train} " + f"-p {pipe}") + subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir) diff --git a/tests/use-cases/test_mnist.py b/tests/use-cases/test_mnist.py index d32aab1c..1f18a8e6 100644 --- a/tests/use-cases/test_mnist.py +++ b/tests/use-cases/test_mnist.py @@ -7,72 +7,100 @@ import pytest import subprocess +import os +# from itwinai.cli import exec_pipeline TORCH_PATH = "use-cases/mnist/torch" LIGHTNING_PATH = "use-cases/mnist/torch-lightning" TF_PATH = "use-cases/mnist/tensorflow" +@pytest.mark.skip(reason="structure changed") def test_structure_mnist_torch(check_folder_structure): """Test MNIST folder structure for torch native trainer.""" check_folder_structure(TORCH_PATH) +@pytest.mark.skip(reason="structure changed") def test_structure_mnist_lightning(check_folder_structure): """Test MNIST folder structure for torch lightning trainer.""" check_folder_structure(LIGHTNING_PATH) +@pytest.mark.skip(reason="structure changed") def test_structure_mnist_tf(check_folder_structure): """Test MNIST folder structure for tensorflow trainer.""" check_folder_structure(TF_PATH) @pytest.mark.functional -def test_mnist_train_torch(install_requirements): +def test_mnist_train_torch(torch_env, tmp_test_dir, install_requirements): """ Test MNIST torch native trainer by running it end-to-end. + + To set the torch env path set the ``TORCH_ENV`` env variable: + + >>> export TORCH_ENV="my_env" """ - install_requirements(TORCH_PATH, pytest.TORCH_PREFIX) - cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} python " - f"{TORCH_PATH}/train.py -p {TORCH_PATH}/pipeline.yaml") - subprocess.run(cmd.split(), check=True) + install_requirements(TORCH_PATH, torch_env) + conf = os.path.join(os.path.abspath(TORCH_PATH), 'config.yaml') + cmd = (f"{torch_env}/bin/itwinai exec-pipeline " + f"--config {conf} --pipe-key training_pipeline") + subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir) @pytest.mark.functional -def test_mnist_train_lightning(install_requirements): +def test_mnist_inference_torch(torch_env, tmp_test_dir, install_requirements): """ - Test MNIST torch lightning trainer by running it end-to-end. + Test MNIST torch native inference by running it end-to-end. + + To set the torch env path set the ``TORCH_ENV`` env variable: + + >>> export TORCH_ENV="my_env" """ - install_requirements(TORCH_PATH, pytest.TORCH_PREFIX) - cmd = (f"micromamba run -p {pytest.TORCH_PREFIX} python " - f"{LIGHTNING_PATH}/train.py -p {LIGHTNING_PATH}/pipeline.yaml") - subprocess.run(cmd.split(), check=True) + install_requirements(TORCH_PATH, torch_env) + + # Create fake inference dataset and checkpoint + exec = os.path.join(os.path.abspath(TORCH_PATH), + 'create_inference_sample.py') + cmd = (f"{torch_env}/bin/python {exec} " + f"--root {tmp_test_dir}") + subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir) + + # Test inference + conf = os.path.join(os.path.abspath(TORCH_PATH), 'config.yaml') + cmd = (f"{torch_env}/bin/itwinai exec-pipeline " + f"--config {conf} --pipe-key inference_pipeline") + subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir) @pytest.mark.functional -def test_mnist_train_tf(install_requirements): +def test_mnist_train_torch_lightning( + torch_env, + tmp_test_dir, + install_requirements +): """ - Test MNIST tensorflow trainer by running it end-to-end. + Test MNIST torch lightning trainer by running it end-to-end. + + To set the torch env path set the ``TORCH_ENV`` env variable: + + >>> export TORCH_ENV="my_env" """ - install_requirements(TF_PATH, pytest.TF_PREFIX) - cmd = (f"micromamba run -p {pytest.TF_PREFIX} python " - f"{TF_PATH}/train.py -p {TF_PATH}/pipeline.yaml") - subprocess.run(cmd.split(), check=True) + install_requirements(LIGHTNING_PATH, torch_env) + conf = os.path.join(os.path.abspath(LIGHTNING_PATH), 'config.yaml') + cmd = (f"{torch_env}/bin/itwinai exec-pipeline " + f"--config {conf} --pipe-key training_pipeline") + subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir) -@pytest.mark.skip(reason="workflow changed. Left as example") -@pytest.mark.integration -def test_mnist_train_legacy(): +@pytest.mark.functional +def test_mnist_train_tf(tf_env, tmp_test_dir, install_requirements): """ - Test MNIST training workflow(s) by running it end-to-end. + Test MNIST tensorflow trainer by running it end-to-end. """ - workflows = [ - "./use-cases/mnist/torch/workflows/training-workflow.yml", - "./use-cases/mnist/tensorflow/workflows/training-workflow.yml", - ] - - for workflow in workflows: - cmd = f"micromamba run -p ./.venv python run-workflow.py -f {workflow}" - subprocess.run(cmd.split(), check=True) - subprocess.run(cmd.split() + ["--cwl"], check=True) + install_requirements(TF_PATH, tf_env) + conf = os.path.join(os.path.abspath(TF_PATH), 'pipeline.yaml') + cmd = (f"{tf_env}/bin/itwinai exec-pipeline " + f"--config {conf} --pipe-key pipeline") + subprocess.run(cmd.split(), check=True, cwd=tmp_test_dir) diff --git a/tutorials/distributed-ml/torch-scaling-test/README.md b/tutorials/distributed-ml/torch-scaling-test/README.md index 74e316c0..1344504e 100644 --- a/tutorials/distributed-ml/torch-scaling-test/README.md +++ b/tutorials/distributed-ml/torch-scaling-test/README.md @@ -38,11 +38,16 @@ setting SLURM environment variables using the `--export` option: ```bash # Launch a distributed training setup with Torch DDP -DIST_MODE="ddp" -RUN_NAME="ddp-bl-imagenent" -TRAINING_CMD="ddp_trainer.py -c config/base.yaml -c config/ddp.yaml" -sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD" \ - --job-name="$RUN_NAME" slurm.sh +export DIST_MODE="ddp" +export RUN_NAME="ddp-bl-imagenent" +export TRAINING_CMD="ddp_trainer.py -c config/base.yaml -c config/ddp.yaml" +export PYTHON_VENV="../../../envAI_hdfml" +export N=2 # Number of nodes +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + --nodes=$N slurm.sh ``` ## Run all training configurations diff --git a/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py b/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py index 54f64fef..0a25ae5b 100755 --- a/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py +++ b/tutorials/distributed-ml/torch-scaling-test/ddp_trainer.py @@ -18,8 +18,11 @@ from itwinai.parser import ArgumentParser as ItAIArgumentParser from itwinai.loggers import EpochTimeTracker +from itwinai.torch.reproducibility import ( + seed_worker, set_seed +) -from utils import seed_worker, imagenet_dataset, set_seed +from utils import imagenet_dataset def parse_params(): @@ -121,7 +124,7 @@ def main(): dist.init_process_group(backend=args.backend) # Set random seed for reproducibility - torch_prng = set_seed(args.rnd_seed, use_cuda) + torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False) if is_distributed: # get job rank info - rank==0 master gpu diff --git a/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py b/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py index 691712e8..e6022021 100644 --- a/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py +++ b/tutorials/distributed-ml/torch-scaling-test/deepspeed_trainer.py @@ -18,8 +18,11 @@ from itwinai.parser import ArgumentParser as ItAIArgumentParser from itwinai.loggers import EpochTimeTracker +from itwinai.torch.reproducibility import ( + seed_worker, set_seed +) -from utils import seed_worker, set_seed, imagenet_dataset +from utils import imagenet_dataset def parse_params(): @@ -124,7 +127,7 @@ def main(): deepspeed.init_distributed(dist_backend=args.backend) # Set random seed for reproducibility - torch_prng = set_seed(args.rnd_seed, use_cuda) + torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False) if is_distributed: # Get job rank info - rank==0 master gpu @@ -248,7 +251,7 @@ def main(): print('TIMER: epoch time:', timer()-lt, 's') epoch_time_tracker.add_epoch_time(epoch-1, timer()-lt) - if torch.cuda.is_available(): + if is_distributed: dist.barrier() if grank == 0: diff --git a/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py b/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py index 501b545c..a4c3eaa4 100755 --- a/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py +++ b/tutorials/distributed-ml/torch-scaling-test/horovod_trainer.py @@ -19,8 +19,11 @@ from itwinai.parser import ArgumentParser as ItAIArgumentParser from itwinai.loggers import EpochTimeTracker +from itwinai.torch.reproducibility import ( + seed_worker, set_seed +) -from utils import imagenet_dataset, seed_worker, set_seed +from utils import imagenet_dataset def parse_params(): @@ -129,7 +132,7 @@ def main(): hvd.init() # Set random seed for reproducibility - torch_prng = set_seed(args.rnd_seed, use_cuda) + torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False) # is_main_worker = True # if is_distributed and (hvd.rank() != 0 or hvd.local_rank() != 0): diff --git a/tutorials/distributed-ml/torch-scaling-test/img/report.png b/tutorials/distributed-ml/torch-scaling-test/img/report.png index 53bb708a..4e81996e 100644 Binary files a/tutorials/distributed-ml/torch-scaling-test/img/report.png and b/tutorials/distributed-ml/torch-scaling-test/img/report.png differ diff --git a/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py b/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py index a1eacc20..cded83af 100644 --- a/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py +++ b/tutorials/distributed-ml/torch-scaling-test/itwinai_trainer.py @@ -21,14 +21,17 @@ from itwinai.torch.distributed import ( TorchDistributedStrategy, - DDPDistributedStrategy, - HVDDistributedStrategy, - DSDistributedStrategy, + TorchDDPStrategy, + HorovodStrategy, + DeepSpeedStrategy, ) from itwinai.parser import ArgumentParser as ItAIArgumentParser from itwinai.loggers import EpochTimeTracker +from itwinai.torch.reproducibility import ( + seed_worker, set_seed +) -from utils import seed_worker, imagenet_dataset, set_seed +from utils import imagenet_dataset def parse_params() -> argparse.Namespace: @@ -116,8 +119,8 @@ def train( model.train() t_list = [] loss_acc = 0 - gwsize = strategy.dist_gwsize() - if strategy.is_main_worker(): + gwsize = strategy.global_world_size() + if strategy.is_main_worker: print("\n") for batch_idx, (data, target) in enumerate(train_loader): t = timer() @@ -127,7 +130,7 @@ def train( loss = F.nll_loss(output, target) loss.backward() optimizer.step() - if (strategy.is_main_worker() and args.log_int > 0 + if (strategy.is_main_worker and args.log_int > 0 and batch_idx % args.log_int == 0): print( f'Train epoch: {epoch} ' @@ -136,7 +139,7 @@ def train( f'Loss: {loss.item():.6f}') t_list.append(timer() - t) loss_acc += loss.item() - if strategy.is_main_worker(): + if strategy.is_main_worker: print('TIMER: train time', sum(t_list) / len(t_list), 's') return loss_acc @@ -151,10 +154,10 @@ def main(): or not torch.cuda.device_count() > 1): raise RuntimeError('Resources unavailable') - strategy = DDPDistributedStrategy(backend=args.backend) + strategy = TorchDDPStrategy(backend=args.backend) distribute_kwargs = {} elif args.strategy == 'horovod': - strategy = HVDDistributedStrategy() + strategy = HorovodStrategy() distribute_kwargs = dict( compression=( hvd.Compression.fp16 if args.fp16_allreduce @@ -164,7 +167,7 @@ def main(): gradient_predivide_factor=args.gradient_predivide_factor ) elif args.strategy == 'deepspeed': - strategy = DSDistributedStrategy(backend=args.backend) + strategy = DeepSpeedStrategy(backend=args.backend) distribute_kwargs = dict( config_params=dict(train_micro_batch_size_per_gpu=args.batch_size) ) @@ -182,19 +185,19 @@ def main(): # Limit # of CPU threads to be used per worker # torch.set_num_threads(1) - # start the timer for profiling + # Start the timer for profiling st = timer() # Set random seed for reproducibility - torch_prng = set_seed(args.rnd_seed, use_cuda) + torch_prng = set_seed(args.rnd_seed, deterministic_cudnn=False) - # get job rank info - rank==0 master gpu + # Get job rank info - rank==0 master gpu if is_distributed: # local world size - per node - lwsize = strategy.dist_lwsize() # local world size - per run - gwsize = strategy.dist_gwsize() # global world size - per run - grank = strategy.dist_grank() # global rank - assign per run - lrank = strategy.dist_lrank() # local rank - assign per node + lwsize = strategy.local_world_size() # local world size - per run + gwsize = strategy.global_world_size() # global world size - per run + grank = strategy.global_rank() # global rank - assign per run + lrank = strategy.local_rank() # local rank - assign per node else: # Use a single worker (either on GPU or CPU) lwsize = 1 @@ -202,7 +205,7 @@ def main(): grank = 0 lrank = 0 - if strategy.is_main_worker(): + if strategy.is_main_worker: print('TIMER: initialise:', timer()-st, 's') print('DEBUG: local ranks:', lwsize, '/ global ranks:', gwsize) print('DEBUG: sys.version:', sys.version) @@ -221,7 +224,7 @@ def main(): # Encapsulate the model on the GPU assigned to the current process device = torch.device( - strategy.dist_device() if use_cuda and torch.cuda.is_available() + strategy.device() if use_cuda else 'cpu') if use_cuda: torch.cuda.set_device(lrank) @@ -263,7 +266,7 @@ def main(): ) # Start training loop - if strategy.is_main_worker(): + if strategy.is_main_worker: print('TIMER: broadcast:', timer()-st, 's') print('\nDEBUG: start training') print('--------------------------------------------------------') @@ -302,11 +305,11 @@ def main(): if epoch + 1 == args.epochs: train_loader.last_epoch = True - if strategy.is_main_worker(): + if strategy.is_main_worker: print('TIMER: epoch time:', timer()-lt, 's') epoch_time_tracker.add_epoch_time(epoch-1, timer()-lt) - if strategy.is_main_worker(): + if strategy.is_main_worker: print('\n--------------------------------------------------------') print('DEBUG: training results:\n') print('TIMER: first epoch time:', first_ep_t, ' s') @@ -327,7 +330,7 @@ def main(): print(f'TIMER: final time: {timer()-st} s\n') time.sleep(1) - print(f" - TRAINING FINISHED") + print(f" - TRAINING FINISHED") # Clean-up if is_distributed: diff --git a/tutorials/distributed-ml/torch-scaling-test/runall.sh b/tutorials/distributed-ml/torch-scaling-test/runall.sh index 4f9efdcf..22958c16 100644 --- a/tutorials/distributed-ml/torch-scaling-test/runall.sh +++ b/tutorials/distributed-ml/torch-scaling-test/runall.sh @@ -15,47 +15,75 @@ else fi # Common options -CMD="--nodes=$N --time=$T --account=atmo-rep --partition=booster slurm.sh" -PYTHON_VENV="../../../envAI_juwels" +CMD="--nodes=$N --time=$T --account=intertwin --partition=batch slurm.sh" +PYTHON_VENV="../../../envAI_hdfml" echo "Distributing training over $N nodes. Timeout set to: $T" +# Clear SLURM logs (*.out and *.err files) rm -rf logs_slurm mkdir logs_slurm -rm *.out *.err *.csv #*checkpoint.pth.tar +rm -rf logs_torchrun + +# Clear scaling test logs +rm *.csv # *checkpoint.pth.tar # DDP baseline DIST_MODE="ddp" RUN_NAME="ddp-bl-imagenent" TRAINING_CMD="ddp_trainer.py -c config/base.yaml -c config/ddp.yaml" -sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" --job-name="$RUN_NAME-n$N" $CMD +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + $CMD # DeepSpeed baseline DIST_MODE="deepspeed" RUN_NAME="deepspeed-bl-imagenent" TRAINING_CMD="deepspeed_trainer.py -c config/base.yaml -c config/deepspeed.yaml" -sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" --job-name="$RUN_NAME-n$N" $CMD +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + $CMD # Horovod baseline DIST_MODE="horovod" RUN_NAME="horovod-bl-imagenent" TRAINING_CMD="horovod_trainer.py -c config/base.yaml -c config/horovod.yaml" -sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" --job-name="$RUN_NAME-n$N" $CMD +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + $CMD # DDP itwinai DIST_MODE="ddp" RUN_NAME="ddp-itwinai-imagenent" TRAINING_CMD="itwinai_trainer.py -c config/base.yaml -c config/ddp.yaml -s ddp" -sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" --job-name="$RUN_NAME-n$N" $CMD +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + $CMD # DeepSpeed itwinai DIST_MODE="deepspeed" RUN_NAME="deepspeed-itwinai-imagenent" TRAINING_CMD="itwinai_trainer.py -c config/base.yaml -c config/deepspeed.yaml -s deepspeed" -sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" --job-name="$RUN_NAME-n$N" $CMD +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + $CMD # Horovod itwinai DIST_MODE="horovod" RUN_NAME="horovod-itwinai-imagenent" TRAINING_CMD="itwinai_trainer.py -c config/base.yaml -c config/horovod.yaml -s horovod" -sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" --job-name="$RUN_NAME-n$N" $CMD \ No newline at end of file +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + $CMD \ No newline at end of file diff --git a/tutorials/distributed-ml/torch-scaling-test/slurm.sh b/tutorials/distributed-ml/torch-scaling-test/slurm.sh index 93dd4349..c53e3da5 100644 --- a/tutorials/distributed-ml/torch-scaling-test/slurm.sh +++ b/tutorials/distributed-ml/torch-scaling-test/slurm.sh @@ -15,7 +15,7 @@ #SBATCH --partition=batch #SBATCH --nodes=2 #SBATCH --gpus-per-node=4 -#SBATCH --cpus-per-gpu=8 +#SBATCH --cpus-per-gpu=4 #SBATCH --exclusive # gres options have to be disabled for deepv @@ -72,13 +72,13 @@ else source $PYTHON_VENV/bin/activate fi +# Get GPUs info per node +srun --cpu-bind=none --ntasks-per-node=1 bash -c 'echo -e "NODE hostname: $(hostname)\n$(nvidia-smi)\n\n"' + # Launch training if [ "$DIST_MODE" == "ddp" ] ; then echo "DDP training: $TRAINING_CMD" srun --cpu-bind=none --ntasks-per-node=1 \ - --job-name="$RUN_NAME-n$SLURM_NNODES" \ - --output="logs_slurm/job-$RUN_NAME-n$SLURM_NNODES.out" \ - --error="logs_slurm/job-$RUN_NAME-n$SLURM_NNODES.err" \ bash -c "torchrun \ --log_dir='logs_torchrun' \ --nnodes=$SLURM_NNODES \ @@ -95,9 +95,6 @@ elif [ "$DIST_MODE" == "deepspeed" ] ; then export MASTER_PORT=29500 srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \ - --job-name="$RUN_NAME-n$SLURM_NNODES" \ - --output="logs_slurm/job-$RUN_NAME-n$SLURM_NNODES.out" \ - --error="logs_slurm/job-$RUN_NAME-n$SLURM_NNODES.err" \ python -u $TRAINING_CMD --deepspeed # # Run with deepspeed launcher: set --ntasks-per-node=1 @@ -112,9 +109,6 @@ elif [ "$DIST_MODE" == "deepspeed" ] ; then elif [ "$DIST_MODE" == "horovod" ] ; then echo "HOROVOD training: $TRAINING_CMD" srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \ - --job-name="$RUN_NAME-imagenet-n$SLURM_NNODES" \ - --output="logs_slurm/job-$RUN_NAME-n$SLURM_NNODES.out" \ - --error="logs_slurm/job-$RUN_NAME-n$SLURM_NNODES.err" \ python -u $TRAINING_CMD else >&2 echo "ERROR: unrecognized \$DIST_MODE env variable" diff --git a/tutorials/distributed-ml/torch-scaling-test/utils.py b/tutorials/distributed-ml/torch-scaling-test/utils.py index cbd6aace..a5dc591e 100644 --- a/tutorials/distributed-ml/torch-scaling-test/utils.py +++ b/tutorials/distributed-ml/torch-scaling-test/utils.py @@ -1,40 +1,6 @@ -from typing import Optional -import numpy as np -import random - -import torch from torchvision import datasets, transforms -def seed_worker(worker_id): - worker_seed = torch.initial_seed() % 2**32 - np.random.seed(worker_seed) - random.seed(worker_seed) - - -def set_seed(rnd_seed: Optional[int], use_cuda: bool) -> torch.Generator: - """Set torch random seed and return a PRNG object. - - Args: - rnd_seed (Optional[int]): random seed. If None, the seed is not set. - use_cuda (bool): whether GPU is available. - - Returns: - torch.Generator: PRNG object. - """ - g = torch.Generator() - if rnd_seed is not None: - # Deterministic execution - np.random.seed(rnd_seed) - random.seed(rnd_seed) - torch.manual_seed(rnd_seed) - g.manual_seed(rnd_seed) - if use_cuda: - torch.cuda.manual_seed(rnd_seed) - torch.cuda.manual_seed_all(rnd_seed) - return g - - def imagenet_dataset(data_root: str): """Create a torch dataset object for Imagenet.""" transform = transforms.Compose([ diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/README.md b/tutorials/distributed-ml/torch-tutorial-0-basics/README.md index 5ddcd635..43d42565 100644 --- a/tutorials/distributed-ml/torch-tutorial-0-basics/README.md +++ b/tutorials/distributed-ml/torch-tutorial-0-basics/README.md @@ -23,19 +23,43 @@ should be used to run it: If you want to distribute the code in `train.py` with **torch DDP**, run from terminal: ```bash -sbatch ddp_slurm.sh +export DIST_MODE="ddp" +export RUN_NAME="ddp-itwinai" +export TRAINING_CMD="train.py -s ddp" +export PYTHON_VENV="../../../envAI_hdfml" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh ``` If you want to distribute the code in `train.py` with **DeepSpeed**, run from terminal: ```bash -sbatch deepspeed_slurm.sh +export DIST_MODE="deepspeed" +export RUN_NAME="deepspeed-itwinai" +export TRAINING_CMD="train.py -s deepspeed" +export PYTHON_VENV="../../../envAI_hdfml" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh ``` If you want to distribute the code in `train.py` with **Horovod**, run from terminal: ```bash -sbatch hvd_slurm.sh +export DIST_MODE="deepspeed" +export RUN_NAME="deepspeed-itwinai" +export TRAINING_CMD="train.py -s deepspeed" +export PYTHON_VENV="../../../envAI_hdfml" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh ``` You can run all of them with: diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/ddp_slurm.sh b/tutorials/distributed-ml/torch-tutorial-0-basics/ddp_slurm.sh deleted file mode 100644 index 1b53f04c..00000000 --- a/tutorials/distributed-ml/torch-tutorial-0-basics/ddp_slurm.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=Torch_DDP_tutorial-0 -#SBATCH --account=intertwin -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job-ddp.out -#SBATCH --error=job-ddp.err -#SBATCH --time=00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=batch -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --gpus-per-node=4 -# SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# set modules -ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py - -# set env -source ../../../envAI_hdfml/bin/activate - -# job info -debug=false -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch training -TRAINING_CMD="train.py -s ddp" - -srun --cpu-bind=none bash -c "torchrun \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=$SLURM_GPUS_PER_NODE \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $TRAINING_CMD" - diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/deepspeed_slurm.sh b/tutorials/distributed-ml/torch-tutorial-0-basics/deepspeed_slurm.sh deleted file mode 100644 index b12009de..00000000 --- a/tutorials/distributed-ml/torch-tutorial-0-basics/deepspeed_slurm.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=Torch_DeepSpeed_tutorial-0 -#SBATCH --account=intertwin -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job-ds.out -#SBATCH --error=job-ds.err -#SBATCH --time=00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=batch -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=4 -#SBATCH --gpus-per-node=4 -# SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# set modules -ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py - -# set env -source ../../../envAI_hdfml/bin/activate - -# job info -debug=false -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set env vars -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi -export CUDA_VISIBLE_DEVICES="0,1,2,3" - -# launch training -MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i -export MASTER_ADDR -export MASTER_PORT=29500 - -TRAINING_CMD="train.py -s deepspeed" - -# Run without launcher: set --ntasks-per-node=NUM_GPUS -srun --cpu-bind=none python -u $TRAINING_CMD #--deepspeed - -# srun pwd - -# # Run with deepspeed launcher: set --ntasks-per-node=1 -# # https://www.deepspeed.ai/getting-started/#multi-node-environment-variables -# export NCCL_IB_DISABLE=1 -# export NCCL_SOCKET_IFNAME=eth0 -# nodelist=$(scontrol show hostname $SLURM_NODELIST) -# echo "$nodelist" | sed -e 's/$/ slots=4/' > .hostfile -# # Requires passwordless SSH access among compute node -# srun --cpu-bind=none deepspeed --hostfile=.hostfile $TRAINING_CMD --deepspeed -# rm .hostfile \ No newline at end of file diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/hvd_slurm.sh b/tutorials/distributed-ml/torch-tutorial-0-basics/hvd_slurm.sh deleted file mode 100644 index a2a06e6c..00000000 --- a/tutorials/distributed-ml/torch-tutorial-0-basics/hvd_slurm.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=Torch_HVD_tutorial-0 -#SBATCH --account=intertwin -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job-hvd.out -#SBATCH --error=job-hvd.err -#SBATCH --time=00:15:00 - -# configure node and process count on the CM -#SBATCH --partition=batch -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=8 -#SBATCH --gpus-per-node=4 -# SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# set modules -ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py - -# set env -source ../../../envAI_hdfml/bin/activate - -# job info -debug=false -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set vars -# export NCCL_DEBUG=INFO -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi -export CUDA_VISIBLE_DEVICES="0,1,2,3" - -# launch training -TRAINING_CMD="train.py -s horovod" - -srun --cpu-bind=none python -u $TRAINING_CMD - diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/runall.sh b/tutorials/distributed-ml/torch-tutorial-0-basics/runall.sh index 17c0f190..48a8f1e0 100644 --- a/tutorials/distributed-ml/torch-tutorial-0-basics/runall.sh +++ b/tutorials/distributed-ml/torch-tutorial-0-basics/runall.sh @@ -1,6 +1,39 @@ #!/bin/bash -# Run all versions of distributed ML -rm *.out *.err -echo "Torch DDP training: $(sbatch ddp_slurm.sh)" -echo "DeepSpeed training: $(sbatch deepspeed_slurm.sh)" -echo "Horovod training: $(sbatch hvd_slurm.sh)" \ No newline at end of file + +# Python virtual environment +PYTHON_VENV="../../../envAI_hdfml" + +# Clear SLURM logs (*.out and *.err files) +rm -rf logs_slurm +mkdir logs_slurm +rm -rf logs_torchrun + +# DDP itwinai +DIST_MODE="ddp" +RUN_NAME="ddp-itwinai" +TRAINING_CMD="train.py -s ddp" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh + +# DeepSpeed itwinai +DIST_MODE="deepspeed" +RUN_NAME="deepspeed-itwinai" +TRAINING_CMD="train.py -s deepspeed" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh + +# Horovod itwinai +DIST_MODE="horovod" +RUN_NAME="horovod-itwinai" +TRAINING_CMD="train.py -s horovod" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh \ No newline at end of file diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/slurm.sh b/tutorials/distributed-ml/torch-tutorial-0-basics/slurm.sh new file mode 100644 index 00000000..c53e3da5 --- /dev/null +++ b/tutorials/distributed-ml/torch-tutorial-0-basics/slurm.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# SLURM jobscript for JSC systems + +# Job configuration +#SBATCH --job-name=distributed_training +#SBATCH --account=intertwin +#SBATCH --mail-user= +#SBATCH --mail-type=ALL +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --time=00:30:00 + +# Resources allocation +#SBATCH --partition=batch +#SBATCH --nodes=2 +#SBATCH --gpus-per-node=4 +#SBATCH --cpus-per-gpu=4 +#SBATCH --exclusive + +# gres options have to be disabled for deepv +#SBATCH --gres=gpu:4 + +# Load environment modules +ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py + +# Job info +echo "DEBUG: TIME: $(date)" +sysN="$(uname -n | cut -f2- -d.)" +sysN="${sysN%%[0-9]*}" +echo "Running on system: $sysN" +echo "DEBUG: EXECUTE: $EXEC" +echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" +echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" +echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" +echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" +echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" +echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" +echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" +echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" +echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" +if [ "$DEBUG" = true ] ; then + echo "DEBUG: NCCL_DEBUG=INFO" + export NCCL_DEBUG=INFO +fi +echo + +# Setup env for distributed ML +export CUDA_VISIBLE_DEVICES="0,1,2,3" +export OMP_NUM_THREADS=1 +if [ "$SLURM_CPUS_PER_GPU" -gt 0 ] ; then + export OMP_NUM_THREADS=$SLURM_CPUS_PER_GPU +fi + +# Env vairables check +if [ -z "$DIST_MODE" ]; then + >&2 echo "ERROR: env variable DIST_MODE is not set. Allowed values are 'horovod', 'ddp' or 'deepspeed'" + exit 1 +fi +if [ -z "$RUN_NAME" ]; then + >&2 echo "WARNING: env variable RUN_NAME is not set. It's a way to identify some specific run of an experiment." + RUN_NAME=$DIST_MODE +fi +if [ -z "$TRAINING_CMD" ]; then + >&2 echo "ERROR: env variable TRAINING_CMD is not set. It's the python command to execute." + exit 1 +fi +if [ -z "$PYTHON_VENV" ]; then + >&2 echo "WARNING: env variable PYTHON_VENV is not set. It's the path to a python virtual environment." +else + # Activate Python virtual env + source $PYTHON_VENV/bin/activate +fi + +# Get GPUs info per node +srun --cpu-bind=none --ntasks-per-node=1 bash -c 'echo -e "NODE hostname: $(hostname)\n$(nvidia-smi)\n\n"' + +# Launch training +if [ "$DIST_MODE" == "ddp" ] ; then + echo "DDP training: $TRAINING_CMD" + srun --cpu-bind=none --ntasks-per-node=1 \ + bash -c "torchrun \ + --log_dir='logs_torchrun' \ + --nnodes=$SLURM_NNODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ + --rdzv_backend=c10d \ + --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ + $TRAINING_CMD" +elif [ "$DIST_MODE" == "deepspeed" ] ; then + echo "DEEPSPEED training: $TRAINING_CMD" + MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i + export MASTER_ADDR + export MASTER_PORT=29500 + + srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \ + python -u $TRAINING_CMD --deepspeed + + # # Run with deepspeed launcher: set --ntasks-per-node=1 + # # https://www.deepspeed.ai/getting-started/#multi-node-environment-variables + # export NCCL_IB_DISABLE=1 + # export NCCL_SOCKET_IFNAME=eth0 + # nodelist=$(scontrol show hostname $SLURM_NODELIST) + # echo "$nodelist" | sed -e 's/$/ slots=4/' > .hostfile + # # Requires passwordless SSH access among compute node + # srun --cpu-bind=none deepspeed --hostfile=.hostfile $TRAINING_CMD --deepspeed + # rm .hostfile +elif [ "$DIST_MODE" == "horovod" ] ; then + echo "HOROVOD training: $TRAINING_CMD" + srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \ + python -u $TRAINING_CMD +else + >&2 echo "ERROR: unrecognized \$DIST_MODE env variable" + exit 1 +fi + diff --git a/tutorials/distributed-ml/torch-tutorial-0-basics/train.py b/tutorials/distributed-ml/torch-tutorial-0-basics/train.py index 614b56e4..29c0d272 100644 --- a/tutorials/distributed-ml/torch-tutorial-0-basics/train.py +++ b/tutorials/distributed-ml/torch-tutorial-0-basics/train.py @@ -2,19 +2,23 @@ Show how to use DDP, Horovod and DeepSpeed strategies interchangeably with an extremely simple neural network. """ -from typing import Any -import os +from typing import Dict import argparse +import time import torch from torch import nn -from torch.utils.data import DataLoader, Dataset, DistributedSampler +from torch.utils.data import Dataset + +import horovod.torch as hvd from itwinai.torch.distributed import ( + distributed_resources_available, TorchDistributedStrategy, - DDPDistributedStrategy, - HVDDistributedStrategy, - DSDistributedStrategy, + TorchDDPStrategy, + HorovodStrategy, + DeepSpeedStrategy, + NonDistributedStrategy ) @@ -29,6 +33,9 @@ def parse_args() -> argparse.Namespace: "--shuffle_dataloader", action=argparse.BooleanOptionalAction ) + parser.add_argument( + '--batch-size', type=int, default=10, + help='input batch size for training (default: 10)') # DeepSpeed: needs to be removed import deepspeed @@ -55,42 +62,31 @@ def __getitem__(self, index): return torch.rand(self.x_size), torch.rand(self.y_size) -def trainer_entrypoint_fn( - foo: Any, args: argparse.Namespace, strategy: TorchDistributedStrategy +def training_fn( + args: argparse.Namespace, + strategy: TorchDistributedStrategy, + distribute_kwargs: Dict ) -> int: - """Dummy training function. This emulates custom code developed - by some use case. - """ + """Dummy training function.""" strategy.init() - print(f"{foo}: {os.environ.get('RANK')} {os.environ.get('LOCAL_RANK')} " - f"{os.environ.get('MASTER_ADDR')} {os.environ.get('MASTER_PORT')}") # Local model model = nn.Linear(3, 4) optim = torch.optim.Adam(model.parameters(), lr=1e-3) loss_fn = nn.MSELoss() # Distributed model - deepspeed_config = dict(train_batch_size=32) - # 'config_params' key is ignored if strategy != DSDistributedStrategy model, optim, lr_sched = strategy.distributed( - model, optim, lr_scheduler=None, config_params=deepspeed_config + model, optim, lr_scheduler=None, **distribute_kwargs ) # Data train_set = UniformRndDataset(x_size=3, y_size=4) # Distributed dataloader - train_loader = DataLoader( - train_set, batch_size=10, num_workers=1, - sampler=DistributedSampler( - train_set, - num_replicas=strategy.dist_gwsize(), - rank=strategy.dist_grank(), - shuffle=args.shuffle_dataloader - ) - ) + train_loader = strategy.create_dataloader( + train_set, batch_size=args.batch_size, num_workers=1) # Device allocated for this worker - device = strategy.dist_device() + device = strategy.device() for epoch in range(2): for (x, y) in train_loader: @@ -107,7 +103,7 @@ def trainer_entrypoint_fn( optim.step() - if strategy.is_main_worker(): + if strategy.is_main_worker: print(f"Loss [epoch={epoch}]: {loss.item()}") print(f"NNLoss [epoch={epoch}]: {loss.item()}") @@ -115,7 +111,8 @@ def trainer_entrypoint_fn( if lr_sched: lr_sched.step() - print(f" - TRAINING FINISHED") + time.sleep(1) + print(f" - TRAINING FINISHED") strategy.clean_up() return 123 @@ -125,19 +122,27 @@ def trainer_entrypoint_fn( args = parse_args() # Instantiate Strategy - if args.strategy == 'ddp': - if (not torch.cuda.is_available() - or not torch.cuda.device_count() > 1): - raise RuntimeError('Resources unavailable') - - strategy = DDPDistributedStrategy(backend='nccl') + if not distributed_resources_available(): + print("WARNING: falling back to non-distributed strategy.") + strategy = NonDistributedStrategy() + distribute_kwargs = {} + elif args.strategy == 'ddp': + strategy = TorchDDPStrategy(backend='nccl') + distribute_kwargs = {} elif args.strategy == 'horovod': - strategy = HVDDistributedStrategy() + strategy = HorovodStrategy() + distribute_kwargs = dict( + compression=hvd.Compression.none, + op=hvd.Average, + gradient_predivide_factor=1.0 + ) elif args.strategy == 'deepspeed': - strategy = DSDistributedStrategy(backend='nccl') + strategy = DeepSpeedStrategy(backend='nccl') + distribute_kwargs = dict( + config_params=dict(train_micro_batch_size_per_gpu=args.batch_size) + ) else: raise NotImplementedError( f"Strategy {args.strategy} is not recognized/implemented.") - # Launch distributed training - trainer_entrypoint_fn("foobar", args, strategy) + training_fn(args, strategy, distribute_kwargs) diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/README.md b/tutorials/distributed-ml/torch-tutorial-1-mnist/README.md index 6f22d3ef..70178f0d 100644 --- a/tutorials/distributed-ml/torch-tutorial-1-mnist/README.md +++ b/tutorials/distributed-ml/torch-tutorial-1-mnist/README.md @@ -33,19 +33,43 @@ should be used to run it: If you want to distribute the code in `train.py` with **torch DDP**, run from terminal: ```bash -sbatch ddp_slurm.sh +export DIST_MODE="ddp" +export RUN_NAME="ddp-itwinai" +export TRAINING_CMD="train.py -s ddp -c config.yaml" +export PYTHON_VENV="../../../envAI_hdfml" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh ``` If you want to distribute the code in `train.py` with **DeepSpeed**, run from terminal: ```bash -sbatch deepspeed_slurm.sh +export DIST_MODE="deepspeed" +export RUN_NAME="deepspeed-itwinai" +export TRAINING_CMD="train.py -s deepspeed -c config.yaml" +export PYTHON_VENV="../../../envAI_hdfml" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh ``` If you want to distribute the code in `train.py` with **Horovod**, run from terminal: ```bash -sbatch hvd_slurm.sh +export DIST_MODE="horovod" +export RUN_NAME="horovod-itwinai" +export TRAINING_CMD="train.py -s horovod -c config.yaml" +export PYTHON_VENV="../../../envAI_hdfml" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh ``` You can run all of them with: diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/config.yaml b/tutorials/distributed-ml/torch-tutorial-1-mnist/config.yaml index cb221dec..331d6d04 100644 --- a/tutorials/distributed-ml/torch-tutorial-1-mnist/config.yaml +++ b/tutorials/distributed-ml/torch-tutorial-1-mnist/config.yaml @@ -1,26 +1,28 @@ -# I/O +# Data and logging data_dir: ./ +log_int: 10 +verbose: True restart_int: 10 download_only: False -verbose: True +dataset_replication: 10 +shuff: False +nworker: 4 # num workers dataloader +prefetch: 2 # Model batch_size: 64 epochs: 2 lr: 0.001 -concM: 100 momentum: 0.5 -shuff: False -# Debugging -testrun: False -nseed: 10 -log_int: 10 +# Reproducibility +rnd_seed: 10 # Distributed ML -backend: nccl -nworker: 4 # num workers dataloader -prefetch: 2 -no_cuda: False +backend: nccl # ignored when using Horovod +# Horovod: ignored when NOT using Horovod +fp16_allreduce: False +use_adasum: False +gradient_predivide_factor: 1.0 diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/ddp_slurm.sh b/tutorials/distributed-ml/torch-tutorial-1-mnist/ddp_slurm.sh deleted file mode 100644 index 3d5d4bb3..00000000 --- a/tutorials/distributed-ml/torch-tutorial-1-mnist/ddp_slurm.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=Torch_DDP_tutorial-1 -#SBATCH --account=intertwin -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job-ddp.out -#SBATCH --error=job-ddp.err -#SBATCH --time=00:30:00 - -# configure node and process count on the CM -#SBATCH --partition=batch -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --gpus-per-node=4 -# SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# set modules -ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py - -# set env -source ../../../envAI_hdfml/bin/activate - -# job info -debug=false -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch training -TRAINING_CMD="train.py -s ddp -c config.yaml" - -srun --cpu-bind=none bash -c "torchrun \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=$SLURM_GPUS_PER_NODE \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $TRAINING_CMD" - diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/deepspeed_slurm.sh b/tutorials/distributed-ml/torch-tutorial-1-mnist/deepspeed_slurm.sh deleted file mode 100644 index 8e5f7881..00000000 --- a/tutorials/distributed-ml/torch-tutorial-1-mnist/deepspeed_slurm.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=Torch_DeepSpeed_tutorial-1 -#SBATCH --account=intertwin -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job-ds.out -#SBATCH --error=job-ds.err -#SBATCH --time=00:30:00 - -# configure node and process count on the CM -#SBATCH --partition=batch -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=4 -#SBATCH --gpus-per-node=4 -# SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# set modules -ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py - -# set env -source ../../../envAI_hdfml/bin/activate - -# job info -debug=false -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set env vars -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi -export CUDA_VISIBLE_DEVICES="0,1,2,3" - -# launch training -MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i -export MASTER_ADDR -export MASTER_PORT=29500 - -TRAINING_CMD="train.py -s deepspeed -c config.yaml" - -# Run without launcher: set --ntasks-per-node=NUM_GPUS -srun --cpu-bind=none python -u $TRAINING_CMD --deepspeed - -# # Run with deepspeed launcher: set --ntasks-per-node=1 -# # https://www.deepspeed.ai/getting-started/#multi-node-environment-variables -# export NCCL_IB_DISABLE=1 -# export NCCL_SOCKET_IFNAME=eth0 -# nodelist=$(scontrol show hostname $SLURM_NODELIST) -# echo "$nodelist" | sed -e 's/$/ slots=4/' > .hostfile -# # Requires passwordless SSH access among compute node -# srun --cpu-bind=none deepspeed --hostfile=.hostfile $TRAINING_CMD --deepspeed -# rm .hostfile - diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/hvd_slurm.sh b/tutorials/distributed-ml/torch-tutorial-1-mnist/hvd_slurm.sh deleted file mode 100644 index 3774b6e1..00000000 --- a/tutorials/distributed-ml/torch-tutorial-1-mnist/hvd_slurm.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=Torch_HVD_tutorial-1 -#SBATCH --account=intertwin -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job-hvd.out -#SBATCH --error=job-hvd.err -#SBATCH --time=00:30:00 - -# configure node and process count on the CM -#SBATCH --partition=batch -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=8 -#SBATCH --gpus-per-node=4 -# SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# set modules -ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py - -# set env -source ../../../envAI_hdfml/bin/activate - -# job info -debug=false -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set vars -# export NCCL_DEBUG=INFO -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi -export CUDA_VISIBLE_DEVICES="0,1,2,3" - -# launch training -TRAINING_CMD="train.py -s horovod -c config.yaml" - -srun --cpu-bind=none python -u $TRAINING_CMD - diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/runall.sh b/tutorials/distributed-ml/torch-tutorial-1-mnist/runall.sh index b1470d75..5a89b4fe 100644 --- a/tutorials/distributed-ml/torch-tutorial-1-mnist/runall.sh +++ b/tutorials/distributed-ml/torch-tutorial-1-mnist/runall.sh @@ -1,6 +1,39 @@ #!/bin/bash -# Run all versions of distributed ML for MNIST -rm *checkpoint.pth.tar *.out *.err -echo "Torch DDP training: $(sbatch ddp_slurm.sh)" -echo "DeepSpeed training: $(sbatch deepspeed_slurm.sh)" -echo "Horovod training: $(sbatch hvd_slurm.sh)" \ No newline at end of file + +# Python virtual environment +PYTHON_VENV="../../../envAI_hdfml" + +# Clear SLURM logs (*.out and *.err files) +rm -rf logs_slurm +mkdir logs_slurm +rm -rf logs_torchrun + +# DDP itwinai +DIST_MODE="ddp" +RUN_NAME="ddp-itwinai" +TRAINING_CMD="train.py -s ddp -c config.yaml" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh + +# DeepSpeed itwinai +DIST_MODE="deepspeed" +RUN_NAME="deepspeed-itwinai" +TRAINING_CMD="train.py -s deepspeed -c config.yaml" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh + +# Horovod itwinai +DIST_MODE="horovod" +RUN_NAME="horovod-itwinai" +TRAINING_CMD="train.py -s horovod -c config.yaml" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh \ No newline at end of file diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/slurm.sh b/tutorials/distributed-ml/torch-tutorial-1-mnist/slurm.sh new file mode 100644 index 00000000..3eef38ae --- /dev/null +++ b/tutorials/distributed-ml/torch-tutorial-1-mnist/slurm.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +# SLURM jobscript for JSC systems + +# Job configuration +#SBATCH --job-name=distributed_training +#SBATCH --account=intertwin +#SBATCH --mail-user= +#SBATCH --mail-type=ALL +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --time=00:30:00 + +# Resources allocation +#SBATCH --partition=batch +#SBATCH --nodes=2 +#SBATCH --gpus-per-node=4 +#SBATCH --cpus-per-gpu=4 +#SBATCH --exclusive + +# gres options have to be disabled for deepv +#SBATCH --gres=gpu:4 + +# Load environment modules +ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py + +# Job info +echo "DEBUG: TIME: $(date)" +sysN="$(uname -n | cut -f2- -d.)" +sysN="${sysN%%[0-9]*}" +echo "Running on system: $sysN" +echo "DEBUG: EXECUTE: $EXEC" +echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" +echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" +echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" +echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" +echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" +echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" +echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" +echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" +echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" +if [ "$DEBUG" = true ] ; then + echo "DEBUG: NCCL_DEBUG=INFO" + export NCCL_DEBUG=INFO +fi +echo + +# Setup env for distributed ML +export CUDA_VISIBLE_DEVICES="0,1,2,3" +export OMP_NUM_THREADS=1 +if [ "$SLURM_CPUS_PER_GPU" -gt 0 ] ; then + export OMP_NUM_THREADS=$SLURM_CPUS_PER_GPU +fi + +# Env vairables check +if [ -z "$DIST_MODE" ]; then + >&2 echo "ERROR: env variable DIST_MODE is not set. Allowed values are 'horovod', 'ddp' or 'deepspeed'" + exit 1 +fi +if [ -z "$RUN_NAME" ]; then + >&2 echo "WARNING: env variable RUN_NAME is not set. It's a way to identify some specific run of an experiment." + RUN_NAME=$DIST_MODE +fi +if [ -z "$TRAINING_CMD" ]; then + >&2 echo "ERROR: env variable TRAINING_CMD is not set. It's the python command to execute." + exit 1 +fi +if [ -z "$PYTHON_VENV" ]; then + >&2 echo "WARNING: env variable PYTHON_VENV is not set. It's the path to a python virtual environment." +else + # Activate Python virtual env + source $PYTHON_VENV/bin/activate +fi + +# Get GPUs info per node +srun --cpu-bind=none --ntasks-per-node=1 bash -c 'echo -e "NODE hostname: $(hostname)\n$(nvidia-smi)\n\n"' + +# Launch training +if [ "$DIST_MODE" == "ddp" ] ; then + echo "DDP training: $TRAINING_CMD" + srun --cpu-bind=none --ntasks-per-node=1 \ + bash -c "torchrun \ + --log_dir='logs_torchrun' \ + --nnodes=$SLURM_NNODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ + --rdzv_backend=c10d \ + --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ + $TRAINING_CMD" +elif [ "$DIST_MODE" == "deepspeed" ] ; then + echo "DEEPSPEED training: $TRAINING_CMD" + MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i + export MASTER_ADDR + export MASTER_PORT=29500 + + srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \ + python -u $TRAINING_CMD --deepspeed + + # # Run with deepspeed launcher: set --ntasks-per-node=1 + # # https://www.deepspeed.ai/getting-started/#multi-node-environment-variables + # export NCCL_IB_DISABLE=1 + # export NCCL_SOCKET_IFNAME=eth0 + # nodelist=$(scontrol show hostname $SLURM_NODELIST) + # echo "$nodelist" | sed -e 's/$/ slots=4/' > .hostfile + # # Requires passwordless SSH access among compute node + # srun --cpu-bind=none deepspeed --hostfile=.hostfile $TRAINING_CMD --deepspeed + # rm .hostfile +elif [ "$DIST_MODE" == "horovod" ] ; then + echo "HOROVOD training: $TRAINING_CMD" + srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \ + python -u $TRAINING_CMD +else + >&2 echo "ERROR: unrecognized \$DIST_MODE env variable" + exit 1 +fi diff --git a/tutorials/distributed-ml/torch-tutorial-1-mnist/train.py b/tutorials/distributed-ml/torch-tutorial-1-mnist/train.py index 365a9048..809480dd 100644 --- a/tutorials/distributed-ml/torch-tutorial-1-mnist/train.py +++ b/tutorials/distributed-ml/torch-tutorial-1-mnist/train.py @@ -1,34 +1,38 @@ """ Show how to use DDP, Horovod and DeepSpeed strategies interchangeably -with a simple neural network trained on MNIST dataset, showing how -to use checkpoints. +with a simple neural network trained on MNIST dataset. """ -import os +from typing import Tuple import argparse import sys import time -import numpy as np -import random +from timeit import default_timer as timer import torch -import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F from torchvision import datasets, transforms -from torch.utils.data import DataLoader, DistributedSampler +from torch.utils.data import Dataset + +import horovod.torch as hvd import deepspeed from itwinai.torch.distributed import ( + distributed_resources_available, TorchDistributedStrategy, - DDPDistributedStrategy, - HVDDistributedStrategy, - DSDistributedStrategy, + TorchDDPStrategy, + HorovodStrategy, + DeepSpeedStrategy, + NonDistributedStrategy ) from itwinai.parser import ArgumentParser as ItAIArgumentParser +from itwinai.torch.reproducibility import ( + seed_worker, set_seed +) -def parse_args() -> argparse.Namespace: +def parse_params() -> argparse.Namespace: """ Parse CLI args, which can also be loaded from a configuration file using the --config flag: @@ -44,54 +48,59 @@ def parse_args() -> argparse.Namespace: default='ddp' ) - # IO parsers + # Data and logging parser.add_argument('--data-dir', default='./', help=('location of the training dataset in the local ' 'filesystem')) + parser.add_argument('--log-int', type=int, default=10, + help='log interval per training') + parser.add_argument('--verbose', + action=argparse.BooleanOptionalAction, + help='Print parsed arguments') parser.add_argument('--restart-int', type=int, default=10, help='restart interval per epoch (default: 10)') parser.add_argument('--download-only', action=argparse.BooleanOptionalAction, help='Download dataset and exit') - parser.add_argument('--verbose', - action=argparse.BooleanOptionalAction, - help='Print parsed arguments') + parser.add_argument('--dataset-replication', type=int, default=100, + help='concatenate MNIST to this factor (default: 100)') + parser.add_argument('--shuff', action='store_true', default=False, + help='shuffle dataset (default: False)') + parser.add_argument('--nworker', type=int, default=0, + help=('number of workers in DataLoader (default: 0 -' + ' only main)')) + parser.add_argument('--prefetch', type=int, default=2, + help='prefetch data in DataLoader (default: 2)') - # model parsers + # Model parser.add_argument('--batch-size', type=int, default=64, help='input batch size for training (default: 64)') parser.add_argument('--epochs', type=int, default=10, help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)') - parser.add_argument('--concM', type=int, default=100, - help='concatenate MNIST to this factor (default: 100)') parser.add_argument('--momentum', type=float, default=0.5, help='momentum in SGD optimizer (default: 0.5)') - parser.add_argument('--shuff', action='store_true', default=False, - help='shuffle dataset (default: False)') - # debug parsers - parser.add_argument('--testrun', action='store_true', default=False, - help='do a test run with seed (default: False)') - parser.add_argument('--nseed', type=int, default=0, + # Reproducibility + parser.add_argument('--rnd-seed', type=int, default=0, help='seed integer for reproducibility (default: 0)') - parser.add_argument('--log-int', type=int, default=10, - help='log interval per training') - # parallel parsers + # Distributed ML parser.add_argument('--backend', type=str, default='nccl', help='backend for parrallelisation (default: nccl)') - parser.add_argument('--nworker', type=int, default=0, - help=('number of workers in DataLoader (default: 0 -' - ' only main)')) - parser.add_argument('--prefetch', type=int, default=2, - help='prefetch data in DataLoader (default: 2)') - parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables GPGPUs') parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher') + # Horovod: ignored when not using Horovod + parser.add_argument('--fp16-allreduce', action='store_true', default=False, + help='use fp16 compression during allreduce') + parser.add_argument('--use-adasum', action='store_true', default=False, + help='use adasum algorithm to do reduction') + parser.add_argument('--gradient-predivide-factor', type=float, default=1.0, + help=('apply gradient pre-divide factor in optimizer ' + '(default: 1.0)')) + # DeepSpeed parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() @@ -127,7 +136,7 @@ def forward(self, x): def train( - model, device, train_loader, optimizer, epoch, + model, train_loader, optimizer, epoch, strategy: TorchDistributedStrategy, args ): """ @@ -136,108 +145,62 @@ def train( model.train() t_list = [] loss_acc = 0 - gwsize = strategy.dist_gwsize() - if strategy.is_main_worker(): + if strategy.is_main_worker: print("\n") for batch_idx, (data, target) in enumerate(train_loader): - t = time.perf_counter() - data, target = data.to(device), target.to(device) + t = timer() + data = data.to(strategy.device()) + target = target.to(strategy.device()) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() - if batch_idx % args.log_int == 0 and strategy.is_main_worker(): + if (strategy.is_main_worker and args.log_int > 0 + and batch_idx % args.log_int == 0): + dl_size = len(train_loader.dataset)//strategy.global_world_size() print( f'Train epoch: {epoch} ' - f'[{batch_idx * len(data)}/{len(train_loader.dataset)/gwsize} ' + f'[{batch_idx * len(data)}/{dl_size} ' f'({100.0 * batch_idx / len(train_loader):.0f}%)]\t\t' f'Loss: {loss.item():.6f}') - t_list.append(time.perf_counter() - t) + t_list.append(timer() - t) loss_acc += loss.item() - if strategy.is_main_worker(): + if strategy.is_main_worker: print('TIMER: train time', sum(t_list) / len(t_list), 's') return loss_acc -def test(model, device, test_loader, strategy: TorchDistributedStrategy): +def test(model, test_loader, strategy: TorchDistributedStrategy): """ Model validation. """ model.eval() test_loss = 0 correct = 0 - gwsize = strategy.dist_gwsize() with torch.no_grad(): for data, target in test_loader: - data, target = data.to(device), target.to(device) + data = data.to(strategy.device()) + target = target.to(strategy.device()) output = model(data) - # sum up batch loss + # Sum up batch loss test_loss += F.nll_loss(output, target, reduction="sum").item() - # get the index of the max log-probability + # Get the index of the max log-probability pred = output.argmax(dim=1, keepdim=True) correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) - if strategy.is_main_worker(): + if strategy.is_main_worker: + dl_size = len(test_loader.dataset)//strategy.global_world_size() print( f'Test set: average loss: {test_loss:.4f}\t' - f'accurate samples: {correct}/{len(test_loader.dataset)/gwsize}') - acc_test = 100.0 * correct * gwsize / len(test_loader.dataset) + f'accurate samples: {correct}/{dl_size}') + acc_test = ( + 100.0 * correct * strategy.global_world_size() + / len(test_loader.dataset) + ) return acc_test -def save_state( - epoch, distrib_model, loss_acc, optimizer, - res_name, is_best, strategy: TorchDistributedStrategy -): - """ - Save training state. - """ - grank = strategy.dist_grank() - rt = time.time() - # find if is_best happened in any worker - if torch.cuda.is_available(): - is_best_m = strategy.par_allgather_obj(is_best) - - if torch.cuda.is_available(): - if any(is_best_m): - # find which rank is_best happened - select first rank if multiple - is_best_rank = np.where(np.array(is_best_m))[0][0] - - # collect state - state = {'epoch': epoch + 1, - 'state_dict': distrib_model.state_dict(), - 'best_acc': loss_acc, - 'optimizer': optimizer.state_dict()} - - # write on worker with is_best - if grank == is_best_rank: - torch.save(state, './'+res_name) - print( - f'DEBUG: state in {grank} is saved on epoch:{epoch} ' - f'in {time.time()-rt} s') - else: - # collect state - state = {'epoch': epoch + 1, - 'state_dict': distrib_model.state_dict(), - 'best_acc': loss_acc, - 'optimizer': optimizer.state_dict()} - - torch.save(state, './'+res_name) - print( - f'DEBUG: state in {grank} is saved on epoch:{epoch} in ' - f'{time.time()-rt} s') - - -def seed_worker(worker_id): - """ - Seed dataloader worker. - """ - worker_seed = torch.initial_seed() % 2**32 - np.random.seed(worker_seed) - random.seed(worker_seed) - - def download_mnist(): """ Use built-in torch datasets functions to pull MNIST dataset. @@ -257,212 +220,154 @@ def download_mnist(): ])) +def mnist_dataset(dataset_replication: int = 1) -> Tuple[Dataset, Dataset]: + """Load MNIST train and test datasets, replicating them. + + Args: + dataset_replication (int): dataset replication factor. Default 1. + + Returns: + Tuple[Dataset, Dataset]: train dataset and test dataset. + """ + replicated_data = [ + datasets.MNIST(args.data_dir, train=True, download=False, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) + for _ in range(dataset_replication) + ] + train_dataset = torch.utils.data.ConcatDataset(replicated_data) + + replicated_data = [ + datasets.MNIST(args.data_dir, train=False, download=False, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])) + for _ in range(dataset_replication) + ] + test_dataset = torch.utils.data.ConcatDataset(replicated_data) + return train_dataset, test_dataset + + if __name__ == "__main__": - args = parse_args() + args = parse_params() if args.download_only: - # Download datasets and exit + # Download datasets from a location with internet access and exit. + # This is convenient when submitting training jobs to + # a batch system where worker nodes have no internet + # access, like in some HPCs. download_mnist() sys.exit() # Instantiate Strategy - if args.strategy == 'ddp': - if (not torch.cuda.is_available() - or not torch.cuda.device_count() > 1): - raise RuntimeError('Resources unavailable') - - strategy = DDPDistributedStrategy(backend=args.backend) + if not distributed_resources_available(): + print("WARNING: falling back to non-distributed strategy.") + strategy = NonDistributedStrategy() + distribute_kwargs = {} + elif args.strategy == 'ddp': + strategy = TorchDDPStrategy(backend=args.backend) + distribute_kwargs = {} elif args.strategy == 'horovod': - strategy = HVDDistributedStrategy() + strategy = HorovodStrategy() + distribute_kwargs = dict( + compression=( + hvd.Compression.fp16 if args.fp16_allreduce + else hvd.Compression.none + ), + op=hvd.Adasum if args.use_adasum else hvd.Average, + gradient_predivide_factor=args.gradient_predivide_factor + ) elif args.strategy == 'deepspeed': - strategy = DSDistributedStrategy(backend=args.backend) + strategy = DeepSpeedStrategy(backend=args.backend) + distribute_kwargs = dict( + config_params=dict(train_micro_batch_size_per_gpu=args.batch_size) + ) else: raise NotImplementedError( f"Strategy {args.strategy} is not recognized/implemented.") - strategy.init() - - # check CUDA availability - args.cuda = not args.no_cuda and torch.cuda.is_available() - - # limit # of CPU threads to be used per worker - torch.set_num_threads(1) - - # get directory - program_dir = os.getcwd() - - # start the time.time for profiling - st = time.time() - # deterministic testrun - if args.testrun: - torch.manual_seed(args.nseed) - g = torch.Generator() - g.manual_seed(args.nseed) - - # get job rank info - rank==0 master gpu - if torch.cuda.is_available(): - # local world size - per node - lwsize = strategy.dist_lwsize() if args.cuda else 0 - gwsize = strategy.dist_gwsize() # global world size - per run - grank = strategy.dist_grank() # global rank - assign per run - lrank = strategy.dist_lrank() # local rank - assign per node - else: - gwsize = 1 - grank = 0 - - # some debug - if strategy.is_main_worker(): - print('TIMER: initialise:', time.time()-st, 's') - - # move the model on the GPU assigned to the current process - device = torch.device( - strategy.dist_device() if args.cuda and torch.cuda.is_available() - else 'cpu') - if args.cuda: - torch.cuda.set_device(lrank) - # deterministic testrun - if args.testrun: - torch.cuda.manual_seed(args.nseed) - - # read data - mnist_scale = args.concM - largeData = [] - for i in range(mnist_scale): - largeData.append( - datasets.MNIST(args.data_dir, train=True, download=False, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - ) - - # concat data - train_dataset = torch.utils.data.ConcatDataset(largeData) - - mnist_scale = args.concM - largeData = [] - for i in range(mnist_scale): - largeData.append( - datasets.MNIST(args.data_dir, train=False, download=False, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - ) + # Initialize strategy + strategy.init() - # concat data - test_dataset = torch.utils.data.ConcatDataset(largeData) - - # restricts data loading to a subset of the dataset exclusive to the - # current process - args.shuff = args.shuff and not args.testrun - if torch.cuda.is_available(): - train_sampler = DistributedSampler( - train_dataset, num_replicas=gwsize, rank=grank, shuffle=args.shuff) - test_sampler = DistributedSampler( - test_dataset, num_replicas=gwsize, rank=grank, shuffle=args.shuff) - # distribute dataset to workers - # persistent workers is not possible for nworker=0 - pers_w = True if args.nworker > 1 else False - - # deterministic testrun - the same dataset each run - kwargs = {'worker_init_fn': seed_worker, - 'generator': g} if args.testrun else {} - - if torch.cuda.is_available(): - train_loader = DataLoader( - train_dataset, batch_size=args.batch_size, - sampler=train_sampler, num_workers=args.nworker, pin_memory=True, - persistent_workers=pers_w, prefetch_factor=args.prefetch, **kwargs - ) - test_loader = DataLoader( - test_dataset, batch_size=args.batch_size, - sampler=test_sampler, num_workers=args.nworker, pin_memory=True, - persistent_workers=pers_w, prefetch_factor=args.prefetch, **kwargs - ) - else: - train_loader = DataLoader( - train_dataset, batch_size=args.batch_size) - test_loader = DataLoader( - test_dataset, batch_size=args.batch_size) + # Start the timer for profiling + st = timer() + + # Set random seed for reproducibility + torch_prng = set_seed(args.rnd_seed) + + if strategy.is_main_worker: + print('TIMER: initialise:', timer()-st, 's') + print('DEBUG: local ranks:', strategy.local_world_size(), + '/ global ranks:', strategy.global_world_size()) + print('DEBUG: sys.version:', sys.version) + print('DEBUG: args.data_dir:', args.data_dir) + print('DEBUG: args.log_int:', args.log_int) + print('DEBUG: args.nworker:', args.nworker) + print('DEBUG: args.prefetch:', args.prefetch) + print('DEBUG: args.batch_size:', args.batch_size) + print('DEBUG: args.epochs:', args.epochs) + print('DEBUG: args.lr:', args.lr) + print('DEBUG: args.momentum:', args.momentum) + print('DEBUG: args.shuff:', args.shuff) + print('DEBUG: args.rnd_seed:', args.rnd_seed) + print('DEBUG: args.backend:', args.backend) + + # Dataset + train_dataset, test_dataset = mnist_dataset(args.dataset_replication) + # Distributed dataloaders + train_loader = strategy.create_dataloader( + train_dataset, batch_size=args.batch_size, + num_workers=args.nworker, pin_memory=True, + persistent_workers=(args.nworker > 1), + prefetch_factor=args.prefetch, generator=torch_prng, + worker_init_fn=seed_worker + ) + test_loader = strategy.create_dataloader( + test_dataset, batch_size=args.batch_size, + num_workers=args.nworker, pin_memory=True, + persistent_workers=(args.nworker > 1), + prefetch_factor=args.prefetch, generator=torch_prng, + worker_init_fn=seed_worker + ) - if strategy.is_main_worker(): - print('TIMER: read and concat data:', time.time()-st, 's') + if strategy.is_main_worker: + print('TIMER: read and concat data:', timer()-st, 's') - # create CNN model - model = Net().to(device) + # Create CNN model + model = Net().to(strategy.device()) - # optimizer + # Optimizer optimizer = torch.optim.SGD( model.parameters(), lr=args.lr, momentum=args.momentum) - deepspeed_config = dict(train_batch_size=args.batch_size) - # 'config_params' key is ignored if strategy != DSDistributedStrategy - distrib_model, optimizer, _ = strategy.distributed( - model, optimizer, lr_scheduler=None, config_params=deepspeed_config + # Distributed model, optimizer, and scheduler + model, optimizer, _ = strategy.distributed( + model, optimizer, lr_scheduler=None, **distribute_kwargs ) - # resume state - start_epoch = 1 - best_acc = np.Inf - res_name = f'{args.strategy}-checkpoint.pth.tar' - if os.path.isfile(res_name): - try: - if torch.cuda.is_available(): - dist.barrier() - # Map model to be loaded to specified single gpu. - loc = {'cuda:%d' % 0: 'cuda:%d' % lrank} if args.cuda else { - 'cpu:%d' % 0: 'cpu:%d' % lrank} - checkpoint = torch.load( - program_dir+'/'+res_name, map_location=loc) - else: - checkpoint = torch.load(program_dir+'/'+res_name) - start_epoch = checkpoint['epoch'] - best_acc = checkpoint['best_acc'] - distrib_model.load_state_dict(checkpoint['state_dict']) - optimizer.load_state_dict(checkpoint['optimizer']) - if torch.cuda.is_available(): - if strategy.is_main_worker(): - print(f'WARNING: restarting from {start_epoch} epoch') - else: - print(f'WARNING: restarting from {start_epoch} epoch') - except Exception: - if torch.cuda.is_available(): - if strategy.is_main_worker(): - print('WARNING: restart file cannot be loaded, ' - 'restarting!') - else: - print('WARNING: restart file cannot be loaded, restarting!') - - if start_epoch > args.epochs: - if torch.cuda.is_available(): - if strategy.is_main_worker(): - print('WARNING: given epochs are less than the one in the ' - 'restart file!\n' - 'WARNING: SYS.EXIT is issued') - - strategy.clean_up() - sys.exit() - else: - print('WARNING: given epochs are less than the one in ' - 'the restart file!\n' - 'WARNING: SYS.EXIT is issued') - sys.exit() - - # start trainin/testing loop - if strategy.is_main_worker(): - print('TIMER: broadcast:', time.time()-st, 's') + # Start training and test loop + if strategy.is_main_worker: + print('TIMER: broadcast:', timer()-st, 's') print('\nDEBUG: start training') print('--------------------------------------------------------') - et = time.time() + et = timer() + start_epoch = 1 for epoch in range(start_epoch, args.epochs + 1): - lt = time.time() - # training + lt = timer() + if strategy.is_distributed: + # Inform the sampler that a new epoch started: shuffle + # may be needed + train_loader.sampler.set_epoch(epoch) + test_loader.sampler.set_epoch(epoch) + + # Training loss_acc = train( - model=distrib_model, - device=device, + model=model, train_loader=train_loader, optimizer=optimizer, epoch=epoch, @@ -470,77 +375,52 @@ def download_mnist(): args=args ) - # testing + # Testing acc_test = test( - model=distrib_model, - device=device, + model=model, test_loader=test_loader, strategy=strategy ) - # save first epoch timer + # Save first epoch timer if epoch == start_epoch: - first_ep_t = time.time()-lt + first_ep_t = timer()-lt - # final epoch + # Final epoch if epoch + 1 == args.epochs: train_loader.last_epoch = True test_loader.last_epoch = True - if strategy.is_main_worker(): - print('TIMER: epoch time:', time.time()-lt, 's') + if strategy.is_main_worker: + print('TIMER: epoch time:', timer()-lt, 's') print('DEBUG: accuracy:', acc_test, '%') - # save state if found a better state - is_best = loss_acc < best_acc - if epoch % args.restart_int == 0: - save_state( - epoch=epoch, - distrib_model=distrib_model, - loss_acc=loss_acc, - optimizer=optimizer, - res_name=res_name, - is_best=is_best, - strategy=strategy - ) - # reset best_acc - best_acc = min(loss_acc, best_acc) - - # finalise - # save final state - save_state( - epoch=epoch, - distrib_model=distrib_model, - loss_acc=loss_acc, - optimizer=optimizer, - res_name=res_name, - is_best=True, - strategy=strategy - ) - - # some debug - if strategy.is_main_worker(): + if strategy.is_main_worker: print('\n--------------------------------------------------------') print('DEBUG: training results:\n') print('TIMER: first epoch time:', first_ep_t, ' s') - print('TIMER: last epoch time:', time.time()-lt, ' s') - print('TIMER: average epoch time:', (time.time()-et)/args.epochs, ' s') - print('TIMER: total epoch time:', time.time()-et, ' s') + print('TIMER: last epoch time:', timer()-lt, ' s') + print('TIMER: average epoch time:', (timer()-et)/args.epochs, ' s') + print('TIMER: total epoch time:', timer()-et, ' s') if epoch > 1: print('TIMER: total epoch-1 time:', - time.time()-et-first_ep_t, ' s') + timer()-et-first_ep_t, ' s') print('TIMER: average epoch-1 time:', - (time.time()-et-first_ep_t)/(args.epochs-1), ' s') + (timer()-et-first_ep_t)/(args.epochs-1), ' s') print('DEBUG: last accuracy:', acc_test, '%') - print('DEBUG: memory req:', - int(torch.cuda.memory_reserved(lrank)/1024/1024), 'MB') \ - if args.cuda else 'DEBUG: memory req: - MB' - print('DEBUG: memory summary:\n\n', - torch.cuda.memory_summary(0)) if args.cuda else '' + if torch.cuda.is_available(): + print('DEBUG: memory req:', + int(torch.cuda.memory_reserved( + strategy.local_rank())/1024/1024), + 'MB') + print('DEBUG: memory summary:\n\n', + torch.cuda.memory_summary(0)) + + print(f'TIMER: final time: {timer()-st} s\n') - if strategy.is_main_worker(): - print(f'TIMER: final time: {time.time()-st} s\n') + time.sleep(1) + print(f" - TRAINING FINISHED") - print(f" - TRAINING FINISHED") + # Clean-up strategy.clean_up() sys.exit() diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/README.md b/tutorials/distributed-ml/torch-tutorial-2-imagenet/README.md deleted file mode 100644 index 780eb278..00000000 --- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/README.md +++ /dev/null @@ -1,47 +0,0 @@ -# Tutorial: distributed strategies for PyTorch model trained on MNIST dataset - -In this tutorial we show how to use torch `DistributedDataParallel` (DDP), Horovod and -DeepSpeed from the same client code. -Note that the environment is tested on the HDFML system at JSC. For other systems, -the module versions might need change accordingly. - -## Setup - -First, from the root of this repository, build the environment containing -pytorch, horovod and deepspeed. You can *try* with: - -```bash -# Creates a Python venv called envAI_hdfml -make torch-gpu-jsc -``` - -The Imagenet dataset is assumed to be already downloaded to some location. - -## Distributed training - -Each distributed strategy has its own SLURM job script, which -should be used to run it: - -If you want to distribute the code in `train.py` with **torch DDP**, run from terminal: - -```bash -sbatch ddp_slurm.sh -``` - -If you want to distribute the code in `train.py` with **DeepSpeed**, run from terminal: - -```bash -sbatch deepspeed_slurm.sh -``` - -If you want to distribute the code in `train.py` with **Horovod**, run from terminal: - -```bash -sbatch hvd_slurm.sh -``` - -You can run all of them with: - -```bash -bash runall.sh -``` diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/config.yaml b/tutorials/distributed-ml/torch-tutorial-2-imagenet/config.yaml deleted file mode 100644 index 2473d346..00000000 --- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/config.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# I/O -data_dir: /p/project/intertwin/datasets/Imagenet_sub/ImageNet_uncompressed/train/ #/p/project/intertwin/datasets/ImageNet_uncompressed/train -restart_int: 10 -verbose: True - -# Model -batch_size: 64 -epochs: 3 -lr: 0.001 -momentum: 0.5 -shuff: False -num_classes: 1000 - -# Debugging -testrun: False -nseed: 10 -log_int: 10 - -# Distributed ML -backend: nccl -nworker: 4 # num workers dataloader -prefetch: 2 -no_cuda: False - - diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/ddp_slurm.sh b/tutorials/distributed-ml/torch-tutorial-2-imagenet/ddp_slurm.sh deleted file mode 100644 index 4e9749c2..00000000 --- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/ddp_slurm.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=Torch_DDP_tutorial-1 -#SBATCH --account=intertwin -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job-ddp.out -#SBATCH --error=job-ddp.err -#SBATCH --time=00:30:00 - -# configure node and process count on the CM -#SBATCH --partition=batch -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=1 -#SBATCH --cpus-per-task=32 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# set modules -ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py - -# set env -source ../../../envAI_hdfml/bin/activate - -# job info -debug=false -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set comm -export CUDA_VISIBLE_DEVICES="0,1,2,3" -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi - -# launch training -TRAINING_CMD="train.py -s ddp -c config.yaml" - -srun --cpu-bind=none bash -c "torchrun \ - --log_dir='logs' \ - --nnodes=$SLURM_NNODES \ - --nproc_per_node=$SLURM_GPUS_PER_NODE \ - --rdzv_id=$SLURM_JOB_ID \ - --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ - --rdzv_backend=c10d \ - --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ - $TRAINING_CMD" - diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/deepspeed_slurm.sh b/tutorials/distributed-ml/torch-tutorial-2-imagenet/deepspeed_slurm.sh deleted file mode 100644 index 8f1c2d2d..00000000 --- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/deepspeed_slurm.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=Torch_DeepSpeed_tutorial-1 -#SBATCH --account=intertwin -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job-ds.out -#SBATCH --error=job-ds.err -#SBATCH --time=00:30:00 - -# configure node and process count on the CM -#SBATCH --partition=batch -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=4 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# set modules -ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py - -# set env -source ../../../envAI_hdfml/bin/activate - -# job info -debug=false -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set env vars -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi -export CUDA_VISIBLE_DEVICES="0,1,2,3" - -# launch training -MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i -export MASTER_ADDR -export MASTER_PORT=29500 - -TRAINING_CMD="train.py -s deepspeed -c config.yaml" - -# Run without launcher: set --ntasks-per-node=NUM_GPUS -srun --cpu-bind=none python -u $TRAINING_CMD --deepspeed - -# # Run with deepspeed launcher: set --ntasks-per-node=1 -# # https://www.deepspeed.ai/getting-started/#multi-node-environment-variables -# export NCCL_IB_DISABLE=1 -# export NCCL_SOCKET_IFNAME=eth0 -# nodelist=$(scontrol show hostname $SLURM_NODELIST) -# echo "$nodelist" | sed -e 's/$/ slots=4/' > .hostfile -# # Requires passwordless SSH access among compute node -# srun --cpu-bind=none deepspeed --hostfile=.hostfile $TRAINING_CMD --deepspeed -# rm .hostfile - diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/hvd_slurm.sh b/tutorials/distributed-ml/torch-tutorial-2-imagenet/hvd_slurm.sh deleted file mode 100644 index 69b9d51e..00000000 --- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/hvd_slurm.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -# general configuration of the job -#SBATCH --job-name=Torch_HVD_tutorial-1 -#SBATCH --account=intertwin -#SBATCH --mail-user= -#SBATCH --mail-type=ALL -#SBATCH --output=job-hvd.out -#SBATCH --error=job-hvd.err -#SBATCH --time=00:30:00 - -# configure node and process count on the CM -#SBATCH --partition=batch -#SBATCH --nodes=2 -#SBATCH --ntasks-per-node=4 -#SBATCH --cpus-per-task=8 -#SBATCH --gpus-per-node=4 -#SBATCH --exclusive - -# gres options have to be disabled for deepv -#SBATCH --gres=gpu:4 - -# set modules -ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py - -# set env -source ../../../envAI_hdfml/bin/activate - -# job info -debug=false -echo "DEBUG: TIME: $(date)" -echo "DEBUG: EXECUTE: $EXEC" -echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" -echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" -echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" -echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" -echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" -echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" -echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" -echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" -echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" -if [ "$debug" = true ] ; then - export NCCL_DEBUG=INFO -fi -echo - -# set vars -# export NCCL_DEBUG=INFO -export SRUN_CPUS_PER_TASK=${SLURM_CPUS_PER_TASK} -export OMP_NUM_THREADS=1 -if [ "$SLURM_CPUS_PER_TASK" -gt 0 ] ; then - export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK -fi -export CUDA_VISIBLE_DEVICES="0,1,2,3" - -# launch training -TRAINING_CMD="train.py -s horovod -c config.yaml" - -srun --cpu-bind=none python -u $TRAINING_CMD - diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/runall.sh b/tutorials/distributed-ml/torch-tutorial-2-imagenet/runall.sh deleted file mode 100644 index 21c02a22..00000000 --- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/runall.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -# Run all versions of distributed ML version -rm *checkpoint.pth.tar *.out *.err *.csv -echo "Torch DDP training: $(sbatch ddp_slurm.sh)" -echo "DeepSpeed training: $(sbatch deepspeed_slurm.sh)" -echo "Horovod training: $(sbatch hvd_slurm.sh)" \ No newline at end of file diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/scaling-test.sh b/tutorials/distributed-ml/torch-tutorial-2-imagenet/scaling-test.sh deleted file mode 100644 index 275f7fb7..00000000 --- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/scaling-test.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -rm *checkpoint.pth.tar *.out *.err *.csv - -timeout="01:01:00" -for N in 1 2 4 8 -do - sbatch --job-name="DDP-imagenet-n$N" --nodes=$N --output="job-ddp-n$N.out" --error="job-ddp-n$N.err" --time=$timeout ddp_slurm.sh - sbatch --job-name="DS-imagenet-n$N" --nodes=$N --output="job-ds-n$N.out" --error="job-ds-n$N.err" --time=$timeout deepspeed_slurm.sh - sbatch --job-name="HVD-imagenet-n$N" --nodes=$N --output="job-hvd-n$N.out" --error="job-hvd-n$N.err" --time=$timeout hvd_slurm.sh -done \ No newline at end of file diff --git a/tutorials/distributed-ml/torch-tutorial-2-imagenet/train.py b/tutorials/distributed-ml/torch-tutorial-2-imagenet/train.py deleted file mode 100644 index 6bd71214..00000000 --- a/tutorials/distributed-ml/torch-tutorial-2-imagenet/train.py +++ /dev/null @@ -1,499 +0,0 @@ -""" -Show how to use DDP, Horovod and DeepSpeed strategies interchangeably -with a large neural network trained on Imagenet dataset, showing how -to use checkpoints. -""" -import os -import argparse -import sys -import time -import numpy as np -import random - -import torch -from torch import nn -import torch.distributed as dist -import torch.nn.functional as F -import torchvision -from torchvision import transforms -from torch.utils.data import DataLoader, DistributedSampler - -import deepspeed - -from itwinai.torch.distributed import ( - TorchDistributedStrategy, - DDPDistributedStrategy, - HVDDistributedStrategy, - DSDistributedStrategy, -) -from itwinai.parser import ArgumentParser as ItAIArgumentParser -from itwinai.loggers import EpochTimeTracker - - -def parse_args() -> argparse.Namespace: - """ - Parse CLI args, which can also be loaded from a configuration file - using the --config flag: - - >>> train.py --strategy ddp --config config.yaml - """ - parser = ItAIArgumentParser(description='PyTorch MNIST Example') - - # Distributed ML strategy - parser.add_argument( - "--strategy", "-s", type=str, - choices=['ddp', 'horovod', 'deepspeed'], - default='ddp' - ) - - # IO parsers - parser.add_argument('--data-dir', default='./', - help=('location of the training dataset in the local ' - 'filesystem')) - parser.add_argument('--restart-int', type=int, default=10, - help='restart interval per epoch (default: 10)') - parser.add_argument('--verbose', - action=argparse.BooleanOptionalAction, - help='Print parsed arguments') - - # model parsers - parser.add_argument('--batch-size', type=int, default=64, - help='input batch size for training (default: 64)') - parser.add_argument('--epochs', type=int, default=10, - help='number of epochs to train (default: 10)') - parser.add_argument('--lr', type=float, default=0.01, - help='learning rate (default: 0.01)') - parser.add_argument('--momentum', type=float, default=0.5, - help='momentum in SGD optimizer (default: 0.5)') - parser.add_argument('--shuff', action='store_true', default=False, - help='shuffle dataset (default: False)') - parser.add_argument('--num-classes', type=int, default=1000, - help='number of classes in dataset') - - # debug parsers - parser.add_argument('--testrun', action='store_true', default=False, - help='do a test run with seed (default: False)') - parser.add_argument('--nseed', type=int, default=0, - help='seed integer for reproducibility (default: 0)') - parser.add_argument('--log-int', type=int, default=10, - help='log interval per training') - - # parallel parsers - parser.add_argument('--backend', type=str, default='nccl', - help='backend for parrallelisation (default: nccl)') - parser.add_argument('--nworker', type=int, default=0, - help=('number of workers in DataLoader (default: 0 -' - ' only main)')) - parser.add_argument('--prefetch', type=int, default=2, - help='prefetch data in DataLoader (default: 2)') - parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables GPGPUs') - parser.add_argument('--local_rank', type=int, default=-1, - help='local rank passed from distributed launcher') - - # DeepSpeed - parser = deepspeed.add_config_arguments(parser) - args = parser.parse_args() - - if args.verbose: - args_list = [f"{key}: {val}" for key, val in args.items()] - print("PARSED ARGS:\n", '\n'.join(args_list)) - - return args - - -def train( - model, device, train_loader, optimizer, epoch, - strategy: TorchDistributedStrategy, args -): - """ - Training function, representing an epoch. - """ - model.train() - t_list = [] - loss_acc = 0 - gwsize = strategy.dist_gwsize() - if strategy.is_main_worker(): - print("\n") - for batch_idx, (data, target) in enumerate(train_loader): - t = time.perf_counter() - data, target = data.to(device), target.to(device) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.step() - if batch_idx % args.log_int == 0 and strategy.is_main_worker(): - print( - f'Train epoch: {epoch} ' - f'[{batch_idx * len(data)}/{len(train_loader.dataset)/gwsize} ' - f'({100.0 * batch_idx / len(train_loader):.0f}%)]\t\t' - f'Loss: {loss.item():.6f}') - t_list.append(time.perf_counter() - t) - loss_acc += loss.item() - if strategy.is_main_worker(): - print('TIMER: train time', sum(t_list) / len(t_list), 's') - return loss_acc - - -def test(model, device, test_loader, strategy: TorchDistributedStrategy): - """ - Model validation. - """ - model.eval() - test_loss = 0 - correct = 0 - gwsize = strategy.dist_gwsize() - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(device), target.to(device) - output = model(data) - # sum up batch loss - test_loss += F.nll_loss(output, target, reduction="sum").item() - # get the index of the max log-probability - pred = output.argmax(dim=1, keepdim=True) - correct += pred.eq(target.view_as(pred)).sum().item() - test_loss /= len(test_loader.dataset) - if strategy.is_main_worker(): - print( - f'Test set: average loss: {test_loss:.4f}\t' - f'accurate samples: {correct}/{len(test_loader.dataset)/gwsize}') - acc_test = 100.0 * correct * gwsize / len(test_loader.dataset) - return acc_test - - -def save_state( - epoch, distrib_model, loss_acc, optimizer, - res_name, is_best, strategy: TorchDistributedStrategy -): - """ - Save training state. - """ - grank = strategy.dist_grank() - rt = time.time() - # find if is_best happened in any worker - if torch.cuda.is_available(): - is_best_m = strategy.par_allgather_obj(is_best) - - if torch.cuda.is_available(): - if any(is_best_m): - # find which rank is_best happened - select first rank if multiple - is_best_rank = np.where(np.array(is_best_m))[0][0] - - # collect state - state = {'epoch': epoch + 1, - 'state_dict': distrib_model.state_dict(), - 'best_acc': loss_acc, - 'optimizer': optimizer.state_dict()} - - # write on worker with is_best - if grank == is_best_rank: - torch.save(state, './'+res_name) - print( - f'DEBUG: state in {grank} is saved on epoch:{epoch} ' - f'in {time.time()-rt} s') - else: - # collect state - state = {'epoch': epoch + 1, - 'state_dict': distrib_model.state_dict(), - 'best_acc': loss_acc, - 'optimizer': optimizer.state_dict()} - - torch.save(state, './'+res_name) - print( - f'DEBUG: state in {grank} is saved on epoch:{epoch} in ' - f'{time.time()-rt} s') - - -def seed_worker(worker_id): - """ - Seed dataloader worker. - """ - worker_seed = torch.initial_seed() % 2**32 - np.random.seed(worker_seed) - random.seed(worker_seed) - - -if __name__ == "__main__": - - args = parse_args() - - # Instantiate Strategy - if args.strategy == 'ddp': - if (not torch.cuda.is_available() - or not torch.cuda.device_count() > 1): - raise RuntimeError('Resources unavailable') - - strategy = DDPDistributedStrategy(backend=args.backend) - elif args.strategy == 'horovod': - strategy = HVDDistributedStrategy() - elif args.strategy == 'deepspeed': - strategy = DSDistributedStrategy(backend=args.backend) - else: - raise NotImplementedError( - f"Strategy {args.strategy} is not recognized/implemented.") - strategy.init() - - # check CUDA availability - args.cuda = not args.no_cuda and torch.cuda.is_available() - - # limit # of CPU threads to be used per worker - torch.set_num_threads(1) - - # get directory - program_dir = os.getcwd() - - # start the time.time for profiling - st = time.time() - - # deterministic testrun - if args.testrun: - torch.manual_seed(args.nseed) - g = torch.Generator() - g.manual_seed(args.nseed) - - # get job rank info - rank==0 master gpu - if torch.cuda.is_available(): - # local world size - per node - lwsize = strategy.dist_lwsize() if args.cuda else 0 - gwsize = strategy.dist_gwsize() # global world size - per run - grank = strategy.dist_grank() # global rank - assign per run - lrank = strategy.dist_lrank() # local rank - assign per node - else: - gwsize = 1 - grank = 0 - - # some debug - if strategy.is_main_worker(): - print('TIMER: initialise:', time.time()-st, 's') - - # move the model on the GPU assigned to the current process - device = torch.device( - strategy.dist_device() if args.cuda and torch.cuda.is_available() - else 'cpu') - if args.cuda: - torch.cuda.set_device(lrank) - # deterministic testrun - if args.testrun: - torch.cuda.manual_seed(args.nseed) - - # dataset - # Initialize transformations for data augmentation - transform = transforms.Compose([ - transforms.Resize(256), - transforms.RandomHorizontalFlip(), - transforms.RandomVerticalFlip(), - transforms.RandomRotation(degrees=45), - transforms.ColorJitter( - brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) - ]) - - # Load the ImageNet Object Localization Challenge dataset - train_dataset = torchvision.datasets.ImageFolder( - root=args.data_dir, - transform=transform - ) - # test_dataset = ... - - # restricts data loading to a subset of the dataset exclusive to the - # current process - args.shuff = args.shuff and not args.testrun - if torch.cuda.is_available(): - train_sampler = DistributedSampler( - train_dataset, num_replicas=gwsize, rank=grank, shuffle=args.shuff) - # test_sampler = DistributedSampler( - # test_dataset, num_replicas=gwsize, rank=grank, - # shuffle=args.shuff) - # distribute dataset to workers - # persistent workers is not possible for nworker=0 - pers_w = True if args.nworker > 1 else False - - # deterministic testrun - the same dataset each run - kwargs = {'worker_init_fn': seed_worker, - 'generator': g} if args.testrun else {} - - if torch.cuda.is_available(): - train_loader = DataLoader( - train_dataset, batch_size=args.batch_size, - sampler=train_sampler, num_workers=args.nworker, pin_memory=True, - persistent_workers=pers_w, prefetch_factor=args.prefetch, **kwargs - ) - # test_loader = DataLoader( - # test_dataset, batch_size=args.batch_size, - # sampler=test_sampler, num_workers=args.nworker, pin_memory=True, - # persistent_workers=pers_w, prefetch_factor=args.prefetch, - # **kwargs - # ) - else: - train_loader = DataLoader( - train_dataset, batch_size=args.batch_size) - # test_loader = DataLoader( - # test_dataset, batch_size=args.batch_size) - - if strategy.is_main_worker(): - print('TIMER: read and concat data:', time.time()-st, 's') - - # create CNN model: resnet 50, resnet101, resnet152 - model = torchvision.models.resnet152() - model.fc = nn.Linear(2048, args.num_classes) - - # optimizer - optimizer = torch.optim.SGD( - model.parameters(), lr=args.lr, momentum=args.momentum) - - deepspeed_config = dict(train_micro_batch_size_per_gpu=args.batch_size) - # 'config_params' key is ignored if strategy != DSDistributedStrategy - distrib_model, optimizer, _ = strategy.distributed( - model, optimizer, lr_scheduler=None, config_params=deepspeed_config - ) - - # resume state - start_epoch = 1 - best_acc = np.Inf - nnod = os.environ.get('SLURM_NNODES', 'unk') - res_name = f'{args.strategy}-{nnod}N-checkpoint.pth.tar' - if os.path.isfile(res_name): - try: - if torch.cuda.is_available(): - dist.barrier() - # Map model to be loaded to specified single gpu. - loc = {'cuda:%d' % 0: 'cuda:%d' % lrank} if args.cuda else { - 'cpu:%d' % 0: 'cpu:%d' % lrank} - checkpoint = torch.load( - program_dir+'/'+res_name, map_location=loc) - else: - checkpoint = torch.load(program_dir+'/'+res_name) - start_epoch = checkpoint['epoch'] - best_acc = checkpoint['best_acc'] - distrib_model.load_state_dict(checkpoint['state_dict']) - optimizer.load_state_dict(checkpoint['optimizer']) - if torch.cuda.is_available(): - if strategy.is_main_worker(): - print(f'WARNING: restarting from {start_epoch} epoch') - else: - print(f'WARNING: restarting from {start_epoch} epoch') - except Exception: - if torch.cuda.is_available(): - if strategy.is_main_worker(): - print('WARNING: restart file cannot be loaded, ' - 'restarting!') - else: - print('WARNING: restart file cannot be loaded, restarting!') - - if start_epoch > args.epochs: - if torch.cuda.is_available(): - if strategy.is_main_worker(): - print('WARNING: given epochs are less than the one in the ' - 'restart file!\n' - 'WARNING: SYS.EXIT is issued') - - strategy.clean_up() - sys.exit() - else: - print('WARNING: given epochs are less than the one in ' - 'the restart file!\n' - 'WARNING: SYS.EXIT is issued') - sys.exit() - - # start trainin/testing loop - if strategy.is_main_worker(): - print('TIMER: broadcast:', time.time()-st, 's') - print('\nDEBUG: start training') - print('--------------------------------------------------------') - epoch_time_tracker = EpochTimeTracker(series_name=args.strategy) - - et = time.time() - for epoch in range(start_epoch, args.epochs + 1): - lt = time.time() - # training - loss_acc = train( - model=distrib_model, - device=device, - train_loader=train_loader, - optimizer=optimizer, - epoch=epoch, - strategy=strategy, - args=args - ) - - # # testing - # acc_test = test( - # model=distrib_model, - # device=device, - # test_loader=test_loader, - # strategy=strategy - # ) - - # save first epoch timer - if epoch == start_epoch: - first_ep_t = time.time()-lt - - # final epoch - if epoch + 1 == args.epochs: - train_loader.last_epoch = True - # test_loader.last_epoch = True - - if strategy.is_main_worker(): - print('TIMER: epoch time:', time.time()-lt, 's') - epoch_time_tracker.add_epoch_time(epoch-1, time.time()-lt) - # print('DEBUG: accuracy:', acc_test, '%') - - # save state if found a better state - is_best = loss_acc < best_acc - if epoch % args.restart_int == 0: - save_state( - epoch=epoch, - distrib_model=distrib_model, - loss_acc=loss_acc, - optimizer=optimizer, - res_name=res_name, - is_best=is_best, - strategy=strategy - ) - # reset best_acc - best_acc = min(loss_acc, best_acc) - - # finalise - # save final state - save_state( - epoch=epoch, - distrib_model=distrib_model, - loss_acc=loss_acc, - optimizer=optimizer, - res_name=res_name, - is_best=True, - strategy=strategy - ) - - # some debug - if strategy.is_main_worker(): - print('\n--------------------------------------------------------') - print('DEBUG: training results:\n') - print('TIMER: first epoch time:', first_ep_t, ' s') - print('TIMER: last epoch time:', time.time()-lt, ' s') - print('TIMER: average epoch time:', (time.time()-et)/args.epochs, ' s') - print('TIMER: total epoch time:', time.time()-et, ' s') - if epoch > 1: - print('TIMER: total epoch-1 time:', - time.time()-et-first_ep_t, ' s') - print('TIMER: average epoch-1 time:', - (time.time()-et-first_ep_t)/(args.epochs-1), ' s') - # print('DEBUG: last accuracy:', acc_test, '%') - print('DEBUG: memory req:', - int(torch.cuda.memory_reserved(lrank)/1024/1024), 'MB') \ - if args.cuda else 'DEBUG: memory req: - MB' - print('DEBUG: memory summary:\n\n', - torch.cuda.memory_summary(0)) if args.cuda else '' - - if strategy.is_main_worker(): - print(f'TIMER: final time: {time.time()-st} s\n') - nnod = os.environ.get('SLURM_NNODES', 'unk') - epoch_time_tracker.save( - csv_file=f"epochtime_{args.strategy}_{nnod}N.csv") - - print(f" - TRAINING FINISHED") - strategy.clean_up() - sys.exit() diff --git a/tutorials/ml-workflows/basic_components.py b/tutorials/ml-workflows/basic_components.py index 49e74180..1fca03d8 100644 --- a/tutorials/ml-workflows/basic_components.py +++ b/tutorials/ml-workflows/basic_components.py @@ -70,12 +70,6 @@ def execute( """ return train_set, vaild_set, test_set, "my_trained_model" - def save_state(self): - return super().save_state() - - def load_state(self): - return super().load_state() - class MySaver(Saver): @monitor_exec diff --git a/use-cases/3dgan/Dockerfile b/use-cases/3dgan/Dockerfile index c10d8ec8..26cc3f29 100644 --- a/use-cases/3dgan/Dockerfile +++ b/use-cases/3dgan/Dockerfile @@ -1,19 +1,25 @@ -# FROM python:3.9.12 FROM nvcr.io/nvidia/pytorch:23.09-py3 +# FROM python:3.11 WORKDIR /usr/src/app -RUN pip install --upgrade pip -RUN pip install --no-cache-dir lightning +# Install itwinai +COPY pyproject.toml ./ +COPY src ./ +RUN pip install --upgrade pip \ + && pip install --no-cache-dir lightning \ + && pip install --no-cache-dir . -# Add 3DGAN custom requirements +# Add 3DGAN use case files and install additional requirements COPY use-cases/3dgan/requirements.txt ./ +COPY use-cases/3dgan/* ./ RUN pip install --no-cache-dir -r requirements.txt -# Install itwinai and dependencies -COPY pyproject.toml ./ -COPY src ./ -RUN pip install --no-cache-dir . +# Create non-root user +RUN groupadd -g 10001 dotnet \ + && useradd -m -u 10000 -g dotnet dotnet \ + && chown -R dotnet:dotnet /usr/src/app +USER dotnet:dotnet -# Add 3DGAN use case files -COPY use-cases/3dgan/* ./ \ No newline at end of file +# ENTRYPOINT [ "itwinai", "exec-pipeline" ] +# CMD [ "--config", "pipeline.yaml" ] \ No newline at end of file diff --git a/use-cases/3dgan/README.md b/use-cases/3dgan/README.md index d0bf2c82..53501e89 100644 --- a/use-cases/3dgan/README.md +++ b/use-cases/3dgan/README.md @@ -19,30 +19,22 @@ micromamba virtual environment. ## Training -At CERN, use the dedicated configuration file: +Launch training using `itwinai` and the training configuration: ```bash cd use-cases/3dgan -itwinai exec-pipeline --config cern-pipeline.yaml +itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline # Or better: -micromamba run -p ../../.venv-pytorch/ torchrun --nproc_per_node gpu itwinai exec-pipeline --config cern-pipeline.yaml +micromamba run -p ../../.venv-pytorch/ torchrun --nproc_per_node gpu \ + itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline ``` -Anywhere else, use the general purpose training configuration: +To visualize the logs with MLFLow, if you set a local path as tracking URI, +run the following in the terminal: ```bash -cd use-cases/3dgan -itwinai exec-pipeline --config pipeline.yaml - -# Or better: -micromamba run -p ../../.venv-pytorch/ torchrun --nproc_per_node gpu itwinai exec-pipeline --config pipeline.yaml -``` - -To visualize the logs with MLFLow run the following in the terminal: - -```bash -micromamba run -p ../../.venv-pytorch mlflow ui --backend-store-uri ml_logs/mlflow_logs +micromamba run -p ../../.venv-pytorch mlflow ui --backend-store-uri LOCAL_TRACKING_URI ``` And select the "3DGAN" experiment. @@ -69,12 +61,8 @@ sub-folders: 2. As model, if a pre-trained checkpoint is not available, we can create a dummy version of it with: - ```python - import torch - from model import ThreeDGAN - # Same params as in the training config file! - my_gan = ThreeDGAN() - torch.save(my_gan, '3dgan-inference.pth') + ```bash + python create_inference_sample.py ``` 3. Run inference command. This will generate a `3dgan-generated-data` @@ -82,7 +70,7 @@ folder containing generated particle traces in form of torch tensors (.pth files) and 3D scatter plots (.jpg images). ```bash - itwinai exec-pipeline --config inference-pipeline.yaml + itwinai exec-pipeline --config config.yaml --pipe-key inference_pipeline ``` The inference execution will produce a folder called @@ -120,19 +108,20 @@ export STRATEGY="auto" # distributed strategy export DEVICES="0," # GPU devices list -itwinai exec-pipeline --print-config --config $CERN_CODE_ROOT/inference-pipeline.yaml \ --o pipeline.init_args.steps.dataloading_step.init_args.data_path=$TMP_DATA_ROOT/exp_data \ --o pipeline.init_args.steps.inference_step.init_args.config.trainer.logger.init_args.save_dir=$TMP_DATA_ROOT/ml_logs/mlflow_logs \ --o pipeline.init_args.steps.inference_step.init_args.config.trainer.strategy=$STRATEGY \ --o pipeline.init_args.steps.inference_step.init_args.config.trainer.devices=$DEVICES \ --o pipeline.init_args.steps.inference_step.init_args.config.trainer.accelerator=$ACCELERATOR \ --o pipeline.init_args.steps.inference_step.init_args.model.init_args.model_uri=$CERN_CODE_ROOT/3dgan-inference.pth \ --o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.datapath=$TMP_DATA_ROOT/exp_data/*/*.h5 \ --o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.max_samples=$MAX_DATA_SAMPLES \ --o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.batch_size=$BATCH_SIZE \ --o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.num_workers=$NUM_WORKERS_DL \ --o pipeline.init_args.steps.saver_step.init_args.save_dir=$TMP_DATA_ROOT/3dgan-generated-data \ --o pipeline.init_args.steps.saver_step.init_args.aggregate_predictions=$AGGREGATE_PREDS +itwinai exec-pipeline --print-config --config $CERN_CODE_ROOT/config.yaml \ + --pipe-key inference_pipeline \ + -o dataset_location=$CERN_DATA_ROOT/exp_data \ + -o logs_dir=$TMP_DATA_ROOT/ml_logs/mlflow_logs \ + -o distributed_strategy=$STRATEGY \ + -o devices=$DEVICES \ + -o hw_accelerators=$ACCELERATOR \ + -o checkpoints_path=\\$TMP_DATA_ROOT/checkpoints \ + -o inference_model_uri=$CERN_CODE_ROOT/3dgan-inference.pth \ + -o max_dataset_size=$MAX_DATA_SAMPLES \ + -o batch_size=$BATCH_SIZE \ + -o num_workers_dataloader=$NUM_WORKERS_DL \ + -o inference_results_location=$TMP_DATA_ROOT/3dgan-generated-data \ + -o aggregate_predictions=$AGGREGATE_PREDS ``` ### Docker image @@ -196,17 +185,20 @@ export ACCELERATOR="gpu" # choose "cpu" or "gpu" docker run -it --rm --name running-inference \ -v "$PWD":/usr/data ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.1 \ /bin/bash -c "itwinai exec-pipeline \ ---config inference-pipeline.yaml --print-config \ --o pipeline.init_args.steps.dataloading_step.init_args.data_path=$CERN_DATA_ROOT/exp_data \ --o pipeline.init_args.steps.inference_step.init_args.config.trainer.logger.init_args.save_dir=$CERN_DATA_ROOT/ml_logs/mlflow_logs \ --o pipeline.init_args.steps.inference_step.init_args.config.trainer.accelerator=$ACCELERATOR \ --o pipeline.init_args.steps.inference_step.init_args.model.init_args.model_uri=$CERN_CODE_ROOT/3dgan-inference.pth \ --o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.datapath=$CERN_DATA_ROOT/exp_data/*/*.h5 \ --o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.max_samples=$MAX_DATA_SAMPLES \ --o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.batch_size=$BATCH_SIZE \ --o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.num_workers=$NUM_WORKERS_DL \ --o pipeline.init_args.steps.saver_step.init_args.save_dir=$CERN_DATA_ROOT/3dgan-generated-data \ --o pipeline.init_args.steps.saver_step.init_args.aggregate_predictions=$AGGREGATE_PREDS " + --print-config --config $CERN_CODE_ROOT/config.yaml \ + --pipe-key inference_pipeline \ + -o dataset_location=$CERN_DATA_ROOT/exp_data \ + -o logs_dir=$TMP_DATA_ROOT/ml_logs/mlflow_logs \ + -o distributed_strategy=$STRATEGY \ + -o devices=$DEVICES \ + -o hw_accelerators=$ACCELERATOR \ + -o checkpoints_path=\\$TMP_DATA_ROOT/checkpoints \ + -o inference_model_uri=$CERN_CODE_ROOT/3dgan-inference.pth \ + -o max_dataset_size=$MAX_DATA_SAMPLES \ + -o batch_size=$BATCH_SIZE \ + -o num_workers_dataloader=$NUM_WORKERS_DL \ + -o inference_results_location=$TMP_DATA_ROOT/3dgan-generated-data \ + -o aggregate_predictions=$AGGREGATE_PREDS " ``` #### How to fully exploit GPU resources @@ -231,7 +223,7 @@ Run Docker container with Singularity: ```bash singularity run --nv -B "$PWD":/usr/data docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.1 /bin/bash -c \ -"cd /usr/src/app && itwinai exec-pipeline --config inference-pipeline.yaml" +"cd /usr/src/app && itwinai exec-pipeline --config config.yaml --pipe-key inference_pipeline" ``` Example with overrides (as above for Docker): @@ -248,15 +240,18 @@ export ACCELERATOR="gpu" # choose "cpu" or "gpu" singularity run --nv -B "$PWD":/usr/data docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.1 /bin/bash -c \ "cd /usr/src/app && itwinai exec-pipeline \ ---config inference-pipeline.yaml --print-config \ --o pipeline.init_args.steps.dataloading_step.init_args.data_path=$CERN_DATA_ROOT/exp_data \ --o pipeline.init_args.steps.inference_step.init_args.config.trainer.logger.init_args.save_dir=$CERN_DATA_ROOT/ml_logs/mlflow_logs \ --o pipeline.init_args.steps.inference_step.init_args.config.trainer.accelerator=$ACCELERATOR \ --o pipeline.init_args.steps.inference_step.init_args.model.init_args.model_uri=$CERN_CODE_ROOT/3dgan-inference.pth \ --o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.datapath=$CERN_DATA_ROOT/exp_data/*/*.h5 \ --o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.max_samples=$MAX_DATA_SAMPLES \ --o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.batch_size=$BATCH_SIZE \ --o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.num_workers=$NUM_WORKERS_DL \ --o pipeline.init_args.steps.saver_step.init_args.save_dir=$CERN_DATA_ROOT/3dgan-generated-data \ --o pipeline.init_args.steps.saver_step.init_args.aggregate_predictions=$AGGREGATE_PREDS " + --print-config --config $CERN_CODE_ROOT/config.yaml \ + --pipe-key inference_pipeline \ + -o dataset_location=$CERN_DATA_ROOT/exp_data \ + -o logs_dir=$TMP_DATA_ROOT/ml_logs/mlflow_logs \ + -o distributed_strategy=$STRATEGY \ + -o devices=$DEVICES \ + -o hw_accelerators=$ACCELERATOR \ + -o checkpoints_path=\\$TMP_DATA_ROOT/checkpoints \ + -o inference_model_uri=$CERN_CODE_ROOT/3dgan-inference.pth \ + -o max_dataset_size=$MAX_DATA_SAMPLES \ + -o batch_size=$BATCH_SIZE \ + -o num_workers_dataloader=$NUM_WORKERS_DL \ + -o inference_results_location=$TMP_DATA_ROOT/3dgan-generated-data \ + -o aggregate_predictions=$AGGREGATE_PREDS " ``` diff --git a/use-cases/3dgan/cern-pipeline.yaml b/use-cases/3dgan/cern-pipeline.yaml deleted file mode 100644 index 0bc9a756..00000000 --- a/use-cases/3dgan/cern-pipeline.yaml +++ /dev/null @@ -1,95 +0,0 @@ -pipeline: - class_path: itwinai.pipeline.Pipeline - init_args: - steps: - - class_path: dataloader.Lightning3DGANDownloader - init_args: - data_path: /eos/user/k/ktsolaki/data/3dgan_data - data_url: null # https://drive.google.com/drive/folders/1uPpz0tquokepptIfJenTzGpiENfo2xRX - - - class_path: trainer.Lightning3DGANTrainer - init_args: - # Pytorch lightning config for training - config: - seed_everything: 4231162351 - trainer: - accelerator: auto - accumulate_grad_batches: 1 - barebones: false - benchmark: null - callbacks: - - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping - init_args: - monitor: val_generator_loss - patience: 2 - - class_path: lightning.pytorch.callbacks.lr_monitor.LearningRateMonitor - init_args: - logging_interval: step - - class_path: lightning.pytorch.callbacks.ModelCheckpoint - init_args: - dirpath: checkpoints - filename: best-checkpoint - mode: min - monitor: val_generator_loss - save_top_k: 1 - verbose: true - check_val_every_n_epoch: 1 - default_root_dir: null - detect_anomaly: false - deterministic: null - devices: auto #[0] - enable_checkpointing: true - enable_model_summary: null - enable_progress_bar: null - fast_dev_run: false - gradient_clip_algorithm: null - gradient_clip_val: null - inference_mode: true - limit_predict_batches: null - limit_test_batches: null - limit_train_batches: null - limit_val_batches: null - log_every_n_steps: 2 - logger: - # - class_path: lightning.pytorch.loggers.CSVLogger - # init_args: - # save_dir: ml_logs/csv_logs - class_path: lightning.pytorch.loggers.MLFlowLogger - init_args: - experiment_name: 3DGAN - save_dir: ml_logs/mlflow_logs - log_model: all - max_epochs: 100 - max_steps: -1 - max_time: null - min_epochs: null - min_steps: null - num_sanity_val_steps: null - overfit_batches: 0.0 - plugins: null - profiler: null - reload_dataloaders_every_n_epochs: 0 - strategy: ddp_find_unused_parameters_true #auto - sync_batchnorm: false - use_distributed_sampler: true - val_check_interval: null - - # Lightning Model configuration - model: - class_path: model.ThreeDGAN - init_args: - latent_size: 256 - batch_size: 128 - loss_weights: [3, 0.1, 25, 0.1] - power: 0.85 - lr: 0.001 - checkpoint_path: checkpoints/3dgan.pth - - # Lightning data module configuration - data: - class_path: dataloader.ParticlesDataModule - init_args: - datapath: /eos/user/k/ktsolaki/data/3dgan_data/*.h5 # exp_data/*/*.h5 - batch_size: 128 - num_workers: 0 - max_samples: 10000 diff --git a/use-cases/3dgan/config.yaml b/use-cases/3dgan/config.yaml new file mode 100644 index 00000000..d23288d5 --- /dev/null +++ b/use-cases/3dgan/config.yaml @@ -0,0 +1,208 @@ +# Main configurations +dataset_location: exp_data/ +dataset_url: https://drive.google.com/drive/folders/1uPpz0tquokepptIfJenTzGpiENfo2xRX +hw_accelerators: auto +distributed_strategy: auto #ddp_find_unused_parameters_true +devices: auto #[0] +checkpoints_path: checkpoints +logs_dir: ml_logs +mlflow_tracking_uri: https://131.154.99.166.myip.cloud.infn.it +batch_size: 4 +num_workers_dataloader: 0 +max_epochs: 2 +max_dataset_size: 48 +random_seed: 4231162351 +inference_results_location: 3dgan-generated-data/ +inference_model_uri: 3dgan-inference.pth +aggregate_predictions: false + +# Dataloading step is common and can be reused +dataloading_step: + class_path: dataloader.Lightning3DGANDownloader + init_args: + data_path: ${dataset_location} # Set to null to skip dataset download + data_url: ${dataset_url} + +# AI workflows +training_pipeline: + class_path: itwinai.pipeline.Pipeline + init_args: + steps: + dataloading_step: ${dataloading_step} + + training_step: + class_path: trainer.Lightning3DGANTrainer + init_args: + exp_root: ${logs_dir} + # Pytorch lightning config for training + config: + seed_everything: ${random_seed} + trainer: + accelerator: ${hw_accelerators} + accumulate_grad_batches: 1 + barebones: false + benchmark: null + callbacks: + - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping + init_args: + monitor: val_generator_loss + patience: 2 + - class_path: lightning.pytorch.callbacks.lr_monitor.LearningRateMonitor + init_args: + logging_interval: step + - class_path: lightning.pytorch.callbacks.ModelCheckpoint + init_args: + dirpath: ${checkpoints_path} + filename: best-checkpoint + mode: min + monitor: val_generator_loss + save_top_k: 1 + verbose: true + check_val_every_n_epoch: 1 + default_root_dir: null + detect_anomaly: false + deterministic: null + devices: ${devices} + enable_checkpointing: true + enable_model_summary: null + enable_progress_bar: null + fast_dev_run: false + gradient_clip_algorithm: null + gradient_clip_val: null + inference_mode: true + limit_predict_batches: null + limit_test_batches: null + limit_train_batches: null + limit_val_batches: null + log_every_n_steps: 1 + logger: + - class_path: lightning.pytorch.loggers.CSVLogger + init_args: + name: 3DGAN + save_dir: ${logs_dir} + - class_path: lightning.pytorch.loggers.MLFlowLogger + init_args: + experiment_name: 3DGAN + save_dir: null #ml_logs/mlflow_logs + tracking_uri: ${mlflow_tracking_uri} + log_model: all + max_epochs: ${max_epochs} + max_time: null + min_epochs: null + min_steps: null + num_sanity_val_steps: null + overfit_batches: 0.0 + plugins: null + profiler: null + reload_dataloaders_every_n_epochs: 0 + strategy: ${distributed_strategy} + sync_batchnorm: false + use_distributed_sampler: true + val_check_interval: null + + # Lightning Model configuration + model: + class_path: model.ThreeDGAN + init_args: + latent_size: 256 + loss_weights: [3, 0.1, 25, 0.1] + power: 0.85 + lr: 0.001 + checkpoints_dir: ${checkpoints_path} + + # Lightning data module configuration + data: + class_path: dataloader.ParticlesDataModule + init_args: + datapath: ${dataset_location} + batch_size: ${batch_size} + num_workers: ${num_workers_dataloader} + max_samples: ${max_dataset_size} + +inference_pipeline: + class_path: itwinai.pipeline.Pipeline + init_args: + steps: + dataloading_step: ${dataloading_step} + + inference_step: + class_path: trainer.Lightning3DGANPredictor + init_args: + model: + class_path: trainer.LightningModelLoader + init_args: + model_uri: ${inference_model_uri} + + # Pytorch lightning config for training + config: + seed_everything: ${random_seed} + trainer: + accelerator: ${hw_accelerators} + accumulate_grad_batches: 1 + barebones: false + benchmark: null + check_val_every_n_epoch: 1 + default_root_dir: null + detect_anomaly: false + deterministic: null + devices: ${devices} + enable_checkpointing: true + enable_model_summary: null + enable_progress_bar: null + fast_dev_run: false + gradient_clip_algorithm: null + gradient_clip_val: null + inference_mode: true + limit_predict_batches: null + limit_test_batches: null + limit_train_batches: null + limit_val_batches: null + log_every_n_steps: 2 + logger: + # - class_path: lightning.pytorch.loggers.CSVLogger + # init_args: + # save_dir: ml_logs/csv_logs + class_path: lightning.pytorch.loggers.MLFlowLogger + init_args: + experiment_name: 3DGAN + save_dir: ${logs_dir} + log_model: all + max_epochs: ${max_epochs} + max_steps: 20 + max_time: null + min_epochs: null + min_steps: null + num_sanity_val_steps: null + overfit_batches: 0.0 + plugins: null + profiler: null + reload_dataloaders_every_n_epochs: 0 + strategy: ${distributed_strategy} + sync_batchnorm: false + use_distributed_sampler: true + val_check_interval: null + + # Lightning Model configuration + model: + class_path: model.ThreeDGAN + init_args: + latent_size: 256 + loss_weights: [3, 0.1, 25, 0.1] + power: 0.85 + lr: 0.001 + checkpoints_dir: ${checkpoints_path} + + # Lightning data module configuration + data: + class_path: dataloader.ParticlesDataModule + init_args: + datapath: ${dataset_location} + batch_size: ${batch_size} #1024 + num_workers: ${num_workers_dataloader} #4 + max_samples: ${max_dataset_size} #null, 10000 + + saver_step: + class_path: saver.ParticleImagesSaver + init_args: + save_dir: ${inference_results_location} + aggregate_predictions: ${aggregate_predictions} \ No newline at end of file diff --git a/use-cases/3dgan/create_inference_sample.py b/use-cases/3dgan/create_inference_sample.py new file mode 100644 index 00000000..14b88870 --- /dev/null +++ b/use-cases/3dgan/create_inference_sample.py @@ -0,0 +1,23 @@ +"""Create a simple inference dataset sample and a checkpoint.""" + +import argparse +import os +import torch +from model import ThreeDGAN + + +def create_checkpoint( + root: str = '.', + ckpt_name: str = "3dgan-inference.pth" +): + ckpt_path = os.path.join(root, ckpt_name) + net = ThreeDGAN() + torch.save(net, ckpt_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--root", type=str, default='.') + parser.add_argument("--ckpt-name", type=str, default="3dgan-inference.pth") + args = parser.parse_args() + create_checkpoint(**vars(args)) diff --git a/use-cases/3dgan/dataloader.py b/use-cases/3dgan/dataloader.py index 89234895..49d2c2f5 100644 --- a/use-cases/3dgan/dataloader.py +++ b/use-cases/3dgan/dataloader.py @@ -35,7 +35,8 @@ def execute(self): gdown.download_folder( url=self.data_url, quiet=False, - output=self.data_path + output=self.data_path, + verify=False ) @@ -57,7 +58,8 @@ def __getitem__(self, idx): def fetch_data(self) -> None: print("Searching in :", self.datapath) - files = sorted(glob.glob(self.datapath)) + files = sorted(glob.glob(os.path.join( + self.datapath, '**/*.h5'), recursive=True)) print("Found {} files. ".format(len(files))) if len(files) == 0: raise RuntimeError(f"No H5 files found at '{self.datapath}'!") diff --git a/use-cases/3dgan/inference-pipeline.yaml b/use-cases/3dgan/inference-pipeline.yaml deleted file mode 100644 index f5125576..00000000 --- a/use-cases/3dgan/inference-pipeline.yaml +++ /dev/null @@ -1,106 +0,0 @@ -pipeline: - class_path: itwinai.pipeline.Pipeline - init_args: - steps: - dataloading_step: - class_path: dataloader.Lightning3DGANDownloader - init_args: - data_path: /usr/data/exp_data/ - data_url: https://drive.google.com/drive/folders/1uPpz0tquokepptIfJenTzGpiENfo2xRX - - inference_step: - class_path: trainer.Lightning3DGANPredictor - init_args: - model: - class_path: trainer.LightningModelLoader - init_args: - model_uri: 3dgan-inference.pth - - # Pytorch lightning config for training - config: - seed_everything: 4231162351 - trainer: - accelerator: auto - accumulate_grad_batches: 1 - barebones: false - benchmark: null - # callbacks: - # # - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping - # # init_args: - # # monitor: val_loss - # # patience: 2 - # - class_path: lightning.pytorch.callbacks.lr_monitor.LearningRateMonitor - # init_args: - # logging_interval: step - # # - class_path: lightning.pytorch.callbacks.ModelCheckpoint - # # init_args: - # # dirpath: checkpoints - # # filename: best-checkpoint - # # mode: min - # # monitor: val_loss - # # save_top_k: 1 - # # verbose: true - check_val_every_n_epoch: 1 - default_root_dir: null - detect_anomaly: false - deterministic: null - devices: auto #[0] - enable_checkpointing: true - enable_model_summary: null - enable_progress_bar: null - fast_dev_run: false - gradient_clip_algorithm: null - gradient_clip_val: null - inference_mode: true - limit_predict_batches: null - limit_test_batches: null - limit_train_batches: null - limit_val_batches: null - log_every_n_steps: 2 - logger: - # - class_path: lightning.pytorch.loggers.CSVLogger - # init_args: - # save_dir: ml_logs/csv_logs - class_path: lightning.pytorch.loggers.MLFlowLogger - init_args: - experiment_name: 3DGAN - save_dir: /usr/data/ml_logs/mlflow_logs - log_model: all - max_epochs: 1 - max_steps: 20 - max_time: null - min_epochs: null - min_steps: null - num_sanity_val_steps: null - overfit_batches: 0.0 - plugins: null - profiler: null - reload_dataloaders_every_n_epochs: 0 - strategy: ddp_find_unused_parameters_true #auto - sync_batchnorm: false - use_distributed_sampler: true - val_check_interval: null - - # Lightning Model configuration - model: - class_path: model.ThreeDGAN - init_args: - latent_size: 256 - loss_weights: [3, 0.1, 25, 0.1] - power: 0.85 - lr: 0.001 - - # Lightning data module configuration - data: - class_path: dataloader.ParticlesDataModule - init_args: - datapath: /usr/data/exp_data/*/*.h5 - batch_size: 64 #1024 - num_workers: 2 #4 - max_samples: 10 #null, 10000 - - saver_step: - class_path: saver.ParticleImagesSaver - init_args: - save_dir: /usr/data/3dgan-generated-data - aggregate_predictions: false \ No newline at end of file diff --git a/use-cases/3dgan/interLink/3dgan-inference-cpu.yaml b/use-cases/3dgan/interLink/3dgan-inference-cpu.yaml index 2ba3c0a8..ef9016b4 100644 --- a/use-cases/3dgan/interLink/3dgan-inference-cpu.yaml +++ b/use-cases/3dgan/interLink/3dgan-inference-cpu.yaml @@ -1,29 +1,31 @@ apiVersion: v1 kind: Pod metadata: - name: 3dgan-cpu + name: 3dgan-inference-cpu annotations: - slurm-job.vk.io/flags: "-p gpu --gres=gpu:1 --cpus-per-task=4 --mem=100G --ntasks-per-node=1 --nodes=1" + slurm-job.vk.io/flags: "-p gpu --gres=gpu:1 --cpus-per-task=4 --mem=100G --ntasks-per-node=1 --nodes=1 --time=00:55:00" job.vk.io/singularity-mounts: "--bind /ceph/hpc/data/st2301-itwin-users/egarciagarcia:/exp_data" - #job.vk.io/pre-exec: "singularity pull /ceph/hpc/data/st2301-itwin-users/itwinaiv6_1.sif docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.2" + #job.vk.io/pre-exec: "singularity pull /ceph/hpc/data/st2301-itwin-users/itwinai_v9.5.sif docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.4" spec: automountServiceAccountToken: false containers: - args: - -c - - "\" cd /usr/src/app && itwinai exec-pipeline --print-config --config \\$CERN_CODE_ROOT/inference-pipeline.yaml \ - -o pipeline.init_args.steps.dataloading_step.init_args.data_path=\\$CERN_DATA_ROOT \ - -o pipeline.init_args.steps.inference_step.init_args.config.trainer.logger.init_args.save_dir=\\$TMP_DATA_ROOT/ml_logs/mlflow_logs \ - -o pipeline.init_args.steps.inference_step.init_args.config.trainer.strategy=\\$STRATEGY \ - -o pipeline.init_args.steps.inference_step.init_args.config.trainer.devices=\\$DEVICES \ - -o pipeline.init_args.steps.inference_step.init_args.config.trainer.accelerator=\\$ACCELERATOR \ - -o pipeline.init_args.steps.inference_step.init_args.model.init_args.model_uri=\\$CERN_CODE_ROOT/3dgan-inference.pth \ - -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.datapath=\\$CERN_DATA_ROOT/*.h5 \ - -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.max_samples=\\$MAX_DATA_SAMPLES \ - -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.batch_size=\\$BATCH_SIZE \ - -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.num_workers=\\$NUM_WORKERS_DL \ - -o pipeline.init_args.steps.saver_step.init_args.save_dir=\\$TMP_DATA_ROOT/3dgan-generated-data \ - -o pipeline.init_args.steps.saver_step.init_args.aggregate_predictions=\\$AGGREGATE_PREDS \"" + - "\" cd /usr/src/app && itwinai exec-pipeline --print-config \ + --config \\$CERN_CODE_ROOT/config.yaml \ + --pipe-key inference_pipeline \ + -o dataset_location=\\$CERN_DATA_ROOT \ + -o logs_dir=\\$TMP_DATA_ROOT/ml_logs/mlflow_logs \ + -o distributed_strategy=\\$STRATEGY \ + -o devices=\\$DEVICES \ + -o hw_accelerators=\\$ACCELERATOR \ + -o inference_model_uri=\\$CERN_CODE_ROOT/3dgan-inference.pth \ + -o checkpoints_path=\\$TMP_DATA_ROOT/checkpoints \ + -o max_dataset_size=\\$MAX_DATA_SAMPLES \ + -o batch_size=\\$BATCH_SIZE \ + -o num_workers_dataloader=\\$NUM_WORKERS_DL \ + -o inference_results_location=\\$TMP_DATA_ROOT/3dgan-generated-data \ + -o aggregate_predictions=\\$AGGREGATE_PREDS \"" command: - /bin/sh env: @@ -47,7 +49,7 @@ spec: value: "auto" - name: DEVICES value: "auto" - image: /ceph/hpc/data/st2301-itwin-users/itwinaiv6_1.sif + image: /ceph/hpc/data/st2301-itwin-users/itwinai_v9.5.sif imagePullPolicy: Always name: oscar-container resources: diff --git a/use-cases/3dgan/interLink/3dgan-inference.yaml b/use-cases/3dgan/interLink/3dgan-inference.yaml index 4a9b4575..c07997de 100644 --- a/use-cases/3dgan/interLink/3dgan-inference.yaml +++ b/use-cases/3dgan/interLink/3dgan-inference.yaml @@ -1,29 +1,31 @@ apiVersion: v1 kind: Pod metadata: - name: 3dgan + name: 3dgan-inference annotations: - slurm-job.vk.io/flags: "-p gpu --gres=gpu:1 --cpus-per-task=4 --mem=100G --ntasks-per-node=1 --nodes=1" + slurm-job.vk.io/flags: "-p gpu --gres=gpu:1 --cpus-per-task=4 --mem=100G --ntasks-per-node=1 --nodes=1 --time=00:55:00" job.vk.io/singularity-mounts: "--bind /ceph/hpc/data/st2301-itwin-users/egarciagarcia:/exp_data" - #job.vk.io/pre-exec: "singularity pull /ceph/hpc/data/st2301-itwin-users/itwinaiv6_1.sif docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.2" + # job.vk.io/pre-exec: "singularity pull /ceph/hpc/data/st2301-itwin-users/itwinai_v9.5.sif docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.4" spec: automountServiceAccountToken: false containers: - args: - -c - - "\" cd /usr/src/app && itwinai exec-pipeline --print-config --config \\$CERN_CODE_ROOT/inference-pipeline.yaml \ - -o pipeline.init_args.steps.dataloading_step.init_args.data_path=\\$CERN_DATA_ROOT \ - -o pipeline.init_args.steps.inference_step.init_args.config.trainer.logger.init_args.save_dir=\\$TMP_DATA_ROOT/ml_logs/mlflow_logs \ - -o pipeline.init_args.steps.inference_step.init_args.config.trainer.strategy=\\$STRATEGY \ - -o pipeline.init_args.steps.inference_step.init_args.config.trainer.devices=\\$DEVICES \ - -o pipeline.init_args.steps.inference_step.init_args.config.trainer.accelerator=\\$ACCELERATOR \ - -o pipeline.init_args.steps.inference_step.init_args.model.init_args.model_uri=\\$CERN_CODE_ROOT/3dgan-inference.pth \ - -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.datapath=\\$CERN_DATA_ROOT/*.h5 \ - -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.max_samples=\\$MAX_DATA_SAMPLES \ - -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.batch_size=\\$BATCH_SIZE \ - -o pipeline.init_args.steps.inference_step.init_args.config.data.init_args.num_workers=\\$NUM_WORKERS_DL \ - -o pipeline.init_args.steps.saver_step.init_args.save_dir=\\$TMP_DATA_ROOT/3dgan-generated-data \ - -o pipeline.init_args.steps.saver_step.init_args.aggregate_predictions=\\$AGGREGATE_PREDS \"" + - "\" cd /usr/src/app && itwinai exec-pipeline --print-config \ + --config \\$CERN_CODE_ROOT/config.yaml \ + --pipe-key inference_pipeline \ + -o dataset_location=\\$CERN_DATA_ROOT \ + -o logs_dir=\\$TMP_DATA_ROOT/ml_logs/mlflow_logs \ + -o distributed_strategy=\\$STRATEGY \ + -o devices=\\$DEVICES \ + -o hw_accelerators=\\$ACCELERATOR \ + -o checkpoints_path=\\$TMP_DATA_ROOT/checkpoints \ + -o inference_model_uri=\\$CERN_CODE_ROOT/3dgan-inference.pth \ + -o max_dataset_size=\\$MAX_DATA_SAMPLES \ + -o batch_size=\\$BATCH_SIZE \ + -o num_workers_dataloader=\\$NUM_WORKERS_DL \ + -o inference_results_location=\\$TMP_DATA_ROOT/3dgan-generated-data \ + -o aggregate_predictions=\\$AGGREGATE_PREDS \"" command: - /bin/sh env: @@ -47,7 +49,7 @@ spec: value: "auto" - name: DEVICES value: "auto" - image: /ceph/hpc/data/st2301-itwin-users/itwinaiv6_1.sif + image: /ceph/hpc/data/st2301-itwin-users/itwinai_v9.5.sif imagePullPolicy: Always name: oscar-container resources: diff --git a/use-cases/3dgan/interLink/3dgan-train.yaml b/use-cases/3dgan/interLink/3dgan-train.yaml new file mode 100644 index 00000000..e0885fd9 --- /dev/null +++ b/use-cases/3dgan/interLink/3dgan-train.yaml @@ -0,0 +1,88 @@ +apiVersion: v1 +kind: Pod +metadata: + name: 3dgan-train + annotations: + slurm-job.vk.io/flags: "-p gpu --gres=gpu:1 --cpus-per-task=4 --mem=100G --ntasks-per-node=1 --nodes=1 --time=00:55:00" + job.vk.io/singularity-mounts: "--bind /ceph/hpc/data/st2301-itwin-users/egarciagarcia:/exp_data" + # job.vk.io/pre-exec: "singularity pull /ceph/hpc/data/st2301-itwin-users/itwinai_v9.5.sif docker://ghcr.io/intertwin-eu/itwinai:0.0.1-3dgan-0.4" +spec: + automountServiceAccountToken: false + containers: + - args: + - -c + - "\" cd /usr/src/app && itwinai exec-pipeline --print-config \ + --config \\$CERN_CODE_ROOT/config.yaml \ + --pipe-key training_pipeline \ + -o dataset_location=\\$CERN_DATA_ROOT \ + -o pipeline.init_args.steps.training_step.init_args.exp_root=\\$TMP_DATA_ROOT \ + -o logs_dir=\\$TMP_DATA_ROOT/ml_logs \ + -o distributed_strategy=\\$STRATEGY \ + -o devices=\\$DEVICES \ + -o hw_accelerators=\\$ACCELERATOR \ + -o checkpoints_path=\\$TMP_DATA_ROOT/checkpoints \ + -o max_samples=\\$MAX_DATA_SAMPLES \ + -o batch_size=\\$BATCH_SIZE \ + -o max_dataset_size=\\$NUM_WORKERS_DL \"" + command: + - /bin/sh + env: + - name: CERN_DATA_ROOT + value: "/exp_data" + - name: CERN_CODE_ROOT + value: "/usr/src/app" + - name: TMP_DATA_ROOT + value: "/exp_data" + - name: MAX_DATA_SAMPLES + value: "1000" + - name: BATCH_SIZE + value: "512" + - name: NUM_WORKERS_DL + value: "4" + - name: ACCELERATOR + value: "gpu" + - name: STRATEGY + value: "auto" + - name: DEVICES + value: "auto" + + # - name: MLFLOW_TRACKING_USERNAME + # valueFrom: + # secretKeyRef: + # name: mlflow-server + # key: username + # - name: MLFLOW_TRACKING_PASSWORD + # valueFrom: + # secretKeyRef: + # name: mlflow-server + # key: password + + - name: MLFLOW_TRACKING_USERNAME + value: "XXX" + - name: MLFLOW_TRACKING_PASSWORD + value: "XXX" + image: /ceph/hpc/data/st2301-itwin-users/itwinai_v9.5.sif + imagePullPolicy: Always + name: oscar-container + resources: + limits: + cpu: "1" + memory: 1Gi + requests: + cpu: "1" + memory: 1Gi + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + nodeSelector: + kubernetes.io/hostname: vega-new-vk + tolerations: + - key: virtual-node.interlink/no-schedule + operator: Exists + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 \ No newline at end of file diff --git a/use-cases/3dgan/interLink/README.md b/use-cases/3dgan/interLink/README.md index d4b6dcca..c2831f7b 100644 --- a/use-cases/3dgan/interLink/README.md +++ b/use-cases/3dgan/interLink/README.md @@ -53,3 +53,14 @@ nodeSelector: ``` Additional info in [interLink](https://github.com/interTwin-eu/interLink) docs. + +## Secrets + +See [this guide](https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/#define-container-environment-variables-using-secret-data) +on how to set Kubernetes secretes as env variables of a container. + +Example: + +```bash +kubectl create secret generic mlflow-server --from-literal=username='XYZ' --from-literal=password='ABC' +``` diff --git a/use-cases/3dgan/model.py b/use-cases/3dgan/model.py index 9653c98e..60dd48f9 100644 --- a/use-cases/3dgan/model.py +++ b/use-cases/3dgan/model.py @@ -1,5 +1,5 @@ import sys -# import os +import os # import pickle from collections import defaultdict import math @@ -309,6 +309,7 @@ def __init__( loss_weights=[3, 0.1, 25, 0.1], power=0.85, lr=0.001, + checkpoints_dir: str = '.' # checkpoint_path: str = '3Dgan.pth' ): super().__init__() @@ -319,6 +320,8 @@ def __init__( self.loss_weights = loss_weights self.lr = lr self.power = power + self.checkpoints_dir = checkpoints_dir + os.makedirs(self.checkpoints_dir, exist_ok=True) self.generator = Generator(self.latent_size) self.discriminator = Discriminator(self.power) @@ -544,9 +547,10 @@ def training_step(self, batch, batch_idx): if fake_batch_loss[3] == 100.0 and self.index > 10: # print("Empty image with Ecal loss equal to 100.0 " # f"for {self.index} batch") - torch.save(self.generator.state_dict(), "generator_weights.pth") - torch.save(self.discriminator.state_dict(), - "discriminator_weights.pth") + torch.save(self.generator.state_dict(), os.path.join( + self.checkpoints_dir, "generator_weights.pth")) + torch.save(self.discriminator.state_dict(), os.path.join( + self.checkpoints_dir, "discriminator_weights.pth")) # print("real_batch_loss", real_batch_loss) # print("fake_batch_loss", fake_batch_loss) sys.exit() @@ -609,9 +613,10 @@ def on_train_epoch_end(self): # outputs print(ROW_FMT.format("discriminator (train)", *self.train_history["discriminator"][-1])) - torch.save(self.generator.state_dict(), "generator_weights.pth") - torch.save(self.discriminator.state_dict(), - "discriminator_weights.pth") + torch.save(self.generator.state_dict(), os.path.join( + self.checkpoints_dir, "generator_weights.pth")) + torch.save(self.discriminator.state_dict(), os.path.join( + self.checkpoints_dir, "discriminator_weights.pth")) # with open(self.pklfile, "wb") as f: # pickle.dump({"train": self.train_history, diff --git a/use-cases/3dgan/pipeline.yaml b/use-cases/3dgan/pipeline.yaml deleted file mode 100644 index d6bade54..00000000 --- a/use-cases/3dgan/pipeline.yaml +++ /dev/null @@ -1,91 +0,0 @@ -pipeline: - class_path: itwinai.pipeline.Pipeline - init_args: - steps: - dataloading_step: - class_path: dataloader.Lightning3DGANDownloader - init_args: - data_path: exp_data/ # Set to null to skip dataset download - data_url: https://drive.google.com/drive/folders/1uPpz0tquokepptIfJenTzGpiENfo2xRX - - training_step: - class_path: trainer.Lightning3DGANTrainer - init_args: - # Pytorch lightning config for training - config: - seed_everything: 4231162351 - trainer: - accelerator: auto - accumulate_grad_batches: 1 - barebones: false - benchmark: null - callbacks: - - class_path: lightning.pytorch.callbacks.early_stopping.EarlyStopping - init_args: - monitor: val_generator_loss - patience: 2 - - class_path: lightning.pytorch.callbacks.lr_monitor.LearningRateMonitor - init_args: - logging_interval: step - - class_path: lightning.pytorch.callbacks.ModelCheckpoint - init_args: - dirpath: checkpoints - filename: best-checkpoint - mode: min - monitor: val_generator_loss - save_top_k: 1 - verbose: true - check_val_every_n_epoch: 1 - default_root_dir: null - detect_anomaly: false - deterministic: null - devices: auto #[0] - enable_checkpointing: true - enable_model_summary: null - enable_progress_bar: null - fast_dev_run: false - gradient_clip_algorithm: null - gradient_clip_val: null - inference_mode: true - limit_predict_batches: null - limit_test_batches: null - limit_train_batches: null - limit_val_batches: null - log_every_n_steps: 1 - logger: - class_path: lightning.pytorch.loggers.MLFlowLogger - init_args: - experiment_name: 3DGAN - save_dir: ml_logs/mlflow_logs - log_model: all - max_epochs: 5 - max_time: null - min_epochs: null - min_steps: null - num_sanity_val_steps: null - overfit_batches: 0.0 - plugins: null - profiler: null - reload_dataloaders_every_n_epochs: 0 - strategy: auto #ddp_find_unused_parameters_true - sync_batchnorm: false - use_distributed_sampler: true - val_check_interval: null - - # Lightning Model configuration - model: - class_path: model.ThreeDGAN - init_args: - latent_size: 256 - loss_weights: [3, 0.1, 25, 0.1] - power: 0.85 - lr: 0.001 - - # Lightning data module configuration - data: - class_path: dataloader.ParticlesDataModule - init_args: - datapath: exp_data/*/*.h5 - batch_size: 4 - num_workers: 0 - max_samples: 48 diff --git a/use-cases/3dgan/trainer.py b/use-cases/3dgan/trainer.py index 3bb5a1fd..f51b5d5a 100644 --- a/use-cases/3dgan/trainer.py +++ b/use-cases/3dgan/trainer.py @@ -23,17 +23,22 @@ class Lightning3DGANTrainer(Trainer): - def __init__(self, config: Union[Dict, str]): + def __init__(self, config: Union[Dict, str], exp_root: str = '.'): self.save_parameters(**self.locals2params(locals())) super().__init__() if isinstance(config, str) and os.path.isfile(config): # Load from YAML config = load_yaml(config) self.conf = config + self.exp_root = exp_root @monitor_exec def execute(self) -> Any: - init_lightning_mlflow(self.conf, registered_model_name='3dgan-lite') + init_lightning_mlflow( + self.conf, + tmp_dir=os.path.join(self.exp_root, '.tmp'), + registered_model_name='3dgan-lite' + ) old_argv = sys.argv sys.argv = ['some_script_placeholder.py'] cli = LightningCLI( @@ -52,12 +57,6 @@ def execute(self) -> Any: cli.trainer.fit(cli.model, datamodule=cli.datamodule) teardown_lightning_mlflow() - def save_state(self): - return super().save_state() - - def load_state(self): - return super().load_state() - class LightningModelLoader(TorchModelLoader): """Loads a torch lightning model from somewhere. diff --git a/use-cases/cyclones/README.md b/use-cases/cyclones/README.md new file mode 100644 index 00000000..6b504fb0 --- /dev/null +++ b/use-cases/cyclones/README.md @@ -0,0 +1,12 @@ +# Tropical cyclone detection + +## Dataset + +If the automatic download from python does not work, try from the command line from +within the virtual environment: + +```bash +gdown https://drive.google.com/drive/folders/1TnmujO4T-8_j4bCxqNe5HEw9njJIIBQD -O data/tmp_data/trainval --folder +``` + +For more info visit the [gdown](https://github.com/wkentaro/gdown) repository. diff --git a/use-cases/cyclones/dataloader.py b/use-cases/cyclones/dataloader.py index 7f224157..3cf1d97b 100644 --- a/use-cases/cyclones/dataloader.py +++ b/use-cases/cyclones/dataloader.py @@ -180,6 +180,7 @@ def setup_config(self, config: Dict) -> None: if not exists(join(root_dir, self.data_path)): gdown.download_folder( url=self.data_url, quiet=False, + verify=False, output=join(root_dir, self.data_path) ) diff --git a/use-cases/cyclones/trainer.py b/use-cases/cyclones/trainer.py index 1c47819b..054f772b 100644 --- a/use-cases/cyclones/trainer.py +++ b/use-cases/cyclones/trainer.py @@ -155,9 +155,3 @@ def setup_config(self, config: Dict) -> None: if self.model_backup: self.best_model_name = join(self.model_backup, "best_model.h5") self.last_model_name = join(self.run_dir, "last_model.h5") - - def load_state(self): - return super().load_state() - - def save_state(self): - return super().save_state() diff --git a/use-cases/mnist/tensorflow/pipeline.yaml b/use-cases/mnist/tensorflow/pipeline.yaml index 9fced327..314f78b1 100644 --- a/use-cases/mnist/tensorflow/pipeline.yaml +++ b/use-cases/mnist/tensorflow/pipeline.yaml @@ -32,9 +32,9 @@ pipeline: strategy: class_path: tensorflow.python.distribute.mirrored_strategy.MirroredStrategy - logger: - - class_path: itwinai.loggers.ConsoleLogger - - class_path: itwinai.loggers.MLFlowLogger - init_args: - experiment_name: MNIST classifier - log_freq: batch + # logger: + # - class_path: itwinai.loggers.ConsoleLogger + # - class_path: itwinai.loggers.MLFlowLogger + # init_args: + # experiment_name: MNIST classifier + # log_freq: batch diff --git a/use-cases/mnist/tensorflow/trainer.py b/use-cases/mnist/tensorflow/trainer.py index 17ef19a5..435f79f4 100644 --- a/use-cases/mnist/tensorflow/trainer.py +++ b/use-cases/mnist/tensorflow/trainer.py @@ -35,9 +35,3 @@ def __init__( @monitor_exec def execute(self, train_dataset, validation_dataset) -> Any: return super().execute(train_dataset, validation_dataset) - - def load_state(self): - return super().load_state() - - def save_state(self): - return super().save_state() diff --git a/use-cases/mnist/torch-lightning/README.md b/use-cases/mnist/torch-lightning/README.md new file mode 100644 index 00000000..bd769c70 --- /dev/null +++ b/use-cases/mnist/torch-lightning/README.md @@ -0,0 +1,17 @@ +# Torch Lightning example on MNIST dataset + +## Training + +```bash +# Download dataset and exit: only run first step in the pipeline (index=0) +itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline --steps 0 + +# Run the whole training pipeline +itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline +``` + +View training logs on MLFLow server (if activated from the configuration): + +```bash +mlflow ui --backend-store-uri mllogs/mlflow/ +``` diff --git a/use-cases/mnist/torch-lightning/pipeline.yaml b/use-cases/mnist/torch-lightning/config.yaml similarity index 96% rename from use-cases/mnist/torch-lightning/pipeline.yaml rename to use-cases/mnist/torch-lightning/config.yaml index cf754b2f..23fde03d 100644 --- a/use-cases/mnist/torch-lightning/pipeline.yaml +++ b/use-cases/mnist/torch-lightning/config.yaml @@ -1,4 +1,4 @@ -pipeline: +training_pipeline: class_path: itwinai.pipeline.Pipeline init_args: steps: @@ -6,7 +6,7 @@ pipeline: init_args: data_path: data/ - - class_path: trainer.LightningMNISTTrainer + - class_path: itwinai.torch.trainer.TorchLightningTrainer #trainer.LightningMNISTTrainer init_args: # Pytorch lightning config for training config: diff --git a/use-cases/mnist/torch-lightning/dataloader.py b/use-cases/mnist/torch-lightning/dataloader.py index 1f062fe5..b7e8d46e 100644 --- a/use-cases/mnist/torch-lightning/dataloader.py +++ b/use-cases/mnist/torch-lightning/dataloader.py @@ -31,7 +31,7 @@ def execute(self) -> None: self._downloader.setup(stage='predict') -class MNISTDataModule(L.LightningModule): +class MNISTDataModule(L.LightningDataModule): def __init__( self, data_path: str, diff --git a/use-cases/mnist/torch-lightning/train.py b/use-cases/mnist/torch-lightning/train.py deleted file mode 100644 index 97f53093..00000000 --- a/use-cases/mnist/torch-lightning/train.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Training pipeline. To run this script, use the following commands. - -On login node: - ->>> micromamba run -p ../../../.venv-pytorch/ \ - python train.py -p pipeline.yaml -d - -On compute nodes: - ->>> micromamba run -p ../../../.venv-pytorch/ \ - python train.py -p pipeline.yaml - -""" - -import argparse - -from itwinai.parser import ConfigParser - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-p", "--pipeline", type=str, required=True, - help='Configuration file to the pipeline to execute.' - ) - parser.add_argument( - '-d', '--download-only', - action=argparse.BooleanOptionalAction, - default=False, - help=('Whether to download only the dataset and exit execution ' - '(suggested on login nodes of HPC systems)') - ) - args = parser.parse_args() - - # Create parser for the pipeline - pipe_parser = ConfigParser(config=args.pipeline) - pipeline = pipe_parser.parse_pipeline() - - if args.download_only: - print('Downloading datasets and exiting...') - pipeline = pipeline[:1] - - pipeline.execute() diff --git a/use-cases/mnist/torch-lightning/trainer.py b/use-cases/mnist/torch-lightning/trainer.py deleted file mode 100644 index 128cf5c6..00000000 --- a/use-cases/mnist/torch-lightning/trainer.py +++ /dev/null @@ -1,40 +0,0 @@ -import os -from typing import Union, Dict, Any - -from itwinai.components import Trainer, monitor_exec -from itwinai.torch.models.mnist import MNISTModel -from dataloader import MNISTDataModule -from lightning.pytorch.cli import LightningCLI -from utils import load_yaml - - -class LightningMNISTTrainer(Trainer): - def __init__(self, config: Union[Dict, str]): - super().__init__() - self.save_parameters(**self.locals2params(locals())) - if isinstance(config, str) and os.path.isfile(config): - # Load from YAML - config = load_yaml(config) - self.conf = config - - @monitor_exec - def execute(self) -> Any: - cli = LightningCLI( - args=self.conf, - model_class=MNISTModel, - datamodule_class=MNISTDataModule, - run=False, - save_config_kwargs={ - "overwrite": True, - "config_filename": "pl-training.yml", - }, - subclass_mode_model=True, - subclass_mode_data=True, - ) - cli.trainer.fit(cli.model, datamodule=cli.datamodule) - - def save_state(self): - return super().save_state() - - def load_state(self): - return super().load_state() diff --git a/use-cases/mnist/torch/Dockerfile b/use-cases/mnist/torch/Dockerfile index b4cf3654..5b96feb5 100644 --- a/use-cases/mnist/torch/Dockerfile +++ b/use-cases/mnist/torch/Dockerfile @@ -1,4 +1,5 @@ -FROM python:3.9.12 +# FROM python:3.9 +FROM nvcr.io/nvidia/pytorch:23.09-py3 WORKDIR /usr/src/app @@ -13,6 +14,3 @@ RUN pip install --no-cache-dir . # Add torch MNIST use case COPY use-cases/mnist/torch/* ./ - -# Run inference -CMD [ "python", "train.py", "-p", "inference-pipeline.yaml"] \ No newline at end of file diff --git a/use-cases/mnist/torch/README.md b/use-cases/mnist/torch/README.md index c953671f..e333f14b 100644 --- a/use-cases/mnist/torch/README.md +++ b/use-cases/mnist/torch/README.md @@ -3,10 +3,18 @@ ## Training ```bash -python train.py -p pipeline.yaml [-d] +# Download dataset and exit +itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline --steps dataloading_step + +# Run the whole training pipeline +itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline ``` -Use `-d` flag to run only the fist step in the pipeline. +View training logs on MLFLow server (if activated from the configuration): + +```bash +mlflow ui --backend-store-uri mllogs/mlflow/ +``` ## Inference @@ -30,24 +38,37 @@ Use `-d` flag to run only the fist step in the pipeline. folder containing a CSV file with the predictions as rows. ```bash - python train.py -p inference-pipeline.yaml + itwinai exec-pipeline --config config.yaml --pipe-key inference_pipeline ``` Note the same entry point as for training. -### Docker image +## Docker image Build from project root with ```bash # Local -docker buildx build -t itwinai-mnist-torch-inference -f use-cases/mnist/torch/Dockerfile . +docker buildx build -t itwinai:0.0.1-mnist-torch-0.1 -f use-cases/mnist/torch/Dockerfile . # Ghcr.io -docker buildx build -t ghcr.io/intertwin-eu/itwinai-mnist-torch-inference:0.0.1 -f use-cases/mnist/torch/Dockerfile . -docker push ghcr.io/intertwin-eu/itwinai-mnist-torch-inference:0.0.1 +docker buildx build -t ghcr.io/intertwin-eu/itwinai:0.0.1-mnist-torch-0.1 -f use-cases/mnist/torch/Dockerfile . +docker push ghcr.io/intertwin-eu/itwinai:0.0.1-mnist-torch-0.1 ``` +### Training with Docker container + +```bash +docker run -it --rm --name running-inference \ + -v "$PWD":/usr/data ghcr.io/intertwin-eu/itwinai:0.01-mnist-torch-0.1 \ + /bin/bash -c "itwinai exec-pipeline --print-config \ + --config /usr/src/app/config.yaml \ + --pipe-key training_pipeline \ + -o dataset_root=/usr/data/mnist-dataset " +``` + +### Inference with Docker container + From wherever a sample of MNIST jpg images is available (folder called 'mnist-sample-data/'): @@ -62,7 +83,14 @@ From wherever a sample of MNIST jpg images is available ``` ```bash -docker run -it --rm --name running-inference -v "$PWD":/usr/data ghcr.io/intertwin-eu/itwinai-mnist-torch-inference:0.0.1 +docker run -it --rm --name running-inference \ + -v "$PWD":/usr/data ghcr.io/intertwin-eu/itwinai:0.01-mnist-torch-0.1 \ + /bin/bash -c "itwinai exec-pipeline --print-config \ + --config /usr/src/app/config.yaml \ + --pipe-key inference_pipeline \ + -o test_data_path=/usr/data/mnist-sample-data \ + -o inference_model_mlflow_uri=/usr/src/app/mnist-pre-trained.pth \ + -o predictions_dir=/usr/data/mnist-predictions " ``` This command will store the results in a folder called "mnist-predictions": diff --git a/use-cases/mnist/torch/config.yaml b/use-cases/mnist/torch/config.yaml new file mode 100644 index 00000000..c5d71204 --- /dev/null +++ b/use-cases/mnist/torch/config.yaml @@ -0,0 +1,99 @@ +# General config +dataset_root: .tmp/ +num_classes: 10 +batch_size: 64 +num_workers_dataloader: 4 +pin_memory: False +lr: 0.001 +momentum: 0.9 +fp16_allreduce: False +use_adasum: False +gradient_predivide_factor: 1.0 +epochs: 2 +strategy: ddp +test_data_path: mnist-sample-data +inference_model_mlflow_uri: mnist-pre-trained.pth +predictions_dir: mnist-predictions +predictions_file: predictions.csv +class_labels: null + +# Workflows configuration +training_pipeline: + class_path: itwinai.pipeline.Pipeline + init_args: + steps: + dataloading_step: + class_path: dataloader.MNISTDataModuleTorch + init_args: + save_path: ${dataset_root} + + training_step: + class_path: itwinai.torch.trainer.TorchTrainer + init_args: + config: + batch_size: ${batch_size} + num_workers: ${num_workers_dataloader} + pin_memory: ${pin_memory} + lr: ${lr} + momentum: ${momentum} + fp16_allreduce: ${fp16_allreduce} + use_adasum: ${use_adasum} + gradient_predivide_factor: ${gradient_predivide_factor} + + model: + class_path: model.Net + epochs: ${epochs} + metrics: + accuracy: + class_path: torchmetrics.classification.MulticlassAccuracy + init_args: + num_classes: ${num_classes} + precision: + class_path: torchmetrics.classification.MulticlassPrecision + init_args: + num_classes: ${num_classes} + recall: + class_path: torchmetrics.classification.MulticlassRecall + init_args: + num_classes: ${num_classes} + logger: + class_path: itwinai.loggers.LoggersCollection + init_args: + loggers: + - class_path: itwinai.loggers.ConsoleLogger + init_args: + log_freq: 100 + - class_path: itwinai.loggers.MLFlowLogger + init_args: + experiment_name: MNIST classifier + log_freq: batch + strategy: ${strategy} + # checkpoint_every: 1 + # cluster: + # class_path: itwinai.torch.cluster.LocalCluster + # init_args: + # gpus: '0,1,2' + # backend: nccl + +inference_pipeline: + class_path: itwinai.pipeline.Pipeline + init_args: + steps: + - class_path: dataloader.MNISTPredictLoader + init_args: + test_data_path: ${test_data_path} + + - class_path: itwinai.torch.inference.MulticlassTorchPredictor + init_args: + model: + class_path: itwinai.torch.inference.TorchModelLoader + init_args: + model_uri: ${inference_model_mlflow_uri} + test_dataloader_kwargs: + batch_size: ${batch_size} + + - class_path: saver.TorchMNISTLabelSaver + init_args: + save_dir: ${predictions_dir} + predictions_file: ${predictions_file} + class_labels: ${class_labels} \ No newline at end of file diff --git a/use-cases/mnist/torch/create_inference_sample.py b/use-cases/mnist/torch/create_inference_sample.py new file mode 100644 index 00000000..1c588c48 --- /dev/null +++ b/use-cases/mnist/torch/create_inference_sample.py @@ -0,0 +1,42 @@ +"""Create a simple inference dataset sample and a checkpoint.""" + +import torch +import os +import argparse + +from model import Net +from dataloader import InferenceMNIST + + +def mnist_torch_inference_files( + root: str = '.', + samples_path: str = 'mnist-sample-data/', + model_name: str = 'mnist-pre-trained.pth' +): + """Create sample dataset and fake model to test mnist + inference workflow. Assumes to be run from + the use case folder. + + Args: + root (str, optional): where to create the files. + Defaults to '.'. + """ + + sample = os.path.join(root, samples_path) + InferenceMNIST.generate_jpg_sample(sample, 10) + + # Fake checkpoint + dummy_nn = Net() + mdl_ckpt = os.path.join(root, model_name) + torch.save(dummy_nn, mdl_ckpt) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--root", type=str, default='.') + parser.add_argument("--samples-path", type=str, + default='mnist-sample-data') + parser.add_argument("--model-name", type=str, + default='mnist-pre-trained.pth') + args = parser.parse_args() + mnist_torch_inference_files(**vars(args)) diff --git a/use-cases/mnist/torch/dataloader.py b/use-cases/mnist/torch/dataloader.py index e4243763..a19c647e 100644 --- a/use-cases/mnist/torch/dataloader.py +++ b/use-cases/mnist/torch/dataloader.py @@ -34,7 +34,7 @@ def execute(self) -> Tuple[Dataset, Dataset]: transforms.Normalize((0.1307,), (0.3081,)) ])) print("Train and validation datasets loaded.") - return train_dataset, validation_dataset + return train_dataset, validation_dataset, None class InferenceMNIST(Dataset): diff --git a/use-cases/mnist/torch/inference-pipeline.yaml b/use-cases/mnist/torch/inference-pipeline.yaml deleted file mode 100644 index 5edf6ce9..00000000 --- a/use-cases/mnist/torch/inference-pipeline.yaml +++ /dev/null @@ -1,22 +0,0 @@ -pipeline: - class_path: itwinai.pipeline.Pipeline - init_args: - steps: - - class_path: dataloader.MNISTPredictLoader - init_args: - test_data_path: /usr/data/mnist-sample-data - - - class_path: itwinai.torch.inference.MulticlassTorchPredictor - init_args: - model: - class_path: itwinai.torch.inference.TorchModelLoader - init_args: - model_uri: mnist-pre-trained.pth - test_dataloader_kwargs: - batch_size: 3 - - - class_path: saver.TorchMNISTLabelSaver - init_args: - save_dir: /usr/data/mnist-predictions - predictions_file: predictions.csv - class_labels: null \ No newline at end of file diff --git a/use-cases/mnist/torch/pipeline.yaml b/use-cases/mnist/torch/pipeline.yaml deleted file mode 100644 index 99f35c73..00000000 --- a/use-cases/mnist/torch/pipeline.yaml +++ /dev/null @@ -1,56 +0,0 @@ -pipeline: - class_path: itwinai.pipeline.Pipeline - init_args: - steps: - dataloading_step: - class_path: dataloader.MNISTDataModuleTorch - init_args: - save_path: .tmp/ - - training_step: - class_path: itwinai.torch.trainer.TorchTrainerMG - init_args: - model: - class_path: model.Net - loss: - class_path: torch.nn.NLLLoss - init_args: - reduction: mean - optimizer_class: torch.optim.SGD - optimizer_kwargs: - lr: 0.001 - train_dataloader_kwargs: - batch_size: 32 - pin_memory: True - shuffle: True - validation_dataloader_kwargs: - batch_size: 32 - pin_memory: True - shuffle: False - epochs: 2 - train_metrics: - accuracy: - class_path: torchmetrics.classification.MulticlassAccuracy - init_args: - num_classes: 10 - precision: - class_path: torchmetrics.classification.MulticlassPrecision - init_args: - num_classes: 10 - recall: - class_path: torchmetrics.classification.MulticlassRecall - init_args: - num_classes: 10 - logger: - - class_path: itwinai.loggers.ConsoleLogger - - class_path: itwinai.loggers.MLFlowLogger - init_args: - experiment_name: MNIST classifier - log_freq: batch - strategy: ddp - checkpoint_every: 1 - cluster: - class_path: itwinai.torch.cluster.LocalCluster - init_args: - gpus: '0,1,2' - backend: nccl diff --git a/use-cases/mnist/torch/runall.sh b/use-cases/mnist/torch/runall.sh new file mode 100644 index 00000000..e81ed74d --- /dev/null +++ b/use-cases/mnist/torch/runall.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +# Python virtual environment (no conda/micromamba) +PYTHON_VENV="../../../envAI_hdfml" + +# Clear SLURM logs (*.out and *.err files) +rm -rf logs_slurm +mkdir logs_slurm +rm -rf logs_torchrun + +# DDP itwinai +DIST_MODE="ddp" +RUN_NAME="ddp-itwinai" +TRAINING_CMD="$PYTHON_VENV/bin/itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline -o strategy=ddp" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh + +# DeepSpeed itwinai +DIST_MODE="deepspeed" +RUN_NAME="deepspeed-itwinai" +TRAINING_CMD="$PYTHON_VENV/bin/itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline -o strategy=deepspeed" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh + +# Horovod itwinai +DIST_MODE="horovod" +RUN_NAME="horovod-itwinai" +TRAINING_CMD="$PYTHON_VENV/bin/itwinai exec-pipeline --config config.yaml --pipe-key training_pipeline -o strategy=horovod" +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ + --job-name="$RUN_NAME-n$N" \ + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ + slurm.sh \ No newline at end of file diff --git a/use-cases/mnist/torch/slurm.sh b/use-cases/mnist/torch/slurm.sh new file mode 100644 index 00000000..2a2a15d8 --- /dev/null +++ b/use-cases/mnist/torch/slurm.sh @@ -0,0 +1,116 @@ +#!/bin/bash + +# SLURM jobscript for JSC systems + +# Job configuration +#SBATCH --job-name=distributed_training +#SBATCH --account=intertwin +#SBATCH --mail-user= +#SBATCH --mail-type=ALL +#SBATCH --output=job.out +#SBATCH --error=job.err +#SBATCH --time=00:30:00 + +# Resources allocation +#SBATCH --partition=batch +#SBATCH --nodes=2 +#SBATCH --gpus-per-node=4 +#SBATCH --cpus-per-gpu=4 +#SBATCH --exclusive + +# gres options have to be disabled for deepv +#SBATCH --gres=gpu:4 + +# Load environment modules +ml Stages/2024 GCC OpenMPI CUDA/12 MPI-settings/CUDA Python HDF5 PnetCDF libaio mpi4py + +# Job info +echo "DEBUG: TIME: $(date)" +sysN="$(uname -n | cut -f2- -d.)" +sysN="${sysN%%[0-9]*}" +echo "Running on system: $sysN" +echo "DEBUG: EXECUTE: $EXEC" +echo "DEBUG: SLURM_SUBMIT_DIR: $SLURM_SUBMIT_DIR" +echo "DEBUG: SLURM_JOB_ID: $SLURM_JOB_ID" +echo "DEBUG: SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" +echo "DEBUG: SLURM_NNODES: $SLURM_NNODES" +echo "DEBUG: SLURM_NTASKS: $SLURM_NTASKS" +echo "DEBUG: SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE" +echo "DEBUG: SLURM_SUBMIT_HOST: $SLURM_SUBMIT_HOST" +echo "DEBUG: SLURMD_NODENAME: $SLURMD_NODENAME" +echo "DEBUG: CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" +if [ "$DEBUG" = true ] ; then + echo "DEBUG: NCCL_DEBUG=INFO" + export NCCL_DEBUG=INFO +fi +echo + +# Setup env for distributed ML +export CUDA_VISIBLE_DEVICES="0,1,2,3" +export OMP_NUM_THREADS=1 +if [ "$SLURM_CPUS_PER_GPU" -gt 0 ] ; then + export OMP_NUM_THREADS=$SLURM_CPUS_PER_GPU +fi + +# Env vairables check +if [ -z "$DIST_MODE" ]; then + >&2 echo "ERROR: env variable DIST_MODE is not set. Allowed values are 'horovod', 'ddp' or 'deepspeed'" + exit 1 +fi +if [ -z "$RUN_NAME" ]; then + >&2 echo "WARNING: env variable RUN_NAME is not set. It's a way to identify some specific run of an experiment." + RUN_NAME=$DIST_MODE +fi +if [ -z "$TRAINING_CMD" ]; then + >&2 echo "ERROR: env variable TRAINING_CMD is not set. It's the python command to execute." + exit 1 +fi +if [ -z "$PYTHON_VENV" ]; then + >&2 echo "WARNING: env variable PYTHON_VENV is not set. It's the path to a python virtual environment." +else + # Activate Python virtual env + source $PYTHON_VENV/bin/activate +fi + +# Get GPUs info per node +srun --cpu-bind=none --ntasks-per-node=1 bash -c 'echo -e "NODE hostname: $(hostname)\n$(nvidia-smi)\n\n"' + +# Launch training +if [ "$DIST_MODE" == "ddp" ] ; then + echo "DDP training: $TRAINING_CMD" + srun --cpu-bind=none --ntasks-per-node=1 \ + bash -c "torchrun \ + --log_dir='logs_torchrun' \ + --nnodes=$SLURM_NNODES \ + --nproc_per_node=$SLURM_GPUS_PER_NODE \ + --rdzv_id=$SLURM_JOB_ID \ + --rdzv_conf=is_host=\$(((SLURM_NODEID)) && echo 0 || echo 1) \ + --rdzv_backend=c10d \ + --rdzv_endpoint='$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)'i:29500 \ + $TRAINING_CMD" +elif [ "$DIST_MODE" == "deepspeed" ] ; then + echo "DEEPSPEED training: $TRAINING_CMD" + MASTER_ADDR=$(scontrol show hostnames "\$SLURM_JOB_NODELIST" | head -n 1)i + export MASTER_ADDR + export MASTER_PORT=29500 + + srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \ + $TRAINING_CMD + + # # Run with deepspeed launcher: set --ntasks-per-node=1 + # # https://www.deepspeed.ai/getting-started/#multi-node-environment-variables + # export NCCL_IB_DISABLE=1 + # export NCCL_SOCKET_IFNAME=eth0 + # nodelist=$(scontrol show hostname $SLURM_NODELIST) + # echo "$nodelist" | sed -e 's/$/ slots=4/' > .hostfile + # # Requires passwordless SSH access among compute node + # srun --cpu-bind=none deepspeed --hostfile=.hostfile $TRAINING_CMD --deepspeed + # rm .hostfile +elif [ "$DIST_MODE" == "horovod" ] ; then + echo "HOROVOD training: $TRAINING_CMD" + srun --cpu-bind=none --ntasks-per-node=$SLURM_GPUS_PER_NODE --cpus-per-task=$SLURM_CPUS_PER_GPU \ + $TRAINING_CMD +else + >&2 echo "ERROR: unrecognized \$DIST_MODE env variable" + exit 1 +fi diff --git a/use-cases/mnist/torch/train.py b/use-cases/mnist/torch/train.py deleted file mode 100644 index 97f53093..00000000 --- a/use-cases/mnist/torch/train.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Training pipeline. To run this script, use the following commands. - -On login node: - ->>> micromamba run -p ../../../.venv-pytorch/ \ - python train.py -p pipeline.yaml -d - -On compute nodes: - ->>> micromamba run -p ../../../.venv-pytorch/ \ - python train.py -p pipeline.yaml - -""" - -import argparse - -from itwinai.parser import ConfigParser - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-p", "--pipeline", type=str, required=True, - help='Configuration file to the pipeline to execute.' - ) - parser.add_argument( - '-d', '--download-only', - action=argparse.BooleanOptionalAction, - default=False, - help=('Whether to download only the dataset and exit execution ' - '(suggested on login nodes of HPC systems)') - ) - args = parser.parse_args() - - # Create parser for the pipeline - pipe_parser = ConfigParser(config=args.pipeline) - pipeline = pipe_parser.parse_pipeline() - - if args.download_only: - print('Downloading datasets and exiting...') - pipeline = pipeline[:1] - - pipeline.execute()