diff --git a/composer/_version.py b/composer/_version.py index 6a46c95e08..cb43d310d0 100644 --- a/composer/_version.py +++ b/composer/_version.py @@ -3,4 +3,4 @@ """The Composer Version.""" -__version__ = '0.19.1' +__version__ = '0.20.0' diff --git a/composer/devices/device_tpu.py b/composer/devices/device_tpu.py index b91d1bc478..813fc49924 100644 --- a/composer/devices/device_tpu.py +++ b/composer/devices/device_tpu.py @@ -26,6 +26,7 @@ class DeviceTPU(Device): More details. """ + dist_backend = 'xla' name = 'tpu' def __init__(self): diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 7411dc4393..0d2349bf93 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -2567,6 +2567,11 @@ def _train_microbatch(self, use_grad_scaling: bool, current_batch_size: int, microbatch_loss.mul_(microbatch_num_samples / current_batch_size) microbatch_loss.backward(create_graph=self._backwards_create_graph) + if self.state.device.dist_backend == 'xla': + # For xla devices, the program between any pair of mark_steps() calls is compiled. With out this, the + # microbatching loop is unrolled, drastically increasing compile time. + xm.mark_step() + self.engine.run_event(Event.AFTER_BACKWARD) # Use microbatch outputs to update training metrics diff --git a/composer/utils/dist.py b/composer/utils/dist.py index 65edb5e80c..5b8dd5df68 100644 --- a/composer/utils/dist.py +++ b/composer/utils/dist.py @@ -37,6 +37,7 @@ import logging import os import pickle +import sys import time from contextlib import contextmanager from typing import TYPE_CHECKING, Any, List, Optional, Sequence, TypeVar, Union, cast @@ -44,8 +45,12 @@ import torch import torch.distributed as dist import torch.utils.data +from packaging import version -from composer.utils.device import get_device, is_hpu_installed +from composer.utils.device import get_device, is_hpu_installed, is_tpu_installed + +if is_tpu_installed(): + import torch_xla if TYPE_CHECKING: from composer.devices import Device @@ -534,7 +539,15 @@ def initialize_dist(device: Union[str, Device], timeout: float = 300.0): dist_env_vars_match_defaults = all(os.environ.get(k, v) == v for (k, v) in dist_env_var_defaults.items()) - if dist_env_vars_match_defaults: + if device_obj.dist_backend == 'xla': + if not 'torch_xla' in sys.modules: + raise RuntimeError('PyTorch XLA package not found. In order to use XLA based devices ' + 'PyTorch XLA must be installed.') + if version.parse(torch_xla.__version__) < version.parse('2.1.0'): + raise RuntimeError(f'PyTorch XLA version must be at least 2.1.0, found {torch_xla.__version__}.') + # XLA initialization requires the init_method to be set + dist.init_process_group(device_obj.dist_backend, init_method='xla://') + elif dist_env_vars_match_defaults: # Fill in the remaining single-rank variables os.environ.update(dist_env_var_defaults) dist.init_process_group(device_obj.dist_backend, store=dist.HashStore(), world_size=1, rank=0) diff --git a/docker/README.md b/docker/README.md index e66ab3049f..d0624e2665 100644 --- a/docker/README.md +++ b/docker/README.md @@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the | Composer Version | CUDA Support | Docker Tag | |--------------------|----------------|----------------------------------------------------------------| -| 0.19.1 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.19.1` | -| 0.19.1 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.19.1_cpu` | +| 0.20.0 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.20.0` | +| 0.20.0 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.20.0_cpu` | **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 13803e45db..21c36347e9 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -246,9 +246,9 @@ TORCHVISION_VERSION: 0.18.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 - COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.19.1 + COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.20.0 CUDA_VERSION: 12.1.0 - IMAGE_NAME: composer-0-19-1 + IMAGE_NAME: composer-0-20-0 MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: cuda>=12.1 brand=tesla,driver>=450,driver<451 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 @@ -269,15 +269,15 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 TAGS: - - mosaicml/composer:0.19.1 + - mosaicml/composer:0.20.0 - mosaicml/composer:latest TARGET: composer_stage TORCHVISION_VERSION: 0.16.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 - COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.19.1 + COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.20.0 CUDA_VERSION: '' - IMAGE_NAME: composer-0-19-1-cpu + IMAGE_NAME: composer-0-20-0-cpu MOFED_VERSION: 5.5-1.0.3.2 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.10' @@ -285,7 +285,7 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.1.2 TAGS: - - mosaicml/composer:0.19.1_cpu + - mosaicml/composer:0.20.0_cpu - mosaicml/composer:latest_cpu TARGET: composer_stage TORCHVISION_VERSION: 0.16.2 diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index d59219f811..ca378388c6 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -261,7 +261,7 @@ def _main(): composer_entries = [] # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images - composer_versions = ['0.19.1'] # Only build images for the latest composer version + composer_versions = ['0.20.0'] # Only build images for the latest composer version composer_python_versions = [PRODUCTION_PYTHON_VERSION] # just build composer against the latest for product in itertools.product(composer_python_versions, composer_versions, cuda_options):