mosaicml · mvpatel2000 · May 24, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
diff --git a/composer/callbacks/checkpoint_saver.py b/composer/callbacks/checkpoint_saver.py
@@ -468,8 +468,9 @@ def _save_checkpoint(self, state: State, logger: Logger):
                     is_deepspeed,
                     keep_placeholders=True,
                 ).lstrip('/')
-                assert state.sharded_ckpt_prefix_dir is not None
-                remote_prefix = state.sharded_ckpt_prefix_dir
+                assert state.fsdp_config is not None
+                remote_prefix = state.fsdp_config['sharded_ckpt_prefix_dir']
+                assert remote_prefix is not None
                 ckpt_filename = checkpoint._TORCH_DISTRIBUTED_CHECKPOINTS_FILENAME
                 remote_file_name = os.path.join(pathlib.Path(remote_file_name).parent, remote_prefix, ckpt_filename)
                 remote_file_name = format_name_with_dist_and_time(remote_file_name, state.run_name, state.timestamp)

diff --git a/composer/core/state.py b/composer/core/state.py
@@ -539,28 +539,72 @@ def __init__(
         parallelism_config = parallelism_config or {}
         self.fsdp_config = parallelism_config.get('fsdp', None)
         self.tp_config = parallelism_config.get('tp', None)
-        if self.fsdp_config is not None:
-            from composer.distributed import patch_pytorch
 
-            # Add an earlier call to patch_pytorch as we require device_mesh slicing before any
-            # model wrapping.
-            patch_pytorch()
+        self._validate_parallelism_configs()
+
+        self.device_mesh: Optional[DeviceMesh] = _create_device_mesh(self.device, self.fsdp_config, self.tp_config)
+        if self.fsdp_config is not None and self.device_mesh is not None:
+            fsdp_mesh_dim_names = []
+            if self.device_mesh.mesh_dim_names is not None and ParallelismType.DATA_PARALLEL_REPLICATE.value in self.device_mesh.mesh_dim_names:
+                fsdp_mesh_dim_names.append(ParallelismType.DATA_PARALLEL_REPLICATE.value)
+            fsdp_mesh_dim_names.append(ParallelismType.DATA_PARALLEL_SHARD.value)
+            self.fsdp_config['device_mesh'] = self.device_mesh[tuple(fsdp_mesh_dim_names)]  # type: ignore
+        if self.tp_config is not None and self.device_mesh is not None:
+            self.tp_config['device_mesh'] = self.device_mesh[ParallelismType.TENSOR_PARALLEL.value]
 
+        # Set defaults for transient variables (to make pyright happy)
+        self.batch: Any = None
+        self.loss: Union[torch.Tensor, Sequence[torch.Tensor], Dict[Any, torch.Tensor]] = torch.Tensor()
+        self.outputs: Union[torch.Tensor, Sequence[torch.Tensor]] = torch.Tensor()
+
+        # These attributes will be serialized using .state_dict(), and loaded with .load_state_dict()
+        # All other attributes will not be serialized.
+        # For simplicity, omit the leading underscore for private attributes.
+        # For example, even though the optimizers are stored on the state
+        # as the "_optimizers" attribute, here we specify just "optimizers"
+        self.serialized_attributes = [
+            'model',
+            'optimizers',
+            'schedulers',
+            'algorithms',
+            'callbacks',
+            'scaler',
+            'timestamp',
+            'rank_zero_seed',
+            'train_metrics',
+            'eval_metrics',
+            'run_name',
+            'dataset_state',
+        ]
+
+        self.train_metrics: Optional[Dict[str, Metric]] = {}
+        self.eval_metrics: Dict[str, Dict[str, Metric]] = {}
+        self.train_metric_values: Dict[str, float] = {}
+        self.eval_metric_values: Dict[str, float] = {}
+        self.total_loss_dict: Dict[str, float] = {}
+
+        self.metric_outputs: Dict[str, Any] = {}
+
+    def _validate_parallelism_configs(self):
+        # Validate TP config
         if self.tp_config is not None:
+            warnings.warn('Tensor parallelism (TP) is experimental and may change in future versions.', FutureWarning)
             if version.parse(torch.__version__.split('.dev')[0]) < version.parse('2.3.0'):
                 raise ValueError('Tensor parallelism (TP) requires torch>=2.3.0.')
             if self.fsdp_config is None:
                 raise ValueError(
                     'Tensor parallelism (TP) currently requires FSDP to be enabled. '
                     'An empty `fsdp_config` can be specified to enable FSDP with '
-                    'default settings.',
+                    'default settings. Additionally, PyTorch currently errors if FSDP '
+                    'data_parallel_shard_degree is not at least 2.',
                 )
             if not self.fsdp_config['use_orig_params']:
                 raise ValueError(
                     'Tensor parallelism (TP) currently requires FSDP with use_orig_params=True, '
                     'which is the default and recommended setting.',
                 )
 
+        # Load monolith rank0 only
         if self.load_monolith_rank0_only:
             if self.tp_config is not None:
                 raise ValueError('load_fsdp_monolith_rank0_only is not compatible with tensor parallelism (TP).')
@@ -573,7 +617,7 @@ def __init__(
                 )
             # Broadcast rank 0 meta check to all ranks so error can be raised on all ranks
             rank0_on_meta = 0
-            if dist.get_global_rank() == 0 and next(model.parameters()).device.type == 'meta':
+            if dist.get_global_rank() == 0 and next(self.model.parameters()).device.type == 'meta':
                 rank0_on_meta = 1
             rank0_on_meta_tensor = self.device.tensor_to_device(torch.tensor([rank0_on_meta], dtype=torch.uint8))
             dist.all_reduce(rank0_on_meta_tensor, reduce_operation='MAX')
@@ -586,10 +630,7 @@ def __init__(
             if error_message != '':
                 raise ValueError(error_message)
 
-        self.sharded_ckpt_prefix_dir: Optional[str] = None
-        if self.fsdp_config is not None:
-            self.sharded_ckpt_prefix_dir = self.fsdp_config['sharded_ckpt_prefix_dir']
-
+        # Validate FSDP state dict type
         if self.fsdp_state_dict_type not in [None, 'full', 'sharded']:
             if self.fsdp_state_dict_type == 'local':
                 raise ValueError(
@@ -613,49 +654,6 @@ def __init__(
                 ),
             )
 
-        self.device_mesh: Optional[DeviceMesh] = _create_device_mesh(self.device, self.fsdp_config, self.tp_config)
-        if self.fsdp_config is not None and self.device_mesh is not None:
-            fsdp_mesh_dim_names = []
-            if self.device_mesh.mesh_dim_names is not None and ParallelismType.DATA_PARALLEL_REPLICATE.value in self.device_mesh.mesh_dim_names:
-                fsdp_mesh_dim_names.append(ParallelismType.DATA_PARALLEL_REPLICATE.value)
-            fsdp_mesh_dim_names.append(ParallelismType.DATA_PARALLEL_SHARD.value)
-            self.fsdp_config['device_mesh'] = self.device_mesh[tuple(fsdp_mesh_dim_names)]  # type: ignore
-        if self.tp_config is not None and self.device_mesh is not None:
-            self.tp_config['device_mesh'] = self.device_mesh[ParallelismType.TENSOR_PARALLEL.value]
-
-        # Set defaults for transient variables (to make pyright happy)
-        self.batch: Any = None
-        self.loss: Union[torch.Tensor, Sequence[torch.Tensor], Dict[Any, torch.Tensor]] = torch.Tensor()
-        self.outputs: Union[torch.Tensor, Sequence[torch.Tensor]] = torch.Tensor()
-
-        # These attributes will be serialized using .state_dict(), and loaded with .load_state_dict()
-        # All other attributes will not be serialized.
-        # For simplicity, omit the leading underscore for private attributes.
-        # For example, even though the optimizers are stored on the state
-        # as the "_optimizers" attribute, here we specify just "optimizers"
-        self.serialized_attributes = [
-            'model',
-            'optimizers',
-            'schedulers',
-            'algorithms',
-            'callbacks',
-            'scaler',
-            'timestamp',
-            'rank_zero_seed',
-            'train_metrics',
-            'eval_metrics',
-            'run_name',
-            'dataset_state',
-        ]
-
-        self.train_metrics: Optional[Dict[str, Metric]] = {}
-        self.eval_metrics: Dict[str, Dict[str, Metric]] = {}
-        self.train_metric_values: Dict[str, float] = {}
-        self.eval_metric_values: Dict[str, float] = {}
-        self.total_loss_dict: Dict[str, float] = {}
-
-        self.metric_outputs: Dict[str, Any] = {}
-
     def _dataset_of(self, dataloader: Optional[Union[Evaluator, DataSpec, DataLoader, Iterable]]) -> Optional[Dataset]:
         """Get the dataset contained by the given dataloader-like object.
 

diff --git a/composer/distributed/__init__.py b/composer/distributed/__init__.py
@@ -11,7 +11,7 @@
     prepare_fsdp_module,
     prepare_tp_module,
 )
-from composer.distributed.mosaic_fsdp import patch_pytorch, set_fsdp_default
+from composer.distributed.mosaic_fsdp import set_fsdp_default
 
 __all__ = [
     'fix_batch_precision_for_deepspeed',
@@ -21,6 +21,5 @@
     'prepare_ddp_module',
     'prepare_fsdp_module',
     'prepare_tp_module',
-    'patch_pytorch',
     'set_fsdp_default',
 ]
diff --git a/composer/distributed/dist_strategy.py b/composer/distributed/dist_strategy.py
@@ -29,7 +29,6 @@
     SHARDING_MAP,
     get_cpu_offload,
     get_mixed_precision,
-    patch_pytorch,
     set_custom_fsdp_module_kwargs,
 )
 from composer.utils import StringEnum, dist, ensure_tuple
@@ -216,8 +215,6 @@ def prepare_fsdp_module(
         auto_microbatching (bool, optional): Whether or not auto microbatching is enabled.
         te_rng_seed(int): The seed to use for the Transformer Engine activation checkpointing RNG. Defaults to 1234.
     """
-    patch_pytorch()
-
     # Check sync_module_states is True for mixed initialization or HSDP
     if fsdp_config['sync_module_states'] == False:
         rank_on_meta = 1 if next(model.parameters()).device.type == 'meta' else 0