pytorch-labs · y-sq · Nov 20, 2023 · Nov 20, 2023 · Nov 21, 2023 · Nov 21, 2023
diff --git a/float8_experimental/float8_linear.py b/float8_experimental/float8_linear.py
@@ -14,6 +14,8 @@
 
 import dataclasses
 
+from typing import Optional
+
 import torch
 
 from float8_experimental.float8_linear_utils import (
@@ -92,6 +94,8 @@ def __init__(self, *args, **kwargs):
         delayed_scaling_recipe = kwargs.pop(
             "delayed_scaling_recipe", DelayedScalingRecipe()
         )
+        # Amax scales should always be kept as float32.
+        self.always_float32_buffers = set()
         super().__init__(*args, **kwargs)
 
         # TODO(future): have a unique recipe per buffer instead of one per
@@ -100,15 +104,23 @@ def __init__(self, *args, **kwargs):
         self.recipe = delayed_scaling_recipe
         history_len = self.recipe.history_len
 
-        self.register_buffer("fp8_amax_x", torch.tensor(E4M3_MAX_POS))
-        self.register_buffer("fp8_amax_history_x", torch.zeros(history_len))
-        self.register_buffer("fp8_scale_x", torch.tensor(1.0))
-        self.register_buffer("fp8_amax_w", torch.tensor(E4M3_MAX_POS))
-        self.register_buffer("fp8_amax_history_w", torch.zeros(history_len))
-        self.register_buffer("fp8_scale_w", torch.tensor(1.0))
-        self.register_buffer("fp8_amax_dL_dY", torch.tensor(E5M2_MAX_POS))
-        self.register_buffer("fp8_amax_history_dL_dY", torch.zeros(history_len))
-        self.register_buffer("fp8_scale_dL_dY", torch.tensor(1.0))
+        self.register_always_float32_buffer("fp8_amax_x", torch.tensor(E4M3_MAX_POS))
+        self.register_always_float32_buffer(
+            "fp8_amax_history_x", torch.zeros(history_len)
+        )
+        self.register_always_float32_buffer("fp8_scale_x", torch.tensor(1.0))
+        self.register_always_float32_buffer("fp8_amax_w", torch.tensor(E4M3_MAX_POS))
+        self.register_always_float32_buffer(
+            "fp8_amax_history_w", torch.zeros(history_len)
+        )
+        self.register_always_float32_buffer("fp8_scale_w", torch.tensor(1.0))
+        self.register_always_float32_buffer(
+            "fp8_amax_dL_dY", torch.tensor(E5M2_MAX_POS)
+        )
+        self.register_always_float32_buffer(
+            "fp8_amax_history_dL_dY", torch.zeros(history_len)
+        )
+        self.register_always_float32_buffer("fp8_scale_dL_dY", torch.tensor(1.0))
         # Whether to emulate the fp8 matmul logic in float32
         self.emulate = False
 
@@ -136,6 +148,22 @@ def __init__(self, *args, **kwargs):
         # will access the scale when it has ensured that it is on GPU.
         self._float8_tensor_ctor = lambda *args, **kwargs: Float8Tensor(*args, **kwargs)
 
+    def register_always_float32_buffer(
+        self, name: str, tensor: Optional[torch.Tensor], persistent: bool = True
+    ) -> None:
+        self.register_buffer(name=name, tensor=tensor, persistent=persistent)
+        self.always_float32_buffers.add(name)
+
+    def _apply(self, fn, recurse=True):
+        ret = super()._apply(fn, recurse)
+        self.convert_amax_buffer_to_float32()
+        return ret
+
+    def convert_amax_buffer_to_float32(self):
+        for key in self.always_float32_buffers:
+            if self._buffers[key] is not None:
+                self._buffers[key] = self._buffers[key].to(torch.float32)
+
     def cast_x_to_float8(
         self, x: torch.Tensor, is_amax_initialized: bool
     ) -> torch.Tensor:

diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -117,7 +117,9 @@ def swap_linear_with_float8_linear(
             swap_linear_with_float8_linear(child, module, emulate)
 
 
-def sync_float8_amax_and_scale_history(model: torch.nn.Module) -> None:
+def sync_float8_amax_and_scale_history(
+    model: torch.nn.Module, fp8_classes=None
+) -> None:
     """
     Manages the float8 amax and scale bookkeeping. In detail, it does the
     following:
@@ -138,10 +140,13 @@ def sync_float8_amax_and_scale_history(model: torch.nn.Module) -> None:
     # the reductions into one and probably make the history update faster.
     # Lazy import to avoid circular dependency
 
-    from float8_experimental.float8_linear import Float8Linear
+    if fp8_classes is None:
+        from float8_experimental.float8_linear import Float8Linear
+
+        fp8_classes = Float8Linear
 
     for name, child in model.named_modules():
-        if not isinstance(child, (Float8Linear)):
+        if not isinstance(child, fp8_classes):
             continue
 
         #

diff --git a/test/test_base.py b/test/test_base.py
@@ -177,6 +177,60 @@ def test_linear_float8_weight_tag(self):
         m_fp8 = Float8Linear.from_float(copy.deepcopy(m_ref))
         assert m_fp8.weight._is_fp8_weight
 
+    @pytest.mark.parametrize("linear_type", [LinearType.DELAYED, LinearType.DYNAMIC])
+    @pytest.mark.parametrize(
+        "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
+    )
+    def test_type_cast(self, linear_type: LinearType, linear_dtype: torch.dtype):
+        emulate = (
+            not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0)
+        )
+        x_shape = (16, 16)
+
+        x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype)
+        m_ref = nn.Linear(16, 32, bias=True, device="cuda", dtype=linear_dtype)
+        self._test_linear_impl(x, m_ref, linear_type, emulate)
+
+        m = nn.Linear(32, 16, device="cuda", dtype=linear_dtype)
+        m = Float8Linear.from_float(m, emulate)
+
+        # Cast the module to dtype
+        m = m.to(dtype=linear_dtype)
+        # Check amax buffer types
+        for key in [
+            "fp8_amax_x",
+            "fp8_amax_history_x",
+            "fp8_scale_x",
+            "fp8_amax_w",
+            "fp8_amax_history_w",
+            "fp8_scale_w",
+            "fp8_amax_dL_dY",
+            "fp8_amax_history_dL_dY",
+            "fp8_scale_dL_dY",
+        ]:
+            assert (
+                m._buffers[key].dtype == torch.float32
+            ), f"{key}.dtype is {m._buffers[key].dtype}, expected torch.float32"
+
+        # autocast off
+        x = torch.randn(16, 32, device="cuda", dtype=linear_dtype)
+        sync_float8_amax_and_scale_history(m)
+        y = m(x)
+        assert y.dtype == linear_dtype, f"y.dtype is {y.dtype}, expected {linear_dtype}"
+
+        # autocast on
+        with torch.autocast("cuda"):
+            sync_float8_amax_and_scale_history(m)
+            y = m(x)
+        assert y.dtype == torch.half, f"y.dtype is {y.dtype}, expected {torch.half}"
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            sync_float8_amax_and_scale_history(m)
+            y = m(x)
+        assert (
+            y.dtype == torch.bfloat16
+        ), f"y.dtype is {y.dtype}, expected {torch.bfloat16}"
+
 
 class TestScaledMM:
     @unittest.skipIf(