diff --git a/src/anemoi/models/layers/attention.py b/src/anemoi/models/layers/attention.py index 9ee8e104..7df09629 100644 --- a/src/anemoi/models/layers/attention.py +++ b/src/anemoi/models/layers/attention.py @@ -288,7 +288,7 @@ def sliding_window_mask(b, h, q_idx, kv_idx): self.attention = torch.compile(self.attention) self.is_attn_compiled = True - # TODO test how this impacts scaling at large model counts + # TODO(Cathal): test how this impacts scaling at large model counts torch._dynamo.config.optimize_ddp = False out = self.attention(query, key, value) torch._dynamo.config.optimize_ddp = True @@ -354,10 +354,10 @@ def get_alibi_slopes(num_heads: int) -> Tensor: aLiBi slopes """ n = 2 ** math.floor(math.log2(num_heads)) - slope_0 = 2.0 ** (-8.0 / n) + slope_0 = 2 ** (-8 / n) alibi_slopes = torch.pow(slope_0, torch.arange(1, 1 + n)) if n < num_heads: - slope_hat_0 = 2.0 ** (-4.0 / n) + slope_hat_0 = 2 ** (-4 / n) alibi_slopes_hat = torch.pow(slope_hat_0, torch.arange(1, 1 + 2 * (num_heads - n), 2)) alibi_slopes = torch.cat([alibi_slopes, alibi_slopes_hat]) return alibi_slopes