Refactoring: in CachedMultiHeadAttention call MHA methods instead of …

…recoding the attention calculation (keras-team#1684) * Call "_compute_attention" instead of recoding the calculation * Fix formatting --------- Co-authored-by: Matt Watson <[email protected]>
SamanehSaadat · Jul 8, 2024 · 29c85c0 · 29c85c0
1 parent 880c7c6
commit 29c85c0
Showing 1 changed file with 6 additions and 13 deletions.
diff --git a/keras_nlp/src/layers/modeling/cached_multi_head_attention.py b/keras_nlp/src/layers/modeling/cached_multi_head_attention.py
@@ -122,21 +122,14 @@ def call(
             key = self._key_dense(key)
             value = self._value_dense(value)
 
-        query = ops.multiply(
-            query,
-            1.0 / ops.sqrt(ops.cast(self._key_dim, query.dtype)),
-        )
-        attention_scores = ops.einsum(self._dot_product_equation, key, query)
-        attention_scores = self._masked_softmax(
-            attention_scores, attention_mask
-        )
-        attention_scores = self._dropout_layer(
-            attention_scores, training=training
+        attention_output, attention_scores = self._compute_attention(
+            query=query,
+            key=key,
+            value=value,
+            attention_mask=attention_mask,
+            training=training,
         )
 
-        attention_output = ops.einsum(
-            self._combine_equation, attention_scores, value
-        )
         attention_output = self._output_dense(attention_output)
 
         if cache is not None: