From 0d9d14dcf616d41d187b1f9cd51b56eff9df1c40 Mon Sep 17 00:00:00 2001
From: "jag.Xu" <jia3.xu@intel.com>
Date: Mon, 2 Dec 2024 12:57:36 +0800
Subject: [PATCH] [GPU] fix SDPA produce NaN after transpose_fusion pass.
 (#27629)

### Details:
- The issue only happens is fp16 model and even fp32 model if a hint for
fp32 is not set.
- The NaN starts to produce after the comit 27780a6 which reverts the
fake alignment. *CVS-155861*
 -


### Tickets:
 - *CVS-156289*
---
 .../kernels/sdpa/sdpa_kernel_base.cpp         |  2 +-
 .../dynamic/scaled_dot_product_attention.cpp  | 93 ++++++++++++++++---
 2 files changed, 83 insertions(+), 12 deletions(-)
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp
index e2a538750d1615..ed0ba87f8f22af 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp
@@ -107,7 +107,7 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const {
     };
 
     auto use_index_calc_func = [&](const std::vector<int64_t> order, bool is_query = false) {
-        if (!params.input0_order.empty() && !is_default_order(params.input0_order))
+        if (!order.empty() && !is_default_order(order))
             return true;
 
         if (params.conf.broadcast_axis != -1)
diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp
index 965313126d4362..89b3d38f5051d3 100644
--- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp
+++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp
@@ -25,7 +25,8 @@ typedef std::tuple<ov::element::Type,                // netPrecision
                    std::vector<InputShape>,          // shape
                    bool,                             // is_causal
                    bool,                             // has_attn
-                   bool                              // has_scale
+                   bool,                             // has_scale
+                   std::vector<std::vector<int64_t>> // input_transpose
                    > ScaledAttnGPUTestParams;
 
 class ScaledAttnLayerGPUTest : public testing::WithParamInterface<ScaledAttnGPUTestParams>,
@@ -36,6 +37,7 @@ class ScaledAttnLayerGPUTest : public testing::WithParamInterface<ScaledAttnGPUT
 protected:
     void SetUp() override;
     void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
+    void transpose_prepare(std::vector<InputShape>& shapes, const std::vector<std::vector<int64_t>>& input_transpose);
     bool is_causal;
     bool has_attn;
     bool has_scale;
@@ -44,11 +46,14 @@ class ScaledAttnLayerGPUTest : public testing::WithParamInterface<ScaledAttnGPUT
 std::string ScaledAttnLayerGPUTest::getTestCaseName(const testing::TestParamInfo<ScaledAttnGPUTestParams>& obj) {
     ov::element::Type inType;
     std::vector<InputShape> inputShapes;
+    std::vector<std::vector<int64_t>> input_transpose;
     bool is_causal;
     bool has_attn;
     bool has_scale;
-    std::tie(inType, inputShapes, is_causal, has_attn, has_scale) = obj.param;
+    bool transpose_enable;
+    std::tie(inType, inputShapes, is_causal, has_attn, has_scale, input_transpose) = obj.param;
 
+    transpose_enable = (input_transpose.size() != 0);
     std::ostringstream result;
     result << "netPRC=" << inType << "_";
     result << "IS=";
@@ -65,6 +70,7 @@ std::string ScaledAttnLayerGPUTest::getTestCaseName(const testing::TestParamInfo
     result << "is_causal=" << is_causal << "_";
     result << "has_attn=" << has_attn << "_";
     result << "has_scale=" << has_scale << "_";
+    result << "with_transpose" << transpose_enable << "_";
 
     return result.str();
 }
@@ -72,17 +78,19 @@ std::string ScaledAttnLayerGPUTest::getTestCaseName(const testing::TestParamInfo
 void ScaledAttnLayerGPUTest::SetUp() {
     ov::element::Type inType;
     std::vector<InputShape> inputShapes;
+    std::vector<std::vector<int64_t>> input_transpose;
 
     targetDevice = ov::test::utils::DEVICE_GPU;
 
-    std::tie(inType, inputShapes, is_causal, has_attn, has_scale) = this->GetParam();
+    std::tie(inType, inputShapes, is_causal, has_attn, has_scale, input_transpose) = this->GetParam();
 
+    transpose_prepare(inputShapes, input_transpose);
     init_input_shapes(inputShapes);
     ov::ParameterVector inputParams;
     // q, k, v
     inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]));
     inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[1]));
-    inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[1]));
+    inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[2]));
     inputParams[0]->set_friendly_name("q");
     inputParams[1]->set_friendly_name("k");
     inputParams[2]->set_friendly_name("v");
@@ -96,7 +104,7 @@ void ScaledAttnLayerGPUTest::SetUp() {
         inputParams.back()->set_friendly_name("scale");
     } else {
         if (has_attn) {
-            inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[2]));
+            inputParams.push_back(std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[3]));
             inputParams.back()->set_friendly_name("attention_mask");
         }
         if (has_scale) {
@@ -106,9 +114,31 @@ void ScaledAttnLayerGPUTest::SetUp() {
         }
     }
 
-    ov::OutputVector inputs;
+    ov::OutputVector inputParams_transpose;
     for (size_t i = 0; i < inputParams.size(); i++) {
-        inputs.push_back(inputParams[i]);
+        inputParams_transpose.push_back(inputParams[i]);
+    }
+    if (input_transpose.size() != 0) {
+        // deal with transpose.
+        auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, input_transpose[0]);
+        auto tranpose_a = std::make_shared<ov::op::v1::Transpose>(inputParams[0], tranpose_a_const);
+        tranpose_a->set_friendly_name("tranpose_a");
+        inputParams_transpose[0] = tranpose_a;
+
+        auto tranpose_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, input_transpose[1]);
+        auto tranpose_b = std::make_shared<ov::op::v1::Transpose>(inputParams[1], tranpose_b_const);
+        tranpose_b->set_friendly_name("tranpose_b");
+        inputParams_transpose[1] = tranpose_b;
+
+        auto tranpose_c_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, input_transpose[2]);
+        auto tranpose_c = std::make_shared<ov::op::v1::Transpose>(inputParams[2], tranpose_c_const);
+        tranpose_c->set_friendly_name("tranpose_c");
+        inputParams_transpose[2] = tranpose_c;
+    }
+
+    ov::OutputVector inputs;
+    for (size_t i = 0; i < inputParams_transpose.size(); i++) {
+        inputs.push_back(inputParams_transpose[i]);
     }
 
     auto sdp = std::make_shared<ov::opset13::ScaledDotProductAttention>(inputs, is_causal);
@@ -141,17 +171,53 @@ void ScaledAttnLayerGPUTest::SetUp() {
     }
 }
 
+void ScaledAttnLayerGPUTest::transpose_prepare(std::vector<InputShape>& shapes,
+    const std::vector<std::vector<int64_t>>& input_transpose) {
+    auto transpose_pshape = [](InputShape& pshapes, const std::vector<int64_t>& order) {
+        auto transposed_pshape = ov::PartialShape::dynamic(pshapes.first.rank());
+        std::vector<ov::Shape> transposed_cshapes(pshapes.second);
+        auto& pshape = pshapes.first;
+        auto& cshape = pshapes.second;
+        for (size_t i = 0; i < order.size(); i++) {
+            transposed_pshape[i] = pshape[order[i]];
+            for (size_t j = 0; j < cshape.size(); j++) {
+                transposed_cshapes[j][i] = cshape[j][order[i]];
+            }
+        }
+
+        for (size_t i = 0; i < order.size(); i++) {
+            pshape[i] = transposed_pshape[i];
+            for (size_t j = 0; j < cshape.size(); j++) {
+                cshape[j][i] = transposed_cshapes[j][i];
+            }
+        }
+    };
+
+    if (shapes.empty()) {
+        return;
+    }
+
+    shapes.insert(shapes.begin()+1, shapes[1]);
+    if (input_transpose.empty()) {
+        return;
+    }
+
+    for (size_t i = 0; i < input_transpose.size(); i++) {
+        transpose_pshape(shapes[i], input_transpose[i]);
+    }
+}
+
 void ScaledAttnLayerGPUTest::generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) {
     std::vector<ov::Shape> shapes(3);
     shapes[0] = targetInputStaticShapes[0];
     shapes[1] = targetInputStaticShapes[1];
-    shapes[2] = targetInputStaticShapes[1];
+    shapes[2] = targetInputStaticShapes[2];
     if (!has_attn && has_scale) {
         shapes.push_back(ov::Shape{});
         shapes.push_back(ov::Shape{1});
     } else {
         if (has_attn) {
-            shapes.push_back(targetInputStaticShapes[2]);
+            shapes.push_back(targetInputStaticShapes[3]);
         }
         if (has_scale) {
             shapes.push_back(ov::Shape{1});
@@ -163,10 +229,11 @@ void ScaledAttnLayerGPUTest::generate_inputs(const std::vector<ov::Shape>& targe
 TEST_P(ScaledAttnLayerGPUTest, CompareWithRefs) {
     ov::element::Type inType;
     std::vector<InputShape> inputShapes;
+    std::vector<std::vector<int64_t>> input_transpose;
     bool is_causal;
     bool has_attn;
     bool has_scale;
-    std::tie(inType, inputShapes, is_causal, has_attn, has_scale) = this->GetParam();
+    std::tie(inType, inputShapes, is_causal, has_attn, has_scale, input_transpose) = this->GetParam();
     run();
 }
 
@@ -261,11 +328,15 @@ const std::vector<std::vector<InputShape>> shapes{
     },
 };
 
+const std::vector<std::vector<int64_t>> disable_transpose{};
+const std::vector<std::vector<int64_t>> enable_transpose{{0, 1, 2, 3}, {0, 1, 2, 3}, {0, 2, 1, 3}};
+
 const auto params = testing::Combine(testing::Values(ov::element::f16 /*, ov::element::f32 */),
                                                  testing::ValuesIn(shapes),
                                                  testing::Values(true, false),
                                                  testing::Values(true, false),
-                                                 testing::Values(true, false));
+                                                 testing::Values(true, false),
+                                                 testing::ValuesIn({disable_transpose, enable_transpose}));
 
 INSTANTIATE_TEST_SUITE_P(smoke_ScaledAttn_GPU,
                          ScaledAttnLayerGPUTest,