From 0d9d14dcf616d41d187b1f9cd51b56eff9df1c40 Mon Sep 17 00:00:00 2001 From: "jag.Xu" Date: Mon, 2 Dec 2024 12:57:36 +0800 Subject: [PATCH] [GPU] fix SDPA produce NaN after transpose_fusion pass. (#27629) ### Details: - The issue only happens is fp16 model and even fp32 model if a hint for fp32 is not set. - The NaN starts to produce after the comit 27780a6 which reverts the fake alignment. *CVS-155861* - ### Tickets: - *CVS-156289* --- .../kernels/sdpa/sdpa_kernel_base.cpp | 2 +- .../dynamic/scaled_dot_product_attention.cpp | 93 ++++++++++++++++--- 2 files changed, 83 insertions(+), 12 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp index e2a538750d1615..ed0ba87f8f22af 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp @@ -107,7 +107,7 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { }; auto use_index_calc_func = [&](const std::vector order, bool is_query = false) { - if (!params.input0_order.empty() && !is_default_order(params.input0_order)) + if (!order.empty() && !is_default_order(order)) return true; if (params.conf.broadcast_axis != -1) diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp index 965313126d4362..89b3d38f5051d3 100644 --- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp @@ -25,7 +25,8 @@ typedef std::tuple, // shape bool, // is_causal bool, // has_attn - bool // has_scale + bool, // has_scale + std::vector> // input_transpose > ScaledAttnGPUTestParams; class ScaledAttnLayerGPUTest : public testing::WithParamInterface, @@ -36,6 +37,7 @@ class ScaledAttnLayerGPUTest : public testing::WithParamInterface& targetInputStaticShapes) override; + void transpose_prepare(std::vector& shapes, const std::vector>& input_transpose); bool is_causal; bool has_attn; bool has_scale; @@ -44,11 +46,14 @@ class ScaledAttnLayerGPUTest : public testing::WithParamInterface& obj) { ov::element::Type inType; std::vector inputShapes; + std::vector> input_transpose; bool is_causal; bool has_attn; bool has_scale; - std::tie(inType, inputShapes, is_causal, has_attn, has_scale) = obj.param; + bool transpose_enable; + std::tie(inType, inputShapes, is_causal, has_attn, has_scale, input_transpose) = obj.param; + transpose_enable = (input_transpose.size() != 0); std::ostringstream result; result << "netPRC=" << inType << "_"; result << "IS="; @@ -65,6 +70,7 @@ std::string ScaledAttnLayerGPUTest::getTestCaseName(const testing::TestParamInfo result << "is_causal=" << is_causal << "_"; result << "has_attn=" << has_attn << "_"; result << "has_scale=" << has_scale << "_"; + result << "with_transpose" << transpose_enable << "_"; return result.str(); } @@ -72,17 +78,19 @@ std::string ScaledAttnLayerGPUTest::getTestCaseName(const testing::TestParamInfo void ScaledAttnLayerGPUTest::SetUp() { ov::element::Type inType; std::vector inputShapes; + std::vector> input_transpose; targetDevice = ov::test::utils::DEVICE_GPU; - std::tie(inType, inputShapes, is_causal, has_attn, has_scale) = this->GetParam(); + std::tie(inType, inputShapes, is_causal, has_attn, has_scale, input_transpose) = this->GetParam(); + transpose_prepare(inputShapes, input_transpose); init_input_shapes(inputShapes); ov::ParameterVector inputParams; // q, k, v inputParams.push_back(std::make_shared(inType, inputDynamicShapes[0])); inputParams.push_back(std::make_shared(inType, inputDynamicShapes[1])); - inputParams.push_back(std::make_shared(inType, inputDynamicShapes[1])); + inputParams.push_back(std::make_shared(inType, inputDynamicShapes[2])); inputParams[0]->set_friendly_name("q"); inputParams[1]->set_friendly_name("k"); inputParams[2]->set_friendly_name("v"); @@ -96,7 +104,7 @@ void ScaledAttnLayerGPUTest::SetUp() { inputParams.back()->set_friendly_name("scale"); } else { if (has_attn) { - inputParams.push_back(std::make_shared(inType, inputDynamicShapes[2])); + inputParams.push_back(std::make_shared(inType, inputDynamicShapes[3])); inputParams.back()->set_friendly_name("attention_mask"); } if (has_scale) { @@ -106,9 +114,31 @@ void ScaledAttnLayerGPUTest::SetUp() { } } - ov::OutputVector inputs; + ov::OutputVector inputParams_transpose; for (size_t i = 0; i < inputParams.size(); i++) { - inputs.push_back(inputParams[i]); + inputParams_transpose.push_back(inputParams[i]); + } + if (input_transpose.size() != 0) { + // deal with transpose. + auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, input_transpose[0]); + auto tranpose_a = std::make_shared(inputParams[0], tranpose_a_const); + tranpose_a->set_friendly_name("tranpose_a"); + inputParams_transpose[0] = tranpose_a; + + auto tranpose_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, input_transpose[1]); + auto tranpose_b = std::make_shared(inputParams[1], tranpose_b_const); + tranpose_b->set_friendly_name("tranpose_b"); + inputParams_transpose[1] = tranpose_b; + + auto tranpose_c_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, input_transpose[2]); + auto tranpose_c = std::make_shared(inputParams[2], tranpose_c_const); + tranpose_c->set_friendly_name("tranpose_c"); + inputParams_transpose[2] = tranpose_c; + } + + ov::OutputVector inputs; + for (size_t i = 0; i < inputParams_transpose.size(); i++) { + inputs.push_back(inputParams_transpose[i]); } auto sdp = std::make_shared(inputs, is_causal); @@ -141,17 +171,53 @@ void ScaledAttnLayerGPUTest::SetUp() { } } +void ScaledAttnLayerGPUTest::transpose_prepare(std::vector& shapes, + const std::vector>& input_transpose) { + auto transpose_pshape = [](InputShape& pshapes, const std::vector& order) { + auto transposed_pshape = ov::PartialShape::dynamic(pshapes.first.rank()); + std::vector transposed_cshapes(pshapes.second); + auto& pshape = pshapes.first; + auto& cshape = pshapes.second; + for (size_t i = 0; i < order.size(); i++) { + transposed_pshape[i] = pshape[order[i]]; + for (size_t j = 0; j < cshape.size(); j++) { + transposed_cshapes[j][i] = cshape[j][order[i]]; + } + } + + for (size_t i = 0; i < order.size(); i++) { + pshape[i] = transposed_pshape[i]; + for (size_t j = 0; j < cshape.size(); j++) { + cshape[j][i] = transposed_cshapes[j][i]; + } + } + }; + + if (shapes.empty()) { + return; + } + + shapes.insert(shapes.begin()+1, shapes[1]); + if (input_transpose.empty()) { + return; + } + + for (size_t i = 0; i < input_transpose.size(); i++) { + transpose_pshape(shapes[i], input_transpose[i]); + } +} + void ScaledAttnLayerGPUTest::generate_inputs(const std::vector& targetInputStaticShapes) { std::vector shapes(3); shapes[0] = targetInputStaticShapes[0]; shapes[1] = targetInputStaticShapes[1]; - shapes[2] = targetInputStaticShapes[1]; + shapes[2] = targetInputStaticShapes[2]; if (!has_attn && has_scale) { shapes.push_back(ov::Shape{}); shapes.push_back(ov::Shape{1}); } else { if (has_attn) { - shapes.push_back(targetInputStaticShapes[2]); + shapes.push_back(targetInputStaticShapes[3]); } if (has_scale) { shapes.push_back(ov::Shape{1}); @@ -163,10 +229,11 @@ void ScaledAttnLayerGPUTest::generate_inputs(const std::vector& targe TEST_P(ScaledAttnLayerGPUTest, CompareWithRefs) { ov::element::Type inType; std::vector inputShapes; + std::vector> input_transpose; bool is_causal; bool has_attn; bool has_scale; - std::tie(inType, inputShapes, is_causal, has_attn, has_scale) = this->GetParam(); + std::tie(inType, inputShapes, is_causal, has_attn, has_scale, input_transpose) = this->GetParam(); run(); } @@ -261,11 +328,15 @@ const std::vector> shapes{ }, }; +const std::vector> disable_transpose{}; +const std::vector> enable_transpose{{0, 1, 2, 3}, {0, 1, 2, 3}, {0, 2, 1, 3}}; + const auto params = testing::Combine(testing::Values(ov::element::f16 /*, ov::element::f32 */), testing::ValuesIn(shapes), testing::Values(true, false), testing::Values(true, false), - testing::Values(true, false)); + testing::Values(true, false), + testing::ValuesIn({disable_transpose, enable_transpose})); INSTANTIATE_TEST_SUITE_P(smoke_ScaledAttn_GPU, ScaledAttnLayerGPUTest,