From 4c5228b622497b8169f02bc07fa6495647e9dabe Mon Sep 17 00:00:00 2001 From: yanlan song Date: Tue, 26 Nov 2024 14:16:04 +0800 Subject: [PATCH] [GPU] do not need to broadcast for eltwise post ops constant argument in onednn gemm (#27671) ### Details: - observe huge perf drop with onednn binary-ops per tensor mode (e.g. per_tensor POLICY input or integer 15 input)): ![image](https://github.com/user-attachments/assets/1d71f89b-2ec5-4afa-8f2d-b349ce7f8a49) in this model, the argument of the binary is constant, but it has been broadcasted to match the gemm output tensor layout, yet it leads to huge perf drop - remove the unecessary broadcast for scalar argument which is later feed to onednn binary ops ### Tickets: - 152186 --------- Signed-off-by: fishbell --- .../graph/graph_optimizer/reorder_inputs.cpp | 4 +++ .../tests/unit/fusions/gemm_fusion_test.cpp | 30 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp index 213da8cb0ab606..4f15800c70970b 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp @@ -1003,6 +1003,10 @@ void reorder_inputs::run(program& p, reorder_factory& rf) { if (gemm_dims[0] == data_dims[0]) continue; + auto data_shape = data_layout.get_shape(); + if (data_shape.size() && shape_size(data_shape) == 1ul) + continue; + static size_t idx = 0; const auto prim_id = "broadcast:" + data.id() + "_broadcasted" + std::to_string(idx++); auto broadcast_prim = std::make_shared(prim_id, cldnn::input_info(data.id()), gemm_layout.get_shape(), diff --git a/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp index 659ccaf9d8a723..08b63e0a8326b8 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/gemm_fusion_test.cpp @@ -398,6 +398,36 @@ TEST_P(gemm_2in_add, eltwise_postop_cached) { execute(p, false, true); } +TEST_P(gemm_2in_add, eltwise_postop_scalar) { + auto p = GetParam(); + + if (engine.get_device_info().supports_immad) { + ov::intel_gpu::ImplementationDesc gemmv_impl = { cldnn::format::type::any, "", impl_types::onednn }; + cfg_fused.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "gemm_prim", gemmv_impl } })); + } + + auto add_data_layout = get_output_layout(p); + auto add_data_size = add_data_layout.get_partial_shape(); + for (size_t i = 0; i < add_data_size.size(); i++) + add_data_size[i] = 1; + add_data_layout.set_partial_shape(add_data_size); + + auto in_layout0 = get_input_layout(p, 0); + auto in_layout1 = get_input_layout(p, 1); + + create_topologies( + input_layout("input0", in_layout0), + input_layout("input1", in_layout1), + data("add_data", get_mem(add_data_layout, 0.5f)), + gemm("gemm_prim", { input_info("input0"), input_info("input1") }, data_types::f32, false, false, 1.f, 0.f, in_layout0.get_rank(), in_layout1.get_rank()), + eltwise("add_prim", { input_info("gemm_prim"), input_info("add_data") }, p.eltwise_m, p.default_type), + reorder("reorder_bfyx", input_info("add_prim"), p.default_format, data_types::f32) + ); + + tolerance = default_tolerance(p.default_type); + execute(p, false, true); +} + INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_add, ::testing::ValuesIn(std::vector{ // gemm_test_params{ CASE_GEMM_2IN_FP16_3, 3, 4, "", broadcast_kinds::none, eltwise_mode::sum }, // TODO: check why failed in eltwise_postop_dynamic gemm_test_params{ CASE_GEMM_2IN_FP16_4, 3, 4, "", broadcast_kinds::none, eltwise_mode::sum },