From db4362a43842852b035722c1753c813737ce1dc9 Mon Sep 17 00:00:00 2001 From: hyunback Date: Wed, 2 Oct 2024 10:58:10 +0900 Subject: [PATCH 1/7] [GPU] Fix sd1.5_controlnet_lora bad image. 1. Fixed a bug where dynamic tensor value would disappear when using dynamic and static inputs together. 2. Fixed onednn gemm post-op wrong dims in case spatial 1x1. 3. Fixed side effect for can_be_optimized condtion in allocate_output . Signed-off-by: hyunback --- src/plugins/intel_gpu/src/graph/gemm.cpp | 3 +- .../intel_gpu/src/graph/primitive_inst.cpp | 2 +- .../intel_gpu/src/graph/program_node.cpp | 4 + .../tests/unit/test_cases/gemm_gpu_test.cpp | 74 +++++++++++++++++++ 4 files changed, 81 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/gemm.cpp b/src/plugins/intel_gpu/src/graph/gemm.cpp index a8b196bd45885f..25007cb93b18d5 100644 --- a/src/plugins/intel_gpu/src/graph/gemm.cpp +++ b/src/plugins/intel_gpu/src/graph/gemm.cpp @@ -229,7 +229,8 @@ layout gemm_inst::transform_output_layout(const std::shared_ptr prim (i == 1) ? transposed_input1_pshape : input_layouts[i].get_partial_shape(); for (size_t j = 0; j != input_pshape.size(); ++j) { - ov::Dimension::merge(output_pshape[j], output_pshape[j], input_pshape[j]); + if (input_pshape[j].get_max_length() != input_pshape[j].get_min_length()) + ov::Dimension::merge(output_pshape[j], output_pshape[j], input_pshape[j]); } } diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index c51b34d81cf153..c23616b8c87297 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -2122,7 +2122,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl; return ov::intel_gpu::allocate_memory_evenif_zero_bytes(_engine, layout, alloc_type, reset); } - } else if (!_node.can_share_buffer() || impl_params.can_be_optimized() || _node.is_output()) { + } else if (!_node.can_share_buffer() || _node.can_be_optimized() || _node.is_output()) { GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl; return ov::intel_gpu::allocate_memory_evenif_zero_bytes(_engine, layout, alloc_type, reset); } else { diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index 21ba4e656fae0d..dc76bbc58351c6 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -1550,6 +1550,10 @@ void program_node::create_onednn_primitive_attributes( size_t rank = cldnn::format::dimension(in.format); size_t in_batched_size = in.count() / (in.spatial(0) * in.spatial(1)); dnnl::memory::dims dims = onednn::convert_gemm_tensor(in.get_tensor(), rank, in_batched_size == 1); + bool spatial_dims_can_be_removed = (in.spatial(0) * in.spatial(1) == 1); + if (dims.size() == 4 && spatial_dims_can_be_removed) { + dims.erase(dims.begin() + 2, dims.begin() + 4); + } dnnl::memory::data_type dt = onednn::convert_data_type(in.data_type); dnnl::memory::format_tag fmt = onednn::convert_gemm_data_format(dims, in.format); post_ops.append_binary(alg, dnnl::memory::desc(dims, dt, fmt)); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp index adaa572878bff4..51f66f3abb7bfe 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp @@ -1577,6 +1577,76 @@ class gemm_gpu_tests: public ::testing::Test { ASSERT_NEAR(output_ptr[i], ref_out_data[i], abs_error) << "at " << i; } } + + void test_dynamic_static_broadcast_3dim(std::vector BMKN, bool is_caching_test, const double abs_error = 0.0001) { + tests::random_generator rg; + rg.set_seed(GET_SUITE_NAME); + + auto& engine = get_test_engine(); + cldnn::layout input0_layout; + cldnn::layout input1_layout; + + std::vector input0_order = {0, 1, 2}; + std::vector input1_order = {0, 1, 2}; + std::vector output_order = {0, 1, 2}; + + size_t BATCH_SIZE = BMKN[0]; + size_t M_SIZE = BMKN[1]; + size_t K_SIZE = BMKN[2]; + size_t N_SIZE = BMKN[3]; + + ov::Shape input0_shape = { BATCH_SIZE, M_SIZE, K_SIZE }; + ov::Shape input1_shape = { 1, K_SIZE, N_SIZE }; + ov::Shape output_shape = { BATCH_SIZE, M_SIZE, N_SIZE }; + + input0_layout = layout{ov::PartialShape::dynamic(input0_shape.size()), data_types::f16, format::bfyx}; + input1_layout = layout{ov::PartialShape(input1_shape), data_types::f16, format::bfyx}; + + auto input0_mem = engine.allocate_memory(layout{ov::PartialShape(input0_shape), data_types::f16, format::bfyx}); + auto input1_mem = engine.allocate_memory(layout{ov::PartialShape(input1_shape), data_types::f16, format::bfyx}); + + auto input_0_data = rg.generate_random_1d(ov::shape_size(input0_shape), -2, 2); + auto input_1_data = rg.generate_random_1d(ov::shape_size(input1_shape), -2, 2); + + set_values(input0_mem, input_0_data); + set_values(input1_mem, input_1_data); + + topology topology; + topology.add(input_layout("input0", input0_layout), + input_layout("input1", input1_layout), + gemm("gemm", { input_info("input0"), input_info("input1") }, data_types::f16, input0_order, input1_order, output_order) + ); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input0", input0_mem); + network->set_input_data("input1", input1_mem); + + auto outputs = network->execute(); + + auto output_mem = outputs.at("gemm").get_memory(); + cldnn::mem_lock output_ptr(output_mem, get_test_stream()); + + std::vector ref_out_data; + ref_out_data.resize(ov::shape_size(output_shape)); + + ov::reference::matmul(input_0_data.data(), + input_1_data.data(), + ref_out_data.data(), + input0_shape, + input1_shape, + output_shape, + false, + false); + + ASSERT_EQ(output_ptr.size(), ref_out_data.size()); + + for (uint32_t i = 0; i < ref_out_data.size(); ++i) { + ASSERT_NEAR(output_ptr[i], ref_out_data[i], abs_error) << "at " << i; + } + } }; TEST_F(gemm_gpu_tests, basic_bfyx_t2_inplace_crop_with_pad) { @@ -1710,6 +1780,10 @@ TEST_F(gemm_gpu_tests, transpose_matmul_static_4d_f32_n_tile_32_input1_ylast) { this->test_transpose_matmul_f32(4, false, false, /*BMKN*/{19, 37, 23, 29}, /*input0_order*/{0, 1, 2, 3}, /*input1_order*/{0, 1, 3, 2}); } +TEST_F(gemm_gpu_tests, test_dynamic_static_broadcast_3dim) { + this->test_dynamic_static_broadcast_3dim(/*BMKN*/{2, 16, 2, 2}, false); +} + TEST_F(gemm_gpu_tests, transpose_matmul_in0_indirect) { this->test_transpose_indirect(false, true, false); } From 620b122a44d775d2d794b71b83c2dfecdcd36ad1 Mon Sep 17 00:00:00 2001 From: hyunback Date: Sun, 6 Oct 2024 11:19:43 +0900 Subject: [PATCH 2/7] Fix memory pool dependency issue. We should check is_runtime_skippable condition using memory pool. Signed-off-by: hyunback --- src/plugins/intel_gpu/src/graph/primitive_inst.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index c23616b8c87297..2c73540a39faca 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -2122,7 +2122,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl; return ov::intel_gpu::allocate_memory_evenif_zero_bytes(_engine, layout, alloc_type, reset); } - } else if (!_node.can_share_buffer() || _node.can_be_optimized() || _node.is_output()) { + } else if (!_node.can_share_buffer() || (impl_params.can_be_optimized() || !_node.is_runtime_skippable()) || _node.is_output()) { GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl; return ov::intel_gpu::allocate_memory_evenif_zero_bytes(_engine, layout, alloc_type, reset); } else { From 8566cc5fb44dd4b2907e1c2367d33005e2d22fa0 Mon Sep 17 00:00:00 2001 From: hyunback Date: Mon, 7 Oct 2024 12:17:47 +0900 Subject: [PATCH 3/7] Fix to use mem pool in default layer case.. Signed-off-by: hyunback --- src/plugins/intel_gpu/src/graph/primitive_inst.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 2c73540a39faca..7e18cb99f308ae 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -2122,7 +2122,8 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl; return ov::intel_gpu::allocate_memory_evenif_zero_bytes(_engine, layout, alloc_type, reset); } - } else if (!_node.can_share_buffer() || (impl_params.can_be_optimized() || !_node.is_runtime_skippable()) || _node.is_output()) { + } else if (!_node.can_share_buffer() || _node.is_output() + || (((impl_params.can_be_optimized() || (_node.can_be_optimized() != impl_params.can_be_optimized())) && !_node.is_runtime_skippable()))) { GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl; return ov::intel_gpu::allocate_memory_evenif_zero_bytes(_engine, layout, alloc_type, reset); } else { From ed0b51f547ffbcb054fe28b87d8d1734939cd5eb Mon Sep 17 00:00:00 2001 From: hyunback Date: Tue, 8 Oct 2024 16:52:49 +0900 Subject: [PATCH 4/7] Fix unit-test failure. Signed-off-by: hyunback --- .../intel_gpu/src/graph/primitive_inst.cpp | 2 ++ src/plugins/intel_gpu/src/graph/program_node.cpp | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 7e18cb99f308ae..0c69777c2487b9 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -2124,6 +2124,8 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, } } else if (!_node.can_share_buffer() || _node.is_output() || (((impl_params.can_be_optimized() || (_node.can_be_optimized() != impl_params.can_be_optimized())) && !_node.is_runtime_skippable()))) { + // To use a memory pool, skippable should always be true. + // Concat and Crop should not use a memory pool if optimized_out changes at runtime because skippable is always false. GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl; return ov::intel_gpu::allocate_memory_evenif_zero_bytes(_engine, layout, alloc_type, reset); } else { diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index dc76bbc58351c6..48e00a68e77e78 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -1548,12 +1548,20 @@ void program_node::create_onednn_primitive_attributes( mem_desc.get_dims(), mem_desc.get_data_type()); } else if (is_type()) { size_t rank = cldnn::format::dimension(in.format); + auto in_pshape = in.get_partial_shape(); + auto out_pshape = get_output_layout().get_partial_shape(); + size_t ones_to_add = std::max(out_pshape.size(), static_cast(4)) - in_pshape.size(); + if (ones_to_add > 0) { + layout new_layout = in; + ov::PartialShape new_input_pshape; + std::vector dims(in_pshape.begin(), in_pshape.begin() + in_pshape.size()); + new_input_pshape = ov::PartialShape(dims); + new_input_pshape.insert(new_input_pshape.begin(), ones_to_add, 1ul); + new_layout.set_partial_shape(new_input_pshape); + in = new_layout; + } size_t in_batched_size = in.count() / (in.spatial(0) * in.spatial(1)); dnnl::memory::dims dims = onednn::convert_gemm_tensor(in.get_tensor(), rank, in_batched_size == 1); - bool spatial_dims_can_be_removed = (in.spatial(0) * in.spatial(1) == 1); - if (dims.size() == 4 && spatial_dims_can_be_removed) { - dims.erase(dims.begin() + 2, dims.begin() + 4); - } dnnl::memory::data_type dt = onednn::convert_data_type(in.data_type); dnnl::memory::format_tag fmt = onednn::convert_gemm_data_format(dims, in.format); post_ops.append_binary(alg, dnnl::memory::desc(dims, dt, fmt)); From 722ae156ef11f8512bef9b50d0538ec5ad051ce4 Mon Sep 17 00:00:00 2001 From: hyunback Date: Tue, 8 Oct 2024 17:17:47 +0900 Subject: [PATCH 5/7] Fix to use rank size instead of 4. Signed-off-by: hyunback --- src/plugins/intel_gpu/src/graph/program_node.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index 48e00a68e77e78..fc9648b90e444c 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -1550,7 +1550,7 @@ void program_node::create_onednn_primitive_attributes( size_t rank = cldnn::format::dimension(in.format); auto in_pshape = in.get_partial_shape(); auto out_pshape = get_output_layout().get_partial_shape(); - size_t ones_to_add = std::max(out_pshape.size(), static_cast(4)) - in_pshape.size(); + size_t ones_to_add = std::max(out_pshape.size(), static_cast(rank)) - in_pshape.size(); if (ones_to_add > 0) { layout new_layout = in; ov::PartialShape new_input_pshape; From 6f1b5db80236ea6238f207dc3cc1ef11c530928f Mon Sep 17 00:00:00 2001 From: hyunback Date: Thu, 10 Oct 2024 16:57:04 +0900 Subject: [PATCH 6/7] Fix concat handling with skippable enable. Signed-off-by: hyunback --- .../graph/graph_optimizer/prepare_buffer_fusing.cpp | 3 +++ src/plugins/intel_gpu/src/graph/primitive_inst.cpp | 10 ++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index b0c6758af7d909..b7017c414c505f 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -274,6 +274,9 @@ void concat_in_place_optimization::optimize_cascade(concatenation_node& node, st } node.set_output_layout(concat_layout); node.can_be_optimized(true); + if (node.is_dynamic()) { + node.set_runtime_skippable(true); + } GPU_DEBUG_TRACE_DETAIL << "[prepare_buffer_fusing] : " << node.id() << " can be optimized" << std::endl; } diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 0c69777c2487b9..3e35c9c25ebeb3 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -656,7 +656,7 @@ event::ptr primitive_inst::realloc_if_needed() { } // Clear out memory if if was previously reused, but now primitive can't be optimized - if (_node->is_runtime_skippable() || _node->is_type()) { + if (!_node->is_type() && (_node->is_runtime_skippable() || _node->is_type())) { if (can_be_optimized()) { _max_output_layout_count = _deps[0].first->_max_output_layout_count; GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("can_be_optimized"); @@ -1351,7 +1351,8 @@ void primitive_inst::do_runtime_in_place_concat() { if (get_users().size() != 1) return; auto concat_inst = get_user_insts().front(); - if (!concat_inst->get_node().is_type() || !concat_inst->get_node().can_be_optimized()) + + if (!concat_inst->get_node().is_type() || !(concat_inst->get_node().can_be_optimized() && concat_inst->get_node().is_runtime_skippable())) return; if (has_subgraph_dependency(concat_inst->dependencies())) { @@ -2122,10 +2123,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl; return ov::intel_gpu::allocate_memory_evenif_zero_bytes(_engine, layout, alloc_type, reset); } - } else if (!_node.can_share_buffer() || _node.is_output() - || (((impl_params.can_be_optimized() || (_node.can_be_optimized() != impl_params.can_be_optimized())) && !_node.is_runtime_skippable()))) { - // To use a memory pool, skippable should always be true. - // Concat and Crop should not use a memory pool if optimized_out changes at runtime because skippable is always false. + } else if (!_node.can_share_buffer() || impl_params.can_be_optimized() || _node.is_output()) { GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl; return ov::intel_gpu::allocate_memory_evenif_zero_bytes(_engine, layout, alloc_type, reset); } else { From 81cb67b039cec4c75cf1b78d518a076f0f35ddf3 Mon Sep 17 00:00:00 2001 From: hyunback Date: Fri, 11 Oct 2024 17:29:12 +0900 Subject: [PATCH 7/7] Apply code-review comment. Signed-off-by: hyunback --- src/plugins/intel_gpu/src/graph/primitive_inst.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 3e35c9c25ebeb3..13634b49fd9d96 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -502,7 +502,7 @@ event::ptr primitive_inst::realloc_if_needed() { event::ptr ev = nullptr; const auto& users = get_user_insts(); - if (users.size() == 1 && users.front()->get_node().is_type()) { + if (users.size() == 1 && users.front()->get_node().is_type() && users.front()->get_node().is_runtime_skippable()) { auto concat_inst = users.front(); if (concat_inst->can_be_optimized()) { if (!concat_inst->allocation_done_by_other) {