From 1b9e6bf42916f7793b14584bd119073625e69f7b Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Fri, 8 Dec 2023 20:26:46 +0400 Subject: [PATCH] [GPU] Add more tests for FC fake alignment and FC SLM optimization --- .../intel_gpu/src/graph/fully_connected.cpp | 6 + .../intel_gpu/src/graph/primitive_inst.cpp | 2 +- .../fully_connected_kernel_bf_tiled.cpp | 4 + .../fake_alignment/fc_fake_alignment_test.cpp | 56 +++++++++ .../test_cases/fully_connected_gpu_test.cpp | 107 ++++++++++++++++++ 5 files changed, 174 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp index 3650fb4d0c4f89..7dd86bd52b3e6a 100644 --- a/src/plugins/intel_gpu/src/graph/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/fully_connected.cpp @@ -223,6 +223,12 @@ kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_par orig_output_layout.data_type, orig_output_layout.format, orig_output_layout.data_padding); + + GPU_DEBUG_TRACE_DETAIL << "Apply fake alignment: input(" << orig_input_layout.to_short_string() << " -> " + << updated_param.input_layouts[0].to_short_string() << "), output(" + << orig_output_layout.to_short_string() << " -> " + << updated_param.output_layouts[0].to_short_string() << ")\n"; + return updated_param; } return std::move(orig_impl_param); diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 8c3b430e3efe28..3b57af2ae6db76 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -455,7 +455,7 @@ event::ptr primitive_inst::realloc_if_needed() { auto updated_layout = actual_layout; for (auto user : get_user_insts()) { // Since fake alignment is applicable for input tensor as well, make sure we allocate enough memory - // to prevemt reading beyound the allocated memory bounds + // to prevent reading beyond the allocated memory bounds if (user->get_node().is_type()) { user->update_shape(); user->update_shape_done_by_other = true; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index 5b9bb1c5716fb0..f6f1fd411811e3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -196,6 +196,10 @@ bool TuneParamsSelector::VerifyTuneParams(const fully_connected_params& params, if (params.engineInfo.deviceType != dev_type::integrated_gpu) return false; + const auto required_slm_size = tparams.tile_ofm * simd * tparams.tile_ifm * simd * 2; // 2 bytes per value (FP16 data type) + if (params.engineInfo.maxLocalMemSize < required_slm_size) + return false; + return true; } diff --git a/src/plugins/intel_gpu/tests/unit/fake_alignment/fc_fake_alignment_test.cpp b/src/plugins/intel_gpu/tests/unit/fake_alignment/fc_fake_alignment_test.cpp index 56d5e59076f99a..157676cd8bea6c 100644 --- a/src/plugins/intel_gpu/tests/unit/fake_alignment/fc_fake_alignment_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fake_alignment/fc_fake_alignment_test.cpp @@ -134,6 +134,33 @@ INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test, layout{ov::PartialShape{56, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu layout{ov::PartialShape{56, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu }, + { + layout{ov::PartialShape{240, 1, 511}, data_types::f16, format::bfyx}, // input_layout + layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout + data_types::f16, + layout{ov::PartialShape{240, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu + layout{ov::PartialShape{240, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu + layout{ov::PartialShape{240, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu + layout{ov::PartialShape{240, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu + }, + { + layout{ov::PartialShape{241, 1, 511}, data_types::f16, format::bfyx}, // input_layout + layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout + data_types::f16, + layout{ov::PartialShape{256, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu + layout{ov::PartialShape{256, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu + layout{ov::PartialShape{248, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu + layout{ov::PartialShape{248, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu + }, + { + layout{ov::PartialShape{257, 1, 511}, data_types::f16, format::bfyx}, // input_layout + layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout + data_types::f16, + layout{ov::PartialShape{272, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu + layout{ov::PartialShape{272, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu + layout{ov::PartialShape{264, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu + layout{ov::PartialShape{264, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu + }, { layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx, padding{{2,0,1,0}, 0}}, // input_layout layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout @@ -152,6 +179,35 @@ INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test, layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx, padding{{0,1,1,0}, 0}}, // fake_aligned input layout_dgpu layout{ov::PartialShape{55, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu }, + + /* int4 compressed weights */ + { + layout{ov::PartialShape{240, 1, 511}, data_types::f16, format::bfyx}, // input_layout + layout{ov::PartialShape{800, 511}, data_types::u4, format::bfyx}, // weight layout + data_types::f16, + layout{ov::PartialShape{240, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu + layout{ov::PartialShape{240, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu + layout{ov::PartialShape{240, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu + layout{ov::PartialShape{240, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu + }, + { + layout{ov::PartialShape{241, 1, 511}, data_types::f16, format::bfyx}, // input_layout + layout{ov::PartialShape{800, 511}, data_types::u4, format::bfyx}, // weight layout + data_types::f16, + layout{ov::PartialShape{256, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu + layout{ov::PartialShape{256, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu + layout{ov::PartialShape{248, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu + layout{ov::PartialShape{248, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu + }, + { + layout{ov::PartialShape{257, 1, 511}, data_types::f16, format::bfyx}, // input_layout + layout{ov::PartialShape{800, 511}, data_types::u4, format::bfyx}, // weight layout + data_types::f16, + layout{ov::PartialShape{320, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu + layout{ov::PartialShape{320, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu + layout{ov::PartialShape{264, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu + layout{ov::PartialShape{264, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu + }, })); } // fake_alignment_tests diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index 173b14e1462244..5ac5dc06f1ed6f 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -1043,6 +1043,97 @@ class fully_connected_gpu_tests: public ::testing::Test { } } + void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + + if (engine.get_device_info().dev_type == device_type::discrete_gpu) + GTEST_SKIP(); + + long int batch_num = is_dynamic ? 260 : 256; + long int ifm_num = 256; + long int ofm_num = 256; + long int scales_group_size = 128; + + auto input_mem = engine.allocate_memory({ { batch_num, ifm_num}, data_types::f16, format::bfyx }); + auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u4, format::bfyx }); + auto scale_mem = engine.allocate_memory({ {ofm_num, ifm_num / scales_group_size}, data_types::f16, format::bfyx }); + + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -2.0f, 2.0f); + set_values(input_mem, input_data); + + auto weigths_data = rg.generate_random_1d(ofm_num * ifm_num / 2, 0, 10); + set_values(weights_mem, weigths_data); + + auto scale_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, -4.0f, 4.0f); + set_values(scale_mem, scale_data); + + auto in_layout = is_dynamic ? layout{ {-1, ifm_num}, data_types::f16, format::bfyx } + : layout{ {batch_num, ifm_num}, data_types::f16, format::bfyx }; + + auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", "", data_types::f16, padding(), 2, 2); + fc_prim.decompression_zero_point_scalar = 8; + + auto get_ref_results = [&]() { + topology topology( + input_layout("input", in_layout), + data("weights", weights_mem), + data("scale", scale_mem), + fc_prim + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + network network(engine, topology, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.begin()->first == "fc_prim"); + + auto output_layout = outputs.begin()->second.get_layout(); + auto output_mem = outputs.begin()->second.get_memory(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; + + topology topology( + input_layout("input", in_layout), + data("weights", weights_mem), + data("scale", scale_mem), + fc_prim + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + + if (is_dynamic) { + auto inst = network->get_primitive("fc_prim"); + auto impl = inst->get_impl(); + ASSERT_EQ(impl->get_kernels().size(), size_t(2)); // Two shape-agnostic kernels + } + + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc_prim"); + + auto output_mem = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr (output_mem, get_test_stream()); + + auto ref_output_mem = get_ref_results(); + cldnn::mem_lock output_ptr_ref (ref_output_mem, get_test_stream()); + + for (size_t i = 0; i < output_ptr_ref.size(); i++) { + ASSERT_FLOAT_EQ(output_ptr_ref[i], output_ptr[i]) << "i = " << i; + } + } + void test_compressed_scale_bias(bool is_caching_test) { auto& engine = get_test_engine(); @@ -2547,6 +2638,22 @@ TEST_F(fully_connected_gpu_tests, compressed_scale_zp_bias_cached) { this->test_compressed_scale_zp_bias(true); } +TEST_F(fully_connected_gpu_tests, compressed_int4_scale) { + this->test_compressed_int4_scale(false, false); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_cached) { + this->test_compressed_int4_scale(true, false); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic) { + this->test_compressed_int4_scale(false, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_cached) { + this->test_compressed_int4_scale(true, true); +} + TEST_F(fully_connected_gpu_tests, compressed_scale_bias) { this->test_compressed_scale_bias(false); }