Skip to content

Commit

Permalink
[GPU] Add more tests for FC fake alignment and FC SLM optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
sshlyapn committed Dec 11, 2023
1 parent 9e2bd77 commit 1b9e6bf
Show file tree
Hide file tree
Showing 5 changed files with 174 additions and 1 deletion.
6 changes: 6 additions & 0 deletions src/plugins/intel_gpu/src/graph/fully_connected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,12 @@ kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_par
orig_output_layout.data_type,
orig_output_layout.format,
orig_output_layout.data_padding);

GPU_DEBUG_TRACE_DETAIL << "Apply fake alignment: input(" << orig_input_layout.to_short_string() << " -> "
<< updated_param.input_layouts[0].to_short_string() << "), output("
<< orig_output_layout.to_short_string() << " -> "
<< updated_param.output_layouts[0].to_short_string() << ")\n";

return updated_param;
}
return std::move(orig_impl_param);
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ event::ptr primitive_inst::realloc_if_needed() {
auto updated_layout = actual_layout;
for (auto user : get_user_insts()) {
// Since fake alignment is applicable for input tensor as well, make sure we allocate enough memory
// to prevemt reading beyound the allocated memory bounds
// to prevent reading beyond the allocated memory bounds
if (user->get_node().is_type<fully_connected>()) {
user->update_shape();
user->update_shape_done_by_other = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,10 @@ bool TuneParamsSelector::VerifyTuneParams(const fully_connected_params& params,
if (params.engineInfo.deviceType != dev_type::integrated_gpu)
return false;

const auto required_slm_size = tparams.tile_ofm * simd * tparams.tile_ifm * simd * 2; // 2 bytes per value (FP16 data type)
if (params.engineInfo.maxLocalMemSize < required_slm_size)
return false;

return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,33 @@ INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test,
layout{ov::PartialShape{56, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{56, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
{
layout{ov::PartialShape{240, 1, 511}, data_types::f16, format::bfyx}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{240, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu
layout{ov::PartialShape{240, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{240, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{240, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
{
layout{ov::PartialShape{241, 1, 511}, data_types::f16, format::bfyx}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{256, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu
layout{ov::PartialShape{256, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{248, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{248, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
{
layout{ov::PartialShape{257, 1, 511}, data_types::f16, format::bfyx}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{272, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu
layout{ov::PartialShape{272, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{264, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{264, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
{
layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx, padding{{2,0,1,0}, 0}}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::f16, format::bfyx}, // weight layout
Expand All @@ -152,6 +179,35 @@ INSTANTIATE_TEST_SUITE_P(smoke, fully_connected_fake_align_test,
layout{ov::PartialShape{55, 1, 511}, data_types::f16, format::bfyx, padding{{0,1,1,0}, 0}}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{55, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},

/* int4 compressed weights */
{
layout{ov::PartialShape{240, 1, 511}, data_types::f16, format::bfyx}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::u4, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{240, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu
layout{ov::PartialShape{240, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{240, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{240, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
{
layout{ov::PartialShape{241, 1, 511}, data_types::f16, format::bfyx}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::u4, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{256, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu
layout{ov::PartialShape{256, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{248, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{248, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
{
layout{ov::PartialShape{257, 1, 511}, data_types::f16, format::bfyx}, // input_layout
layout{ov::PartialShape{800, 511}, data_types::u4, format::bfyx}, // weight layout
data_types::f16,
layout{ov::PartialShape{320, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_igpu
layout{ov::PartialShape{320, 1, 800}, data_types::f16, format::bfyx}, // fake_aligned output layout_igpu
layout{ov::PartialShape{264, 1, 511}, data_types::f16, format::bfyx}, // fake_aligned input layout_dgpu
layout{ov::PartialShape{264, 1, 800}, data_types::f16, format::bfyx} // fake_aligned output layout_dgpu
},
}));

} // fake_alignment_tests
Original file line number Diff line number Diff line change
Expand Up @@ -1043,6 +1043,97 @@ class fully_connected_gpu_tests: public ::testing::Test {
}
}

void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic) {
tests::random_generator rg(GET_SUITE_NAME);
auto& engine = get_test_engine();

if (engine.get_device_info().dev_type == device_type::discrete_gpu)
GTEST_SKIP();

long int batch_num = is_dynamic ? 260 : 256;
long int ifm_num = 256;
long int ofm_num = 256;
long int scales_group_size = 128;

auto input_mem = engine.allocate_memory({ { batch_num, ifm_num}, data_types::f16, format::bfyx });
auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u4, format::bfyx });
auto scale_mem = engine.allocate_memory({ {ofm_num, ifm_num / scales_group_size}, data_types::f16, format::bfyx });

auto input_data = rg.generate_random_1d<ov::float16>(batch_num * ifm_num, -2.0f, 2.0f);
set_values(input_mem, input_data);

auto weigths_data = rg.generate_random_1d<uint8_t>(ofm_num * ifm_num / 2, 0, 10);
set_values(weights_mem, weigths_data);

auto scale_data = rg.generate_random_1d<ov::float16>(ofm_num * ifm_num / scales_group_size, -4.0f, 4.0f);
set_values(scale_mem, scale_data);

auto in_layout = is_dynamic ? layout{ {-1, ifm_num}, data_types::f16, format::bfyx }
: layout{ {batch_num, ifm_num}, data_types::f16, format::bfyx };

auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", "", data_types::f16, padding(), 2, 2);
fc_prim.decompression_zero_point_scalar = 8;

auto get_ref_results = [&]() {
topology topology(
input_layout("input", in_layout),
data("weights", weights_mem),
data("scale", scale_mem),
fc_prim
);

auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));

network network(engine, topology, config);
network.set_input_data("input", input_mem);

auto outputs = network.execute();
OPENVINO_ASSERT(outputs.size() == 1);
OPENVINO_ASSERT(outputs.begin()->first == "fc_prim");

auto output_layout = outputs.begin()->second.get_layout();
auto output_mem = outputs.begin()->second.get_memory();

return engine.reinterpret_buffer(*output_mem, output_layout);
};

topology topology(
input_layout("input", in_layout),
data("weights", weights_mem),
data("scale", scale_mem),
fc_prim
);

auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::optimize_data(true));

network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);

if (is_dynamic) {
auto inst = network->get_primitive("fc_prim");
auto impl = inst->get_impl();
ASSERT_EQ(impl->get_kernels().size(), size_t(2)); // Two shape-agnostic kernels
}

network->set_input_data("input", input_mem);

auto outputs = network->execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "fc_prim");

auto output_mem = outputs.begin()->second.get_memory();
cldnn::mem_lock<ov::float16> output_ptr (output_mem, get_test_stream());

auto ref_output_mem = get_ref_results();
cldnn::mem_lock<ov::float16> output_ptr_ref (ref_output_mem, get_test_stream());

for (size_t i = 0; i < output_ptr_ref.size(); i++) {
ASSERT_FLOAT_EQ(output_ptr_ref[i], output_ptr[i]) << "i = " << i;
}
}

void test_compressed_scale_bias(bool is_caching_test) {
auto& engine = get_test_engine();

Expand Down Expand Up @@ -2547,6 +2638,22 @@ TEST_F(fully_connected_gpu_tests, compressed_scale_zp_bias_cached) {
this->test_compressed_scale_zp_bias(true);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale) {
this->test_compressed_int4_scale(false, false);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_cached) {
this->test_compressed_int4_scale(true, false);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic) {
this->test_compressed_int4_scale(false, true);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_cached) {
this->test_compressed_int4_scale(true, true);
}

TEST_F(fully_connected_gpu_tests, compressed_scale_bias) {
this->test_compressed_scale_bias(false);
}
Expand Down

0 comments on commit 1b9e6bf

Please sign in to comment.