From d575d2b5e3d39e72c42e8deff0d3d66e55d682db Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Wed, 13 Dec 2023 18:20:12 +0400 Subject: [PATCH] [GPU] Gemm tiled opt add dynamic padding support --- .../cl_kernels/gemm_tiled_opt.cl | 26 ++- .../intel_gpu/src/kernel_selector/jitter.cpp | 7 +- .../kernels/gemm/gemm_kernel_tiled_opt.cpp | 44 ++++- .../tests/unit/test_cases/gemm_gpu_test.cpp | 157 ++++++++++++++++++ 4 files changed, 222 insertions(+), 12 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl index fb607312739b1d..0fc29d9ace1fb6 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/gemm_tiled_opt.cl @@ -117,7 +117,7 @@ KERNEL(gemm_tiled_opt)( // Start pointers offsets #if !TRANSPOSE_INPUT0 - const __global INPUT0_TYPE* a_ptr = input0 + batch_offset_input0 + tile_m_offset * K; + const __global INPUT0_TYPE* a_ptr = input0 + batch_offset_input0 + tile_m_offset * K_PADDED_IN0; #else // !TRANSPOSE_INPUT0 const __global INPUT0_TYPE* a_ptr = input0 + batch_offset_input0 + tile_m_offset; #endif // !TRANSPOSE_INPUT0 @@ -153,7 +153,13 @@ KERNEL(gemm_tiled_opt)( // Loading B tile unroll_for (uint b_load_id = 0; b_load_id < TILE_K; b_load_id++) { #if IS_DYNAMIC +#if HAS_DYNAMIC_N_PADDING + // In case of dynamic padding we can't guarantee memory access alignment for + // block reads (4 bytes), so use scattered read + b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; +#else b_tile[b_load_id] = TILE_N_NOT_DIVISIBLE ? (b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]) : BLOCK_READ_B(b_ptr, 0); +#endif #else // IS_DYNAMIC #if TILE_N_NOT_DIVISIBLE b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; @@ -162,7 +168,7 @@ KERNEL(gemm_tiled_opt)( #endif // TILE_N_NOT_DIVISIBLE #endif // IS_DYNAMIC #if !TRANSPOSE_INPUT1 - b_ptr += N; + b_ptr += N_PADDED; #else // !TRANSPOSE_INPUT1 b_ptr += K; #endif // !TRANSPOSE_INPUT1 @@ -203,7 +209,13 @@ KERNEL(gemm_tiled_opt)( unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) { #if !TRANSPOSE_INPUT0 #if IS_DYNAMIC - A_FLOATN a_read = TILE_K_NOT_DIVISIBLE ? a_ptr[dot_id * K + sglid] : BLOCK_READ_A(a_ptr, dot_id * K); +#if HAS_DYNAMIC_K_PADDING + // In case of dynamic padding we can't guarantee memory access alignment for + // block reads (4 bytes), so use scattered read + A_FLOATN a_read = a_ptr[dot_id * K_PADDED_IN0 + sglid]; +#else + A_FLOATN a_read = TILE_K_NOT_DIVISIBLE ? a_ptr[dot_id * K_PADDED_IN0 + sglid] : BLOCK_READ_A(a_ptr, dot_id * K); +#endif #else // IS_DYNAMIC #if TILE_K_NOT_DIVISIBLE A_FLOATN a_read = a_ptr[dot_id * K + sglid]; @@ -273,13 +285,17 @@ KERNEL(gemm_tiled_opt)( if (TILE_K_NOT_DIVISIBLE) { // Loading leftovers of the matrix B unroll_for (uint b_load_id = 0; b_load_id < TILE_K_LEFTOVER; b_load_id++) { +#if HAS_DYNAMIC_N_PADDING + b_tile[b_load_id] = b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]; +#else b_tile[b_load_id] = TILE_N_NOT_DIVISIBLE ? (b_raw_global_id > N - 1 ? 0 : b_ptr[sglid]) : BLOCK_READ_B(b_ptr, 0); - b_ptr += N; +#endif + b_ptr += N_PADDED; } // Loading leftovers of the matrix B end // Loading leftovers of the matrix A and tile C calculation unroll_for (uint dot_id = 0; dot_id < tile_m_iterations; dot_id++) { - INPUT0_TYPE a_read = a_ptr[dot_id * K + sglid]; + INPUT0_TYPE a_read = a_ptr[dot_id * K_PADDED_IN0 + sglid]; unroll_for (uint simd_id = 0; simd_id < TILE_K_LEFTOVER; simd_id++) { c_tile[dot_id] = mad((INPUT0_TYPE)(sub_group_broadcast(a_read, simd_id)), b_tile[simd_id], c_tile[dot_id]); diff --git a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp index 3be5bb1483a46e..fb2d72875d4c60 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp @@ -197,10 +197,13 @@ std::string toCodeString(const Tensor::Dim& dim, size_t offset, bool padded, boo pad_str = " + " + std::to_string(dim.pad.Total()); } } - if (dim.is_dynamic || pad_is_dynamic) { + if (dim.is_dynamic) { snprintf(buf, sizeof(buf), "(shape_info[%zu] %s)", offset, pad_str.c_str()); } else { - snprintf(buf, sizeof(buf), "%zu", dim.v + (padded ? dim.pad.Total() : 0)); + if (pad_is_dynamic) + snprintf(buf, sizeof(buf), "(%zu %s)", dim.v, pad_str.c_str()); + else + snprintf(buf, sizeof(buf), "%zu", dim.v + (padded ? dim.pad.Total() : 0)); } return buf; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp index 7d8a1dceb178c8..de93e27a3810b0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/gemm/gemm_kernel_tiled_opt.cpp @@ -24,6 +24,7 @@ ParamsKey GemmKernelTiledOpt::GetSupportedKey() const { k.EnableOutputLayout(DataLayout::bfwzyx); k.EnableTensorOffset(); + k.EnableTensorPitches(); k.EnableBatching(); k.EnableDifferentTypes(); k.EnableDynamicShapesSupport(); @@ -64,7 +65,7 @@ GemmKernelTiledOpt::GemmTuningData GemmKernelTiledOpt::SetTuningParams(const gem GemmKernelTiledOpt::GemmTuningData tuning_data; - if (!params.is_shape_agnostic) { + if (!params.is_shape_agnostic) { auto m_size = output.Y().v; auto n_size = output.X().v; auto k_size = params.transpose_input0 ? params.inputs[0].Y().v : params.inputs[0].X().v; @@ -112,11 +113,17 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons if (params.has_dynamic_tensors()) { DimensionAccessHelper dims0(params.inputs[0]); DimensionAccessHelper dims1(params.inputs[1]); + DimensionAccessHelper dims0_padded(params.inputs[0], true); + DimensionAccessHelper dims1_padded(params.inputs[1], true); // Note: Actually currently this kernel is not being selected if it is shape agnostic impl && transposed inputs // Because we cannot get the original rank auto m_size = params.transpose_input0 ? dims0.x() : dims0.y(); auto n_size = params.transpose_input1 ? dims1.y() : dims1.x(); + auto n_padded_size = params.transpose_input1 ? "(" + dims1_padded.y() + ")" + : "(" + dims1_padded.x() + ")"; auto k_size = params.transpose_input0 ? dims0.y() : dims0.x(); + auto k_padded_size_in0 = params.transpose_input0 ? "(" + dims0_padded.y() + ")" + : "(" + dims0_padded.x() + ")"; const std::string leftover_m = "(" + m_size + "%" + std::to_string(tuning_data.tile_m_size) + ")"; const std::string leftover_n = "(" + n_size + "%" + std::to_string(tuning_data.tile_n_size) + ")"; const std::string leftover_k = "(" + k_size + "%" + std::to_string(tuning_data.tile_k_size) + ")"; @@ -129,6 +136,8 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons MakeJitConstant("M", m_size), MakeJitConstant("K", k_size), MakeJitConstant("N", n_size), + MakeJitConstant("K_PADDED_IN0", k_padded_size_in0), + MakeJitConstant("N_PADDED", n_padded_size), MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size), MakeJitConstant("TILE_M", tuning_data.tile_m_size), MakeJitConstant("TILE_K", tuning_data.tile_k_size), @@ -141,6 +150,15 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons MakeJitConstant("TILE_K_LEFTOVER", leftover_k), MakeJitConstant("TILE_N_LEFTOVER", leftover_n), }); + + bool has_dynamic_k_padding = params.transpose_input0 ? params.inputs[0].Y().pad.is_dynamic + : params.inputs[0].X().pad.is_dynamic; + bool has_dynamic_n_padding = params.transpose_input1 ? params.inputs[1].Y().pad.is_dynamic + : params.inputs[1].X().pad.is_dynamic; + if (has_dynamic_k_padding) + jit.AddConstant(MakeJitConstant("HAS_DYNAMIC_K_PADDING", 1)); + if (has_dynamic_n_padding) + jit.AddConstant(MakeJitConstant("HAS_DYNAMIC_N_PADDING", 1)); } else { auto m_size = output.Y().v; auto n_size = output.X().v; @@ -153,6 +171,8 @@ JitConstants GemmKernelTiledOpt::GetJitConstants(const gemm_params& params) cons MakeJitConstant("M", m_size), MakeJitConstant("K", k_size), MakeJitConstant("N", n_size), + MakeJitConstant("K_PADDED_IN0", k_size), + MakeJitConstant("N_PADDED", n_size), MakeJitConstant("SIMD_WIDTH", tuning_data.simd_size), MakeJitConstant("TILE_M", tuning_data.tile_m_size), MakeJitConstant("TILE_K", tuning_data.tile_k_size), @@ -235,10 +255,24 @@ bool GemmKernelTiledOpt::Validate(const Params& params, const optional_params& o return false; const auto& gmm_params = static_cast(params); - for (auto input : gmm_params.inputs) { - // Only supports outer padding as first element offset - if (input.X().pad.Total() != 0 || input.Y().pad.Total() != 0 || input.Z().pad.Total() != 0 || - input.Feature().pad.Total() != 0) + + if (gmm_params.outputs[0].PitchesDifferFromLogicalDims()) + return false; + + for (size_t input_idx = 0; input_idx < gmm_params.inputs.size(); ++input_idx) { + auto& input = gmm_params.inputs[input_idx]; + // Supports outer padding as first element offset and dynamic padding for Batch, Feature, X, Y dimensions for first and second inputs + // in case of shape agnostic kernel + bool proper_pad_f = input.Feature().pad.is_dynamic ? false : input.Feature().pad.Total() == 0; + bool proper_pad_x = input.X().pad.is_dynamic ? false : input.X().pad.Total() == 0; + bool proper_pad_y = input.Y().pad.is_dynamic ? false : input.Y().pad.Total() == 0; + if (gmm_params.is_shape_agnostic && input_idx < 2) { + proper_pad_f |= input.Feature().pad.is_dynamic; + proper_pad_x |= input.X().pad.is_dynamic; + proper_pad_y |= input.Y().pad.is_dynamic; + } + + if (!proper_pad_x || !proper_pad_y || input.Z().pad.Total() != 0 || !proper_pad_f) return false; } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp index feccaf402e8ee6..502125e14bc1b1 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/gemm_gpu_test.cpp @@ -359,6 +359,159 @@ class gemm_gpu_tests: public ::testing::Test { } } + void test_dynamic_padding(bool is_caching_test) { + tests::random_generator rg; + rg.set_seed(GET_SUITE_NAME); + + auto& engine = get_test_engine(); + + const unsigned long BATCH_SIZE = 31; + const unsigned long M_SIZE = 11; + const unsigned long K_SIZE = 37; + const unsigned long N_SIZE = 49; + + auto fill_mem = [&](cldnn::memory_ptr mem, std::vector& data) { + cldnn::mem_lock mem_ptr(mem, get_test_stream()); + auto&& l = mem->get_layout(); + auto data_idx = 0; + for (cldnn::tensor::value_type b = 0; b < l.batch(); ++b) { + for (cldnn::tensor::value_type f = 0; f < l.feature(); ++f) { + for (cldnn::tensor::value_type y = 0; y < l.spatial(1); ++y) { + for (cldnn::tensor::value_type x = 0; x < l.spatial(0); ++x) { + auto tensor_coord = cldnn::tensor{{b, f, x, y}, 0}; + auto buffer_idx = l.get_linear_offset(tensor_coord); + mem_ptr[buffer_idx] = data[data_idx++]; + } + } + } + } + }; + + const auto align_size_m = 13; + const auto align_size_k = 16; + const auto align_size_n = 15; + const auto align_size_b1 = 3; + const auto align_size_b2 = 19; + + const auto aligned_batch1_size = align_to(1ul, align_size_b1); + auto padding_size_batch1 = static_cast(aligned_batch1_size - 1); + + const auto aligned_batch2_size = align_to(BATCH_SIZE, align_size_b2); + auto padding_size_batch2 = static_cast(aligned_batch2_size - BATCH_SIZE); + + const auto aligned_m_size = align_to(M_SIZE, align_size_m); + auto padding_size_m = static_cast(aligned_m_size - M_SIZE); + const auto aligned_k_size = align_to(K_SIZE, align_size_k); + auto padding_size_k = static_cast(aligned_k_size - K_SIZE); + const auto aligned_n_size = align_to(N_SIZE, align_size_n); + auto padding_size_n = static_cast(aligned_n_size - N_SIZE); + + ov::Shape in1_shape = { 1, BATCH_SIZE, M_SIZE, K_SIZE }; + ov::Shape in2_shape = { 1, BATCH_SIZE, K_SIZE, N_SIZE }; + ov::Shape in1_shape_aligned = { aligned_batch1_size, aligned_batch2_size, aligned_m_size, aligned_k_size }; + ov::Shape in2_shape_aligned = { aligned_batch1_size, aligned_batch2_size, aligned_k_size, aligned_n_size }; + + // Use dynamic padding for all BFYX dimensions + tensor dyn_pad_dims_input({1, 1, 1, 1}, 0); + + auto in1_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx, padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input)}; + auto in2_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx, padding({0, 0, 0, 0}, {0, 0, 0, 0}, 0.0f, dyn_pad_dims_input)}; + + auto aligned_input1_mem = engine.allocate_memory({ov::PartialShape(in1_shape_aligned), data_types::f16, format::bfyx}); + auto aligned_input2_mem = engine.allocate_memory({ov::PartialShape(in2_shape_aligned), data_types::f16, format::bfyx}); + + auto input1_mem = engine.reinterpret_buffer(*aligned_input1_mem, layout{ov::PartialShape(in1_shape), + data_types::f16, + format::bfyx, + padding({padding_size_batch1, 0, 0, 0}, + {0, padding_size_batch2, padding_size_k, padding_size_m}, 0.0f, dyn_pad_dims_input)}); + + auto input2_mem = engine.reinterpret_buffer(*aligned_input2_mem, layout{ov::PartialShape(in2_shape), + data_types::f16, + format::bfyx, + padding({0, padding_size_batch2, 0, 0}, + {padding_size_batch1, 0, padding_size_n, padding_size_k}, 0.0f, dyn_pad_dims_input)}); + + auto input_1_data = rg.generate_random_1d(ov::shape_size(in1_shape), -2, 2); + auto input_2_data = rg.generate_random_1d(ov::shape_size(in2_shape), -2, 2); + + fill_mem(input1_mem, input_1_data); + fill_mem(input2_mem, input_2_data); + + auto get_ref_results = [&]() { + ov::Shape in1_shape = { 1, BATCH_SIZE, M_SIZE, K_SIZE }; + ov::Shape in2_shape = { 1, BATCH_SIZE, K_SIZE, N_SIZE }; + auto in1_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx}; + auto in2_layout = layout{ {-1, -1, -1, -1}, data_types::f16, format::bfyx}; + + auto input1_mem = engine.allocate_memory(layout{ov::PartialShape(in1_shape), data_types::f16, format::bfyx}); + auto input2_mem = engine.allocate_memory(layout{ov::PartialShape(in2_shape), data_types::f16, format::bfyx}); + + fill_mem(input1_mem, input_1_data); + fill_mem(input2_mem, input_2_data); + + topology topology; + topology.add(input_layout("input1", in1_layout), + input_layout("input2", in2_layout), + gemm("gemm_ref", { input_info("input1"), input_info("input2") }, data_types::f16, false, false, 1.0f, 0.0f, 4, 4) + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::enable_profiling(true)); + + network network(engine, topology, config); + network.set_input_data("input1", input1_mem); + network.set_input_data("input2", input2_mem); + + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.begin()->first == "gemm_ref"); + + auto inst = network.get_primitive("gemm_ref"); + + auto output_mem = outputs.at("gemm_ref").get_memory(); + auto output_layout = outputs.at("gemm_ref").get_layout(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; + + topology topology; + topology.add(input_layout("input1", in1_layout), + input_layout("input2", in2_layout), + gemm("gemm", { input_info("input1"), input_info("input2") }, data_types::f16, false, false, 1.0f, 0.0f, 4, 4) + ); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::enable_profiling(true)); + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + network->set_input_data("input1", input1_mem); + network->set_input_data("input2", input2_mem); + + auto inst = network->get_primitive("gemm"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic()); + + auto outputs = network->execute(); + + auto output_mem = outputs.at("gemm").get_memory(); + auto output_layout = outputs.at("gemm").get_layout(); + + auto res = engine.reinterpret_buffer(*output_mem, output_layout); + + auto ref_res = get_ref_results(); + + mem_lock res_lock(res, get_test_stream()); + mem_lock res_ref_lock(ref_res, get_test_stream()); + for (size_t i = 0; i < res->count(); i++) { + ASSERT_EQ(res_lock[i], res_ref_lock[i]) << i; + } + } + void test_dynamic_multi_inference_same_shape(bool is_caching_test) { auto& engine = get_test_engine(); @@ -549,6 +702,10 @@ TEST_F(gemm_gpu_tests, dynamic) { this->test_dynamic(false); } +TEST_F(gemm_gpu_tests, dynamic_padding) { + this->test_dynamic_padding(false); +} + TEST_F(gemm_gpu_tests, dynamic_multi_inference_same_shape) { this->test_dynamic_multi_inference_same_shape(false); }