From e1fcc01a09922956bcca88aa41716387c91a449f Mon Sep 17 00:00:00 2001 From: Michal Miotk Date: Tue, 19 Nov 2024 21:52:03 +0100 Subject: [PATCH] wip Signed-off-by: Michal Miotk --- .../include/intel_gpu/primitives/lstm.hpp | 2 +- .../graph_optimizer/post_optimize_weights.cpp | 45 +-- .../src/graph/impls/ocl/lstm_cell.cpp | 4 +- .../intel_gpu/src/graph/impls/ocl/rnn_seq.cpp | 5 +- .../graph/impls/onednn/lstm_seq_onednn.cpp | 13 +- src/plugins/intel_gpu/src/graph/lstm_cell.cpp | 26 +- src/plugins/intel_gpu/src/graph/lstm_seq.cpp | 36 +- .../cl_kernels/lstm_cell_and_seq_bfyx.cl | 71 ++-- .../cl_kernels/lstm_cell_and_seq_ref.cl | 42 ++- ....cpp => lstm_cell_and_seq_kernel_bfyx.cpp} | 10 +- ..._ref.h => lstm_cell_and_seq_kernel_bfyx.h} | 6 +- ...f.cpp => lstm_cell_and_seq_kernel_ref.cpp} | 10 +- ..._bfyx.h => lstm_cell_and_seq_kernel_ref.h} | 6 +- .../lstm_cell_and_seq_kernel_selector.cpp | 18 + ....h => lstm_cell_and_seq_kernel_selector.h} | 10 +- .../kernels/lstm/lstm_cell_kernel_ref.cpp | 32 -- .../lstm/lstm_cell_kernel_selector.cpp | 14 - .../kernels/lstm/lstm_cell_kernel_selector.h | 23 -- .../kernels/lstm/lstm_kernel_base.cpp | 12 +- .../kernels/lstm/lstm_kernel_base.h | 5 +- .../kernels/lstm/lstm_seq_kernel_ref.h | 19 - .../kernels/lstm/lstm_seq_kernel_selector.cpp | 18 - src/plugins/intel_gpu/src/plugin/ops/rnn.cpp | 349 +++--------------- .../intel_gpu/src/plugin/program_builder.cpp | 7 + .../src/plugin/transformations_pipeline.cpp | 4 +- 25 files changed, 246 insertions(+), 541 deletions(-) rename src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/{lstm_seq_kernel_bfyx.cpp => lstm_cell_and_seq_kernel_bfyx.cpp} (66%) rename src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/{lstm_cell_kernel_ref.h => lstm_cell_and_seq_kernel_bfyx.h} (68%) rename src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/{lstm_seq_kernel_ref.cpp => lstm_cell_and_seq_kernel_ref.cpp} (67%) rename src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/{lstm_seq_kernel_bfyx.h => lstm_cell_and_seq_kernel_ref.h} (69%) create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_selector.cpp rename src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/{lstm_seq_kernel_selector.h => lstm_cell_and_seq_kernel_selector.h} (51%) delete mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_kernel_ref.cpp delete mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_kernel_selector.cpp delete mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_kernel_selector.h delete mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_seq_kernel_ref.h delete mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_seq_kernel_selector.cpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm.hpp index 05e0b2be3210e7..87daa81f9f54bc 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm.hpp @@ -39,7 +39,7 @@ struct lstm_elt : public RNNParams { const std::vector activation_params = {}, const lstm_weights_order offset_order = lstm_weights_order::iofz, const uint32_t direction = 0) - : RNNParams(id, x, {}, cell, {}, {}, {}, {}, "", "", clip, input_forget, activations, activation_params, offset_order, \ + : RNNParams(id, x, {}, cell, {}, {}, {}, {}, "", "", clip, input_forget, activations, activation_params, offset_order, direction == 0 ? ov::op::RecurrentSequenceDirection::FORWARD : ov::op::RecurrentSequenceDirection::REVERSE) { if (!cell.empty()) input.pop_back(); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp index dcad2475f7e633..2ac17f811f0055 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp @@ -160,8 +160,9 @@ void post_optimize_weights::add_lstm_weights_reorder(primitive_id input_id, std: cldnn::program_node& prev, cldnn::program_node& node, size_t i) { OPENVINO_ASSERT(reorder_params != nullptr, "[GPU] WeightsReorderParams is not initialized."); std::string reorder_id = input_id + "_reo_" + std::to_string(i); - auto hiddenSize = reorder_params->get_output_layout().get_shape()[1] / 4; - auto inputSize = static_cast(reorder_params->get_output_layout().get_shape()[3]); + const auto dir_num = static_cast(reorder_params->get_input_layout().get_shape()[0]); + auto hiddenSize = reorder_params->get_input_layout().get_shape()[1] / 4; + auto inputSize = static_cast(reorder_params->get_input_layout().get_shape()[2]); int size_third; const int W_idx = 3; if (i == W_idx) { @@ -169,22 +170,11 @@ void post_optimize_weights::add_lstm_weights_reorder(primitive_id input_id, std: } else { size_third = static_cast(hiddenSize); } - auto cropSizeR = cldnn::tensor{1, static_cast(hiddenSize), 1, size_third, 1}; - cldnn::layout reorder_layout; - if (i == W_idx) { - reorder_layout = reorder_params->get_output_layout(); - } else { - reorder_layout = reorder_params->get_output_layout(); - auto reorder_layout_new_shape = reorder_layout.get_shape(); - reorder_layout_new_shape[3] = hiddenSize; - reorder_layout = reorder_layout.clone_with_other_shape(reorder_layout_new_shape); - } - auto reorder = std::make_shared(reorder_id, input_id, reorder_layout); - auto& reorder_node = p.get_or_create(reorder); + auto cropSizeR = cldnn::tensor{dir_num, static_cast(hiddenSize), 1, size_third}; std::string crop_id_b = input_id + "_c"; auto get_crop_node = [&](int cropNum) -> cldnn::program_node& { auto crop_id = primitive_id(crop_id_b + std::to_string(cropNum)); - auto crop_prim = std::make_shared(crop_id, reorder_id, cropSizeR, cldnn::tensor{0, static_cast(cropNum*hiddenSize), 0, 0, 0}); + auto crop_prim = std::make_shared(crop_id, reorder_id, cropSizeR, cldnn::tensor{0, static_cast(cropNum*hiddenSize), 0, 0}); return p.get_or_create(crop_prim); }; @@ -194,19 +184,18 @@ void post_optimize_weights::add_lstm_weights_reorder(primitive_id input_id, std: auto& crop3_node = get_crop_node(3); std::vector con_input{input_info(crop_id_b + "1"), input_info(crop_id_b + "0"), input_info(crop_id_b + "2"), input_info(crop_id_b + "3")}; cldnn::primitive_id concat_id{input_id + "cont"}; - auto con = std::make_shared(concat_id, con_input, 0); + auto con = std::make_shared(concat_id, con_input, 1); auto& con_node = p.get_or_create(con); p.add_intermediate(con_node, node, prev, true); - p.add_intermediate(reorder_node, con_node, prev, true); - p.add_intermediate(crop1_node, con_node, reorder_node, true); - p.add_connection(reorder_node, crop0_node, 0); - p.add_connection(reorder_node, crop2_node, 0); - p.add_connection(reorder_node, crop3_node, 0); + p.add_intermediate(crop1_node, con_node, prev, true); + p.add_connection(prev, crop0_node, 0); + p.add_connection(prev, crop2_node, 0); + p.add_connection(prev, crop3_node, 0); p.add_connection(crop0_node, con_node, 0); p.add_connection(crop2_node, con_node, 0); p.add_connection(crop3_node, con_node, 0); std::string permute_id = input_id + "_perx"; - std::vector ord{2, 4, 3, 0, 1}; + std::vector ord{0, 2, 1}; auto permute = std::make_shared(permute_id, input_info{concat_id}, ord); auto& permute_node = p.get_or_create(permute); p.add_intermediate(permute_node, node, con_node, true); @@ -216,7 +205,6 @@ void post_optimize_weights::add_lstm_weights_reorder(primitive_id input_id, std: p.mark_if_constant(node); node.recalc_output_layout(false); }; - set_implementation_and_output(reorder_node); set_implementation_and_output(crop1_node); set_implementation_and_output(crop0_node); set_implementation_and_output(crop2_node); @@ -228,8 +216,9 @@ void post_optimize_weights::add_lstm_weights_reorder(primitive_id input_id, std: void post_optimize_weights::add_lstm_bias_reorder(primitive_id input_id, std::shared_ptr reorder_params, program& p, \ cldnn::program_node& prev, cldnn::program_node& node) { OPENVINO_ASSERT(reorder_params != nullptr, "[GPU] WeightsReorderParams is not initialized."); + const auto dir_num = static_cast(reorder_params->get_input_layout().get_shape()[0]); auto hiddenSize = reorder_params->get_output_layout().get_shape()[1] / 4; - auto cropSize = cldnn::tensor{1, static_cast(hiddenSize), 1, 1}; + auto cropSize = cldnn::tensor{dir_num, static_cast(hiddenSize), 1, 1}; std::string crop_id_b = input_id + "_c"; auto get_crop_node = [&](int cropNum) -> cldnn::program_node& { auto crop_id = primitive_id(crop_id_b + std::to_string(cropNum)); @@ -242,7 +231,7 @@ void post_optimize_weights::add_lstm_bias_reorder(primitive_id input_id, std::sh auto& crop3_node = get_crop_node(3); std::vector con_input{input_info(crop1_node.id()), input_info(crop0_node.id()), input_info(crop2_node.id()), input_info(crop3_node.id())}; cldnn::primitive_id concat_id{input_id + "concat"}; - auto con = std::make_shared(concat_id, con_input, 2); + auto con = std::make_shared(concat_id, con_input, 1); auto& con_node = p.get_or_create(con); p.add_intermediate(con_node, node, prev, true); p.add_intermediate(crop1_node, con_node, prev, true); @@ -252,11 +241,6 @@ void post_optimize_weights::add_lstm_bias_reorder(primitive_id input_id, std::sh p.add_connection(crop0_node, con_node, 0); p.add_connection(crop2_node, con_node, 0); p.add_connection(crop3_node, con_node, 0); - std::string permute_id = input_id + "_pex"; - std::vector ord{0, 3, 2, 1}; - auto permute = std::make_shared(permute_id, input_info{concat_id}, ord); - auto& permute_node = p.get_or_create(permute); - p.add_intermediate(permute_node, node, con_node, true); auto set_implementation_and_output = [this, &p](program_node& node) { node.get_output_layout(false); select_implementation(p, node); @@ -268,7 +252,6 @@ void post_optimize_weights::add_lstm_bias_reorder(primitive_id input_id, std::sh set_implementation_and_output(crop2_node); set_implementation_and_output(crop3_node); set_implementation_and_output(con_node); - set_implementation_and_output(permute_node); } void post_optimize_weights::run(program& p) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.cpp index 8776ba508dbdba..199362c623d33a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.cpp @@ -5,7 +5,7 @@ #include "primitive_base.hpp" #include "lstm_cell_inst.h" -#include "lstm/lstm_cell_kernel_selector.h" +#include "lstm/lstm_cell_and_seq_kernel_selector.h" #include "lstm/lstm_kernel_base.h" #include "openvino/op/lstm_cell.hpp" #include "lstm_cell.hpp" @@ -16,7 +16,7 @@ namespace ocl { struct lstm_cell_impl : typed_primitive_impl_ocl { using parent = typed_primitive_impl_ocl; using parent::parent; - using kernel_selector_t = kernel_selector::lstm_cell_kernel_selector; + using kernel_selector_t = kernel_selector::lstm_cell_and_seq_kernel_selector; using kernel_params_t = kernel_selector::lstm_params; DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::lstm_cell_impl) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.cpp index e7876c03260698..ce160287d22924 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.cpp @@ -6,7 +6,7 @@ #include "lstm_seq_inst.h" #include "rnn_seq.hpp" -#include "lstm/lstm_seq_kernel_selector.h" +#include "lstm/lstm_cell_and_seq_kernel_selector.h" #include "lstm/lstm_kernel_base.h" #include "openvino/op/lstm_sequence.hpp" #include "impls/registry/implementation_manager.hpp" @@ -17,7 +17,7 @@ namespace ocl { struct rnn_seq_impl : typed_primitive_impl_ocl { using parent = typed_primitive_impl_ocl; using parent::parent; - using kernel_selector_t = kernel_selector::lstm_seq_kernel_selector; + using kernel_selector_t = kernel_selector::lstm_cell_and_seq_kernel_selector; using kernel_params_t = kernel_selector::lstm_params; DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::rnn_seq_impl) @@ -47,6 +47,7 @@ struct rnn_seq_impl : typed_primitive_impl_ocl { static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) { const auto& primitive = impl_param.typed_desc(); auto params = get_default_params(impl_param); + params.sequential = true; for (size_t i = 1; i < impl_param.input_layouts.size(); ++i) { params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(i))); } diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp index 6d222b061b0fa8..9a2583d07e9d60 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp @@ -112,7 +112,7 @@ struct lstm_seq_onednn : typed_primitive_onednn_impl { auto weights_shape = impl_params.get_input_layout(layout_nr).get_shape(); auto target_weights_layout = impl_params.get_input_layout(layout_nr); target_weights_layout.format = cldnn::format::bfzyx; - auto layout = target_weights_layout.clone_with_other_shape(ov::Shape{weights_shape[0], weights_shape[1], 1, weights_shape[2], weights_shape[3]}); + auto layout = target_weights_layout.clone_with_other_shape(ov::Shape{weights_shape[0], weights_shape[1], weights_shape[2], 1, 1}); return layout; } @@ -168,11 +168,18 @@ struct lstm_seq_onednn : typed_primitive_onednn_impl { "[GPU] The format kind of the output memory descriptor of onednn lstm_seq cannot be 'any'."); auto eng = engine.get_onednn_engine(); + dnnl::rnn_direction lstm_desc_dir; + if (direction == ov::op::RecurrentSequenceDirection::FORWARD) { + lstm_desc_dir = dnnl::rnn_direction::unidirectional_left2right; + } else if (direction == ov::op::RecurrentSequenceDirection::REVERSE) { + lstm_desc_dir = dnnl::rnn_direction::unidirectional_right2left; + } else { + lstm_desc_dir = dnnl::rnn_direction::bidirectional_concat; + } return std::make_shared( eng, dnnl::prop_kind::forward_inference, - direction == ov::op::RecurrentSequenceDirection::FORWARD ? dnnl::rnn_direction::unidirectional_left2right : \ - dnnl::rnn_direction::unidirectional_right2left, + lstm_desc_dir, input_md, initial_hidden, initial_cell, diff --git a/src/plugins/intel_gpu/src/graph/lstm_cell.cpp b/src/plugins/intel_gpu/src/graph/lstm_cell.cpp index c720bedc3b67ef..9f2a2a7697f218 100644 --- a/src/plugins/intel_gpu/src/graph/lstm_cell.cpp +++ b/src/plugins/intel_gpu/src/graph/lstm_cell.cpp @@ -10,24 +10,24 @@ namespace cldnn { GPU_DEFINE_PRIMITIVE_TYPE_ID(lstm_cell) layout lstm_cell_inst::calc_output_layout(lstm_cell_node const& node, kernel_impl_params const& impl_param) { - auto input_layout = impl_param.get_input_layout(0); - auto input_pshape = input_layout.get_partial_shape(); - auto input_layout_hidden = impl_param.get_input_layout(1); - auto input_pshape_hidden = input_layout_hidden.get_partial_shape(); - auto lstm_batch_size = input_pshape[0]; - auto lstm_hidden_size = input_pshape_hidden[1]; + const auto& input_layout = impl_param.get_input_layout(0); + const auto& input_pshape = input_layout.get_partial_shape(); + const auto& input_layout_hidden = impl_param.get_input_layout(1); + const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape(); + const auto& lstm_batch_size = input_pshape[0]; + const auto& lstm_hidden_size = input_pshape_hidden[1]; return cldnn::layout{ov::PartialShape{lstm_batch_size, lstm_hidden_size}, input_layout.data_type, input_layout.format}; } template std::vector lstm_cell_inst::calc_output_layouts(lstm_cell_node const& node, kernel_impl_params const& impl_param) { - auto input_layout = impl_param.get_input_layout(0); - auto input_pshape = input_layout.get_partial_shape(); - auto input_layout_hidden = impl_param.get_input_layout(1); - auto input_pshape_hidden = input_layout_hidden.get_partial_shape(); - auto lstm_batch_size = input_pshape[0]; - auto lstm_hidden_size = input_pshape_hidden[1]; + const auto& input_layout = impl_param.get_input_layout(0); + const auto& input_pshape = input_layout.get_partial_shape(); + const auto& input_layout_hidden = impl_param.get_input_layout(1); + const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape(); + const auto& lstm_batch_size = input_pshape[0]; + const auto& lstm_hidden_size = input_pshape_hidden[1]; auto out_layout = cldnn::layout{ShapeType{lstm_batch_size, lstm_hidden_size}, input_layout.data_type, input_layout.format}; return {out_layout, out_layout}; @@ -36,7 +36,7 @@ std::vector lstm_cell_inst::calc_output_layouts(lstm_cell_node const& no template std::vector lstm_cell_inst::calc_output_layouts(lstm_cell_node const& node, const kernel_impl_params& impl_param); std::string lstm_cell_inst::to_string(lstm_cell_node const& node) { - auto desc = node.get_primitive(); + const auto& desc = node.get_primitive(); auto node_info = node.desc_to_json(); std::stringstream primitive_description; diff --git a/src/plugins/intel_gpu/src/graph/lstm_seq.cpp b/src/plugins/intel_gpu/src/graph/lstm_seq.cpp index 88405ee7b01471..16a756cc18a615 100644 --- a/src/plugins/intel_gpu/src/graph/lstm_seq.cpp +++ b/src/plugins/intel_gpu/src/graph/lstm_seq.cpp @@ -10,14 +10,14 @@ namespace cldnn { GPU_DEFINE_PRIMITIVE_TYPE_ID(lstm_seq) layout lstm_seq_inst::calc_output_layout(lstm_seq_node const& node, kernel_impl_params const& impl_param) { - auto desc = impl_param.typed_desc(); - auto input_layout = impl_param.get_input_layout(0); - auto input_pshape = input_layout.get_partial_shape(); - auto input_layout_hidden = impl_param.get_input_layout(1); - auto input_pshape_hidden = input_layout_hidden.get_partial_shape(); - auto lstm_batch_size = input_pshape[0]; - auto lstm_seq_length = input_pshape[1]; - auto lstm_hidden_size = input_pshape_hidden[2]; + const auto& desc = impl_param.typed_desc(); + const auto& input_layout = impl_param.get_input_layout(0); + const auto& input_pshape = input_layout.get_partial_shape(); + const auto& input_layout_hidden = impl_param.get_input_layout(1); + const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape(); + const auto& lstm_batch_size = input_pshape[0]; + const auto& lstm_seq_length = input_pshape[1]; + const auto& lstm_hidden_size = input_pshape_hidden[2]; auto first_out_fmt = cldnn::format::bfyx; if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) { @@ -29,22 +29,20 @@ layout lstm_seq_inst::calc_output_layout(lstm_seq_node const& node, kernel_impl_ template std::vector lstm_seq_inst::calc_output_layouts(lstm_seq_node const& node, kernel_impl_params const& impl_param) { - auto desc = impl_param.typed_desc(); - auto input_layout = impl_param.get_input_layout(0); - auto input_pshape = input_layout.get_partial_shape(); - auto input_layout_hidden = impl_param.get_input_layout(1); - auto input_pshape_hidden = input_layout_hidden.get_partial_shape(); - auto lstm_batch_size = input_pshape[0]; - auto lstm_seq_length = input_pshape[1]; - auto lstm_hidden_size = input_pshape_hidden[2]; + const auto& desc = impl_param.typed_desc(); + const auto& input_layout = impl_param.get_input_layout(0); + const auto& input_pshape = input_layout.get_partial_shape(); + const auto& input_layout_hidden = impl_param.get_input_layout(1); + const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape(); + const auto& lstm_batch_size = input_pshape[0]; + const auto& lstm_seq_length = input_pshape[1]; + const auto& lstm_hidden_size = input_pshape_hidden[2]; auto first_out_fmt = cldnn::format::bfyx; auto second_out_fmt = input_layout.format; auto third_out_fmt = input_layout.format; if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) { first_out_fmt = node.get_preferred_output_fmt(); - second_out_fmt = node.get_preferred_output_fmt(1); - third_out_fmt = node.get_preferred_output_fmt(2); } auto num_directions = desc->num_directions(); @@ -56,7 +54,7 @@ std::vector lstm_seq_inst::calc_output_layouts(lstm_seq_node const& node template std::vector lstm_seq_inst::calc_output_layouts(lstm_seq_node const& node, const kernel_impl_params& impl_param); std::string lstm_seq_inst::to_string(lstm_seq_node const& node) { - auto desc = node.get_primitive(); + const auto& desc = node.get_primitive(); auto node_info = node.desc_to_json(); std::stringstream primitive_description; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl index 219b8c4d8ab42a..70a881918f01a8 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl @@ -50,14 +50,17 @@ KERNEL(lstm_cell_and_seq_bfyx)( #else const uint real_seq_length = 1; #endif - + #if DIRECTION == 2 + for(uint dir=0;dir0){ barrier(CLK_LOCAL_MEM_FENCE); } @@ -98,21 +101,22 @@ KERNEL(lstm_cell_and_seq_bfyx)( uint block_num = INPUT_SIZE/VEC_SIZE; unroll_for(uint j=0;j0){ barrier(CLK_LOCAL_MEM_FENCE); } @@ -49,7 +52,6 @@ KERNEL(lstm_cell_and_seq_ref)( } ACCUMULATOR_TYPE gate_output[GATE_NUM]; unroll_for(uint k=0;k(); + Attach(); +} + +KernelsData lstm_cell_and_seq_kernel_selector::GetBestKernels(const Params& params) const { + return GetNaiveBestKernel(params, KernelType::LSTM_SEQ_CELL); +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_seq_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_selector.h similarity index 51% rename from src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_seq_kernel_selector.h rename to src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_selector.h index 583de5907a72e6..09eba2ff7c9b2f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_seq_kernel_selector.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_and_seq_kernel_selector.h @@ -7,16 +7,16 @@ #include "kernel_selector.h" namespace kernel_selector { -class lstm_seq_kernel_selector : public kernel_selector_base { +class lstm_cell_and_seq_kernel_selector : public kernel_selector_base { public: - static lstm_seq_kernel_selector& Instance() { - static lstm_seq_kernel_selector instance_; + static lstm_cell_and_seq_kernel_selector& Instance() { + static lstm_cell_and_seq_kernel_selector instance_; return instance_; } - lstm_seq_kernel_selector(); + lstm_cell_and_seq_kernel_selector(); - virtual ~lstm_seq_kernel_selector() {} + virtual ~lstm_cell_and_seq_kernel_selector() {} KernelsData GetBestKernels(const Params& params) const override; }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_kernel_ref.cpp deleted file mode 100644 index 350114a854406a..00000000000000 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_kernel_ref.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "lstm_cell_kernel_ref.h" -#include "kernel_selector_utils.h" - -namespace kernel_selector { - -ParamsKey LSTMCellKernelRef::GetSupportedKey() const { - ParamsKey k; - k.EnableInputDataType(Datatype::F16); - k.EnableInputDataType(Datatype::F32); - k.EnableOutputDataType(Datatype::F16); - k.EnableOutputDataType(Datatype::F32); - k.EnableDifferentTypes(); - k.EnableAllInputLayout(); - k.EnableAllOutputLayout(); - k.EnableTensorOffset(); - k.EnableTensorPitches(); - k.EnableBatching(); - return k; -} - -KernelsData LSTMCellKernelRef::GetKernelsData(const Params& params) const { - return GetCommonKernelsData(params, false); -} - -KernelsPriority LSTMCellKernelRef::GetKernelsPriority(const Params& /*params*/) const { - return DONT_USE_IF_HAVE_SOMETHING_ELSE; -} -} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_kernel_selector.cpp deleted file mode 100644 index 58bfba4db0ff24..00000000000000 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_kernel_selector.cpp +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "lstm_cell_kernel_selector.h" -#include "lstm_cell_kernel_ref.h" - -namespace kernel_selector { -lstm_cell_kernel_selector::lstm_cell_kernel_selector() { Attach(); } - -KernelsData lstm_cell_kernel_selector::GetBestKernels(const Params& params) const { - return GetNaiveBestKernel(params, KernelType::LSTM_SEQ_CELL); -} -} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_kernel_selector.h deleted file mode 100644 index 1c28a452d085fd..00000000000000 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_cell_kernel_selector.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "kernel_selector.h" - -namespace kernel_selector { -class lstm_cell_kernel_selector : public kernel_selector_base { -public: - static lstm_cell_kernel_selector& Instance() { - static lstm_cell_kernel_selector instance_; - return instance_; - } - - lstm_cell_kernel_selector(); - - virtual ~lstm_cell_kernel_selector() {} - - KernelsData GetBestKernels(const Params& params) const override; -}; -} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.cpp index 2b193ac1888a9c..65f884f034e24f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.cpp @@ -10,15 +10,15 @@ namespace kernel_selector { -JitConstants LSTMKernelBase::GetJitConstants(const lstm_params& params, bool sequential) const { +JitConstants LSTMKernelBase::GetJitConstants(const lstm_params& params) const { JitConstants jit = MakeBaseParamsJitConstants(params); + bool sequential = params.sequential; auto out = params.outputs[0]; if (params.input_forget) { jit.AddConstants({MakeJitConstant("INPUT_FORGET", true)}); } jit.AddConstants({MakeJitConstant("VEC_SIZE", 4)}); - assert(params.direction == ov::op::RecurrentSequenceDirection::FORWARD || params.direction == ov::op::RecurrentSequenceDirection::REVERSE); - jit.AddConstants({MakeJitConstant("DIRECTION", params.direction == ov::op::RecurrentSequenceDirection::REVERSE ? 1 : 0)}); + jit.AddConstants({MakeJitConstant("DIRECTION", static_cast(params.direction))}); const unsigned int gate_num = 4; jit.AddConstants({MakeJitConstant("GATE_NUM", gate_num)}); if (sequential) { @@ -78,13 +78,13 @@ JitConstants LSTMKernelBase::GetJitConstants(const lstm_params& params, bool seq return jit; } -KernelsData LSTMKernelBase::GetCommonKernelsData(const Params& params, bool sequential) const { +KernelsData LSTMKernelBase::GetCommonKernelsData(const Params& params) const { if (!Validate(params)) { return {}; } const lstm_params& orgParams = static_cast(params); - + bool sequential = orgParams.sequential; KernelData kd = KernelData::Default(params, 1); auto out = orgParams.outputs[0]; @@ -104,7 +104,7 @@ KernelsData LSTMKernelBase::GetCommonKernelsData(const Params& params, bool sequ if (sequential) { kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 2}); } - auto cldnnJit = GetJitConstants(orgParams, sequential); + auto cldnnJit = GetJitConstants(orgParams); auto entryPoint = GetEntryPoint(kernelName, orgParams.layerID, params); auto jit = CreateJit(kernelName, cldnnJit, entryPoint); size_t num_hidden_kernels; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.h index 600196ee119124..bb1153960a0107 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_kernel_base.h @@ -24,6 +24,7 @@ struct lstm_params : public base_params { lstm_params() : base_params(KernelType::LSTM_SEQ_CELL) {} order_type gate_order = offset_iofz; float clip = 0; + bool sequential = false; bool input_forget = false; ov::op::RecurrentSequenceDirection direction = ov::op::RecurrentSequenceDirection::FORWARD; @@ -59,8 +60,8 @@ class LSTMKernelBase : public KernelBaseOpenCL { struct DispatchData : public CommonDispatchData {}; protected: - virtual JitConstants GetJitConstants(const lstm_params& params, bool) const; - KernelsData GetCommonKernelsData(const Params& params, bool) const; + virtual JitConstants GetJitConstants(const lstm_params& params) const; + KernelsData GetCommonKernelsData(const Params& params) const; bool Validate(const Params& p) const override { if (p.GetType() != KernelType::LSTM_SEQ_CELL) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_seq_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_seq_kernel_ref.h deleted file mode 100644 index 350a9b6b28a62d..00000000000000 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_seq_kernel_ref.h +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "lstm_kernel_base.h" - -namespace kernel_selector { -class LSTMSeqKernelRef : public LSTMKernelBase { -public: - LSTMSeqKernelRef() : LSTMKernelBase("lstm_cell_and_seq_ref") {} - virtual ~LSTMSeqKernelRef() {} - - KernelsData GetKernelsData(const Params& params) const override; - KernelsPriority GetKernelsPriority(const Params& params) const override; - ParamsKey GetSupportedKey() const override; -}; -} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_seq_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_seq_kernel_selector.cpp deleted file mode 100644 index c41beffb090f2a..00000000000000 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/lstm/lstm_seq_kernel_selector.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "lstm_seq_kernel_selector.h" -#include "lstm_seq_kernel_ref.h" -#include "lstm_seq_kernel_bfyx.h" - -namespace kernel_selector { -lstm_seq_kernel_selector::lstm_seq_kernel_selector() { - Attach(); - Attach(); -} - -KernelsData lstm_seq_kernel_selector::GetBestKernels(const Params& params) const { - return GetNaiveBestKernel(params, KernelType::LSTM_SEQ_CELL); -} -} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/plugin/ops/rnn.cpp b/src/plugins/intel_gpu/src/plugin/ops/rnn.cpp index 4e64e7fadd80cd..16fb08432b940a 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/rnn.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/rnn.cpp @@ -68,161 +68,35 @@ void GetLSTMActivationParams(const std::shared_ptr& op, static void CreateLSTMCellOp(ProgramBuilder& p, const std::shared_ptr& op) { validate_inputs_count(op, {6}); - int lstm_batch_size, lstm_input_size, lstm_hidden_size; - auto inputs = p.GetInputInfo(op); - std::string layerName = layer_type_name_ID(op); - cldnn::input_info weight = inputs[3]; - cldnn::input_info recurrent = inputs[4]; - cldnn::input_info bias = inputs[5]; - - /* check incoming CNN layer and setup required variables */ - { - const auto in0_pshape = op->get_input_partial_shape(0); - const auto out0_pshape = op->get_output_partial_shape(0); - - if (in0_pshape[in0_pshape.size() - 1].is_static()) - lstm_input_size = in0_pshape[in0_pshape.size() - 1].get_length(); - else - lstm_input_size = -1; - - if (in0_pshape[in0_pshape.size() - 2].is_static()) - lstm_batch_size = in0_pshape[in0_pshape.size() - 2].get_length(); - else - lstm_batch_size = -1; - - if (out0_pshape[out0_pshape.size() - 1].is_static()) - lstm_hidden_size = out0_pshape[out0_pshape.size() - 1].get_length(); - else - lstm_hidden_size = -1; - } - + auto inputs = p.GetInputInfo(op); std::vector activations; std::vector activation_params; GetLSTMActivationParams(op, activations, activation_params); float clip = op->get_clip(); - + assert(!inputs[5].pid.empty()); if (p.use_new_shape_infer()) { - cldnn::primitive_id input_concatID = layerName + "_inputConcat"; - p.add_primitive(*op, cldnn::concatenation(input_concatID, { inputs[0], inputs[1] }, 1)); - - cldnn::primitive_id lstm_fc_id = layerName + "_fully_connected"; - cldnn::primitive_id lstm_elt_id = layerName + "_lstm_elt"; - cldnn::primitive_id wr_concat_id = layerName + "_WRconcat"; - p.add_primitive(*op, cldnn::concatenation(wr_concat_id, { inputs[3], inputs[4] }, 1)); - p.add_primitive(*op, cldnn::fully_connected(lstm_fc_id, cldnn::input_info(input_concatID), wr_concat_id, bias.pid)); - p.add_primitive(*op, cldnn::lstm_elt(lstm_elt_id, cldnn::input_info(lstm_fc_id), inputs[2].pid, clip, 0, activations, - activation_params, cldnn::lstm_weights_order::fizo, 0)); - - auto outSz = op->get_output_partial_shape(0); - std::vector outSzPt; - for (auto pshape : outSz) { - if (pshape.is_static()) - outSzPt.push_back(pshape.get_length()); - else - outSzPt.push_back(-1); - } - - cldnn::crop_ngraph_op_mode op_mode = cldnn::crop_ngraph_op_mode::split; - size_t num_splits = 2; - cldnn::tensor hiddenSz = cldnn::tensor{ lstm_batch_size, 1, lstm_hidden_size, 1 }; - - cldnn::primitive_id outputHiddenCropID = layerName + "_hc"; - cldnn::primitive_id outputHiddenID = layerName + ".out0"; - cldnn::primitive_id outputDataID = layerName + "_data"; - - cldnn::layout constLayout = cldnn::layout({}, cldnn::data_types::i64, cldnn::format::bfyx); - cldnn::memory::ptr data_mem = p.get_engine().allocate_memory(constLayout, false); - auto& stream = p.get_engine().get_service_stream(); - cldnn::mem_lock lock{data_mem, stream}; - auto buf = lock.data(); - const int64_t axis = 1; - std::memcpy(&buf[0], &axis, constLayout.bytes_count()); - p.add_primitive(*op, cldnn::data(outputDataID, data_mem)); - - p.add_primitive(*op, - cldnn::crop(outputHiddenCropID, - {cldnn::input_info(lstm_elt_id), cldnn::input_info(outputDataID)}, - hiddenSz, - cldnn::tensor{0, 0, 0, 0}, - op_mode, 0, axis, num_splits)); - p.add_primitive(*op, cldnn::reshape(outputHiddenID, cldnn::input_info(outputHiddenCropID), - false, outSzPt, op->get_output_partial_shape(0)), {layerName}); - - cldnn::primitive_id outputCellCropID = layerName + "_cc"; - cldnn::primitive_id outputCellID = layerName + ".out1"; - p.add_primitive(*op, - cldnn::crop(outputCellCropID, - {cldnn::input_info(lstm_elt_id), cldnn::input_info(outputDataID)}, - hiddenSz, - cldnn::tensor{0, 1, 0, 0}, - op_mode, 1, axis, num_splits)); - p.add_primitive(*op, cldnn::reshape(outputCellID, cldnn::input_info(outputCellCropID), - false, outSzPt, op->get_output_partial_shape(1))); + p.add_primitive(*op, cldnn::lstm_cell(layerName+".out0", inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], inputs[5], \ + cldnn::input_info(), "", layerName + "_md_write.1", clip, false, activations, \ + activation_params, cldnn::lstm_weights_order::fizo, ov::op::RecurrentSequenceDirection::FORWARD, cldnn::padding(), \ + static_cast(op->get_output_size()))); } else { - // LSTM primitive works with single precision for all in/out/weights tensors - auto lstm_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(0)); - - cldnn::primitive_id inReshapeID = layerName + "_inReshape"; - cldnn::primitive_id permuteID = layerName + "_inputReorder"; - cldnn::primitive_id inHiddenReshapeID = layerName + "_inHiddenReshape"; - cldnn::primitive_id inHiddenReorderID = layerName + "_inHiddenReorder"; - cldnn::primitive_id gemmReshapeID = layerName + "_gemmReshape"; - cldnn::primitive_id gemmReorderID = layerName + "_gemmReorder"; - cldnn::primitive_id input_concatID = layerName + "_inputConcat"; - - cldnn::tensor inputShape = { lstm_batch_size, 1, lstm_input_size, 1 }; - cldnn::tensor inStateShape = { lstm_batch_size, 1, lstm_hidden_size, 1 }; - cldnn::layout inputLayout = cldnn::layout(lstm_dtype, cldnn::format::bfyx, inputShape); - cldnn::layout hiddenLayout = cldnn::layout(lstm_dtype, cldnn::format::bfyx, inStateShape); - p.add_primitive(*op, cldnn::reshape(inReshapeID, inputs[0], inputShape)); - p.add_primitive(*op, cldnn::reorder(permuteID, inReshapeID, inputLayout)); - - - std::string hiddenInResh = inHiddenReshapeID + "_1"; - std::string hiddenInStr = inHiddenReorderID + "_1"; - std::string cellInResh = inHiddenReshapeID + "_2"; - std::string cellInStr = inHiddenReorderID + "_2"; - p.add_primitive(*op, cldnn::reshape(hiddenInResh, inputs[1], inStateShape)); - p.add_primitive(*op, cldnn::reorder(hiddenInStr, cldnn::input_info(hiddenInResh), hiddenLayout)); - p.add_primitive(*op, cldnn::reshape(cellInResh, inputs[2], inStateShape)); - p.add_primitive(*op, cldnn::reorder(cellInStr, cldnn::input_info(cellInResh), hiddenLayout)); - p.add_primitive(*op, cldnn::concatenation(input_concatID, - { permuteID, hiddenInStr }, - 3)); - - cldnn::tensor gemmSz = cldnn::tensor{ lstm_batch_size, 1, 4 * lstm_hidden_size, 1 }; - cldnn::layout gemmLayout = cldnn::layout(lstm_dtype, cldnn::format::bfyx, gemmSz); - cldnn::tensor hiddenSz = cldnn::tensor{ lstm_batch_size, 1, lstm_hidden_size, 1 }; - cldnn::tensor cellCropSz = cldnn::tensor{0, 1, 0, 0}; - - std::string lstm_fc_id = layerName + "_fully_connected"; - std::string lstm_elt_id = layerName + "_lstm_elt"; - - cldnn::primitive_id WRconcatID = layerName + "_WRconcat"; - p.add_primitive(*op, cldnn::concatenation(WRconcatID, { weight, recurrent }, 1)); - - cldnn::primitive_id FCInputReshapeID = "Reshape_bf_" + lstm_fc_id + "_for_input"; - cldnn::tensor FCInputReshapeSz = { lstm_batch_size, inputShape.spatial[0] + inStateShape.spatial[0], 1, 1 }; - p.add_primitive(*op, cldnn::reshape(FCInputReshapeID, cldnn::input_info(input_concatID), FCInputReshapeSz)); - - p.add_primitive(*op, cldnn::fully_connected(lstm_fc_id, cldnn::input_info(FCInputReshapeID), WRconcatID, bias.pid)); - p.add_primitive(*op, cldnn::reshape(gemmReshapeID, cldnn::input_info(lstm_fc_id), gemmSz)); - p.add_primitive(*op, cldnn::reorder(gemmReorderID, cldnn::input_info(gemmReshapeID), gemmLayout)); - p.add_primitive(*op, cldnn::lstm_elt(lstm_elt_id, cldnn::input_info(gemmReorderID), cellInStr, clip, 0, activations, - activation_params, cldnn::lstm_weights_order::fizo, 0)); - + auto mutable_precision_first = op->get_output_element_type(1); + cldnn::layout outLayout = cldnn::layout( + cldnn::element_type_to_data_type(mutable_precision_first), + cldnn::format::get_default_format(op->get_output_shape(1).size()), + tensor_from_dims(op->get_output_shape(1))); + + cldnn::memory::ptr shared_memory = p.get_engine().allocate_memory(outLayout); + const cldnn::primitive_id mutable_id_1 = layerName + "_md_write.1"; + const cldnn::mutable_data mutable_prim_1{mutable_id_1, shared_memory}; + p.add_primitive(*op, mutable_prim_1); - cldnn::tensor outSz = cldnn::tensor{ lstm_batch_size, lstm_hidden_size, 1, 1 }; - cldnn::primitive_id outputHiddenCropID = layerName + "_hc"; - cldnn::primitive_id outputHiddenID = layerName + ".out0"; - p.add_primitive(*op, cldnn::crop(outputHiddenCropID, cldnn::input_info(lstm_elt_id), hiddenSz, cldnn::tensor{0, 0, 0, 0})); - p.add_primitive(*op, cldnn::reshape(outputHiddenID, cldnn::input_info(outputHiddenCropID), outSz), {layerName}); + p.add_primitive(*op, cldnn::lstm_cell(layerName+".out0", inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], inputs[5], \ + cldnn::input_info(), "", layerName + "_md_write.1", clip, false, activations, \ + activation_params, cldnn::lstm_weights_order::fizo, ov::op::RecurrentSequenceDirection::FORWARD, cldnn::padding(), 1)); - cldnn::primitive_id outputCellCropID = layerName + "_cc"; - cldnn::primitive_id outputCellID = layerName + ".out1"; - p.add_primitive(*op, cldnn::crop(outputCellCropID, cldnn::input_info(lstm_elt_id), hiddenSz, cellCropSz)); - p.add_primitive(*op, cldnn::reshape(outputCellID, cldnn::input_info(outputCellCropID), outSz)); + p.add_primitive(*op, cldnn::mutable_data(layerName + ".out1", {cldnn::input_info(layerName + ".out0")}, shared_memory)); } } @@ -234,157 +108,44 @@ static void CreateLSTMSequenceOp(ProgramBuilder& p, const std::shared_ptr activations; std::vector activation_params; GetLSTMActivationParams(op, activations, activation_params); - float clip = op->get_clip(); - if (max_seq_len.get_max_length() == 1) { - int lstm_batch_size, lstm_input_size, lstm_hidden_size, lstm_sequence_len; - cldnn::input_info weight = inputs[4]; - cldnn::input_info recurrent = inputs[5]; - cldnn::input_info bias = inputs[6]; - { - const auto in_dims0 = op->get_input_shape(0); - const auto out_dims0 = op->get_output_shape(0); - if (in_dims0.size() != 3 || - op->get_input_shape(1).size() != 3 || - op->get_input_shape(2).size() != 3) - OPENVINO_THROW("Wrong input shapes for LSTMSequence op ", op->get_friendly_name()); - - lstm_input_size = static_cast(in_dims0.back()); - lstm_sequence_len = static_cast(in_dims0.at(in_dims0.size() - 2)); - lstm_batch_size = static_cast(in_dims0.at(in_dims0.size() - 3)); - lstm_hidden_size = static_cast(out_dims0.back()); - } - - bool isForward = op->get_direction() == ov::op::RecurrentSequenceDirection::FORWARD; - - // LSTM primitive works with single precision for all in/out/weights tensors - auto lstm_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(0)); - - cldnn::primitive_id inReshapeID = layerName + "_inReshape"; - cldnn::primitive_id permuteID = layerName + "_inputReorder"; - cldnn::primitive_id inHiddenReshapeID = layerName + "_inHiddenReshape"; - cldnn::primitive_id inHiddenReorderID = layerName + "_inHiddenReorder"; - cldnn::primitive_id inHiddenStateID = inHiddenReshapeID + "_1"; - cldnn::primitive_id inCellStateID = inHiddenReshapeID + "_2"; - - std::vector output_ids_offsets; - - cldnn::tensor inputShape = { lstm_batch_size, lstm_sequence_len, lstm_input_size, 1 }; - cldnn::tensor inStateShape = { lstm_batch_size, 1, lstm_hidden_size, 1 }; - cldnn::layout inputLayout = cldnn::layout(lstm_dtype, cldnn::format::bfyx, inputShape); - p.add_primitive(*op, cldnn::reshape(inReshapeID, inputs[0], inputShape)); - p.add_primitive(*op, cldnn::reorder(permuteID, cldnn::input_info(inReshapeID), inputLayout)); - - p.add_primitive(*op, cldnn::reshape(inHiddenStateID, inputs[1], inStateShape)); - p.add_primitive(*op, cldnn::reshape(inCellStateID, inputs[2], inStateShape)); - - cldnn::tensor gemmSz = cldnn::tensor{ lstm_batch_size, 1, 4 * lstm_hidden_size, 1 }; - cldnn::layout gemmLayout = cldnn::layout(lstm_dtype, cldnn::format::bfyx, gemmSz); - cldnn::tensor hiddenSz = cldnn::tensor{ lstm_batch_size, 1, lstm_hidden_size, 1 }; - cldnn::tensor cellCropSz = cldnn::tensor{0, 1, 0, 0}; - cldnn::primitive_id hiddenStr = inHiddenReshapeID + "_1"; - cldnn::primitive_id cellStr = inHiddenReshapeID + "_2"; - cldnn::primitive_id inputCropID = layerName + "_inputCrop"; - - cldnn::primitive_id wr_concat_id = layerName + "_WRconcat"; - p.add_primitive(*op, cldnn::concatenation(wr_concat_id, { weight, recurrent }, 2)); - - std::vector WRreshapeSize = { 4 * size_t(lstm_hidden_size), size_t(lstm_input_size + lstm_hidden_size) }; - cldnn::primitive_id WRreshapeID = wr_concat_id + "_reshape"; - auto reshapeInPrim = cldnn::reshape(WRreshapeID, cldnn::input_info(wr_concat_id), tensor_from_dims(WRreshapeSize)); - p.add_primitive(*op, reshapeInPrim); - - for (int i = 0; i < lstm_sequence_len; ++i) { - const std::string id_str = std::to_string(i); - cldnn::primitive_id concatID = layerName + "_inputConcat" + id_str; - cldnn::primitive_id lstm_fc_id = layerName + "_fully_connected" + id_str; - cldnn::primitive_id fc_input_resh_id = "Reshape_bf_" + lstm_fc_id + "_for_input" + id_str; - cldnn::primitive_id lstm_fc_resh_id = layerName + "_gemmReshape" + id_str; - cldnn::primitive_id lstm_fc_reor_id = layerName + "_gemmReorder" + id_str; - cldnn::primitive_id lstm_elt_id = layerName + "_lstm_elt" + id_str; - cldnn::primitive_id crop_id = layerName + "_crop" + id_str; - - int seqIdx = isForward ? i : lstm_sequence_len - 1 - i; - const std::string seqIdx_str = std::to_string(seqIdx); - - cldnn::tensor crop_tensor{ inputShape.batch[0], 1, inputShape.spatial[0], inputShape.spatial[1] }; - cldnn::tensor offset_tensor{ 0, static_cast(seqIdx), 0, 0 }; - cldnn::primitive_id inputCrop_id = inputCropID + ":" + seqIdx_str; - p.add_primitive(*op, cldnn::crop(inputCrop_id, cldnn::input_info(permuteID), crop_tensor, offset_tensor)); - - p.add_primitive(*op, cldnn::concatenation(concatID, { cldnn::input_info(inputCrop_id), cldnn::input_info(hiddenStr) }, 3)); - - cldnn::tensor fc_input_resh_tensor = { crop_tensor.batch[0], crop_tensor.spatial[0] + inStateShape.spatial[0], - crop_tensor.feature[0], crop_tensor.spatial[1]}; - p.add_primitive(*op, cldnn::reshape(fc_input_resh_id, cldnn::input_info(concatID), fc_input_resh_tensor)); - - p.add_primitive(*op, cldnn::fully_connected(lstm_fc_id, fc_input_resh_id, WRreshapeID, bias.pid)); - - p.add_primitive(*op, cldnn::reshape(lstm_fc_resh_id, cldnn::input_info(lstm_fc_id), gemmSz)); - p.add_primitive(*op, cldnn::reorder(lstm_fc_reor_id, cldnn::input_info(lstm_fc_resh_id), gemmLayout)); - p.add_primitive(*op, cldnn::lstm_elt(lstm_elt_id, cldnn::input_info(lstm_fc_reor_id), cellStr, clip, 0, activations, - activation_params, cldnn::lstm_weights_order::fizo, 0)); - - hiddenStr = crop_id + ":hidden"; - cellStr = crop_id + ":cell"; - p.add_primitive(*op, cldnn::crop(hiddenStr, cldnn::input_info(lstm_elt_id), hiddenSz, cldnn::tensor{ 0, 0, 0, 0 })); - output_ids_offsets.push_back(cldnn::input_info(hiddenStr)); - - if (i < lstm_sequence_len - 1) { - p.add_primitive(*op, cldnn::crop(cellStr, cldnn::input_info(lstm_elt_id), hiddenSz, cellCropSz)); - } else { - // last hidden state crop (output 2) - - // last cell state crop (output 3) - p.add_primitive(*op, cldnn::crop(cellStr, cldnn::input_info(lstm_elt_id), hiddenSz, cellCropSz)); - } - } - - if (!isForward) std::reverse(output_ids_offsets.begin(), output_ids_offsets.end()); - // concatenated hidden state (output 1) - cldnn::primitive_id concatStr = layerName + ":hiddenConcat"; - p.add_primitive(*op, cldnn::concatenation(concatStr, output_ids_offsets, 1)); - - p.add_primitive(*op, cldnn::reshape(layerName + ".out0", concatStr, tensor_from_dims(op->get_output_shape(0))), {layerName}); - p.add_primitive(*op, cldnn::reshape(layerName + ".out1", hiddenStr, tensor_from_dims(op->get_output_shape(1)))); - p.add_primitive(*op, cldnn::reshape(layerName + ".out2", cellStr, tensor_from_dims(op->get_output_shape(2)))); - } else { - if (op->get_input_shape(2).size() != 3 || op->get_input_shape(3).size() != 1 \ - || op->get_input_shape(4).size() != 3 || op->get_input_shape(5).size() != 3 || op->get_input_shape(6).size() != 2) - OPENVINO_THROW("Wrong input shapes for LSTMSequence op ", op->get_friendly_name()); - auto mutable_precision_firstsecond = op->get_output_element_type(1); - auto direction = op->get_direction(); - - if (p.use_new_shape_infer()) { - cldnn::lstm_seq prim(layerName, inputs[0], inputs[1], \ - inputs[2], inputs[4], inputs[5], inputs[6], inputs[3], "", "", \ - clip, false, activations, activation_params, cldnn::lstm_weights_order::fizo, direction, cldnn::padding(), \ - static_cast(op->get_output_size())); - prim.output_data_types = get_output_data_types(op); - p.add_primitive(*op, prim); - return; - } - - cldnn::layout out12Layout = cldnn::layout( - cldnn::element_type_to_data_type(mutable_precision_firstsecond), - cldnn::format::bfyx, - tensor_from_dims(op->get_output_shape(1))); + const float clip = op->get_clip(); + if (op->get_input_shape(2).size() != 3 || op->get_input_shape(3).size() != 1 \ + || op->get_input_shape(4).size() != 3 || op->get_input_shape(5).size() != 3 || op->get_input_shape(6).size() != 2) { + OPENVINO_THROW("Wrong input shapes for LSTMSequence op ", op->get_friendly_name()); + } + auto mutable_precision_firstsecond = op->get_output_element_type(1); + auto direction = op->get_direction(); - std::vector shared_memories; - shared_memories.push_back(p.get_engine().allocate_memory(out12Layout)); - const cldnn::primitive_id mutable_id_1 = layerName + "_md_write1"; - const cldnn::mutable_data mutable_prim_1{mutable_id_1, shared_memories.front()}; - p.add_primitive(*op, mutable_prim_1); - shared_memories.push_back(p.get_engine().allocate_memory(out12Layout)); - const cldnn::primitive_id mutable_id_2 = layerName + "_md_write2"; - const cldnn::mutable_data mutable_prim_2{mutable_id_2, shared_memories.back()}; - p.add_primitive(*op, mutable_prim_2); - cldnn::lstm_seq prim(layerName + ".out0", inputs[0], inputs[1], \ - inputs[2], inputs[4], inputs[5], inputs[6], inputs[3], mutable_id_1, mutable_id_2, \ - clip, false, activations, activation_params, cldnn::lstm_weights_order::fizo, direction); + if (p.use_new_shape_infer()) { + cldnn::lstm_seq prim(layerName, inputs[0], inputs[1], \ + inputs[2], inputs[4], inputs[5], inputs[6], inputs[3], "", "", \ + clip, false, activations, activation_params, cldnn::lstm_weights_order::fizo, direction, cldnn::padding(), \ + static_cast(op->get_output_size())); + prim.output_data_types = get_output_data_types(op); p.add_primitive(*op, prim); - p.add_primitive(*op, cldnn::mutable_data(layerName + ".out1", {cldnn::input_info(layerName + ".out0")}, shared_memories.front())); - p.add_primitive(*op, cldnn::mutable_data(layerName + ".out2", {cldnn::input_info(layerName + ".out0")}, shared_memories.back())); + return; } + + cldnn::layout out12Layout = cldnn::layout( + cldnn::element_type_to_data_type(mutable_precision_firstsecond), + cldnn::format::bfyx, + tensor_from_dims(op->get_output_shape(1))); + + std::vector shared_memories; + shared_memories.push_back(p.get_engine().allocate_memory(out12Layout)); + const cldnn::primitive_id mutable_id_1 = layerName + "_md_write1"; + const cldnn::mutable_data mutable_prim_1{mutable_id_1, shared_memories.front()}; + p.add_primitive(*op, mutable_prim_1); + shared_memories.push_back(p.get_engine().allocate_memory(out12Layout)); + const cldnn::primitive_id mutable_id_2 = layerName + "_md_write2"; + const cldnn::mutable_data mutable_prim_2{mutable_id_2, shared_memories.back()}; + p.add_primitive(*op, mutable_prim_2); + cldnn::lstm_seq prim(layerName + ".out0", inputs[0], inputs[1], \ + inputs[2], inputs[4], inputs[5], inputs[6], inputs[3], mutable_id_1, mutable_id_2, \ + clip, false, activations, activation_params, cldnn::lstm_weights_order::fizo, direction); + p.add_primitive(*op, prim); + p.add_primitive(*op, cldnn::mutable_data(layerName + ".out1", {cldnn::input_info(layerName + ".out0")}, shared_memories.front())); + p.add_primitive(*op, cldnn::mutable_data(layerName + ".out2", {cldnn::input_info(layerName + ".out0")}, shared_memories.back())); } REGISTER_FACTORY_IMPL(v4, LSTMCell); diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index 899110872ba633..ce1676713df9ac 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -7,6 +7,7 @@ #include "openvino/op/split.hpp" #include "openvino/op/variadic_split.hpp" #include "openvino/op/lstm_cell.hpp" +#include "openvino/op/lstm_sequence.hpp" #include "openvino/op/loop.hpp" #include "intel_gpu/plugin/common_utils.hpp" @@ -354,6 +355,12 @@ bool ProgramBuilder::requires_new_shape_infer(const std::shared_ptr& o if (body_function->is_dynamic()) return true; } + + /* + if (ov::is_type(op)) { + return true; + } + */ // When input node has dynamic shape with 4 dimension, this function return false // because op.is_dynamic() which only checks input shapes return false. // So, in the case of input data, we need to check output shape. diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index ae7a63bb24ab04..5b208c77b18c41 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -511,7 +511,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { } else if (std::dynamic_pointer_cast(node)) { return false; } else if (const auto &lstm_cell = std::dynamic_pointer_cast(node)) { - return lstm_cell->get_clip() == 0.0f && lstm_cell->get_activations() == std::vector{"sigmoid", "tanh", "tanh"}; + return false; } else if (const auto &lstm_cell_v1 = std::dynamic_pointer_cast(node)) { return lstm_cell_v1->get_clip() == 0.0f && lstm_cell_v1->get_activations() == std::vector{"sigmoid", "tanh", "tanh"}; } @@ -526,6 +526,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { auto isSequencePrimitiveSupported = [](const_node_ptr &node) -> bool { const auto& data = node->input(0); const auto& data_pshape = data.get_partial_shape(); + auto max_seq_len = data_pshape[1]; if (data_pshape.rank().is_static() && data_pshape.rank().get_length() > 1 && !data_pshape[1].is_static()) return false; if (std::dynamic_pointer_cast(node)) { @@ -535,6 +536,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { } else if (const auto &lstm_seq = std::dynamic_pointer_cast(node)) { return lstm_seq->get_clip() == 0.0f && lstm_seq->get_activations() == std::vector{"sigmoid", "tanh", "tanh"} && + max_seq_len != 1 && !ov::op::util::is_seq_len_provided(lstm_seq->get_input_node_shared_ptr(0), lstm_seq->get_input_node_shared_ptr(3)); }