From 9fa6fff10038815109cba9e8c2f3c351195a6e00 Mon Sep 17 00:00:00 2001 From: Michal Miotk Date: Wed, 20 Nov 2024 15:08:51 +0100 Subject: [PATCH] wip Signed-off-by: Michal Miotk --- .../include/intel_gpu/primitives/rnn.hpp | 1 - .../graph/impls/onednn/lstm_seq_onednn.cpp | 23 ++++--- .../cl_kernels/lstm_cell_and_seq_bfyx.cl | 66 +++++++++++------- .../cl_kernels/lstm_cell_and_seq_ref.cl | 69 +++++++++++-------- src/plugins/intel_gpu/src/plugin/ops/rnn.cpp | 24 ------- .../intel_gpu/src/plugin/program_builder.cpp | 2 - 6 files changed, 94 insertions(+), 91 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp index bbea0eb95ed1dc..daf75fece23e0b 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/rnn.hpp @@ -67,7 +67,6 @@ struct RNNParams : public primitive_base { offset_order(offset_order), direction(direction) { std::vector pids{initial_hidden_state.pid, initial_cell_state.pid, W.pid, R.pid, B.pid, seq_lenghts.pid, out1_prim_id, out2_prim_id}; - assert(direction == ov::op::RecurrentSequenceDirection::FORWARD || direction == ov::op::RecurrentSequenceDirection::REVERSE); for (auto pid : pids) { if (!pid.empty()) { primitive_base::input.push_back(pid); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp index 9a2583d07e9d60..6638a476d00dbc 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp @@ -93,15 +93,15 @@ struct lstm_seq_onednn : typed_primitive_onednn_impl { } { - auto& output = instance.input_memory(7); - auto offset = onednn::get_offset(instance.get_input_layout(7), _pd.dnnl::primitive_desc_base::dst_desc(1)); + auto& output = instance.output_memory(1); + auto offset = onednn::get_offset(instance.get_output_layout(1), _pd.dnnl::primitive_desc_base::dst_desc(1)); auto mem = output.get_onednn_memory(_pd.dnnl::primitive_desc_base::dst_desc(1), offset); args.insert({DNNL_ARG_DST_ITER, mem}); } { - auto& output = instance.input_memory(8); - auto offset = onednn::get_offset(instance.get_input_layout(8), _pd.dnnl::primitive_desc_base::dst_desc(2)); + auto& output = instance.output_memory(2); + auto offset = onednn::get_offset(instance.get_output_layout(2), _pd.dnnl::primitive_desc_base::dst_desc(2)); auto mem = output.get_onednn_memory(_pd.dnnl::primitive_desc_base::dst_desc(2), offset); args.insert({DNNL_ARG_DST_ITER_C, mem}); } @@ -134,34 +134,35 @@ struct lstm_seq_onednn : typed_primitive_onednn_impl { const dnnl::primitive_attr& attr, ov::op::RecurrentSequenceDirection direction) { auto prim = impl_params.typed_desc(); + auto num_dir = static_cast(prim->num_directions()); const auto& src_shape = impl_params.get_input_layout(0).get_shape(); auto mod_src_shape = src_shape; std::swap(mod_src_shape[0], mod_src_shape[1]); auto input_md = onednn::layout_to_memory_desc(impl_params.get_input_layout(0).clone_with_other_shape(mod_src_shape), dnnl::memory::format_tag::abc); auto initial_hidden_shape_mod = impl_params.get_input_layout(1).get_shape(); - initial_hidden_shape_mod = { 1, 1, initial_hidden_shape_mod[0], initial_hidden_shape_mod[2] }; + initial_hidden_shape_mod = { 1, num_dir, initial_hidden_shape_mod[0], initial_hidden_shape_mod[2] }; auto initial_hidden = onednn::layout_to_memory_desc(impl_params.get_input_layout(1).clone_with_other_shape(initial_hidden_shape_mod)); auto initial_cell = onednn::layout_to_memory_desc(impl_params.get_input_layout(2).clone_with_other_shape(initial_hidden_shape_mod)); auto W_shape_mod = impl_params.get_input_layout(3).get_shape(); - W_shape_mod = {1, 1, W_shape_mod[2], 4, W_shape_mod[1]/4}; + W_shape_mod = {1, num_dir, W_shape_mod[2], 4, W_shape_mod[1]/4}; auto w_layout = impl_params.get_input_layout(3).clone_with_other_shape(W_shape_mod); w_layout.format = cldnn::format::bfzyx; auto W_md = onednn::layout_to_memory_desc(w_layout); auto R_shape_mod = impl_params.get_input_layout(4).get_shape(); - R_shape_mod = {1, 1, R_shape_mod[2], 4, R_shape_mod[1]/4}; + R_shape_mod = {1, num_dir, R_shape_mod[2], 4, R_shape_mod[1]/4}; auto r_layout = impl_params.get_input_layout(4).clone_with_other_shape(R_shape_mod); r_layout.format = cldnn::format::bfzyx; auto R_md = onednn::layout_to_memory_desc(r_layout); auto B_shape_mod = impl_params.get_input_layout(5).get_shape(); - B_shape_mod = {1, 1, 4, B_shape_mod[1]/4}; + B_shape_mod = {1, num_dir, 4, B_shape_mod[1]/4}; auto b_layout = impl_params.get_input_layout(5).clone_with_other_shape(B_shape_mod); b_layout.format = cldnn::format::bfyx; auto B_md = onednn::layout_to_memory_desc(b_layout); auto out_shape = impl_params.get_output_layout().get_shape(); - out_shape = {out_shape[2], out_shape[0], out_shape[3], 1}; + out_shape = {out_shape[2], out_shape[0], out_shape[3]*num_dir}; auto output_md = onednn::layout_to_memory_desc(impl_params.get_output_layout().clone_with_other_shape(out_shape), dnnl::memory::format_tag::abc); - auto output1_md = onednn::layout_to_memory_desc(impl_params.get_input_layout(7).clone_with_other_shape(initial_hidden_shape_mod)); - auto output2_md = onednn::layout_to_memory_desc(impl_params.get_input_layout(7).clone_with_other_shape(initial_hidden_shape_mod)); + auto output1_md = onednn::layout_to_memory_desc(impl_params.get_output_layout(1).clone_with_other_shape(initial_hidden_shape_mod)); + auto output2_md = onednn::layout_to_memory_desc(impl_params.get_output_layout(2).clone_with_other_shape(initial_hidden_shape_mod)); OPENVINO_ASSERT(input_md.get_format_kind() != dnnl::memory::format_kind::any, "[GPU] The format kind of the input memory descriptor of onednn lstm_seq cannot be 'any'."); OPENVINO_ASSERT(output_md.get_format_kind() != dnnl::memory::format_kind::any, diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl index 70a881918f01a8..289646fe8abb7a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/lstm_cell_and_seq_bfyx.cl @@ -14,14 +14,26 @@ #ifdef SEQUENCE #define GET_IN0_IDX(b, f, y) INPUT0_GET_INDEX(b, f, y, 0) -#define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, f, y, 0) -#define GET_IN3_IDX(b, f) INPUT3_GET_INDEX(0, b, f, 0) -#define GET_IN4_IDX(b, f) INPUT4_GET_INDEX(0, b, f, 0) + #if DIRECTION == 2 + #define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, f, y, 0) + #define GET_IN2_IDX(b, f, y) INPUT2_GET_INDEX(b, f, y, 0) + #define GET_IN3_IDX(b, f, y) INPUT3_GET_INDEX(b, f, y, 0) + #define GET_IN4_IDX(b, f, y) INPUT4_GET_INDEX(b, f, y, 0) + #define GET_IN5_IDX(b, f) INPUT5_GET_INDEX(b, f, 0, 0) + #else + #define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, 0, y, 0) + #define GET_IN2_IDX(b, f, y) INPUT2_GET_INDEX(b, 0, y, 0) + #define GET_IN3_IDX(b, f, y) INPUT3_GET_INDEX(0, f, y, 0) + #define GET_IN4_IDX(b, f, y) INPUT4_GET_INDEX(0, f, y, 0) + #define GET_IN5_IDX(b, f) INPUT5_GET_INDEX(0, f, 0, 0) + #endif #else -#define GET_IN0_IDX(b, f, y) INPUT0_GET_INDEX(b, y, 0, 0) +#define GET_IN0_IDX(b, f, y) INPUT0_GET_INDEX(b, y, 0, 0) #define GET_IN1_IDX(b, f, y) INPUT1_GET_INDEX(b, y, 0, 0) -#define GET_IN3_IDX(b, f) INPUT3_GET_INDEX(b, f, 0, 0) -#define GET_IN4_IDX(b, f) INPUT4_GET_INDEX(b, f, 0, 0) +#define GET_IN2_IDX(b, f, y) INPUT2_GET_INDEX(b, y, 0, 0) +#define GET_IN3_IDX(b, f, y) INPUT3_GET_INDEX(f, y, 0, 0) +#define GET_IN4_IDX(b, f, y) INPUT4_GET_INDEX(f, y, 0, 0) +#define GET_IN5_IDX(b, f) INPUT5_GET_INDEX(f, 0, 0, 0) #endif KERNEL(lstm_cell_and_seq_bfyx)( @@ -51,7 +63,7 @@ KERNEL(lstm_cell_and_seq_bfyx)( const uint real_seq_length = 1; #endif #if DIRECTION == 2 - for(uint dir=0;dirget_input_partial_shape(0)[1]; std::vector activations; std::vector activation_params; GetLSTMActivationParams(op, activations, activation_params); @@ -113,7 +112,6 @@ static void CreateLSTMSequenceOp(ProgramBuilder& p, const std::shared_ptrget_input_shape(4).size() != 3 || op->get_input_shape(5).size() != 3 || op->get_input_shape(6).size() != 2) { OPENVINO_THROW("Wrong input shapes for LSTMSequence op ", op->get_friendly_name()); } - auto mutable_precision_firstsecond = op->get_output_element_type(1); auto direction = op->get_direction(); if (p.use_new_shape_infer()) { @@ -123,29 +121,7 @@ static void CreateLSTMSequenceOp(ProgramBuilder& p, const std::shared_ptr(op->get_output_size())); prim.output_data_types = get_output_data_types(op); p.add_primitive(*op, prim); - return; } - - cldnn::layout out12Layout = cldnn::layout( - cldnn::element_type_to_data_type(mutable_precision_firstsecond), - cldnn::format::bfyx, - tensor_from_dims(op->get_output_shape(1))); - - std::vector shared_memories; - shared_memories.push_back(p.get_engine().allocate_memory(out12Layout)); - const cldnn::primitive_id mutable_id_1 = layerName + "_md_write1"; - const cldnn::mutable_data mutable_prim_1{mutable_id_1, shared_memories.front()}; - p.add_primitive(*op, mutable_prim_1); - shared_memories.push_back(p.get_engine().allocate_memory(out12Layout)); - const cldnn::primitive_id mutable_id_2 = layerName + "_md_write2"; - const cldnn::mutable_data mutable_prim_2{mutable_id_2, shared_memories.back()}; - p.add_primitive(*op, mutable_prim_2); - cldnn::lstm_seq prim(layerName + ".out0", inputs[0], inputs[1], \ - inputs[2], inputs[4], inputs[5], inputs[6], inputs[3], mutable_id_1, mutable_id_2, \ - clip, false, activations, activation_params, cldnn::lstm_weights_order::fizo, direction); - p.add_primitive(*op, prim); - p.add_primitive(*op, cldnn::mutable_data(layerName + ".out1", {cldnn::input_info(layerName + ".out0")}, shared_memories.front())); - p.add_primitive(*op, cldnn::mutable_data(layerName + ".out2", {cldnn::input_info(layerName + ".out0")}, shared_memories.back())); } REGISTER_FACTORY_IMPL(v4, LSTMCell); diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index ce1676713df9ac..4fcf67ef616b8d 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -356,11 +356,9 @@ bool ProgramBuilder::requires_new_shape_infer(const std::shared_ptr& o return true; } - /* if (ov::is_type(op)) { return true; } - */ // When input node has dynamic shape with 4 dimension, this function return false // because op.is_dynamic() which only checks input shapes return false. // So, in the case of input data, we need to check output shape.