wip

Signed-off-by: Michal Miotk <[email protected]>
openvinotoolkit · Nov 20, 2024 · e1fcc01 · e1fcc01
1 parent 2d58b17
commit e1fcc01
Show file tree

Hide file tree

Showing 25 changed files with 246 additions and 541 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/lstm.hpp
@@ -39,7 +39,7 @@ struct lstm_elt : public RNNParams<lstm_elt> {
              const std::vector<activation_additional_params> activation_params = {},
              const lstm_weights_order offset_order = lstm_weights_order::iofz,
              const uint32_t direction = 0)
-        : RNNParams(id, x, {}, cell, {}, {}, {}, {}, "", "", clip, input_forget, activations, activation_params, offset_order, \
+        : RNNParams(id, x, {}, cell, {}, {}, {}, {}, "", "", clip, input_forget, activations, activation_params, offset_order,
           direction == 0 ? ov::op::RecurrentSequenceDirection::FORWARD : ov::op::RecurrentSequenceDirection::REVERSE) {
         if (!cell.empty())
             input.pop_back();

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp
@@ -160,31 +160,21 @@ void post_optimize_weights::add_lstm_weights_reorder(primitive_id input_id, std:
                                                      cldnn::program_node& prev, cldnn::program_node& node, size_t i) {
     OPENVINO_ASSERT(reorder_params != nullptr, "[GPU] WeightsReorderParams is not initialized.");
     std::string reorder_id = input_id + "_reo_" + std::to_string(i);
-    auto hiddenSize = reorder_params->get_output_layout().get_shape()[1] / 4;
-    auto inputSize = static_cast<int>(reorder_params->get_output_layout().get_shape()[3]);
+    const auto dir_num = static_cast<int>(reorder_params->get_input_layout().get_shape()[0]);
+    auto hiddenSize = reorder_params->get_input_layout().get_shape()[1] / 4;
+    auto inputSize = static_cast<int>(reorder_params->get_input_layout().get_shape()[2]);
     int size_third;
     const int W_idx = 3;
     if (i == W_idx) {
         size_third = inputSize;
     } else {
         size_third = static_cast<int>(hiddenSize);
     }
-    auto cropSizeR = cldnn::tensor{1, static_cast<int>(hiddenSize), 1, size_third, 1};
-    cldnn::layout reorder_layout;
-    if (i ==  W_idx) {
-        reorder_layout = reorder_params->get_output_layout();
-    } else {
-        reorder_layout = reorder_params->get_output_layout();
-        auto reorder_layout_new_shape = reorder_layout.get_shape();
-        reorder_layout_new_shape[3] = hiddenSize;
-        reorder_layout = reorder_layout.clone_with_other_shape(reorder_layout_new_shape);
-    }
-    auto reorder = std::make_shared<cldnn::reorder>(reorder_id, input_id, reorder_layout);
-    auto& reorder_node = p.get_or_create(reorder);
+    auto cropSizeR = cldnn::tensor{dir_num, static_cast<int>(hiddenSize), 1, size_third};
     std::string crop_id_b = input_id + "_c";
     auto get_crop_node = [&](int cropNum) -> cldnn::program_node& {
         auto crop_id = primitive_id(crop_id_b + std::to_string(cropNum));
-        auto crop_prim = std::make_shared<cldnn::crop>(crop_id, reorder_id, cropSizeR, cldnn::tensor{0, static_cast<int>(cropNum*hiddenSize), 0, 0, 0});
+        auto crop_prim = std::make_shared<cldnn::crop>(crop_id, reorder_id, cropSizeR, cldnn::tensor{0, static_cast<int>(cropNum*hiddenSize), 0, 0});
         return p.get_or_create(crop_prim);
     };
 
@@ -194,19 +184,18 @@ void post_optimize_weights::add_lstm_weights_reorder(primitive_id input_id, std:
     auto& crop3_node = get_crop_node(3);
     std::vector<input_info> con_input{input_info(crop_id_b + "1"), input_info(crop_id_b + "0"), input_info(crop_id_b + "2"), input_info(crop_id_b + "3")};
     cldnn::primitive_id concat_id{input_id + "cont"};
-    auto con = std::make_shared<cldnn::concatenation>(concat_id, con_input, 0);
+    auto con = std::make_shared<cldnn::concatenation>(concat_id, con_input, 1);
     auto& con_node = p.get_or_create(con);
     p.add_intermediate(con_node, node, prev, true);
-    p.add_intermediate(reorder_node, con_node, prev, true);
-    p.add_intermediate(crop1_node, con_node, reorder_node, true);
-    p.add_connection(reorder_node, crop0_node, 0);
-    p.add_connection(reorder_node, crop2_node, 0);
-    p.add_connection(reorder_node, crop3_node, 0);
+    p.add_intermediate(crop1_node, con_node, prev, true);
+    p.add_connection(prev, crop0_node, 0);
+    p.add_connection(prev, crop2_node, 0);
+    p.add_connection(prev, crop3_node, 0);
     p.add_connection(crop0_node, con_node, 0);
     p.add_connection(crop2_node, con_node, 0);
     p.add_connection(crop3_node, con_node, 0);
     std::string permute_id = input_id + "_perx";
-    std::vector<uint16_t> ord{2, 4, 3, 0, 1};
+    std::vector<uint16_t> ord{0, 2, 1};
     auto permute = std::make_shared<cldnn::permute>(permute_id, input_info{concat_id}, ord);
     auto& permute_node = p.get_or_create(permute);
     p.add_intermediate(permute_node, node, con_node,  true);
@@ -216,7 +205,6 @@ void post_optimize_weights::add_lstm_weights_reorder(primitive_id input_id, std:
         p.mark_if_constant(node);
         node.recalc_output_layout(false);
     };
-    set_implementation_and_output(reorder_node);
     set_implementation_and_output(crop1_node);
     set_implementation_and_output(crop0_node);
     set_implementation_and_output(crop2_node);
@@ -228,8 +216,9 @@ void post_optimize_weights::add_lstm_weights_reorder(primitive_id input_id, std:
 void post_optimize_weights::add_lstm_bias_reorder(primitive_id input_id, std::shared_ptr<WeightsReorderParams> reorder_params, program& p, \
                                                   cldnn::program_node& prev, cldnn::program_node& node) {
     OPENVINO_ASSERT(reorder_params != nullptr, "[GPU] WeightsReorderParams is not initialized.");
+    const auto dir_num = static_cast<int>(reorder_params->get_input_layout().get_shape()[0]);
     auto hiddenSize = reorder_params->get_output_layout().get_shape()[1] / 4;
-    auto cropSize = cldnn::tensor{1, static_cast<int>(hiddenSize), 1, 1};
+    auto cropSize = cldnn::tensor{dir_num, static_cast<int>(hiddenSize), 1, 1};
     std::string crop_id_b = input_id + "_c";
     auto get_crop_node = [&](int cropNum) -> cldnn::program_node& {
         auto crop_id = primitive_id(crop_id_b + std::to_string(cropNum));
@@ -242,7 +231,7 @@ void post_optimize_weights::add_lstm_bias_reorder(primitive_id input_id, std::sh
     auto& crop3_node = get_crop_node(3);
     std::vector<input_info> con_input{input_info(crop1_node.id()), input_info(crop0_node.id()), input_info(crop2_node.id()), input_info(crop3_node.id())};
     cldnn::primitive_id concat_id{input_id + "concat"};
-    auto con = std::make_shared<cldnn::concatenation>(concat_id, con_input, 2);
+    auto con = std::make_shared<cldnn::concatenation>(concat_id, con_input, 1);
     auto& con_node = p.get_or_create(con);
     p.add_intermediate(con_node, node, prev, true);
     p.add_intermediate(crop1_node, con_node, prev, true);
@@ -252,11 +241,6 @@ void post_optimize_weights::add_lstm_bias_reorder(primitive_id input_id, std::sh
     p.add_connection(crop0_node, con_node, 0);
     p.add_connection(crop2_node, con_node, 0);
     p.add_connection(crop3_node, con_node, 0);
-    std::string permute_id = input_id + "_pex";
-    std::vector<uint16_t> ord{0, 3, 2, 1};
-    auto permute = std::make_shared<cldnn::permute>(permute_id, input_info{concat_id}, ord);
-    auto& permute_node = p.get_or_create(permute);
-    p.add_intermediate(permute_node, node, con_node,  true);
     auto set_implementation_and_output = [this, &p](program_node& node) {
         node.get_output_layout(false);
         select_implementation(p, node);
@@ -268,7 +252,6 @@ void post_optimize_weights::add_lstm_bias_reorder(primitive_id input_id, std::sh
     set_implementation_and_output(crop2_node);
     set_implementation_and_output(crop3_node);
     set_implementation_and_output(con_node);
-    set_implementation_and_output(permute_node);
 }
 
 void post_optimize_weights::run(program& p) {

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/lstm_cell.cpp
@@ -5,7 +5,7 @@
 #include "primitive_base.hpp"
 
 #include "lstm_cell_inst.h"
-#include "lstm/lstm_cell_kernel_selector.h"
+#include "lstm/lstm_cell_and_seq_kernel_selector.h"
 #include "lstm/lstm_kernel_base.h"
 #include "openvino/op/lstm_cell.hpp"
 #include "lstm_cell.hpp"
@@ -16,7 +16,7 @@ namespace ocl {
 struct lstm_cell_impl : typed_primitive_impl_ocl<lstm_cell> {
     using parent = typed_primitive_impl_ocl<lstm_cell>;
     using parent::parent;
-    using kernel_selector_t = kernel_selector::lstm_cell_kernel_selector;
+    using kernel_selector_t = kernel_selector::lstm_cell_and_seq_kernel_selector;
     using kernel_params_t = kernel_selector::lstm_params;
 
     DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::lstm_cell_impl)

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rnn_seq.cpp
@@ -6,7 +6,7 @@
 
 #include "lstm_seq_inst.h"
 #include "rnn_seq.hpp"
-#include "lstm/lstm_seq_kernel_selector.h"
+#include "lstm/lstm_cell_and_seq_kernel_selector.h"
 #include "lstm/lstm_kernel_base.h"
 #include "openvino/op/lstm_sequence.hpp"
 #include "impls/registry/implementation_manager.hpp"
@@ -17,7 +17,7 @@ namespace ocl {
 struct rnn_seq_impl : typed_primitive_impl_ocl<lstm_seq> {
     using parent = typed_primitive_impl_ocl<lstm_seq>;
     using parent::parent;
-    using kernel_selector_t = kernel_selector::lstm_seq_kernel_selector;
+    using kernel_selector_t = kernel_selector::lstm_cell_and_seq_kernel_selector;
     using kernel_params_t = kernel_selector::lstm_params;
 
     DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::rnn_seq_impl)
@@ -47,6 +47,7 @@ struct rnn_seq_impl : typed_primitive_impl_ocl<lstm_seq> {
     static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param) {
         const auto& primitive = impl_param.typed_desc<lstm_seq>();
         auto params = get_default_params<kernel_selector::lstm_params>(impl_param);
+        params.sequential = true;
         for (size_t i = 1; i < impl_param.input_layouts.size(); ++i) {
             params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(i)));
         }

diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/lstm_seq_onednn.cpp
@@ -112,7 +112,7 @@ struct lstm_seq_onednn : typed_primitive_onednn_impl<lstm_seq> {
         auto weights_shape = impl_params.get_input_layout(layout_nr).get_shape();
         auto target_weights_layout = impl_params.get_input_layout(layout_nr);
         target_weights_layout.format = cldnn::format::bfzyx;
-        auto layout = target_weights_layout.clone_with_other_shape(ov::Shape{weights_shape[0], weights_shape[1], 1, weights_shape[2], weights_shape[3]});
+        auto layout = target_weights_layout.clone_with_other_shape(ov::Shape{weights_shape[0], weights_shape[1], weights_shape[2], 1, 1});
         return layout;
     }
 
@@ -168,11 +168,18 @@ struct lstm_seq_onednn : typed_primitive_onednn_impl<lstm_seq> {
                         "[GPU] The format kind of the output memory descriptor of onednn lstm_seq cannot be 'any'.");
 
         auto eng = engine.get_onednn_engine();
+        dnnl::rnn_direction lstm_desc_dir;
+        if (direction == ov::op::RecurrentSequenceDirection::FORWARD) {
+            lstm_desc_dir = dnnl::rnn_direction::unidirectional_left2right;
+        } else if (direction == ov::op::RecurrentSequenceDirection::REVERSE) {
+            lstm_desc_dir = dnnl::rnn_direction::unidirectional_right2left;
+        } else {
+            lstm_desc_dir = dnnl::rnn_direction::bidirectional_concat;
+        }
         return std::make_shared<dnnl::lstm_forward::primitive_desc>(
             eng,
             dnnl::prop_kind::forward_inference,
-            direction == ov::op::RecurrentSequenceDirection::FORWARD ? dnnl::rnn_direction::unidirectional_left2right : \
-            dnnl::rnn_direction::unidirectional_right2left,
+            lstm_desc_dir,
             input_md,
             initial_hidden,
             initial_cell,

diff --git a/src/plugins/intel_gpu/src/graph/lstm_cell.cpp b/src/plugins/intel_gpu/src/graph/lstm_cell.cpp
@@ -10,24 +10,24 @@ namespace cldnn {
 GPU_DEFINE_PRIMITIVE_TYPE_ID(lstm_cell)
 
 layout lstm_cell_inst::calc_output_layout(lstm_cell_node const& node, kernel_impl_params const& impl_param) {
-    auto input_layout = impl_param.get_input_layout(0);
-    auto input_pshape = input_layout.get_partial_shape();
-    auto input_layout_hidden = impl_param.get_input_layout(1);
-    auto input_pshape_hidden = input_layout_hidden.get_partial_shape();
-    auto lstm_batch_size = input_pshape[0];
-    auto lstm_hidden_size = input_pshape_hidden[1];
+    const auto& input_layout = impl_param.get_input_layout(0);
+    const auto& input_pshape = input_layout.get_partial_shape();
+    const auto& input_layout_hidden = impl_param.get_input_layout(1);
+    const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape();
+    const auto& lstm_batch_size = input_pshape[0];
+    const auto& lstm_hidden_size = input_pshape_hidden[1];
 
     return cldnn::layout{ov::PartialShape{lstm_batch_size, lstm_hidden_size}, input_layout.data_type, input_layout.format};
 }
 
 template<typename ShapeType>
 std::vector<layout> lstm_cell_inst::calc_output_layouts(lstm_cell_node const& node, kernel_impl_params const& impl_param) {
-    auto input_layout = impl_param.get_input_layout(0);
-    auto input_pshape = input_layout.get_partial_shape();
-    auto input_layout_hidden = impl_param.get_input_layout(1);
-    auto input_pshape_hidden = input_layout_hidden.get_partial_shape();
-    auto lstm_batch_size = input_pshape[0];
-    auto lstm_hidden_size = input_pshape_hidden[1];
+    const auto& input_layout = impl_param.get_input_layout(0);
+    const auto& input_pshape = input_layout.get_partial_shape();
+    const auto& input_layout_hidden = impl_param.get_input_layout(1);
+    const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape();
+    const auto& lstm_batch_size = input_pshape[0];
+    const auto& lstm_hidden_size = input_pshape_hidden[1];
 
     auto out_layout = cldnn::layout{ShapeType{lstm_batch_size, lstm_hidden_size}, input_layout.data_type, input_layout.format};
     return {out_layout, out_layout};
@@ -36,7 +36,7 @@ std::vector<layout> lstm_cell_inst::calc_output_layouts(lstm_cell_node const& no
 template std::vector<layout> lstm_cell_inst::calc_output_layouts<ov::PartialShape>(lstm_cell_node const& node, const kernel_impl_params& impl_param);
 
 std::string lstm_cell_inst::to_string(lstm_cell_node const& node) {
-    auto desc = node.get_primitive();
+    const auto& desc = node.get_primitive();
     auto node_info = node.desc_to_json();
 
     std::stringstream primitive_description;

diff --git a/src/plugins/intel_gpu/src/graph/lstm_seq.cpp b/src/plugins/intel_gpu/src/graph/lstm_seq.cpp
@@ -10,14 +10,14 @@ namespace cldnn {
 GPU_DEFINE_PRIMITIVE_TYPE_ID(lstm_seq)
 
 layout lstm_seq_inst::calc_output_layout(lstm_seq_node const& node, kernel_impl_params const& impl_param) {
-    auto desc = impl_param.typed_desc<lstm_seq>();
-    auto input_layout = impl_param.get_input_layout(0);
-    auto input_pshape = input_layout.get_partial_shape();
-    auto input_layout_hidden = impl_param.get_input_layout(1);
-    auto input_pshape_hidden = input_layout_hidden.get_partial_shape();
-    auto lstm_batch_size = input_pshape[0];
-    auto lstm_seq_length = input_pshape[1];
-    auto lstm_hidden_size = input_pshape_hidden[2];
+    const auto& desc = impl_param.typed_desc<lstm_seq>();
+    const auto& input_layout = impl_param.get_input_layout(0);
+    const auto& input_pshape = input_layout.get_partial_shape();
+    const auto& input_layout_hidden = impl_param.get_input_layout(1);
+    const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape();
+    const auto& lstm_batch_size = input_pshape[0];
+    const auto& lstm_seq_length = input_pshape[1];
+    const auto& lstm_hidden_size = input_pshape_hidden[2];
 
     auto first_out_fmt = cldnn::format::bfyx;
     if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) {
@@ -29,22 +29,20 @@ layout lstm_seq_inst::calc_output_layout(lstm_seq_node const& node, kernel_impl_
 
 template<typename ShapeType>
 std::vector<layout> lstm_seq_inst::calc_output_layouts(lstm_seq_node const& node, kernel_impl_params const& impl_param) {
-    auto desc = impl_param.typed_desc<lstm_seq>();
-    auto input_layout = impl_param.get_input_layout(0);
-    auto input_pshape = input_layout.get_partial_shape();
-    auto input_layout_hidden = impl_param.get_input_layout(1);
-    auto input_pshape_hidden = input_layout_hidden.get_partial_shape();
-    auto lstm_batch_size = input_pshape[0];
-    auto lstm_seq_length = input_pshape[1];
-    auto lstm_hidden_size = input_pshape_hidden[2];
+    const auto& desc = impl_param.typed_desc<lstm_seq>();
+    const auto& input_layout = impl_param.get_input_layout(0);
+    const auto& input_pshape = input_layout.get_partial_shape();
+    const auto& input_layout_hidden = impl_param.get_input_layout(1);
+    const auto& input_pshape_hidden = input_layout_hidden.get_partial_shape();
+    const auto& lstm_batch_size = input_pshape[0];
+    const auto& lstm_seq_length = input_pshape[1];
+    const auto& lstm_hidden_size = input_pshape_hidden[2];
 
     auto first_out_fmt = cldnn::format::bfyx;
     auto second_out_fmt = input_layout.format;
     auto third_out_fmt = input_layout.format;
     if (node.get_preferred_impl_type() == impl_types::onednn && node.get_preferred_output_fmt() != format::any) {
         first_out_fmt = node.get_preferred_output_fmt();
-        second_out_fmt = node.get_preferred_output_fmt(1);
-        third_out_fmt = node.get_preferred_output_fmt(2);
     }
     auto num_directions = desc->num_directions();
 
@@ -56,7 +54,7 @@ std::vector<layout> lstm_seq_inst::calc_output_layouts(lstm_seq_node const& node
 template std::vector<layout> lstm_seq_inst::calc_output_layouts<ov::PartialShape>(lstm_seq_node const& node, const kernel_impl_params& impl_param);
 
 std::string lstm_seq_inst::to_string(lstm_seq_node const& node) {
-    auto desc = node.get_primitive();
+    const auto& desc = node.get_primitive();
     auto node_info = node.desc_to_json();
 
     std::stringstream primitive_description;