From de94a33a6b6d352fbc4517184b44edc8126b0506 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Mon, 7 Oct 2024 19:01:01 +0400 Subject: [PATCH 1/6] [GPU] network code cleanup (#26908) ### Details: - Removed few unnecessary methods - Refactor `network::execute()` --- .../include/intel_gpu/graph/network.hpp | 32 ------- src/plugins/intel_gpu/src/graph/kv_cache.cpp | 3 +- src/plugins/intel_gpu/src/graph/network.cpp | 89 +++++-------------- src/plugins/intel_gpu/src/plugin/graph.cpp | 1 - .../tests/unit/passes/reorder_inputs_test.cpp | 8 +- .../test_cases/concatenation_gpu_test.cpp | 24 ++--- .../unit/test_cases/condition_gpu_test.cpp | 14 +-- .../unit/test_cases/convolution_gpu_test.cpp | 82 ++++++++--------- .../tests/unit/test_cases/crop_gpu_test.cpp | 5 +- .../tests/unit/test_cases/reduce_gpu_test.cpp | 12 +-- .../unit/test_cases/reorder_gpu_test.cpp | 1 - 11 files changed, 96 insertions(+), 175 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp index 63adae28ddabf3..f4e09a51513085 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp @@ -116,34 +116,10 @@ struct network { std::vector> const& get_outputs() { return _outputs; } - const std::vector>& get_outputs() const { - return reinterpret_cast>&>(_outputs); - } - - network_output get_output(const primitive_id& output_id) { - event::ptr evt; - if (get_stream().get_queue_type() == QueueTypes::out_of_order || _enable_profiling) - evt = get_primitive_event(output_id); - return network_output(evt, get_output_memory(output_id), get_stream_ptr(), get_output_layout(output_id)); - } - layout get_node_output_layout(const primitive_id& output_id) const; memory::ptr get_output_memory(const primitive_id& output_id); layout get_output_layout(const primitive_id& output_id) const; std::vector get_input_layouts() const; - /// @brief Returns the list of primitive ids before and after graph optimization. - /// @details If primitive was not optimized, the old and actual id will be the same. - /// @n If primitive was optimized during graph optimization, the actual id will be "_optimized_". - std::map get_all_primitives() const { - auto primitive_ids = get_all_primitive_ids(); - auto primitive_org_ids = get_all_primitive_org_ids(); - std::map result; - for (decltype(primitive_org_ids.size()) i = 0; i < primitive_org_ids.size(); i++) { - result.emplace(primitive_org_ids[i], primitive_ids[i]); - } - return result; - } - /// @brief Returns the list of @ref event for the primitives that were executed in network. std::map get_executed_primitives() const { auto primitive_ids = get_executed_primitive_ids(); @@ -201,7 +177,6 @@ struct network { void configure_primitives_second_output(); void build_insts_deps(); uint32_t get_id() const { return net_id; } - uint32_t get_local_id() const { return _local_net_id; } stream& get_stream() const { return *_stream; } stream::ptr get_stream_ptr() const { return _stream; } bool is_internal() const { return _internal; } @@ -219,7 +194,6 @@ struct network { const ov::intel_gpu::VariableStateInfo& get_variable_info(const std::string &variable_id) const; const ov::intel_gpu::VariablesMap& get_variables() const; const ov::intel_gpu::VariablesInfoMap& get_variables_info() const; - std::vector get_kv_cache_ids() const { return kv_cache_ids; } const ExecutionConfig& get_config() const { return _config; } @@ -243,8 +217,6 @@ struct network { bool _is_dynamic = false; bool _enable_profiling = false; bool _reset_arguments; - uint32_t _local_net_id = 0; // This is for thread-safe deserialization. 'net_id' is globally unique, - // but '_local_net_id' is unique only in each intel_gpu::Graph. std::unordered_map> _primitives; std::vector _in_out_shared_mem_types; @@ -255,10 +227,8 @@ struct network { ov::intel_gpu::VariablesMap _variables_states; ov::intel_gpu::VariablesInfoMap _variables_state_info; - std::vector kv_cache_ids; program::primitives_info _prims_info; - std::map _ext_id_mapping; size_t _weights_cache_capacity = 1; std::unordered_map _events; @@ -272,9 +242,7 @@ struct network { void allocate_primitive_instance(program_node const& node); void transfer_memory_to_device(std::shared_ptr instance, program_node const& node); void add_to_exec_order(const primitive_id& id); - std::shared_ptr find_in_internal_networks(const primitive_id& id) const; std::shared_ptr find_primitive(const primitive_id& id) const; - void check_names(); void add_default_output_chains(); void calculate_weights_cache_capacity(); output_chains_map::iterator add_output_chain(std::shared_ptr& p_inst); diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp index 95cdd587cdf175..66a874b9b153ec 100644 --- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp @@ -17,7 +17,8 @@ GPU_DEFINE_PRIMITIVE_TYPE_ID(kv_cache) kv_cache_inst::typed_primitive_inst(network& network, const kv_cache_node& node) : parent{network, node, false}, memory_state::variable{node.get_primitive()->variable_info.variable_id} { - kv_cache_id = network.get_kv_cache_ids().size(); + thread_local size_t kv_cache_counter = 0; + kv_cache_id = kv_cache_counter++; } layout kv_cache_inst::calc_output_layout(const kv_cache_node& node, kernel_impl_params const& impl_param) { diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 8f0e97dd51ee12..0af0e957df4ea8 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -203,8 +203,6 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo calculate_weights_cache_capacity(); allocate_primitives(); configure_primitives_second_output(); - if (!_program->is_loaded_from_cache()) - check_names(); build_insts_deps(); build_exec_order(); validate_primitives(); @@ -333,11 +331,7 @@ void network::reset_execution(bool wait) { event::ptr network::set_input_data(const primitive_id& id, memory::ptr data) { GPU_DEBUG_TRACE_DETAIL << "Set input " << id << " " << data->get_layout().to_short_string() << std::endl; - std::shared_ptr primitive_inst; - - primitive_inst = find_primitive(id); - - OPENVINO_ASSERT(primitive_inst != nullptr, "[GPU] topology doesn't contain primitive: ", id); + auto primitive_inst = find_primitive(id); if (primitive_inst->type() != input_layout::type_id()) { CLDNN_ERROR_MESSAGE(id, "primitive " + id + " is not an input"); @@ -481,11 +475,8 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr

network::set_output_memory(const primitive_id& id, memory::ptr mem_new) { GPU_DEBUG_TRACE_DETAIL << "Set output " << id << " " << mem_new->get_layout().to_short_string() << std::endl; - std::shared_ptr p_inst; std::vector ret_ev; - p_inst = find_primitive(id); - - OPENVINO_ASSERT(p_inst != nullptr, "[GPU] topology doesn't contain primitive: ", id); + std::shared_ptr p_inst = find_primitive(id); auto iter = std::find(_outputs.begin(), _outputs.end(), p_inst); if (iter == _outputs.end()) @@ -513,35 +504,10 @@ std::vector network::set_output_memory(const primitive_id& id, memor return ret_ev; } -void cldnn::network::check_names() { - for (auto const& prim : _primitives) { - if (find_in_internal_networks(prim.first) != nullptr) - CLDNN_ERROR_MESSAGE("Network", "Found primitive with id: " + prim.first + "in anotother network."); - } -} - std::shared_ptr cldnn::network::find_primitive(const primitive_id& id) const { - if (_primitives.find(id) != _primitives.end()) - return _primitives.at(id); - - return find_in_internal_networks(id); -} - -std::shared_ptr cldnn::network::find_in_internal_networks(const primitive_id& id) const { - std::shared_ptr ret; - - for (auto const& prim : _primitives) { - if (prim.second->type() == condition::type_id()) { // currently only condition inst contains mini networks - auto cond_inst = std::static_pointer_cast(prim.second); - ret = cond_inst->get_net_true()->find_primitive(id); - if (ret != nullptr) - return ret; - ret = cond_inst->get_net_false()->find_primitive(id); - if (ret != nullptr) - return ret; - } - } - return nullptr; + auto it = _primitives.find(id); + OPENVINO_ASSERT(it != _primitives.end(), "[GPU] Network doesn't contain primitive ", id); + return it->second; } std::string network::get_primitive_info(const primitive_id& id) const { @@ -552,9 +518,6 @@ std::string network::get_primitive_info(const primitive_id& id) const { bool network::does_node_need_lockable_output(const primitive_id& id) const { auto prim_inst = find_primitive(id); - OPENVINO_ASSERT(prim_inst, "[GPU] Can't get implementation type, since topology ", - "doesn't contain primitive with requested id: ", id); - const auto& node = prim_inst->get_node(); if (node.is_type()) { for (const auto& user : node.get_users()) { @@ -574,15 +537,6 @@ std::string network::get_implementation_info(const primitive_id& id) const { return _program->get_implementation_info(id); } -layout network::get_node_output_layout(const primitive_id& output_id) const { - auto res = std::find_if(_outputs.begin(), _outputs.end(), [&](const std::shared_ptr& v) { - return v->id() == output_id; - }); - OPENVINO_ASSERT(res != _outputs.end(), "[GPU] Couldn't get output layout for ", output_id, ". Output with such name is not found in the outputs list"); - - return (*res)->get_node_output_layout(); -} - memory::ptr network::get_output_memory(const primitive_id& output_id) { return get_primitive(output_id)->output_memory_ptr(); } @@ -729,17 +683,6 @@ void network::add_to_exec_order(const primitive_id& id) { } std::map network::execute(const std::vector& dependencies) { - execute_impl(dependencies); - - auto output_ids = get_output_ids(); - std::map result; - for (auto& id : output_ids) { - result.emplace(id, get_output(id)); - } - return result; -} - -void network::execute_impl(const std::vector& events) { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "NetworkImpl::Execute"); NETWORK_DEBUG(*this); @@ -779,6 +722,21 @@ void network::execute_impl(const std::vector& events) { // in some cases. auto surf_lock = surfaces_lock::create(get_engine().type(), in_out_mem, get_stream()); + execute_impl(dependencies); + + std::map result; + for (auto& inst : _outputs) { + event::ptr ev = nullptr; + const auto& id = inst->id(); + if (get_stream().get_queue_type() == QueueTypes::out_of_order || _enable_profiling) + ev = _events.at(id); + + result.emplace(id, network_output(ev, inst->output_memory_ptr(0), get_stream_ptr(), inst->get_output_layout(0))); + } + return result; +} + +void network::execute_impl(const std::vector& events) { set_arguments(); // This extra flush command is needed for dynamic models in both cases of out_of_order / in_order operating mode @@ -904,10 +862,6 @@ const program::graph_optimizer_info& network::get_optimizer_passes_info() const } std::map network::get_ext_id_mapping() const { - if (_program == nullptr) { - return _ext_id_mapping; - } - std::map result; for (auto& prim : _primitives) { result.emplace(prim.first, prim.second->get_node().get_primitive()->origin_op_name); @@ -1008,9 +962,6 @@ void network::allocate_primitive_instance(program_node const& node) { if (node.is_type()) _data_outputs.push_back(inst); } - if (node.is_type()) { - kv_cache_ids.push_back(node.id()); - } if (auto state_prim = std::dynamic_pointer_cast(inst)) { auto prim = inst->get_node().get_primitive(); set_variables_state_info(state_prim->variable_id(), node.get_output_layout(0), state_prim->get_user_specified_type(), prim.get()); diff --git a/src/plugins/intel_gpu/src/plugin/graph.cpp b/src/plugins/intel_gpu/src/plugin/graph.cpp index 2a3bd5dc0ff239..22f616e3d39818 100644 --- a/src/plugins/intel_gpu/src/plugin/graph.cpp +++ b/src/plugins/intel_gpu/src/plugin/graph.cpp @@ -558,7 +558,6 @@ void Graph::update_profiling_info() { }; std::map executedPrimitives = get_network()->get_executed_primitives(); - auto allPrimitives = get_network()->get_all_primitives(); // Get profiling info for all layers for (auto &profiledID : profilingIDs) { diff --git a/src/plugins/intel_gpu/tests/unit/passes/reorder_inputs_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/reorder_inputs_test.cpp index 7be7f74e6e96e5..cd5c2fdd1681fc 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/reorder_inputs_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/reorder_inputs_test.cpp @@ -183,7 +183,7 @@ TEST(reorder_inputs, impl_forcing_basic_format) { 7.f, 3.f, -2.f, -1.f }); network.set_input_data("input", input); - network.execute(); + auto outputs = network.execute(); const auto& prog = network.get_program(); auto& pool_node = prog->get_node("pool"); @@ -191,7 +191,7 @@ TEST(reorder_inputs, impl_forcing_basic_format) { ASSERT_EQ(pool_layout.format.value, format::yxfb); - auto out_mem = network.get_output("pool").get_memory(); + auto out_mem = outputs.at("pool").get_memory(); cldnn::mem_lock out_mem_ptr(out_mem, get_test_stream()); ASSERT_EQ(out_mem_ptr.size(), 4u); @@ -239,7 +239,7 @@ TEST(reorder_inputs, impl_forcing_basic_format_kernel) { 7.f, 3.f, -2.f, -1.f }); network.set_input_data("input", input); - network.execute(); + auto outputs = network.execute(); auto prog = network.get_program(); auto& node = prog->get_node("actv"); @@ -250,7 +250,7 @@ TEST(reorder_inputs, impl_forcing_basic_format_kernel) { ASSERT_EQ(actv_layout.format.value, format::yxfb); ASSERT_EQ(kernel_name, actv_impl.kernel_name); - auto out_mem = network.get_output("actv").get_memory(); + auto out_mem = outputs.at("actv").get_memory(); cldnn::mem_lock out_mem_ptr(out_mem, get_test_stream()); ASSERT_EQ(out_mem_ptr.size(), 8u); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp index 8e3da9692dcb45..f640b02afa99cb 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp @@ -1031,9 +1031,9 @@ struct concat_gpu_4d : public concat_gpu { network.set_input_data(input_ids[i].pid, in_memory[i]); } - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("concat").get_memory(); + auto out_mem = outputs.at("concat").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); for (size_t bi = 0; bi < batch_num; bi++) { @@ -1117,9 +1117,9 @@ struct concat_gpu_4d_axis3 : public concat_axis3_gpu { network.set_input_data(input_ids[i].pid, in_memory[i]); } - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("concat").get_memory(); + auto out_mem = outputs.at("concat").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); for (size_t bi = 0; bi < batch_num; bi++) { @@ -1283,9 +1283,9 @@ struct concat_id_conv_gpu_4d : public concat_gpu { network.set_input_data(input_ids[i].pid, in_memory[i]); } - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv").get_memory(); + auto out_mem = outputs.at("conv").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); ASSERT_EQ(out_mem->get_layout().format, fmt); @@ -1420,13 +1420,13 @@ struct concat_gpu_4d_implicit : public concat_gpu { for (size_t i = 0; i < in_features.size(); i++) { concat_network->set_input_data(input_ids[i], in_memory[i]); } - concat_network->execute(); + auto outputs = concat_network->execute(); bool concat_opt_enabled = config.get_property(ov::intel_gpu::optimize_data); bool concat_opt_result = std::static_pointer_cast(concat_network->get_primitive("concat"))->can_be_optimized(); EXPECT_EQ(concat_opt_enabled, concat_opt_result); - return concat_network->get_output("reorder").get_memory(); + return outputs.at("reorder").get_memory(); } std::vector>>>> generate_input() { @@ -1640,13 +1640,13 @@ struct concat_gpu_4d_implicit_onednn : public concat_gpu { for (size_t i = 0; i < in_features.size(); i++) { concat_network.set_input_data(input_ids[i], in_memory[i]); } - concat_network.execute(); + auto outputs = concat_network.execute(); bool concat_opt_enabled = config.get_property(ov::intel_gpu::optimize_data); bool concat_opt_result = std::static_pointer_cast(concat_network.get_primitive("concat"))->node->can_be_optimized(); EXPECT_EQ(concat_opt_enabled, concat_opt_result); - return concat_network.get_output("reorder").get_memory(); + return outputs.at("reorder").get_memory(); } std::vector>>>> generate_input() { @@ -1803,7 +1803,7 @@ struct concat_gpu_4d_explicit : public concat_gpu { for (size_t i = 0; i < 4; i++) { concat_network.set_input_data(input_ids[i], in_memory[i]); } - concat_network.execute(); + auto outputs = concat_network.execute(); bool concat_opt_enabled = config.get_property(ov::intel_gpu::optimize_data); bool concat_opt_result = std::static_pointer_cast(concat_network.get_primitive("concat"))->node->can_be_optimized(); @@ -1813,7 +1813,7 @@ struct concat_gpu_4d_explicit : public concat_gpu { if (concat_opt_enabled && batch_num > 1) concat_opt_result = !concat_opt_result; EXPECT_EQ(concat_opt_enabled, concat_opt_result); - return concat_network.get_output("reorder").get_memory(); + return outputs.at("reorder").get_memory(); } std::vector>>>> generate_input() { diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/condition_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/condition_gpu_test.cpp index d5d7798ff4ce79..7fd439ecac5728 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/condition_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/condition_gpu_test.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "intel_gpu/graph/network.hpp" #include "intel_gpu/primitives/permute.hpp" #include "intel_gpu/runtime/internal_properties.hpp" #include "random_generator.hpp" @@ -577,6 +578,7 @@ class condition_gpu_tests: public ::testing::Test { ); branch_true.inner_program = program::build_program(engine, branch_true_topology, config, false, false, true); branch_true.input_map.insert({"input", "branch_input3"}); + branch_true.input_map.insert({"predicate2", "predicate2"}); branch_true.output_map.insert({0, "condi_nested"}); } @@ -598,11 +600,12 @@ class condition_gpu_tests: public ::testing::Test { ); topology.add( - input_layout("predicate", predicate->get_layout()) + input_layout("predicate", predicate->get_layout()), + input_layout("predicate2", predicate2->get_layout()) ); topology.add( - condition("condi", {input_info("predicate"), input_info("input")}, branch_true, branch_false) + condition("condi", {input_info("predicate"), input_info("predicate2"), input_info("input")}, branch_true, branch_false) ); std::vector input_data = { @@ -773,7 +776,7 @@ class condition_gpu_tests: public ::testing::Test { pooling(duplicated_id, input_info(cond_id), cldnn::pooling_mode::max, { 2, 1 }, { 2, 1 }) ); - EXPECT_ANY_THROW(network::ptr net = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);); + EXPECT_NO_THROW(network::ptr net = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);); } void test_empty_body(bool is_caching_test) { @@ -1038,6 +1041,7 @@ TEST(condition_gpu, set_empty_tensor) { net.set_input_data(empty_input_id, empty_input_mem); net.set_input_data(input_id, input_mem); - OV_ASSERT_NO_THROW(net.execute()); - OV_ASSERT_NO_THROW(net.get_output(cond_id).get_memory()); + std::map outputs; + OV_ASSERT_NO_THROW(outputs = net.execute()); + OV_ASSERT_NO_THROW(outputs.at(cond_id).get_memory()); } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp index 421941296e58ab..4155ac0b420e66 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp @@ -5439,9 +5439,9 @@ TEST_P(convolution_gpu_fs_byx_fsv32, fs_byx_fsv32) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv_fsv").get_memory(); + auto out_mem = outputs.at("conv_fsv").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); ASSERT_EQ(out_mem->get_layout().format, format::fs_b_yx_fsv32); @@ -5549,9 +5549,9 @@ TEST(convolution_f16_fsv_gpu, convolution_f16_fsv_gpu_padding) { network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv_fsv").get_memory(); + auto out_mem = outputs.at("conv_fsv").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); ASSERT_EQ(out_mem->get_layout().format, format::fs_b_yx_fsv32); @@ -5773,9 +5773,9 @@ TEST_P(convolution_gpu_fs_byx_fsv32_crop, fs_byx_fsv32_crop) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("concat").get_memory(); + auto out_mem = outputs.at("concat").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); ASSERT_EQ(out_mem->get_layout().format, format::bfyx); @@ -6020,9 +6020,9 @@ TEST(convolution_gpu, bfyx_iyxo_5x5_fp16) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("out").get_memory(); + auto out_mem = outputs.at("out").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); auto output_layout = out_mem->get_layout(); @@ -6254,12 +6254,12 @@ TEST_P(convolution_gpu_block_layout3D, bfzyx_bsv16_fsv16_fp32) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory(); + auto out_mem = outputs.at("conv_bsv16_fsv16").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); - auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory(); + auto out_mem_bfyx = outputs.at("reorder_bfzyx").get_memory(); cldnn::mem_lock out_ptr_bfyx(out_mem_bfyx, get_test_stream()); blockedFormatZeroCheck(out_mem); @@ -6394,12 +6394,12 @@ TEST_P(convolution_gpu_block_layout3D, bfzyx_bsv16_fsv16_fp16) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory(); + auto out_mem = outputs.at("conv_bsv16_fsv16").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); - auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory(); + auto out_mem_bfyx = outputs.at("reorder_bfzyx").get_memory(); cldnn::mem_lock out_ptr_bfyx(out_mem_bfyx, get_test_stream()); blockedFormatZeroCheck(out_mem); @@ -6531,12 +6531,12 @@ TEST_P(convolution_gpu_block_layout3D, bfzyx_bsv16_fsv16_fp32_fused_ops) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory(); + auto out_mem = outputs.at("conv_bsv16_fsv16").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); - auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory(); + auto out_mem_bfyx = outputs.at("reorder_bfzyx").get_memory(); cldnn::mem_lock out_ptr_bfyx(out_mem_bfyx, get_test_stream()); blockedFormatZeroCheck(out_mem); @@ -6695,12 +6695,12 @@ TEST_P(convolution_gpu_block_layout, bfyx_bsv16_fsv16_fp32) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory(); + auto out_mem = outputs.at("conv_bsv16_fsv16").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); - auto out_mem_bfyx = network.get_output("reorder_bfyx").get_memory(); + auto out_mem_bfyx = outputs.at("reorder_bfyx").get_memory(); cldnn::mem_lock out_ptr_bfyx(out_mem_bfyx, get_test_stream()); ASSERT_EQ(out_mem->get_layout().format, format::bs_fs_yx_bsv16_fsv16); @@ -6836,12 +6836,12 @@ TEST_P(convolution_gpu_block_layout, bfyx_bsv16_fsv16_fp16) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory(); + auto out_mem = outputs.at("conv_bsv16_fsv16").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); - auto out_mem_bfyx = network.get_output("reorder_bfyx").get_memory(); + auto out_mem_bfyx = outputs.at("reorder_bfyx").get_memory(); cldnn::mem_lock out_ptr_bfyx(out_mem_bfyx, get_test_stream()); ASSERT_EQ(out_mem->get_layout().format, format::bs_fs_yx_bsv16_fsv16); @@ -6975,12 +6975,12 @@ TEST_P(convolution_gpu_block_layout, bfyx_bsv16_fsv16_fp32_fused_ops) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory(); + auto out_mem = outputs.at("conv_bsv16_fsv16").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); - auto out_mem_bfyx = network.get_output("reorder_bfyx").get_memory(); + auto out_mem_bfyx = outputs.at("reorder_bfyx").get_memory(); cldnn::mem_lock out_ptr_bfyx(out_mem_bfyx, get_test_stream()); ASSERT_EQ(out_mem->get_layout().format, format::bs_fs_yx_bsv16_fsv16); @@ -7113,9 +7113,9 @@ TEST_P(convolution_depthwise_gpu, depthwise_conv_fs_b_yx_fsv32) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv_fsv").get_memory(); + auto out_mem = outputs.at("conv_fsv").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); ASSERT_EQ(out_mem->get_layout().format, format::fs_b_yx_fsv32); @@ -7257,9 +7257,9 @@ TEST_P(convolution_depthwise_gpu_fsv16, depthwise_conv_b_fs_yx_fsv16) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv_fsv").get_memory(); + auto out_mem = outputs.at("conv_fsv").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); ASSERT_EQ(out_mem->get_layout().format, format::b_fs_yx_fsv16); @@ -7395,9 +7395,9 @@ TEST_P(convolution_depthwise_gpu_fsv16_xy, depthwise_conv_b_fs_yx_fsv16) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("out").get_memory(); + auto out_mem = outputs.at("out").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); ASSERT_EQ(out_mem->get_layout().format, format::b_fs_yx_fsv16); @@ -7602,9 +7602,9 @@ TEST_P(convolution_depthwise_gpu_bfyx, depthwise_conv_bfyx) network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv").get_memory(); + auto out_mem = outputs.at("conv").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); ASSERT_EQ(out_mem->get_layout().format, format::bfyx); @@ -7924,9 +7924,9 @@ TEST_P(convolution_grouped_gpu, base) { cldnn::network network(engine, topology, config); network.set_input_data("input", input); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv").get_memory(); + auto out_mem = outputs.at("conv").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); auto out_lay = out_mem->get_layout(); @@ -8092,9 +8092,9 @@ TEST_P(convolution_general_gpu, conv_fp16_cases) { network network(engine, topology, config); network.set_input_data("input", input_mem); - network.execute(); + auto outputs = network.execute(); - auto out_mem = network.get_output("conv_fsv").get_memory(); + auto out_mem = outputs.at("conv_fsv").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); auto out_lay = out_mem->get_layout(); @@ -9669,7 +9669,7 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) { std::cerr << p.original_id << " " << p.kernel_id << std::endl; auto out_ptr = get_output_values_to_float(network, outputs.find("conv_fsv")->second); - auto out_lay = network.get_node_output_layout("conv_fsv"); + auto out_lay = network.get_primitive("conv_fsv")->get_node_output_layout(); ASSERT_EQ(out_lay.batch(), expected_result.size()); ASSERT_EQ(out_lay.feature(), expected_result[0].size()); ASSERT_EQ(out_lay.spatial(1), expected_result[0][0].size()); @@ -10330,9 +10330,9 @@ void test_convolution_f32_gpu_convolution_gpu_bfyx_f16_depthwise_x_block_size_1( network->set_input_data("input", input_mem); - network->execute(); + auto outputs = network->execute(); - auto out_mem = network->get_output("conv_fsv").get_memory(); + auto out_mem = outputs.at("conv_fsv").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); ASSERT_EQ(out_mem->get_layout().format, format::b_fs_yx_fsv16); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/crop_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/crop_gpu_test.cpp index 1b9e52d1e7ef2b..20d42e85d0c301 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/crop_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/crop_gpu_test.cpp @@ -1569,9 +1569,8 @@ TEST(crop_gpu, optimized_out_crop) { for (size_t i = 0; i < out_vec.size(); i++) ASSERT_EQ(output_ptr[i], out_vec[i]); - auto all_primitives = network.get_all_primitives(); - ASSERT_TRUE(all_primitives["crop1"] == "_optimized_"); - ASSERT_TRUE(all_primitives["crop2"] == "_optimized_"); + ASSERT_TRUE(network.get_primitive("crop1")->can_be_optimized()); + ASSERT_TRUE(network.get_primitive("crop2")->can_be_optimized()); } TEST(crop_single_axis, simple_Baxis) { diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp index 2dd46fe7598b5a..a0e1d307e373c0 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp @@ -538,9 +538,9 @@ class ReduceTestBase : public ::testing::TestWithParamset_input_data("input", input_mem); - network->execute(); + auto outputs = network->execute(); - auto out_mem = network->get_output("reduce").get_memory(); + auto out_mem = outputs.at("reduce").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); auto out_lay = out_mem->get_layout(); @@ -1972,9 +1972,9 @@ class ReduceXYWithBigTensorTestBase : public ::testing::TestWithParamset_input_data("input", input_mem); - network->execute(); + auto outputs = network->execute(); - auto out_mem = network->get_output("reduce").get_memory(); + auto out_mem = outputs.at("reduce").get_memory(); cldnn::mem_lock out_ptr(out_mem, get_test_stream()); auto out_lay = out_mem->get_layout(); @@ -2132,9 +2132,9 @@ class ReduceOnednnTestBase : public ::testing::TestWithParam out_ptr(out_mem, get_test_stream()); auto out_lay = out_mem->get_layout(); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp index 5d99607c5efac5..d43273e2a1508d 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp @@ -1916,7 +1916,6 @@ TEST(reorder_gpu_opt, non_trivial_remove_redundant) net.set_input_data("in", in); auto outputs = net.execute(); auto executed_primitives = net.get_executed_primitives(); - auto all_primitives = net.get_all_primitives(); if (engine.get_device_info().supports_immad) { // Currently, oneDNN only supports in_order_queue From 8151e006b771130920d0e7a3d1d8a43d94470fc3 Mon Sep 17 00:00:00 2001 From: Karol Blaszczak Date: Mon, 7 Oct 2024 17:32:03 +0200 Subject: [PATCH 2/6] [DOCS] technical adjustments for the build mstr (#26883) --- CONTRIBUTING.md | 2 +- .../install-openvino/install-openvino-pip.rst | 50 +++++++++++++------ docs/sphinx_setup/conf.py | 4 +- 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f9557603de5f06..7169ebc2ba2c9b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -52,7 +52,7 @@ product better. Since the market of computing devices is constantly evolving, OpenVINO is always open to extending its support for new hardware. If you want to run inference on a device that is currently not supported, you can see how to develop a new plugin for it in the - [Plugin Developer Guide](https://docs.openvino.ai/canonical/openvino_docs_ie_plugin_dg_overview.html). + [Plugin Developer Guide](https://docs.openvino.ai/2024/documentation/openvino-extensibility/openvino-plugin-library.html). ### Improve documentation diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst b/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst index b9f2664b050282..c079f167761ada 100644 --- a/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst +++ b/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst @@ -15,19 +15,7 @@ Install Intel® Distribution of OpenVINO™ Toolkit from PyPI Repository * is dedicated to users of all major OSes: Windows, Linux, and macOS (all x86_64 / arm64 architectures) * macOS offers support only for CPU inference - -| **Simplified Build and Integration** -| The package includes CMake configurations, precompiled static libraries, and headers, which - can be easily accessed through the Python API. You can use the `get_cmake_path()` method to - retrieve the paths to the CMake configurations and libraries: - -.. code-block:: python - from openvino import get_cmake_path - cmake_path = get_cmake_path() - -For detailed instructions on how to use these configurations in your build setup, check out the -:ref:`Create a library with extensions ` section. .. tab-set:: @@ -42,10 +30,13 @@ For detailed instructions on how to use these configurations in your build setup .. tab-item:: Processor Notes :sync: processor-notes - | To see if your processor includes the integrated graphics technology and supports iGPU inference, refer to: + | To see if your processor includes the integrated graphics technology and supports iGPU + inference, refer to: | `Product Specifications `__ + + Installing OpenVINO Runtime ########################### @@ -137,20 +128,47 @@ to see if your case needs any of them. + + +| **Simplified Build and Integration** +| The package includes CMake configurations, precompiled static libraries, and headers, which + can be easily accessed through the Python API. You can use the `get_cmake_path()` method to + retrieve the paths to the CMake configurations and libraries: + +.. code-block:: python + + from openvino import get_cmake_path + cmake_path = get_cmake_path() + +For detailed instructions on how to use these configurations in your build setup, check out the +:ref:`Create a library with extensions ` section. + + + + + + + What's Next? #################### -Now that you've installed OpenVINO Runtime, you're ready to run your own machine learning applications! Learn more about how to integrate a model in OpenVINO applications by trying out the following tutorials. +Now that you've installed OpenVINO Runtime, you're ready to run your own machine learning +applications! Learn more about how to integrate a model in OpenVINO applications by trying out +the following tutorials. .. image:: https://user-images.githubusercontent.com/15709723/127752390-f6aa371f-31b5-4846-84b9-18dd4f662406.gif :width: 400 -Try the `Python Quick Start Example `__ to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook inside your web browser. +Try the `Python Quick Start Example `__ +to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook inside +your web browser. + Get started with Python +++++++++++++++++++++++ -Visit the :doc:`Tutorials <../../../learn-openvino/interactive-tutorials-python>` page for more Jupyter Notebooks to get you started with OpenVINO, such as: +Visit the :doc:`Tutorials <../../../learn-openvino/interactive-tutorials-python>` page for more +Jupyter Notebooks to get you started with OpenVINO, such as: * `OpenVINO Python API Tutorial `__ * `Basic image classification program with Hello Image Classification `__ diff --git a/docs/sphinx_setup/conf.py b/docs/sphinx_setup/conf.py index 8fa38d90442ad3..351a6d6c5ea8b9 100644 --- a/docs/sphinx_setup/conf.py +++ b/docs/sphinx_setup/conf.py @@ -55,7 +55,9 @@ '.md': 'markdown', } -# html_baseurl = 'https://docs.openvino.ai/canonical/' + +# html_baseurl = 'https://docs.openvino.ai/2024/' + # -- Sitemap configuration --------------------------------------------------- From 339a956bd17b0e6cad7e2c383e0ac0f575588fce Mon Sep 17 00:00:00 2001 From: Alexey Smirnov Date: Mon, 7 Oct 2024 17:24:45 +0100 Subject: [PATCH 3/6] [NPUW] Better utilize threads on compilation (#26864) Co-authored-by: Dmitry Matveev --- .../src/plugin/npuw/compiled_model.cpp | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 563e99fcf2bad9..3213be04ec3a33 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -323,13 +323,25 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, std::map forced_sub_devices{}; const std::string fsd_opt = m_cfg.get<::intel_npu::NPUW_SUBMODEL_DEVICE>(); forced_sub_devices = ::intel_npu ::OptionParser>::parse(fsd_opt); + + // Exclude optimized out subgraphs from compilation target beforehand - otherwise we might get head and repeated + // block in the same chunk + std::vector idx_subgraph_to_compile; + for (std::size_t i = 0u; i < orderedSubgraphs.size(); i++) { + if (orderedSubgraphs[i]._optimized_out || m_compiled_submodels[i].replaced_by.value_or(i) != i) { + continue; // do nothing here + } else { + idx_subgraph_to_compile.push_back(i); + } + } + // Compile submodels. Some of them can be functions: track which model will be // used as function(s): function name -> index of the compiled subgraph - auto compile = [&](size_t id) { + auto compile = [&](size_t i) { + const auto& id = idx_subgraph_to_compile[i]; const auto& subgraph = orderedSubgraphs[id]; - if (subgraph._optimized_out) { - return; - } + + NPUW_ASSERT(!subgraph._optimized_out); const std::size_t real_id = m_compiled_submodels[id].replaced_by.value_or(id); if (!orderedSubgraphs[real_id]._avoid_list.empty()) { @@ -385,10 +397,10 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, // Parallel compilation is unstable so is disabled by default. const bool par_opt = m_cfg.get<::intel_npu::NPUW_PARALLEL_COMPILE>(); if (par_opt) { - ov::parallel_for(orderedSubgraphs.size(), compile); + ov::parallel_for(idx_subgraph_to_compile.size(), compile); } else { // TODO: Introduce npuw::serial(i, f) instead where f is a _funcall - for (std::size_t i = 0u; i < orderedSubgraphs.size(); i++) { + for (std::size_t i = 0u; i < idx_subgraph_to_compile.size(); i++) { compile(i); } } From e2c3982a2374d97eee00b63418e86198ec130abb Mon Sep 17 00:00:00 2001 From: Ivan Tikhonov Date: Mon, 7 Oct 2024 20:57:18 +0400 Subject: [PATCH 4/6] Eliminate nop Convert at the beginning of the MOC pipeline (#26872) ### Details: Added NopElimination at the beginning of the MOC After pytorch conversion, we can see useless Convert (fp32 to fp32) in the graph ``` Constant (fp32) -> Node1 -> Convert( to fp32) -> Node2 ``` So after ConstantFolding, we will get ``` Constant (fp32) -> Node1 Constant (fp32) -> Node2 ``` Deletion of the convert above fixes the duplication of the constant ### Tickets: - *CVS-151490* --------- Co-authored-by: Ilya Lavrenov --- .../common_optimizations/moc_transformations.cpp | 1 + .../common_optimizations/moc_transformations.cpp | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp index 3cf542377d5adc..282fc69486b923 100644 --- a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp @@ -143,6 +143,7 @@ bool ov::pass::MOCTransformations::run_on_model(const std::shared_ptr // In particular, if zero dim tensor is consumed in body of MultiSubGraphOp // RemoveConcatZeroDimInput and RemoveMultiSubGraphOpDanglingParamsResults should be called together. using namespace ov::pass; + REGISTER_PASS(manager, EliminateConvert) REGISTER_PASS(manager, EliminateScatterUpdate) REGISTER_PASS(manager, RemoveConcatZeroDimInput) REGISTER_PASS(manager, EliminateLoopInputsOutputs); diff --git a/src/common/transformations/tests/common_optimizations/moc_transformations.cpp b/src/common/transformations/tests/common_optimizations/moc_transformations.cpp index d054605fba726e..32cd330ca9ab75 100644 --- a/src/common/transformations/tests/common_optimizations/moc_transformations.cpp +++ b/src/common/transformations/tests/common_optimizations/moc_transformations.cpp @@ -39,6 +39,22 @@ TEST(TransformationTests, TestModelTensorsConsistencyUseShapesTrue) { EXPECT_TRUE(model->outputs()[0].get_names() == new_tensors); } +TEST(TransformationTests, MOCConvertElimination) { + auto input = std::make_shared(element::f32, Shape{1}); + auto const_val = opset12::Constant::create(element::f32, Shape{1}, {2}); + + auto add1 = std::make_shared(input, const_val); + auto convert_fp32 = std::make_shared(const_val, element::f32); + auto mul = std::make_shared(add1, convert_fp32); + + auto model = std::make_shared(NodeVector{mul}, ParameterVector{input}); + ov::pass::Manager m; + m.register_pass(false); + m.run_passes(model); + + EXPECT_EQ(count_ops_of_type(model), 1); +} + TEST(TransformationTests, TestModelTensorsConsistencyUseShapesFalse) { auto input = std::make_shared(element::f32, Shape{1}); auto const1 = opset12::Constant::create(element::f32, Shape{1}, {1}); From 9027e1d6d1162dbc445b937dbca22e9a4b2880ef Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Mon, 7 Oct 2024 20:59:16 +0400 Subject: [PATCH 5/6] [PT FE] Fix Bitwise translators with 3 inputs (#26930) **Details:** Fix Bitwise translators with 3 inputs **Ticket:** 154082 --------- Signed-off-by: Kazantsev, Roman --- src/frontends/pytorch/src/op/bitwise.cpp | 19 ++++++++++++++++--- .../pytorch_tests/test_bitwise_ops.py | 2 -- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/frontends/pytorch/src/op/bitwise.cpp b/src/frontends/pytorch/src/op/bitwise.cpp index ba602c86b7bc82..ef66e15c5bcc37 100644 --- a/src/frontends/pytorch/src/op/bitwise.cpp +++ b/src/frontends/pytorch/src/op/bitwise.cpp @@ -7,6 +7,7 @@ #include "openvino/op/bitwise_not.hpp" #include "openvino/op/bitwise_or.hpp" #include "openvino/op/bitwise_xor.hpp" +#include "openvino/op/convert_like.hpp" #include "utils.hpp" namespace ov { @@ -29,8 +30,12 @@ OutputVector translate_bitwise_and(const NodeContext& context) { Output x; Output y; std::tie(x, y) = get_inputs_with_promoted_types(context, 0, 1); - auto and_x = context.mark_node(std::make_shared(x, y)); + auto and_x = context.mark_node(std::make_shared(x, y))->output(0); if (!context.input_is_none(2)) { + auto out = context.get_input(2); + if (out.get_element_type().is_dynamic() || and_x.get_element_type() != out.get_element_type()) { + and_x = context.mark_node(std::make_shared(and_x, out)); + } context.mutate_input(2, and_x); } return {and_x}; @@ -41,8 +46,12 @@ OutputVector translate_bitwise_or(const NodeContext& context) { Output x; Output y; std::tie(x, y) = get_inputs_with_promoted_types(context, 0, 1); - auto or_x = context.mark_node(std::make_shared(x, y)); + auto or_x = context.mark_node(std::make_shared(x, y))->output(0); if (!context.input_is_none(2)) { + auto out = context.get_input(2); + if (out.get_element_type().is_dynamic() || or_x.get_element_type() != out.get_element_type()) { + or_x = context.mark_node(std::make_shared(or_x, out)); + } context.mutate_input(2, or_x); } return {or_x}; @@ -53,8 +62,12 @@ OutputVector translate_bitwise_xor(const NodeContext& context) { Output x; Output y; std::tie(x, y) = get_inputs_with_promoted_types(context, 0, 1); - auto xor_x = context.mark_node(std::make_shared(x, y)); + auto xor_x = context.mark_node(std::make_shared(x, y))->output(0); if (!context.input_is_none(2)) { + auto out = context.get_input(2); + if (out.get_element_type().is_dynamic() || xor_x.get_element_type() != out.get_element_type()) { + xor_x = context.mark_node(std::make_shared(xor_x, out)); + } context.mutate_input(2, xor_x); } return {xor_x}; diff --git a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py index e55a86f279de21..a400f6dcd76d17 100644 --- a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py +++ b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py @@ -75,8 +75,6 @@ def test_bitwise_mixed_dtypes( ): if ie_device == "GPU" and (lhs_dtype != "bool" or rhs_dtype != "bool"): pytest.xfail(reason="bitwise ops are not supported on GPU") - if out and version.parse(np.__version__) >= version.parse("2.0.0"): - pytest.xfail(reason="CVS-154082: incorrect handling out type") self._test( *self.create_model(op_type, out), ie_device, From 46a6ccd4ed93d36dd24183b69bd64204c634debe Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Mon, 7 Oct 2024 19:12:12 +0200 Subject: [PATCH 6/6] List decompositions for torch.export (#26878) ### Details: - *item1* - *...* ### Tickets: - *ticket-id* --- .../openvino/frontend/pytorch/fx_decoder.py | 8 +- .../pytorch/torchdynamo/decompositions.py | 205 +++++++++++++++++- src/frontends/pytorch/src/op_table.cpp | 1 + .../pytorch_tests/pytorch_layer_test_class.py | 19 +- .../layer_tests/pytorch_tests/test_col2im.py | 1 + tests/layer_tests/pytorch_tests/test_eye.py | 20 +- tests/model_hub_tests/pytorch/torch_utils.py | 5 +- .../moc_frontend/pytorch_frontend_utils.py | 11 +- 8 files changed, 231 insertions(+), 39 deletions(-) diff --git a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py index d9dae251aa64e7..a7e9f895b5334b 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py @@ -4,14 +4,14 @@ # flake8: noqa # mypy: ignore-errors +import logging +import torch + from openvino.frontend.pytorch.py_pytorch_frontend import _FrontEndPytorchDecoder as Decoder from openvino.frontend.pytorch.py_pytorch_frontend import _Type as DecoderType -from openvino.runtime import op, PartialShape, Type as OVType, OVAny, Shape +from openvino.runtime import PartialShape, Type as OVType, OVAny, Shape from openvino.frontend.pytorch.utils import make_constant, fetch_attr, pt_to_ov_type_map, torch_tensor_to_ov_const -import torch - -import logging logger = logging.getLogger(__name__) logger.setLevel(logging.WARNING) diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/decompositions.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/decompositions.py index 368dbc4cbfa358..eb117f56ab167d 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/decompositions.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/decompositions.py @@ -46,7 +46,9 @@ def convolution_backward( return grad_input, grad_weight, grad_bias + if len(get_decompositions([aten._scaled_dot_product_flash_attention.default])) == 0: + @register_decomposition(aten._scaled_dot_product_flash_attention.default) def scaled_dot_product_flash_attention( query, @@ -101,16 +103,197 @@ def scaled_dot_product_flash_attention( def get_aot_decomposition_list(): - return ([torch.ops.aten._scaled_dot_product_flash_attention.default, - torch.ops.aten._softmax.default, - torch.ops.aten._softmax_backward_data.default, - torch.ops.aten.convolution_backward.default, - torch.ops.aten.gelu_backward.default, - torch.ops.aten.native_group_norm.default, - torch.ops.aten.native_group_norm_backward.default, - torch.ops.aten.native_layer_norm.default, - torch.ops.aten.native_layer_norm_backward.default, - torch.ops.aten.slice_backward.default]) + return [ + torch.ops.aten._scaled_dot_product_flash_attention.default, + torch.ops.aten._softmax.default, + torch.ops.aten._softmax_backward_data.default, + torch.ops.aten.convolution_backward.default, + torch.ops.aten.gelu_backward.default, + torch.ops.aten.native_group_norm.default, + torch.ops.aten.native_group_norm_backward.default, + torch.ops.aten.native_layer_norm.default, + torch.ops.aten.native_layer_norm_backward.default, + torch.ops.aten.slice_backward.default, + ] + def get_inf_decomposition_list(): - return ([torch.ops.aten.nll_loss_forward.default]) + return [torch.ops.aten.nll_loss_forward.default] + + +def get_export_decomposition_list(): + # List of decompositions from torch._decomp.core_aten_decompositions + # removed _backward ops and ops supported without decomposition + decomp = [ + torch.ops.aten.addcdiv, + torch.ops.aten.addcdiv_, + torch.ops.aten.addcmul, + torch.ops.aten.addcmul_, + torch.ops.aten.addr, + torch.ops.aten.affine_grid_generator, + torch.ops.aten.all, + torch.ops.aten.aminmax, + torch.ops.aten.arange.default, + torch.ops.aten.arange.start, + torch.ops.aten.baddbmm, + torch.ops.aten.binary_cross_entropy, + torch.ops.aten.binary_cross_entropy_with_logits, + torch.ops.aten.block_diag, + torch.ops.aten.celu, + torch.ops.aten.celu_, + torch.ops.aten.clamp_max, + torch.ops.aten.clamp_min, + torch.ops.aten.count_nonzero, + torch.ops.aten.linalg_cross, + torch.ops.aten.cudnn_batch_norm, + torch.ops.aten.deg2rad, + torch.ops.aten.deg2rad_, + torch.ops.aten.detach, + torch.ops.aten.diag_embed, + torch.ops.aten.dot, + torch.ops.aten.vdot, + torch.ops.aten.elu, + torch.ops.aten.elu_, + torch.ops.aten._embedding_bag, + torch.ops.aten.empty_like, + torch.ops.aten._euclidean_dist.default, + torch.ops.aten.expand_as, + torch.ops.aten.eye, + torch.ops.aten.fill, + torch.ops.aten.fill_, + torch.ops.aten.floor_divide, + torch.ops.aten.frac, + torch.ops.aten.frac_, + torch.ops.aten._fused_moving_avg_obs_fq_helper, + torch.ops.aten.gelu_, + torch.ops.aten.glu, + torch.ops.aten.hardshrink, + torch.ops.aten.hardsigmoid, + torch.ops.aten.hardsigmoid_, + torch.ops.aten.hardswish, + torch.ops.aten.hardswish_, + torch.ops.aten.hardtanh_, + torch.ops.aten.heaviside, + torch.ops.aten.heaviside_, + torch.ops.aten.huber_loss, + torch.ops.aten.im2col, + torch.ops.aten.index_add, + torch.ops.aten.index_add_, + torch.ops.aten.index_copy, + torch.ops.aten.index_copy_, + torch.ops.aten.index_fill, + torch.ops.aten.index_fill_, + torch.ops.aten.isin, + torch.ops.aten.isneginf, + torch.ops.aten.isposinf, + torch.ops.aten.l1_loss, + torch.ops.aten.leaky_relu_, + torch.ops.aten.lerp, + torch.ops.aten.lerp_, + torch.ops.aten.linspace, + torch.ops.aten.logaddexp, + torch.ops.aten.logaddexp2, + torch.ops.aten.logit, + torch.ops.aten.logit_, + torch.ops.aten.log_sigmoid_forward, + torch.ops.aten.logspace, + torch.ops.aten.logsumexp.default, + torch.ops.aten.masked_fill, + torch.ops.aten.masked_fill_, + torch.ops.aten.mish, + torch.ops.aten.mish_, + torch.ops.aten.mse_loss, + torch.ops.aten.multi_margin_loss, + torch.ops.aten.multilabel_margin_loss_forward, + torch.ops.aten.mv, + torch.ops.aten.mvlgamma, + torch.ops.aten.mvlgamma_, + torch.ops.aten.nansum, + torch.ops.aten.nan_to_num, + torch.ops.aten.nan_to_num_, + torch.ops.aten.narrow, + torch.ops.aten.new_empty, + torch.ops.aten.new_full, + torch.ops.aten.new_ones, + torch.ops.aten.new_zeros, + torch.ops.aten.nll_loss_forward, + torch.ops.aten.norm, + torch.ops.aten.ones, + torch.ops.aten.ones_like, + torch.ops.aten._prelu_kernel, + torch.ops.aten._reshape_alias, + torch.ops.aten.rad2deg, + torch.ops.aten.rad2deg_, + torch.ops.aten.reflection_pad1d, + torch.ops.aten.reflection_pad2d, + torch.ops.aten.reflection_pad3d, + torch.ops.aten.replication_pad1d, + torch.ops.aten.replication_pad2d, + torch.ops.aten.replication_pad3d, + torch.ops.aten.renorm, + torch.ops.aten.renorm_, + torch.ops.aten.resize_as, + torch.ops.aten.roll, + torch.ops.aten.rot90, + torch.ops.aten.rrelu_with_noise, + torch.ops.aten.rrelu_with_noise_, + torch.ops.aten.rsub, + torch.ops.aten.select_scatter, + torch.ops.aten.sgn, + torch.ops.aten.sgn_, + torch.ops.aten.silu, + torch.ops.aten.silu_, + torch.ops.aten.sinc, + torch.ops.aten.sinc_, + torch.ops.aten.smooth_l1_loss, + torch.ops.aten.soft_margin_loss, + torch.ops.aten.softplus, + torch.ops.aten.softshrink, + torch.ops.aten.special_entr, + torch.ops.aten.special_log_ndtr, + torch.ops.aten.special_xlog1py, + torch.ops.aten.split.Tensor, + torch.ops.aten.split_with_sizes_copy, + torch.ops.aten.squeeze.default, + torch.ops.aten.squeeze.dim, + torch.ops.aten.std, + torch.ops.aten.std_mean, + torch.ops.aten.stack, + torch.ops.aten.sum.default, + torch.ops.aten.sum.out, + torch.ops.aten.t, + torch.ops.aten.take, + torch.ops.aten.threshold, + torch.ops.aten.threshold_, + torch.ops.aten.trace, + torch.ops.aten.transpose.int, + torch.ops.aten.tril, + torch.ops.aten.tril_, + torch.ops.aten.triu, + torch.ops.aten.triu_, + torch.ops.aten.unbind, + torch.ops.aten.unfold_copy, + torch.ops.aten._unsafe_index, + torch.ops.aten.unsafe_split.Tensor, + torch.ops.aten.unsafe_split_with_sizes, + torch.ops.aten._unsafe_view, + torch.ops.aten.view_as_complex, + torch.ops.aten.xlogy, + torch.ops.aten.xlogy_, + torch.ops.aten.zero, + torch.ops.aten.zero_, + torch.ops.aten.zeros, + torch.ops.aten.zeros_like, + torch.ops.aten._weight_norm_interface, + ] + try: + from packaging import version + if version.parse(torch.__version__) >= version.parse("2.3"): + decomp += [ + torch.ops.aten._lazy_clone, + torch.ops.aten._test_parallel_materialize, + torch.ops.aten._chunk_cat, + ] + except ImportError: + pass + return decomp diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 1e4ecfc1e1367f..31cf99a2e1b9d7 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -787,6 +787,7 @@ const std::unordered_map get_supported_ops_fx() { {"aten.clamp_min.default", op::translate_1to1_match_2_inputs_align_types}, {"aten.clamp_min.Tensor", op::translate_1to1_match_2_inputs_align_types}, {"aten.clone.default", op::skip_node}, // ignore clone operators that are inserted by PyTorch autograd + {"aten.col2im.default", op::translate_col2im}, {"aten.constant_pad_nd.default", op::translate_constant_pad_nd_fx}, {"aten.convolution.default", op::translate_convolution}, {"aten.copy.default", op::translate_copy_fx}, diff --git a/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py b/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py index a2f54076de9d7f..5bf019db3c131e 100644 --- a/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py +++ b/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py @@ -5,17 +5,18 @@ import warnings from copy import deepcopy import os - +import torch +import pytest +import logging import numpy as np + from common.constants import test_device, test_precision from openvino.frontend.pytorch.ts_decoder import TorchScriptPythonDecoder - from openvino.frontend import FrontEndManager from openvino.runtime import Core, Type, PartialShape import openvino.properties.hint as hints -import torch -from packaging import version -import pytest + +logging.basicConfig(level=logging.DEBUG) def skip_check(param): @@ -124,13 +125,9 @@ def numpy_to_torch_recursively(x): from torch.export import export em = export(model, tuple(torch_inputs)) - if version.parse(torch.__version__) >= version.parse("2.3"): - em = em.run_decompositions() - gm = em.module() - print(gm.code) converted_model = convert_model( - em, example_input=torch_inputs) + em, example_input=torch_inputs, verbose=True) self._resolve_input_shape_dtype( converted_model, ov_inputs, dynamic_shapes) smodel = model @@ -242,7 +239,7 @@ def convert_via_mo(self, model, example_input, trace_model, dynamic_shapes, ov_i if not dynamic_shapes: input_shapes = [inp.shape for inp in ov_inputs] kwargs["input"] = input_shapes - om = convert_model(decoder, **kwargs) + om = convert_model(decoder, verbose=True, **kwargs) self._resolve_input_shape_dtype(om, ov_inputs, dynamic_shapes) return smodel, om diff --git a/tests/layer_tests/pytorch_tests/test_col2im.py b/tests/layer_tests/pytorch_tests/test_col2im.py index 8cb7ea96cb8391..1dc44557c359fb 100644 --- a/tests/layer_tests/pytorch_tests/test_col2im.py +++ b/tests/layer_tests/pytorch_tests/test_col2im.py @@ -40,6 +40,7 @@ def forward(self, x): @pytest.mark.nightly @pytest.mark.precommit + @pytest.mark.precommit_torch_export @pytest.mark.parametrize("output_size,kernel_size", [([4, 5], [2, 2])]) @pytest.mark.parametrize("dilation", [1, 2, [1, 2]]) @pytest.mark.parametrize("padding", [0, 5, [2, 3]]) diff --git a/tests/layer_tests/pytorch_tests/test_eye.py b/tests/layer_tests/pytorch_tests/test_eye.py index 37b850088844cd..f93e77a8b2844a 100644 --- a/tests/layer_tests/pytorch_tests/test_eye.py +++ b/tests/layer_tests/pytorch_tests/test_eye.py @@ -3,6 +3,7 @@ import pytest import torch +from packaging import version from pytorch_layer_test_class import PytorchLayerTest @@ -14,7 +15,6 @@ def _prepare_input(self, m, n=None): return (np.array(m, dtype="int32"), ) return (np.array(m, dtype="int32"), np.array(n, dtype="int32")) - def create_model(self, num_inputs, dtype): import torch dtype_map = { @@ -45,29 +45,31 @@ def __init__(self, dtype): def forward(self, x, y): return torch.eye(x, y, dtype=self.dtype) - - ref_net = None - - return aten_eye_1_input(pt_dtype) if num_inputs == 1 else aten_eye_2_inputs(pt_dtype), ref_net, ("aten::eye", "aten::IntImplicit") + model = aten_eye_1_input(pt_dtype) if num_inputs == 1 else aten_eye_2_inputs(pt_dtype) + return model, None, ["aten::eye", "aten::IntImplicit"] @pytest.mark.nightly @pytest.mark.precommit @pytest.mark.precommit_torch_export @pytest.mark.parametrize("dtype", ["bool", "int8", "uint8", "int32", "int64", "float32", "float64"]) @pytest.mark.parametrize("m", [2, 3, 4, 5]) - @pytest.mark.skipif(torch.__version__ < '2.3.0', reason="`aten.eye` is not supported in PyTorch versions earlier than 2.3.") def test_eye_square(self, dtype, m, ie_device, precision, ir_version): + if PytorchLayerTest.use_torch_export() and version.parse(torch.__version__) < version.parse("2.3"): + pytest.skip("Not supported in PyTorch versions earlier than 2.3.") if ie_device == "GPU": pytest.xfail(reason="eye is not supported on GPU") - self._test(*self.create_model(1, dtype), ie_device, precision, ir_version, kwargs_to_prepare_input={"m": m}) + self._test(*self.create_model(1, dtype), ie_device, precision, + ir_version, kwargs_to_prepare_input={"m": m}) @pytest.mark.nightly @pytest.mark.precommit @pytest.mark.precommit_torch_export @pytest.mark.parametrize("dtype", ["bool", "int8", "uint8", "int32", "int64", "float32", "float64"]) @pytest.mark.parametrize(("m", "n"), [[2, 2], [3, 4], [5, 3]]) - @pytest.mark.skipif(torch.__version__ < '2.3.0', reason="`aten.eye` is not supported in PyTorch versions earlier than 2.3.") def test_eye(self, dtype, m, n, ie_device, precision, ir_version): + if (PytorchLayerTest.use_torch_export() and version.parse(torch.__version__) < version.parse("2.3")): + pytest.skip("Not supported in PyTorch versions earlier than 2.3.") if ie_device == "GPU": pytest.xfail(reason="eye is not supported on GPU") - self._test(*self.create_model(2, dtype), ie_device, precision, ir_version, kwargs_to_prepare_input={"m": m, "n": n}) + self._test(*self.create_model(2, dtype), ie_device, precision, + ir_version, kwargs_to_prepare_input={"m": m, "n": n}) diff --git a/tests/model_hub_tests/pytorch/torch_utils.py b/tests/model_hub_tests/pytorch/torch_utils.py index 09826b058c7855..5b351c6317e9bd 100644 --- a/tests/model_hub_tests/pytorch/torch_utils.py +++ b/tests/model_hub_tests/pytorch/torch_utils.py @@ -75,7 +75,10 @@ def convert_model_impl(self, model_obj): pt_res = model_obj(**self.example) graph = export(model_obj, tuple(), self.example) if version.parse(torch.__version__) >= version.parse("2.2"): - graph = graph.run_decompositions() + from torch._decomp import get_decompositions + from openvino.frontend.pytorch.torchdynamo.decompositions import get_export_decomposition_list + decomp = get_decompositions(get_export_decomposition_list()) + graph = graph.run_decompositions(decomp_table=decomp) gm = graph.module() print(gm.code) diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py index b79b24e9ce76a3..dfe25f27d13d7d 100644 --- a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py +++ b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py @@ -40,15 +40,20 @@ def extract_module_extensions(args): except: pass if not is_good_version: - raise RuntimeError( - "NNCF models produced by nncf<2.6 are not supported directly. Please upgrade nncf or export to ONNX first.") + raise RuntimeError("NNCF models produced by nncf<2.6 are not " + "supported directly. Please upgrade nncf or " + "export to ONNX first.") inputs = prepare_torch_inputs(example_inputs) if not isinstance(model, (TorchScriptPythonDecoder, TorchFXPythonDecoder)): if hasattr(torch, "export") and isinstance(model, (torch.export.ExportedProgram)): from packaging import version if version.parse(torch.__version__) >= version.parse("2.2"): - model = model.run_decompositions() + from torch._decomp import get_decompositions + from openvino.frontend.pytorch.torchdynamo.decompositions import get_export_decomposition_list + decomp = get_decompositions(get_export_decomposition_list()) + model = model.run_decompositions(decomp_table=decomp) gm = model.module() + log.debug(gm.code) decoder = TorchFXPythonDecoder(gm) else: decoder = TorchScriptPythonDecoder(