From de94a33a6b6d352fbc4517184b44edc8126b0506 Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Mon, 7 Oct 2024 19:01:01 +0400
Subject: [PATCH 1/6] [GPU] network code cleanup (#26908)

### Details:
 - Removed few unnecessary methods
 - Refactor `network::execute()`
---
 .../include/intel_gpu/graph/network.hpp       | 32 -------
 src/plugins/intel_gpu/src/graph/kv_cache.cpp  |  3 +-
 src/plugins/intel_gpu/src/graph/network.cpp   | 89 +++++--------------
 src/plugins/intel_gpu/src/plugin/graph.cpp    |  1 -
 .../tests/unit/passes/reorder_inputs_test.cpp |  8 +-
 .../test_cases/concatenation_gpu_test.cpp     | 24 ++---
 .../unit/test_cases/condition_gpu_test.cpp    | 14 +--
 .../unit/test_cases/convolution_gpu_test.cpp  | 82 ++++++++---------
 .../tests/unit/test_cases/crop_gpu_test.cpp   |  5 +-
 .../tests/unit/test_cases/reduce_gpu_test.cpp | 12 +--
 .../unit/test_cases/reorder_gpu_test.cpp      |  1 -
 11 files changed, 96 insertions(+), 175 deletions(-)
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
index 63adae28ddabf3..f4e09a51513085 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
@@ -116,34 +116,10 @@ struct network {
 
     std::vector<std::shared_ptr<primitive_inst>> const& get_outputs() { return _outputs; }
 
-    const std::vector<std::shared_ptr<const primitive_inst>>& get_outputs() const {
-        return reinterpret_cast<const std::vector<std::shared_ptr<const primitive_inst>>&>(_outputs);
-    }
-
-    network_output get_output(const primitive_id& output_id) {
-        event::ptr evt;
-        if (get_stream().get_queue_type() == QueueTypes::out_of_order || _enable_profiling)
-            evt = get_primitive_event(output_id);
-        return network_output(evt, get_output_memory(output_id), get_stream_ptr(), get_output_layout(output_id));
-    }
-    layout get_node_output_layout(const primitive_id& output_id) const;
     memory::ptr get_output_memory(const primitive_id& output_id);
     layout get_output_layout(const primitive_id& output_id) const;
     std::vector<layout> get_input_layouts() const;
 
-    /// @brief Returns the list of primitive ids before and after graph optimization.
-    /// @details If primitive was not optimized, the old and actual id will be the same.
-    /// @n If primitive was optimized during graph optimization, the actual id will be "_optimized_".
-    std::map<primitive_id, primitive_id> get_all_primitives() const {
-        auto primitive_ids = get_all_primitive_ids();
-        auto primitive_org_ids = get_all_primitive_org_ids();
-        std::map<primitive_id, primitive_id> result;
-        for (decltype(primitive_org_ids.size()) i = 0; i < primitive_org_ids.size(); i++) {
-            result.emplace(primitive_org_ids[i], primitive_ids[i]);
-        }
-        return result;
-    }
-
     /// @brief Returns the list of @ref event for the primitives that were executed in network.
     std::map<primitive_id, event::ptr> get_executed_primitives() const {
         auto primitive_ids = get_executed_primitive_ids();
@@ -201,7 +177,6 @@ struct network {
     void configure_primitives_second_output();
     void build_insts_deps();
     uint32_t get_id() const { return net_id; }
-    uint32_t get_local_id() const { return _local_net_id; }
     stream& get_stream() const { return *_stream; }
     stream::ptr get_stream_ptr() const { return _stream; }
     bool is_internal() const { return _internal; }
@@ -219,7 +194,6 @@ struct network {
     const ov::intel_gpu::VariableStateInfo& get_variable_info(const std::string &variable_id) const;
     const ov::intel_gpu::VariablesMap& get_variables() const;
     const ov::intel_gpu::VariablesInfoMap& get_variables_info() const;
-    std::vector<primitive_id> get_kv_cache_ids() const { return kv_cache_ids; }
 
     const ExecutionConfig& get_config() const { return _config; }
 
@@ -243,8 +217,6 @@ struct network {
     bool _is_dynamic = false;
     bool _enable_profiling = false;
     bool _reset_arguments;
-    uint32_t _local_net_id = 0;     // This is for thread-safe deserialization. 'net_id' is globally unique,
-                                    // but '_local_net_id' is unique only in each intel_gpu::Graph.
 
     std::unordered_map<primitive_id, std::shared_ptr<primitive_inst>> _primitives;
     std::vector<shared_mem_type> _in_out_shared_mem_types;
@@ -255,10 +227,8 @@ struct network {
 
     ov::intel_gpu::VariablesMap _variables_states;
     ov::intel_gpu::VariablesInfoMap _variables_state_info;
-    std::vector<primitive_id> kv_cache_ids;
 
     program::primitives_info _prims_info;
-    std::map<primitive_id, primitive_id> _ext_id_mapping;
     size_t _weights_cache_capacity = 1;
 
     std::unordered_map<primitive_id, event::ptr> _events;
@@ -272,9 +242,7 @@ struct network {
     void allocate_primitive_instance(program_node const& node);
     void transfer_memory_to_device(std::shared_ptr<primitive_inst> instance, program_node const& node);
     void add_to_exec_order(const primitive_id& id);
-    std::shared_ptr<primitive_inst> find_in_internal_networks(const primitive_id& id) const;
     std::shared_ptr<primitive_inst> find_primitive(const primitive_id& id) const;
-    void check_names();
     void add_default_output_chains();
     void calculate_weights_cache_capacity();
     output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst);
diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp
index 95cdd587cdf175..66a874b9b153ec 100644
--- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp
+++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp
@@ -17,7 +17,8 @@ GPU_DEFINE_PRIMITIVE_TYPE_ID(kv_cache)
 kv_cache_inst::typed_primitive_inst(network& network, const kv_cache_node& node) :
     parent{network, node, false},
     memory_state::variable{node.get_primitive()->variable_info.variable_id} {
-    kv_cache_id = network.get_kv_cache_ids().size();
+    thread_local size_t kv_cache_counter = 0;
+    kv_cache_id = kv_cache_counter++;
 }
 
 layout kv_cache_inst::calc_output_layout(const kv_cache_node& node, kernel_impl_params const& impl_param) {
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
index 8f0e97dd51ee12..0af0e957df4ea8 100644
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -203,8 +203,6 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo
     calculate_weights_cache_capacity();
     allocate_primitives();
     configure_primitives_second_output();
-    if (!_program->is_loaded_from_cache())
-        check_names();
     build_insts_deps();
     build_exec_order();
     validate_primitives();
@@ -333,11 +331,7 @@ void network::reset_execution(bool wait) {
 
 event::ptr network::set_input_data(const primitive_id& id, memory::ptr data) {
     GPU_DEBUG_TRACE_DETAIL << "Set input " << id << " " << data->get_layout().to_short_string() << std::endl;
-    std::shared_ptr<primitive_inst> primitive_inst;
-
-    primitive_inst = find_primitive(id);
-
-    OPENVINO_ASSERT(primitive_inst != nullptr, "[GPU] topology doesn't contain primitive: ", id);
+    auto primitive_inst = find_primitive(id);
 
     if (primitive_inst->type() != input_layout::type_id()) {
         CLDNN_ERROR_MESSAGE(id, "primitive " + id + " is not an input");
@@ -481,11 +475,8 @@ network::output_chains_map::iterator network::add_output_chain(std::shared_ptr<p
 
 std::vector<event::ptr> network::set_output_memory(const primitive_id& id, memory::ptr mem_new) {
     GPU_DEBUG_TRACE_DETAIL << "Set output " << id << " " << mem_new->get_layout().to_short_string() << std::endl;
-    std::shared_ptr<primitive_inst> p_inst;
     std::vector<event::ptr> ret_ev;
-    p_inst = find_primitive(id);
-
-    OPENVINO_ASSERT(p_inst != nullptr, "[GPU] topology doesn't contain primitive: ", id);
+    std::shared_ptr<primitive_inst> p_inst = find_primitive(id);
 
     auto iter = std::find(_outputs.begin(), _outputs.end(), p_inst);
     if (iter == _outputs.end())
@@ -513,35 +504,10 @@ std::vector<event::ptr> network::set_output_memory(const primitive_id& id, memor
     return ret_ev;
 }
 
-void cldnn::network::check_names() {
-    for (auto const& prim : _primitives) {
-        if (find_in_internal_networks(prim.first) != nullptr)
-            CLDNN_ERROR_MESSAGE("Network", "Found primitive with id: " + prim.first + "in anotother network.");
-    }
-}
-
 std::shared_ptr<primitive_inst> cldnn::network::find_primitive(const primitive_id& id) const {
-    if (_primitives.find(id) != _primitives.end())
-        return _primitives.at(id);
-
-    return find_in_internal_networks(id);
-}
-
-std::shared_ptr<primitive_inst> cldnn::network::find_in_internal_networks(const primitive_id& id) const {
-    std::shared_ptr<primitive_inst> ret;
-
-    for (auto const& prim : _primitives) {
-        if (prim.second->type() == condition::type_id()) {  // currently only condition inst contains mini networks
-            auto cond_inst = std::static_pointer_cast<condition_inst>(prim.second);
-            ret = cond_inst->get_net_true()->find_primitive(id);
-            if (ret != nullptr)
-                return ret;
-            ret = cond_inst->get_net_false()->find_primitive(id);
-            if (ret != nullptr)
-                return ret;
-        }
-    }
-    return nullptr;
+    auto it = _primitives.find(id);
+    OPENVINO_ASSERT(it != _primitives.end(), "[GPU] Network doesn't contain primitive ", id);
+    return it->second;
 }
 
 std::string network::get_primitive_info(const primitive_id& id) const {
@@ -552,9 +518,6 @@ std::string network::get_primitive_info(const primitive_id& id) const {
 bool network::does_node_need_lockable_output(const primitive_id& id) const {
     auto prim_inst = find_primitive(id);
 
-    OPENVINO_ASSERT(prim_inst, "[GPU] Can't get implementation type, since topology ",
-                               "doesn't contain primitive with requested id: ", id);
-
     const auto& node = prim_inst->get_node();
     if (node.is_type<input_layout>()) {
         for (const auto& user : node.get_users()) {
@@ -574,15 +537,6 @@ std::string network::get_implementation_info(const primitive_id& id) const {
     return _program->get_implementation_info(id);
 }
 
-layout network::get_node_output_layout(const primitive_id& output_id) const {
-    auto res = std::find_if(_outputs.begin(), _outputs.end(), [&](const std::shared_ptr<primitive_inst>& v) {
-        return v->id() == output_id;
-    });
-    OPENVINO_ASSERT(res != _outputs.end(), "[GPU] Couldn't get output layout for ", output_id, ". Output with such name is not found in the outputs list");
-
-    return (*res)->get_node_output_layout();
-}
-
 memory::ptr network::get_output_memory(const primitive_id& output_id) {
     return get_primitive(output_id)->output_memory_ptr();
 }
@@ -729,17 +683,6 @@ void network::add_to_exec_order(const primitive_id& id) {
 }
 
 std::map<primitive_id, network_output> network::execute(const std::vector<event::ptr>& dependencies) {
-    execute_impl(dependencies);
-
-    auto output_ids = get_output_ids();
-    std::map<primitive_id, network_output> result;
-    for (auto& id : output_ids) {
-        result.emplace(id, get_output(id));
-    }
-    return result;
-}
-
-void network::execute_impl(const std::vector<event::ptr>& events) {
     OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "NetworkImpl::Execute");
     NETWORK_DEBUG(*this);
 
@@ -779,6 +722,21 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
     // in some cases.
     auto surf_lock = surfaces_lock::create(get_engine().type(), in_out_mem, get_stream());
 
+    execute_impl(dependencies);
+
+    std::map<primitive_id, network_output> result;
+    for (auto& inst : _outputs) {
+        event::ptr ev = nullptr;
+        const auto& id = inst->id();
+        if (get_stream().get_queue_type() == QueueTypes::out_of_order || _enable_profiling)
+            ev = _events.at(id);
+
+        result.emplace(id, network_output(ev, inst->output_memory_ptr(0), get_stream_ptr(), inst->get_output_layout(0)));
+    }
+    return result;
+}
+
+void network::execute_impl(const std::vector<event::ptr>& events) {
     set_arguments();
 
     // This extra flush command is needed for dynamic models in both cases of out_of_order / in_order operating mode
@@ -904,10 +862,6 @@ const program::graph_optimizer_info& network::get_optimizer_passes_info() const
 }
 
 std::map<primitive_id, primitive_id> network::get_ext_id_mapping() const {
-    if (_program == nullptr) {
-        return _ext_id_mapping;
-    }
-
     std::map<primitive_id, primitive_id> result;
     for (auto& prim : _primitives) {
         result.emplace(prim.first, prim.second->get_node().get_primitive()->origin_op_name);
@@ -1008,9 +962,6 @@ void network::allocate_primitive_instance(program_node const& node) {
         if (node.is_type<data>())
             _data_outputs.push_back(inst);
     }
-    if (node.is_type<kv_cache>()) {
-       kv_cache_ids.push_back(node.id());
-    }
     if (auto state_prim = std::dynamic_pointer_cast<memory_state::variable>(inst)) {
         auto prim = inst->get_node().get_primitive();
         set_variables_state_info(state_prim->variable_id(), node.get_output_layout(0), state_prim->get_user_specified_type(), prim.get());
diff --git a/src/plugins/intel_gpu/src/plugin/graph.cpp b/src/plugins/intel_gpu/src/plugin/graph.cpp
index 2a3bd5dc0ff239..22f616e3d39818 100644
--- a/src/plugins/intel_gpu/src/plugin/graph.cpp
+++ b/src/plugins/intel_gpu/src/plugin/graph.cpp
@@ -558,7 +558,6 @@ void Graph::update_profiling_info() {
     };
 
     std::map<cldnn::primitive_id, cldnn::event::ptr> executedPrimitives = get_network()->get_executed_primitives();
-    auto allPrimitives = get_network()->get_all_primitives();
 
     // Get profiling info for all layers
     for (auto &profiledID : profilingIDs) {
diff --git a/src/plugins/intel_gpu/tests/unit/passes/reorder_inputs_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/reorder_inputs_test.cpp
index 7be7f74e6e96e5..cd5c2fdd1681fc 100644
--- a/src/plugins/intel_gpu/tests/unit/passes/reorder_inputs_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/passes/reorder_inputs_test.cpp
@@ -183,7 +183,7 @@ TEST(reorder_inputs, impl_forcing_basic_format) {
                         7.f, 3.f, -2.f, -1.f });
 
     network.set_input_data("input", input);
-    network.execute();
+    auto outputs = network.execute();
 
     const auto& prog = network.get_program();
     auto& pool_node = prog->get_node("pool");
@@ -191,7 +191,7 @@ TEST(reorder_inputs, impl_forcing_basic_format) {
 
     ASSERT_EQ(pool_layout.format.value, format::yxfb);
 
-    auto out_mem = network.get_output("pool").get_memory();
+    auto out_mem = outputs.at("pool").get_memory();
     cldnn::mem_lock<float> out_mem_ptr(out_mem, get_test_stream());
 
     ASSERT_EQ(out_mem_ptr.size(), 4u);
@@ -239,7 +239,7 @@ TEST(reorder_inputs, impl_forcing_basic_format_kernel) {
                         7.f, 3.f, -2.f, -1.f });
 
     network.set_input_data("input", input);
-    network.execute();
+    auto outputs = network.execute();
 
     auto prog = network.get_program();
     auto& node = prog->get_node("actv");
@@ -250,7 +250,7 @@ TEST(reorder_inputs, impl_forcing_basic_format_kernel) {
     ASSERT_EQ(actv_layout.format.value, format::yxfb);
     ASSERT_EQ(kernel_name, actv_impl.kernel_name);
 
-    auto out_mem = network.get_output("actv").get_memory();
+    auto out_mem = outputs.at("actv").get_memory();
     cldnn::mem_lock<float> out_mem_ptr(out_mem, get_test_stream());
 
     ASSERT_EQ(out_mem_ptr.size(), 8u);
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp
index 8e3da9692dcb45..f640b02afa99cb 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/concatenation_gpu_test.cpp
@@ -1031,9 +1031,9 @@ struct concat_gpu_4d : public concat_gpu {
             network.set_input_data(input_ids[i].pid, in_memory[i]);
         }
 
-        network.execute();
+        auto outputs = network.execute();
 
-        auto out_mem = network.get_output("concat").get_memory();
+        auto out_mem = outputs.at("concat").get_memory();
         cldnn::mem_lock<Type> out_ptr(out_mem, get_test_stream());
 
         for (size_t bi = 0; bi < batch_num; bi++) {
@@ -1117,9 +1117,9 @@ struct concat_gpu_4d_axis3 : public concat_axis3_gpu {
             network.set_input_data(input_ids[i].pid, in_memory[i]);
         }
 
-        network.execute();
+        auto outputs = network.execute();
 
-        auto out_mem = network.get_output("concat").get_memory();
+        auto out_mem = outputs.at("concat").get_memory();
         cldnn::mem_lock<Type> out_ptr(out_mem, get_test_stream());
 
         for (size_t bi = 0; bi < batch_num; bi++) {
@@ -1283,9 +1283,9 @@ struct concat_id_conv_gpu_4d : public concat_gpu {
             network.set_input_data(input_ids[i].pid, in_memory[i]);
         }
 
-        network.execute();
+        auto outputs = network.execute();
 
-        auto out_mem = network.get_output("conv").get_memory();
+        auto out_mem = outputs.at("conv").get_memory();
         cldnn::mem_lock<OutputT> out_ptr(out_mem, get_test_stream());
         ASSERT_EQ(out_mem->get_layout().format, fmt);
 
@@ -1420,13 +1420,13 @@ struct concat_gpu_4d_implicit : public concat_gpu {
         for (size_t i = 0; i < in_features.size(); i++) {
             concat_network->set_input_data(input_ids[i], in_memory[i]);
         }
-        concat_network->execute();
+        auto outputs = concat_network->execute();
 
         bool concat_opt_enabled = config.get_property(ov::intel_gpu::optimize_data);
         bool concat_opt_result = std::static_pointer_cast<concatenation_inst>(concat_network->get_primitive("concat"))->can_be_optimized();
         EXPECT_EQ(concat_opt_enabled, concat_opt_result);
 
-        return concat_network->get_output("reorder").get_memory();
+        return outputs.at("reorder").get_memory();
     }
 
     std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> generate_input() {
@@ -1640,13 +1640,13 @@ struct concat_gpu_4d_implicit_onednn : public concat_gpu {
         for (size_t i = 0; i < in_features.size(); i++) {
             concat_network.set_input_data(input_ids[i], in_memory[i]);
         }
-        concat_network.execute();
+        auto outputs = concat_network.execute();
 
         bool concat_opt_enabled = config.get_property(ov::intel_gpu::optimize_data);
         bool concat_opt_result = std::static_pointer_cast<concatenation_inst>(concat_network.get_primitive("concat"))->node->can_be_optimized();
         EXPECT_EQ(concat_opt_enabled, concat_opt_result);
 
-        return concat_network.get_output("reorder").get_memory();
+        return outputs.at("reorder").get_memory();
     }
 
     std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> generate_input() {
@@ -1803,7 +1803,7 @@ struct concat_gpu_4d_explicit : public concat_gpu {
         for (size_t i = 0; i < 4; i++) {
             concat_network.set_input_data(input_ids[i], in_memory[i]);
         }
-        concat_network.execute();
+        auto outputs = concat_network.execute();
 
         bool concat_opt_enabled = config.get_property(ov::intel_gpu::optimize_data);
         bool concat_opt_result = std::static_pointer_cast<concatenation_inst>(concat_network.get_primitive("concat"))->node->can_be_optimized();
@@ -1813,7 +1813,7 @@ struct concat_gpu_4d_explicit : public concat_gpu {
         if (concat_opt_enabled && batch_num > 1) concat_opt_result = !concat_opt_result;
         EXPECT_EQ(concat_opt_enabled, concat_opt_result);
 
-        return concat_network.get_output("reorder").get_memory();
+        return outputs.at("reorder").get_memory();
     }
 
     std::vector<std::vector<std::vector<std::vector<std::vector<Type>>>>> generate_input() {
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/condition_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/condition_gpu_test.cpp
index d5d7798ff4ce79..7fd439ecac5728 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/condition_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/condition_gpu_test.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "intel_gpu/graph/network.hpp"
 #include "intel_gpu/primitives/permute.hpp"
 #include "intel_gpu/runtime/internal_properties.hpp"
 #include "random_generator.hpp"
@@ -577,6 +578,7 @@ class condition_gpu_tests: public ::testing::Test {
             );
             branch_true.inner_program = program::build_program(engine, branch_true_topology, config, false, false, true);
             branch_true.input_map.insert({"input", "branch_input3"});
+            branch_true.input_map.insert({"predicate2", "predicate2"});
             branch_true.output_map.insert({0, "condi_nested"});
         }
 
@@ -598,11 +600,12 @@ class condition_gpu_tests: public ::testing::Test {
         );
 
         topology.add(
-            input_layout("predicate", predicate->get_layout())
+            input_layout("predicate", predicate->get_layout()),
+            input_layout("predicate2", predicate2->get_layout())
         );
 
         topology.add(
-            condition("condi", {input_info("predicate"), input_info("input")}, branch_true, branch_false)
+            condition("condi", {input_info("predicate"), input_info("predicate2"), input_info("input")}, branch_true, branch_false)
         );
 
         std::vector<float> input_data = {
@@ -773,7 +776,7 @@ class condition_gpu_tests: public ::testing::Test {
             pooling(duplicated_id, input_info(cond_id), cldnn::pooling_mode::max, { 2, 1 }, { 2, 1 })
         );
 
-        EXPECT_ANY_THROW(network::ptr net = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test););
+        EXPECT_NO_THROW(network::ptr net = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test););
     }
 
     void test_empty_body(bool is_caching_test) {
@@ -1038,6 +1041,7 @@ TEST(condition_gpu, set_empty_tensor) {
     net.set_input_data(empty_input_id, empty_input_mem);
     net.set_input_data(input_id, input_mem);
 
-    OV_ASSERT_NO_THROW(net.execute());
-    OV_ASSERT_NO_THROW(net.get_output(cond_id).get_memory());
+    std::map<primitive_id, network_output> outputs;
+    OV_ASSERT_NO_THROW(outputs = net.execute());
+    OV_ASSERT_NO_THROW(outputs.at(cond_id).get_memory());
 }
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
index 421941296e58ab..4155ac0b420e66 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
@@ -5439,9 +5439,9 @@ TEST_P(convolution_gpu_fs_byx_fsv32, fs_byx_fsv32)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv_fsv").get_memory();
+    auto out_mem = outputs.at("conv_fsv").get_memory();
     cldnn::mem_lock<ov::float16> out_ptr(out_mem, get_test_stream());
 
     ASSERT_EQ(out_mem->get_layout().format, format::fs_b_yx_fsv32);
@@ -5549,9 +5549,9 @@ TEST(convolution_f16_fsv_gpu, convolution_f16_fsv_gpu_padding) {
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv_fsv").get_memory();
+    auto out_mem = outputs.at("conv_fsv").get_memory();
     cldnn::mem_lock<ov::float16> out_ptr(out_mem, get_test_stream());
 
     ASSERT_EQ(out_mem->get_layout().format, format::fs_b_yx_fsv32);
@@ -5773,9 +5773,9 @@ TEST_P(convolution_gpu_fs_byx_fsv32_crop, fs_byx_fsv32_crop)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("concat").get_memory();
+    auto out_mem = outputs.at("concat").get_memory();
     cldnn::mem_lock<ov::float16> out_ptr(out_mem, get_test_stream());
 
     ASSERT_EQ(out_mem->get_layout().format, format::bfyx);
@@ -6020,9 +6020,9 @@ TEST(convolution_gpu, bfyx_iyxo_5x5_fp16)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("out").get_memory();
+    auto out_mem = outputs.at("out").get_memory();
     cldnn::mem_lock<ov::float16> out_ptr(out_mem, get_test_stream());
 
     auto output_layout = out_mem->get_layout();
@@ -6254,12 +6254,12 @@ TEST_P(convolution_gpu_block_layout3D, bfzyx_bsv16_fsv16_fp32)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory();
+    auto out_mem = outputs.at("conv_bsv16_fsv16").get_memory();
     cldnn::mem_lock<float> out_ptr(out_mem, get_test_stream());
 
-    auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory();
+    auto out_mem_bfyx = outputs.at("reorder_bfzyx").get_memory();
     cldnn::mem_lock<float> out_ptr_bfyx(out_mem_bfyx, get_test_stream());
 
     blockedFormatZeroCheck<float>(out_mem);
@@ -6394,12 +6394,12 @@ TEST_P(convolution_gpu_block_layout3D, bfzyx_bsv16_fsv16_fp16)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory();
+    auto out_mem = outputs.at("conv_bsv16_fsv16").get_memory();
     cldnn::mem_lock<ov::float16> out_ptr(out_mem, get_test_stream());
 
-    auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory();
+    auto out_mem_bfyx = outputs.at("reorder_bfzyx").get_memory();
     cldnn::mem_lock<ov::float16> out_ptr_bfyx(out_mem_bfyx, get_test_stream());
 
     blockedFormatZeroCheck<ov::float16>(out_mem);
@@ -6531,12 +6531,12 @@ TEST_P(convolution_gpu_block_layout3D, bfzyx_bsv16_fsv16_fp32_fused_ops)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory();
+    auto out_mem = outputs.at("conv_bsv16_fsv16").get_memory();
     cldnn::mem_lock<float> out_ptr(out_mem, get_test_stream());
 
-    auto out_mem_bfyx = network.get_output("reorder_bfzyx").get_memory();
+    auto out_mem_bfyx = outputs.at("reorder_bfzyx").get_memory();
     cldnn::mem_lock<float> out_ptr_bfyx(out_mem_bfyx, get_test_stream());
 
     blockedFormatZeroCheck<float>(out_mem);
@@ -6695,12 +6695,12 @@ TEST_P(convolution_gpu_block_layout, bfyx_bsv16_fsv16_fp32)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory();
+    auto out_mem = outputs.at("conv_bsv16_fsv16").get_memory();
     cldnn::mem_lock<float> out_ptr(out_mem, get_test_stream());
 
-    auto out_mem_bfyx = network.get_output("reorder_bfyx").get_memory();
+    auto out_mem_bfyx = outputs.at("reorder_bfyx").get_memory();
     cldnn::mem_lock<float> out_ptr_bfyx(out_mem_bfyx, get_test_stream());
 
     ASSERT_EQ(out_mem->get_layout().format, format::bs_fs_yx_bsv16_fsv16);
@@ -6836,12 +6836,12 @@ TEST_P(convolution_gpu_block_layout, bfyx_bsv16_fsv16_fp16)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory();
+    auto out_mem = outputs.at("conv_bsv16_fsv16").get_memory();
     cldnn::mem_lock<ov::float16> out_ptr(out_mem, get_test_stream());
 
-    auto out_mem_bfyx = network.get_output("reorder_bfyx").get_memory();
+    auto out_mem_bfyx = outputs.at("reorder_bfyx").get_memory();
     cldnn::mem_lock<ov::float16> out_ptr_bfyx(out_mem_bfyx, get_test_stream());
 
     ASSERT_EQ(out_mem->get_layout().format, format::bs_fs_yx_bsv16_fsv16);
@@ -6975,12 +6975,12 @@ TEST_P(convolution_gpu_block_layout, bfyx_bsv16_fsv16_fp32_fused_ops)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv_bsv16_fsv16").get_memory();
+    auto out_mem = outputs.at("conv_bsv16_fsv16").get_memory();
     cldnn::mem_lock<float> out_ptr(out_mem, get_test_stream());
 
-    auto out_mem_bfyx = network.get_output("reorder_bfyx").get_memory();
+    auto out_mem_bfyx = outputs.at("reorder_bfyx").get_memory();
     cldnn::mem_lock<float> out_ptr_bfyx(out_mem_bfyx, get_test_stream());
 
     ASSERT_EQ(out_mem->get_layout().format, format::bs_fs_yx_bsv16_fsv16);
@@ -7113,9 +7113,9 @@ TEST_P(convolution_depthwise_gpu, depthwise_conv_fs_b_yx_fsv32)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv_fsv").get_memory();
+    auto out_mem = outputs.at("conv_fsv").get_memory();
     cldnn::mem_lock<ov::float16> out_ptr(out_mem, get_test_stream());
 
     ASSERT_EQ(out_mem->get_layout().format, format::fs_b_yx_fsv32);
@@ -7257,9 +7257,9 @@ TEST_P(convolution_depthwise_gpu_fsv16, depthwise_conv_b_fs_yx_fsv16)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv_fsv").get_memory();
+    auto out_mem = outputs.at("conv_fsv").get_memory();
     cldnn::mem_lock<ov::float16> out_ptr(out_mem, get_test_stream());
 
     ASSERT_EQ(out_mem->get_layout().format, format::b_fs_yx_fsv16);
@@ -7395,9 +7395,9 @@ TEST_P(convolution_depthwise_gpu_fsv16_xy, depthwise_conv_b_fs_yx_fsv16)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("out").get_memory();
+    auto out_mem = outputs.at("out").get_memory();
 
     cldnn::mem_lock<ov::float16> out_ptr(out_mem, get_test_stream());
     ASSERT_EQ(out_mem->get_layout().format, format::b_fs_yx_fsv16);
@@ -7602,9 +7602,9 @@ TEST_P(convolution_depthwise_gpu_bfyx, depthwise_conv_bfyx)
 
     network.set_input_data("input", input_mem);
 
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv").get_memory();
+    auto out_mem = outputs.at("conv").get_memory();
     cldnn::mem_lock<ov::float16> out_ptr(out_mem, get_test_stream());
 
     ASSERT_EQ(out_mem->get_layout().format, format::bfyx);
@@ -7924,9 +7924,9 @@ TEST_P(convolution_grouped_gpu, base) {
 
     cldnn::network network(engine, topology, config);
     network.set_input_data("input", input);
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv").get_memory();
+    auto out_mem = outputs.at("conv").get_memory();
     cldnn::mem_lock<float> out_ptr(out_mem, get_test_stream());
     auto out_lay = out_mem->get_layout();
 
@@ -8092,9 +8092,9 @@ TEST_P(convolution_general_gpu, conv_fp16_cases) {
     network network(engine, topology, config);
 
     network.set_input_data("input", input_mem);
-    network.execute();
+    auto outputs = network.execute();
 
-    auto out_mem = network.get_output("conv_fsv").get_memory();
+    auto out_mem = outputs.at("conv_fsv").get_memory();
     cldnn::mem_lock<ov::float16> out_ptr(out_mem, get_test_stream());
     auto out_lay = out_mem->get_layout();
 
@@ -9669,7 +9669,7 @@ TEST_P(convolution_gpu_onednn, conv_onednn_cases) {
         std::cerr << p.original_id << " " << p.kernel_id << std::endl;
 
     auto out_ptr = get_output_values_to_float<ov::float16>(network, outputs.find("conv_fsv")->second);
-    auto out_lay = network.get_node_output_layout("conv_fsv");
+    auto out_lay = network.get_primitive("conv_fsv")->get_node_output_layout();
     ASSERT_EQ(out_lay.batch(), expected_result.size());
     ASSERT_EQ(out_lay.feature(), expected_result[0].size());
     ASSERT_EQ(out_lay.spatial(1), expected_result[0][0].size());
@@ -10330,9 +10330,9 @@ void test_convolution_f32_gpu_convolution_gpu_bfyx_f16_depthwise_x_block_size_1(
 
     network->set_input_data("input", input_mem);
 
-    network->execute();
+    auto outputs = network->execute();
 
-    auto out_mem = network->get_output("conv_fsv").get_memory();
+    auto out_mem = outputs.at("conv_fsv").get_memory();
     cldnn::mem_lock<T> out_ptr(out_mem, get_test_stream());
 
     ASSERT_EQ(out_mem->get_layout().format, format::b_fs_yx_fsv16);
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/crop_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/crop_gpu_test.cpp
index 1b9e52d1e7ef2b..20d42e85d0c301 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/crop_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/crop_gpu_test.cpp
@@ -1569,9 +1569,8 @@ TEST(crop_gpu, optimized_out_crop) {
     for (size_t i = 0; i < out_vec.size(); i++)
         ASSERT_EQ(output_ptr[i], out_vec[i]);
 
-    auto all_primitives = network.get_all_primitives();
-    ASSERT_TRUE(all_primitives["crop1"] == "_optimized_");
-    ASSERT_TRUE(all_primitives["crop2"] == "_optimized_");
+    ASSERT_TRUE(network.get_primitive("crop1")->can_be_optimized());
+    ASSERT_TRUE(network.get_primitive("crop2")->can_be_optimized());
 }
 
 TEST(crop_single_axis, simple_Baxis) {
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp
index 2dd46fe7598b5a..a0e1d307e373c0 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp
@@ -538,9 +538,9 @@ class ReduceTestBase : public ::testing::TestWithParam<TestParamType_general_red
         config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"reduce", reduce_impl}}));
         cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
         network->set_input_data("input", input_mem);
-        network->execute();
+        auto outputs = network->execute();
 
-        auto out_mem = network->get_output("reduce").get_memory();
+        auto out_mem = outputs.at("reduce").get_memory();
         cldnn::mem_lock<output_t> out_ptr(out_mem, get_test_stream());
         auto out_lay = out_mem->get_layout();
 
@@ -1972,9 +1972,9 @@ class ReduceXYWithBigTensorTestBase : public ::testing::TestWithParam<TestParamT
             cldnn::network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
             network->set_input_data("input", input_mem);
 
-            network->execute();
+            auto outputs = network->execute();
 
-            auto out_mem = network->get_output("reduce").get_memory();
+            auto out_mem = outputs.at("reduce").get_memory();
             cldnn::mem_lock<output_t> out_ptr(out_mem, get_test_stream());
             auto out_lay = out_mem->get_layout();
 
@@ -2132,9 +2132,9 @@ class ReduceOnednnTestBase : public ::testing::TestWithParam<TestParamType_gener
         network network(engine, topology, config);
         network.set_input_data("input", input_mem);
 
-        network.execute();
+        auto outputs = network.execute();
 
-        auto out_mem = network.get_output("reduce").get_memory();
+        auto out_mem = outputs.at("reduce").get_memory();
         cldnn::mem_lock<output_t> out_ptr(out_mem, get_test_stream());
         auto out_lay = out_mem->get_layout();
 
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp
index 5d99607c5efac5..d43273e2a1508d 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp
@@ -1916,7 +1916,6 @@ TEST(reorder_gpu_opt, non_trivial_remove_redundant)
     net.set_input_data("in", in);
     auto outputs = net.execute();
     auto executed_primitives = net.get_executed_primitives();
-    auto all_primitives = net.get_all_primitives();
 
     if (engine.get_device_info().supports_immad) {
         // Currently, oneDNN only supports in_order_queue

From 8151e006b771130920d0e7a3d1d8a43d94470fc3 Mon Sep 17 00:00:00 2001
From: Karol Blaszczak <karol.blaszczak@intel.com>
Date: Mon, 7 Oct 2024 17:32:03 +0200
Subject: [PATCH 2/6] [DOCS] technical adjustments for the build mstr (#26883)

---
 CONTRIBUTING.md                               |  2 +-
 .../install-openvino/install-openvino-pip.rst | 50 +++++++++++++------
 docs/sphinx_setup/conf.py                     |  4 +-
 3 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f9557603de5f06..7169ebc2ba2c9b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -52,7 +52,7 @@ product better.
      Since the market of computing devices is constantly evolving, OpenVINO is always open to extending
      its support for new hardware. If you want to run inference on a device that is currently not supported,
      you can see how to develop a new plugin for it in the
-     [Plugin Developer Guide](https://docs.openvino.ai/canonical/openvino_docs_ie_plugin_dg_overview.html).
+     [Plugin Developer Guide](https://docs.openvino.ai/2024/documentation/openvino-extensibility/openvino-plugin-library.html).
 
 
 ### Improve documentation
diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst b/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst
index b9f2664b050282..c079f167761ada 100644
--- a/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst
+++ b/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst
@@ -15,19 +15,7 @@ Install Intel® Distribution of OpenVINO™ Toolkit from PyPI Repository
    * is dedicated to users of all major OSes: Windows, Linux, and macOS
      (all x86_64 / arm64 architectures)
    * macOS offers support only for CPU inference
-   
-| **Simplified Build and Integration**  
-|   The package includes CMake configurations, precompiled static libraries, and headers, which
-    can be easily accessed through the Python API. You can use the `get_cmake_path()` method to
-    retrieve the paths to the CMake configurations and libraries:
-
-.. code-block:: python
 
-   from openvino import get_cmake_path
-   cmake_path = get_cmake_path()
-   
-For detailed instructions on how to use these configurations in your build setup, check out the
-:ref:`Create a library with extensions <create_a_library_with_extensions>` section.
 
 .. tab-set::
 
@@ -42,10 +30,13 @@ For detailed instructions on how to use these configurations in your build setup
    .. tab-item:: Processor Notes
       :sync: processor-notes
 
-      | To see if your processor includes the integrated graphics technology and supports iGPU inference, refer to:
+      | To see if your processor includes the integrated graphics technology and supports iGPU
+        inference, refer to:
       | `Product Specifications <https://ark.intel.com/>`__
 
 
+
+
 Installing OpenVINO Runtime
 ###########################
 
@@ -137,20 +128,47 @@ to see if your case needs any of them.
 
 
 
+
+
+| **Simplified Build and Integration**
+|   The package includes CMake configurations, precompiled static libraries, and headers, which
+    can be easily accessed through the Python API. You can use the `get_cmake_path()` method to
+    retrieve the paths to the CMake configurations and libraries:
+
+.. code-block:: python
+
+   from openvino import get_cmake_path
+   cmake_path = get_cmake_path()
+
+For detailed instructions on how to use these configurations in your build setup, check out the
+:ref:`Create a library with extensions <create_a_library_with_extensions>` section.
+
+
+
+
+
+
+
 What's Next?
 ####################
 
-Now that you've installed OpenVINO Runtime, you're ready to run your own machine learning applications! Learn more about how to integrate a model in OpenVINO applications by trying out the following tutorials.
+Now that you've installed OpenVINO Runtime, you're ready to run your own machine learning
+applications! Learn more about how to integrate a model in OpenVINO applications by trying out
+the following tutorials.
 
 .. image:: https://user-images.githubusercontent.com/15709723/127752390-f6aa371f-31b5-4846-84b9-18dd4f662406.gif
    :width: 400
 
-Try the `Python Quick Start Example <https://docs.openvino.ai/2024/notebooks/vision-monodepth-with-output.html>`__ to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook inside your web browser.
+Try the `Python Quick Start Example <https://docs.openvino.ai/2024/notebooks/vision-monodepth-with-output.html>`__
+to estimate depth in a scene using an OpenVINO monodepth model in a Jupyter Notebook inside
+your web browser.
+
 
 Get started with Python
 +++++++++++++++++++++++
 
-Visit the :doc:`Tutorials <../../../learn-openvino/interactive-tutorials-python>` page for more Jupyter Notebooks to get you started with OpenVINO, such as:
+Visit the :doc:`Tutorials <../../../learn-openvino/interactive-tutorials-python>` page for more
+Jupyter Notebooks to get you started with OpenVINO, such as:
 
 * `OpenVINO Python API Tutorial <https://docs.openvino.ai/2024/notebooks/openvino-api-with-output.html>`__
 * `Basic image classification program with Hello Image Classification <https://docs.openvino.ai/2024/notebooks/hello-world-with-output.html>`__
diff --git a/docs/sphinx_setup/conf.py b/docs/sphinx_setup/conf.py
index 8fa38d90442ad3..351a6d6c5ea8b9 100644
--- a/docs/sphinx_setup/conf.py
+++ b/docs/sphinx_setup/conf.py
@@ -55,7 +55,9 @@
     '.md': 'markdown',
 }
 
-# html_baseurl = 'https://docs.openvino.ai/canonical/'
+
+# html_baseurl = 'https://docs.openvino.ai/2024/'
+
 
 # -- Sitemap configuration ---------------------------------------------------
 

From 339a956bd17b0e6cad7e2c383e0ac0f575588fce Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Mon, 7 Oct 2024 17:24:45 +0100
Subject: [PATCH 3/6] [NPUW] Better utilize threads on compilation (#26864)

Co-authored-by: Dmitry Matveev <dmitry.matveev@intel.com>
---
 .../src/plugin/npuw/compiled_model.cpp        | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 563e99fcf2bad9..3213be04ec3a33 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -323,13 +323,25 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
     std::map<std::size_t, std::string> forced_sub_devices{};
     const std::string fsd_opt = m_cfg.get<::intel_npu::NPUW_SUBMODEL_DEVICE>();
     forced_sub_devices = ::intel_npu ::OptionParser<std::map<std::size_t, std::string>>::parse(fsd_opt);
+
+    // Exclude optimized out subgraphs from compilation target beforehand - otherwise we might get head and repeated
+    // block in the same chunk
+    std::vector<std::size_t> idx_subgraph_to_compile;
+    for (std::size_t i = 0u; i < orderedSubgraphs.size(); i++) {
+        if (orderedSubgraphs[i]._optimized_out || m_compiled_submodels[i].replaced_by.value_or(i) != i) {
+            continue;  // do nothing here
+        } else {
+            idx_subgraph_to_compile.push_back(i);
+        }
+    }
+
     // Compile submodels. Some of them can be functions: track which model will be
     // used as function(s): function name -> index of the compiled subgraph
-    auto compile = [&](size_t id) {
+    auto compile = [&](size_t i) {
+        const auto& id = idx_subgraph_to_compile[i];
         const auto& subgraph = orderedSubgraphs[id];
-        if (subgraph._optimized_out) {
-            return;
-        }
+
+        NPUW_ASSERT(!subgraph._optimized_out);
 
         const std::size_t real_id = m_compiled_submodels[id].replaced_by.value_or(id);
         if (!orderedSubgraphs[real_id]._avoid_list.empty()) {
@@ -385,10 +397,10 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
     // Parallel compilation is unstable so is disabled by default.
     const bool par_opt = m_cfg.get<::intel_npu::NPUW_PARALLEL_COMPILE>();
     if (par_opt) {
-        ov::parallel_for(orderedSubgraphs.size(), compile);
+        ov::parallel_for(idx_subgraph_to_compile.size(), compile);
     } else {
         // TODO: Introduce npuw::serial(i, f) instead where f is a _funcall
-        for (std::size_t i = 0u; i < orderedSubgraphs.size(); i++) {
+        for (std::size_t i = 0u; i < idx_subgraph_to_compile.size(); i++) {
             compile(i);
         }
     }

From e2c3982a2374d97eee00b63418e86198ec130abb Mon Sep 17 00:00:00 2001
From: Ivan Tikhonov <ivan.tikhonov@intel.com>
Date: Mon, 7 Oct 2024 20:57:18 +0400
Subject: [PATCH 4/6] Eliminate nop Convert at the beginning of the MOC
 pipeline (#26872)

### Details:
Added NopElimination at the beginning of the MOC
After pytorch conversion, we can see useless Convert (fp32 to fp32) in
the graph
```
Constant (fp32) -> Node1
                -> Convert( to fp32) -> Node2
```
So after ConstantFolding, we will get
```
Constant (fp32) -> Node1
Constant (fp32) -> Node2
```
Deletion of the convert above fixes the duplication of the constant

### Tickets:
 - *CVS-151490*

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .../common_optimizations/moc_transformations.cpp |  1 +
 .../common_optimizations/moc_transformations.cpp | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp
index 3cf542377d5adc..282fc69486b923 100644
--- a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp
@@ -143,6 +143,7 @@ bool ov::pass::MOCTransformations::run_on_model(const std::shared_ptr<ov::Model>
     // In particular, if zero dim tensor is consumed in body of MultiSubGraphOp
     // RemoveConcatZeroDimInput and RemoveMultiSubGraphOpDanglingParamsResults should be called together.
     using namespace ov::pass;
+    REGISTER_PASS(manager, EliminateConvert)
     REGISTER_PASS(manager, EliminateScatterUpdate)
     REGISTER_PASS(manager, RemoveConcatZeroDimInput)
     REGISTER_PASS(manager, EliminateLoopInputsOutputs);
diff --git a/src/common/transformations/tests/common_optimizations/moc_transformations.cpp b/src/common/transformations/tests/common_optimizations/moc_transformations.cpp
index d054605fba726e..32cd330ca9ab75 100644
--- a/src/common/transformations/tests/common_optimizations/moc_transformations.cpp
+++ b/src/common/transformations/tests/common_optimizations/moc_transformations.cpp
@@ -39,6 +39,22 @@ TEST(TransformationTests, TestModelTensorsConsistencyUseShapesTrue) {
     EXPECT_TRUE(model->outputs()[0].get_names() == new_tensors);
 }
 
+TEST(TransformationTests, MOCConvertElimination) {
+    auto input = std::make_shared<opset12::Parameter>(element::f32, Shape{1});
+    auto const_val = opset12::Constant::create(element::f32, Shape{1}, {2});
+
+    auto add1 = std::make_shared<opset12::Add>(input, const_val);
+    auto convert_fp32 = std::make_shared<opset12::Convert>(const_val, element::f32);
+    auto mul = std::make_shared<opset12::MatMul>(add1, convert_fp32);
+
+    auto model = std::make_shared<Model>(NodeVector{mul}, ParameterVector{input});
+    ov::pass::Manager m;
+    m.register_pass<ov::pass::MOCTransformations>(false);
+    m.run_passes(model);
+
+    EXPECT_EQ(count_ops_of_type<opset12::Constant>(model), 1);
+}
+
 TEST(TransformationTests, TestModelTensorsConsistencyUseShapesFalse) {
     auto input = std::make_shared<opset12::Parameter>(element::f32, Shape{1});
     auto const1 = opset12::Constant::create(element::f32, Shape{1}, {1});

From 9027e1d6d1162dbc445b937dbca22e9a4b2880ef Mon Sep 17 00:00:00 2001
From: Roman Kazantsev <roman.kazantsev@intel.com>
Date: Mon, 7 Oct 2024 20:59:16 +0400
Subject: [PATCH 5/6] [PT FE] Fix Bitwise translators with 3 inputs (#26930)

**Details:** Fix Bitwise translators with 3 inputs

**Ticket:** 154082

---------

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 src/frontends/pytorch/src/op/bitwise.cpp      | 19 ++++++++++++++++---
 .../pytorch_tests/test_bitwise_ops.py         |  2 --
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/frontends/pytorch/src/op/bitwise.cpp b/src/frontends/pytorch/src/op/bitwise.cpp
index ba602c86b7bc82..ef66e15c5bcc37 100644
--- a/src/frontends/pytorch/src/op/bitwise.cpp
+++ b/src/frontends/pytorch/src/op/bitwise.cpp
@@ -7,6 +7,7 @@
 #include "openvino/op/bitwise_not.hpp"
 #include "openvino/op/bitwise_or.hpp"
 #include "openvino/op/bitwise_xor.hpp"
+#include "openvino/op/convert_like.hpp"
 #include "utils.hpp"
 
 namespace ov {
@@ -29,8 +30,12 @@ OutputVector translate_bitwise_and(const NodeContext& context) {
     Output<Node> x;
     Output<Node> y;
     std::tie(x, y) = get_inputs_with_promoted_types(context, 0, 1);
-    auto and_x = context.mark_node(std::make_shared<ov::op::v13::BitwiseAnd>(x, y));
+    auto and_x = context.mark_node(std::make_shared<ov::op::v13::BitwiseAnd>(x, y))->output(0);
     if (!context.input_is_none(2)) {
+        auto out = context.get_input(2);
+        if (out.get_element_type().is_dynamic() || and_x.get_element_type() != out.get_element_type()) {
+            and_x = context.mark_node(std::make_shared<ov::op::v1::ConvertLike>(and_x, out));
+        }
         context.mutate_input(2, and_x);
     }
     return {and_x};
@@ -41,8 +46,12 @@ OutputVector translate_bitwise_or(const NodeContext& context) {
     Output<Node> x;
     Output<Node> y;
     std::tie(x, y) = get_inputs_with_promoted_types(context, 0, 1);
-    auto or_x = context.mark_node(std::make_shared<ov::op::v13::BitwiseOr>(x, y));
+    auto or_x = context.mark_node(std::make_shared<ov::op::v13::BitwiseOr>(x, y))->output(0);
     if (!context.input_is_none(2)) {
+        auto out = context.get_input(2);
+        if (out.get_element_type().is_dynamic() || or_x.get_element_type() != out.get_element_type()) {
+            or_x = context.mark_node(std::make_shared<ov::op::v1::ConvertLike>(or_x, out));
+        }
         context.mutate_input(2, or_x);
     }
     return {or_x};
@@ -53,8 +62,12 @@ OutputVector translate_bitwise_xor(const NodeContext& context) {
     Output<Node> x;
     Output<Node> y;
     std::tie(x, y) = get_inputs_with_promoted_types(context, 0, 1);
-    auto xor_x = context.mark_node(std::make_shared<ov::op::v13::BitwiseXor>(x, y));
+    auto xor_x = context.mark_node(std::make_shared<ov::op::v13::BitwiseXor>(x, y))->output(0);
     if (!context.input_is_none(2)) {
+        auto out = context.get_input(2);
+        if (out.get_element_type().is_dynamic() || xor_x.get_element_type() != out.get_element_type()) {
+            xor_x = context.mark_node(std::make_shared<ov::op::v1::ConvertLike>(xor_x, out));
+        }
         context.mutate_input(2, xor_x);
     }
     return {xor_x};
diff --git a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py
index e55a86f279de21..a400f6dcd76d17 100644
--- a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py
+++ b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py
@@ -75,8 +75,6 @@ def test_bitwise_mixed_dtypes(
     ):
         if ie_device == "GPU" and (lhs_dtype != "bool" or rhs_dtype != "bool"):
             pytest.xfail(reason="bitwise ops are not supported on GPU")
-        if out and version.parse(np.__version__) >= version.parse("2.0.0"):
-            pytest.xfail(reason="CVS-154082: incorrect handling out type")
         self._test(
             *self.create_model(op_type, out),
             ie_device,

From 46a6ccd4ed93d36dd24183b69bd64204c634debe Mon Sep 17 00:00:00 2001
From: Maxim Vafin <maxim.vafin@intel.com>
Date: Mon, 7 Oct 2024 19:12:12 +0200
Subject: [PATCH 6/6] List decompositions for torch.export (#26878)

### Details:
 - *item1*
 - *...*

### Tickets:
 - *ticket-id*
---
 .../openvino/frontend/pytorch/fx_decoder.py   |   8 +-
 .../pytorch/torchdynamo/decompositions.py     | 205 +++++++++++++++++-
 src/frontends/pytorch/src/op_table.cpp        |   1 +
 .../pytorch_tests/pytorch_layer_test_class.py |  19 +-
 .../layer_tests/pytorch_tests/test_col2im.py  |   1 +
 tests/layer_tests/pytorch_tests/test_eye.py   |  20 +-
 tests/model_hub_tests/pytorch/torch_utils.py  |   5 +-
 .../moc_frontend/pytorch_frontend_utils.py    |  11 +-
 8 files changed, 231 insertions(+), 39 deletions(-)

diff --git a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py
index d9dae251aa64e7..a7e9f895b5334b 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/fx_decoder.py
@@ -4,14 +4,14 @@
 # flake8: noqa
 # mypy: ignore-errors
 
+import logging
+import torch
+
 from openvino.frontend.pytorch.py_pytorch_frontend import _FrontEndPytorchDecoder as Decoder
 from openvino.frontend.pytorch.py_pytorch_frontend import _Type as DecoderType
-from openvino.runtime import op, PartialShape, Type as OVType, OVAny, Shape
+from openvino.runtime import PartialShape, Type as OVType, OVAny, Shape
 from openvino.frontend.pytorch.utils import make_constant, fetch_attr, pt_to_ov_type_map, torch_tensor_to_ov_const
 
-import torch
-
-import logging
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.WARNING)
 
diff --git a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/decompositions.py b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/decompositions.py
index 368dbc4cbfa358..eb117f56ab167d 100644
--- a/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/decompositions.py
+++ b/src/bindings/python/src/openvino/frontend/pytorch/torchdynamo/decompositions.py
@@ -46,7 +46,9 @@ def convolution_backward(
 
     return grad_input, grad_weight, grad_bias
 
+
 if len(get_decompositions([aten._scaled_dot_product_flash_attention.default])) == 0:
+
     @register_decomposition(aten._scaled_dot_product_flash_attention.default)
     def scaled_dot_product_flash_attention(
         query,
@@ -101,16 +103,197 @@ def scaled_dot_product_flash_attention(
 
 
 def get_aot_decomposition_list():
-    return ([torch.ops.aten._scaled_dot_product_flash_attention.default,
-             torch.ops.aten._softmax.default,
-             torch.ops.aten._softmax_backward_data.default,
-             torch.ops.aten.convolution_backward.default,
-             torch.ops.aten.gelu_backward.default,
-             torch.ops.aten.native_group_norm.default,
-             torch.ops.aten.native_group_norm_backward.default,
-             torch.ops.aten.native_layer_norm.default,
-             torch.ops.aten.native_layer_norm_backward.default,
-             torch.ops.aten.slice_backward.default])
+    return [
+        torch.ops.aten._scaled_dot_product_flash_attention.default,
+        torch.ops.aten._softmax.default,
+        torch.ops.aten._softmax_backward_data.default,
+        torch.ops.aten.convolution_backward.default,
+        torch.ops.aten.gelu_backward.default,
+        torch.ops.aten.native_group_norm.default,
+        torch.ops.aten.native_group_norm_backward.default,
+        torch.ops.aten.native_layer_norm.default,
+        torch.ops.aten.native_layer_norm_backward.default,
+        torch.ops.aten.slice_backward.default,
+    ]
+
 
 def get_inf_decomposition_list():
-    return ([torch.ops.aten.nll_loss_forward.default])
+    return [torch.ops.aten.nll_loss_forward.default]
+
+
+def get_export_decomposition_list():
+    # List of decompositions from torch._decomp.core_aten_decompositions
+    # removed _backward ops and ops supported without decomposition
+    decomp = [
+        torch.ops.aten.addcdiv,
+        torch.ops.aten.addcdiv_,
+        torch.ops.aten.addcmul,
+        torch.ops.aten.addcmul_,
+        torch.ops.aten.addr,
+        torch.ops.aten.affine_grid_generator,
+        torch.ops.aten.all,
+        torch.ops.aten.aminmax,
+        torch.ops.aten.arange.default,
+        torch.ops.aten.arange.start,
+        torch.ops.aten.baddbmm,
+        torch.ops.aten.binary_cross_entropy,
+        torch.ops.aten.binary_cross_entropy_with_logits,
+        torch.ops.aten.block_diag,
+        torch.ops.aten.celu,
+        torch.ops.aten.celu_,
+        torch.ops.aten.clamp_max,
+        torch.ops.aten.clamp_min,
+        torch.ops.aten.count_nonzero,
+        torch.ops.aten.linalg_cross,
+        torch.ops.aten.cudnn_batch_norm,
+        torch.ops.aten.deg2rad,
+        torch.ops.aten.deg2rad_,
+        torch.ops.aten.detach,
+        torch.ops.aten.diag_embed,
+        torch.ops.aten.dot,
+        torch.ops.aten.vdot,
+        torch.ops.aten.elu,
+        torch.ops.aten.elu_,
+        torch.ops.aten._embedding_bag,
+        torch.ops.aten.empty_like,
+        torch.ops.aten._euclidean_dist.default,
+        torch.ops.aten.expand_as,
+        torch.ops.aten.eye,
+        torch.ops.aten.fill,
+        torch.ops.aten.fill_,
+        torch.ops.aten.floor_divide,
+        torch.ops.aten.frac,
+        torch.ops.aten.frac_,
+        torch.ops.aten._fused_moving_avg_obs_fq_helper,
+        torch.ops.aten.gelu_,
+        torch.ops.aten.glu,
+        torch.ops.aten.hardshrink,
+        torch.ops.aten.hardsigmoid,
+        torch.ops.aten.hardsigmoid_,
+        torch.ops.aten.hardswish,
+        torch.ops.aten.hardswish_,
+        torch.ops.aten.hardtanh_,
+        torch.ops.aten.heaviside,
+        torch.ops.aten.heaviside_,
+        torch.ops.aten.huber_loss,
+        torch.ops.aten.im2col,
+        torch.ops.aten.index_add,
+        torch.ops.aten.index_add_,
+        torch.ops.aten.index_copy,
+        torch.ops.aten.index_copy_,
+        torch.ops.aten.index_fill,
+        torch.ops.aten.index_fill_,
+        torch.ops.aten.isin,
+        torch.ops.aten.isneginf,
+        torch.ops.aten.isposinf,
+        torch.ops.aten.l1_loss,
+        torch.ops.aten.leaky_relu_,
+        torch.ops.aten.lerp,
+        torch.ops.aten.lerp_,
+        torch.ops.aten.linspace,
+        torch.ops.aten.logaddexp,
+        torch.ops.aten.logaddexp2,
+        torch.ops.aten.logit,
+        torch.ops.aten.logit_,
+        torch.ops.aten.log_sigmoid_forward,
+        torch.ops.aten.logspace,
+        torch.ops.aten.logsumexp.default,
+        torch.ops.aten.masked_fill,
+        torch.ops.aten.masked_fill_,
+        torch.ops.aten.mish,
+        torch.ops.aten.mish_,
+        torch.ops.aten.mse_loss,
+        torch.ops.aten.multi_margin_loss,
+        torch.ops.aten.multilabel_margin_loss_forward,
+        torch.ops.aten.mv,
+        torch.ops.aten.mvlgamma,
+        torch.ops.aten.mvlgamma_,
+        torch.ops.aten.nansum,
+        torch.ops.aten.nan_to_num,
+        torch.ops.aten.nan_to_num_,
+        torch.ops.aten.narrow,
+        torch.ops.aten.new_empty,
+        torch.ops.aten.new_full,
+        torch.ops.aten.new_ones,
+        torch.ops.aten.new_zeros,
+        torch.ops.aten.nll_loss_forward,
+        torch.ops.aten.norm,
+        torch.ops.aten.ones,
+        torch.ops.aten.ones_like,
+        torch.ops.aten._prelu_kernel,
+        torch.ops.aten._reshape_alias,
+        torch.ops.aten.rad2deg,
+        torch.ops.aten.rad2deg_,
+        torch.ops.aten.reflection_pad1d,
+        torch.ops.aten.reflection_pad2d,
+        torch.ops.aten.reflection_pad3d,
+        torch.ops.aten.replication_pad1d,
+        torch.ops.aten.replication_pad2d,
+        torch.ops.aten.replication_pad3d,
+        torch.ops.aten.renorm,
+        torch.ops.aten.renorm_,
+        torch.ops.aten.resize_as,
+        torch.ops.aten.roll,
+        torch.ops.aten.rot90,
+        torch.ops.aten.rrelu_with_noise,
+        torch.ops.aten.rrelu_with_noise_,
+        torch.ops.aten.rsub,
+        torch.ops.aten.select_scatter,
+        torch.ops.aten.sgn,
+        torch.ops.aten.sgn_,
+        torch.ops.aten.silu,
+        torch.ops.aten.silu_,
+        torch.ops.aten.sinc,
+        torch.ops.aten.sinc_,
+        torch.ops.aten.smooth_l1_loss,
+        torch.ops.aten.soft_margin_loss,
+        torch.ops.aten.softplus,
+        torch.ops.aten.softshrink,
+        torch.ops.aten.special_entr,
+        torch.ops.aten.special_log_ndtr,
+        torch.ops.aten.special_xlog1py,
+        torch.ops.aten.split.Tensor,
+        torch.ops.aten.split_with_sizes_copy,
+        torch.ops.aten.squeeze.default,
+        torch.ops.aten.squeeze.dim,
+        torch.ops.aten.std,
+        torch.ops.aten.std_mean,
+        torch.ops.aten.stack,
+        torch.ops.aten.sum.default,
+        torch.ops.aten.sum.out,
+        torch.ops.aten.t,
+        torch.ops.aten.take,
+        torch.ops.aten.threshold,
+        torch.ops.aten.threshold_,
+        torch.ops.aten.trace,
+        torch.ops.aten.transpose.int,
+        torch.ops.aten.tril,
+        torch.ops.aten.tril_,
+        torch.ops.aten.triu,
+        torch.ops.aten.triu_,
+        torch.ops.aten.unbind,
+        torch.ops.aten.unfold_copy,
+        torch.ops.aten._unsafe_index,
+        torch.ops.aten.unsafe_split.Tensor,
+        torch.ops.aten.unsafe_split_with_sizes,
+        torch.ops.aten._unsafe_view,
+        torch.ops.aten.view_as_complex,
+        torch.ops.aten.xlogy,
+        torch.ops.aten.xlogy_,
+        torch.ops.aten.zero,
+        torch.ops.aten.zero_,
+        torch.ops.aten.zeros,
+        torch.ops.aten.zeros_like,
+        torch.ops.aten._weight_norm_interface,
+    ]
+    try:
+        from packaging import version
+        if version.parse(torch.__version__) >= version.parse("2.3"):
+            decomp += [
+                torch.ops.aten._lazy_clone,
+                torch.ops.aten._test_parallel_materialize,
+                torch.ops.aten._chunk_cat,
+            ]
+    except ImportError:
+        pass
+    return decomp
diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp
index 1e4ecfc1e1367f..31cf99a2e1b9d7 100644
--- a/src/frontends/pytorch/src/op_table.cpp
+++ b/src/frontends/pytorch/src/op_table.cpp
@@ -787,6 +787,7 @@ const std::unordered_map<std::string, CreatorFunction> get_supported_ops_fx() {
         {"aten.clamp_min.default", op::translate_1to1_match_2_inputs_align_types<opset10::Maximum>},
         {"aten.clamp_min.Tensor", op::translate_1to1_match_2_inputs_align_types<opset10::Maximum>},
         {"aten.clone.default", op::skip_node},  // ignore clone operators that are inserted by PyTorch autograd
+        {"aten.col2im.default", op::translate_col2im},
         {"aten.constant_pad_nd.default", op::translate_constant_pad_nd_fx},
         {"aten.convolution.default", op::translate_convolution},
         {"aten.copy.default", op::translate_copy_fx},
diff --git a/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py b/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py
index a2f54076de9d7f..5bf019db3c131e 100644
--- a/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py
+++ b/tests/layer_tests/pytorch_tests/pytorch_layer_test_class.py
@@ -5,17 +5,18 @@
 import warnings
 from copy import deepcopy
 import os
-
+import torch
+import pytest
+import logging
 import numpy as np
+
 from common.constants import test_device, test_precision
 from openvino.frontend.pytorch.ts_decoder import TorchScriptPythonDecoder
-
 from openvino.frontend import FrontEndManager
 from openvino.runtime import Core, Type, PartialShape
 import openvino.properties.hint as hints
-import torch
-from packaging import version
-import pytest
+
+logging.basicConfig(level=logging.DEBUG)
 
 
 def skip_check(param):
@@ -124,13 +125,9 @@ def numpy_to_torch_recursively(x):
                 from torch.export import export
 
                 em = export(model, tuple(torch_inputs))
-                if version.parse(torch.__version__) >= version.parse("2.3"):
-                    em = em.run_decompositions()
-                gm = em.module()
-                print(gm.code)
 
                 converted_model = convert_model(
-                    em, example_input=torch_inputs)
+                    em, example_input=torch_inputs, verbose=True)
                 self._resolve_input_shape_dtype(
                     converted_model, ov_inputs, dynamic_shapes)
                 smodel = model
@@ -242,7 +239,7 @@ def convert_via_mo(self, model, example_input, trace_model, dynamic_shapes, ov_i
         if not dynamic_shapes:
             input_shapes = [inp.shape for inp in ov_inputs]
             kwargs["input"] = input_shapes
-        om = convert_model(decoder, **kwargs)
+        om = convert_model(decoder, verbose=True, **kwargs)
         self._resolve_input_shape_dtype(om, ov_inputs, dynamic_shapes)
         return smodel, om
 
diff --git a/tests/layer_tests/pytorch_tests/test_col2im.py b/tests/layer_tests/pytorch_tests/test_col2im.py
index 8cb7ea96cb8391..1dc44557c359fb 100644
--- a/tests/layer_tests/pytorch_tests/test_col2im.py
+++ b/tests/layer_tests/pytorch_tests/test_col2im.py
@@ -40,6 +40,7 @@ def forward(self, x):
 
     @pytest.mark.nightly
     @pytest.mark.precommit
+    @pytest.mark.precommit_torch_export
     @pytest.mark.parametrize("output_size,kernel_size", [([4, 5], [2, 2])])
     @pytest.mark.parametrize("dilation", [1, 2, [1, 2]])
     @pytest.mark.parametrize("padding", [0, 5, [2, 3]])
diff --git a/tests/layer_tests/pytorch_tests/test_eye.py b/tests/layer_tests/pytorch_tests/test_eye.py
index 37b850088844cd..f93e77a8b2844a 100644
--- a/tests/layer_tests/pytorch_tests/test_eye.py
+++ b/tests/layer_tests/pytorch_tests/test_eye.py
@@ -3,6 +3,7 @@
 
 import pytest
 import torch
+from packaging import version
 
 from pytorch_layer_test_class import PytorchLayerTest
 
@@ -14,7 +15,6 @@ def _prepare_input(self, m, n=None):
             return (np.array(m, dtype="int32"), )
         return (np.array(m, dtype="int32"), np.array(n, dtype="int32"))
 
-
     def create_model(self, num_inputs, dtype):
         import torch
         dtype_map = {
@@ -45,29 +45,31 @@ def __init__(self, dtype):
             def forward(self, x, y):
                 return torch.eye(x, y, dtype=self.dtype)
 
-
-        ref_net = None
-
-        return aten_eye_1_input(pt_dtype) if num_inputs == 1 else aten_eye_2_inputs(pt_dtype), ref_net, ("aten::eye", "aten::IntImplicit")
+        model = aten_eye_1_input(pt_dtype) if num_inputs == 1 else aten_eye_2_inputs(pt_dtype)
+        return model, None, ["aten::eye", "aten::IntImplicit"]
 
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_torch_export
     @pytest.mark.parametrize("dtype", ["bool", "int8", "uint8", "int32", "int64", "float32", "float64"])
     @pytest.mark.parametrize("m", [2, 3, 4, 5])
-    @pytest.mark.skipif(torch.__version__ < '2.3.0', reason="`aten.eye` is not supported in PyTorch versions earlier than 2.3.")
     def test_eye_square(self, dtype, m, ie_device, precision, ir_version):
+        if PytorchLayerTest.use_torch_export() and version.parse(torch.__version__) < version.parse("2.3"):
+            pytest.skip("Not supported in PyTorch versions earlier than 2.3.")
         if ie_device == "GPU":
             pytest.xfail(reason="eye is not supported on GPU")
-        self._test(*self.create_model(1, dtype), ie_device, precision, ir_version, kwargs_to_prepare_input={"m": m})
+        self._test(*self.create_model(1, dtype), ie_device, precision,
+                   ir_version, kwargs_to_prepare_input={"m": m})
 
     @pytest.mark.nightly
     @pytest.mark.precommit
     @pytest.mark.precommit_torch_export
     @pytest.mark.parametrize("dtype", ["bool", "int8", "uint8", "int32", "int64", "float32", "float64"])
     @pytest.mark.parametrize(("m", "n"), [[2, 2], [3, 4], [5, 3]])
-    @pytest.mark.skipif(torch.__version__ < '2.3.0', reason="`aten.eye` is not supported in PyTorch versions earlier than 2.3.")
     def test_eye(self, dtype, m, n, ie_device, precision, ir_version):
+        if (PytorchLayerTest.use_torch_export() and version.parse(torch.__version__) < version.parse("2.3")):
+            pytest.skip("Not supported in PyTorch versions earlier than 2.3.")
         if ie_device == "GPU":
             pytest.xfail(reason="eye is not supported on GPU")
-        self._test(*self.create_model(2, dtype), ie_device, precision, ir_version, kwargs_to_prepare_input={"m": m, "n": n})
+        self._test(*self.create_model(2, dtype), ie_device, precision,
+                   ir_version, kwargs_to_prepare_input={"m": m, "n": n})
diff --git a/tests/model_hub_tests/pytorch/torch_utils.py b/tests/model_hub_tests/pytorch/torch_utils.py
index 09826b058c7855..5b351c6317e9bd 100644
--- a/tests/model_hub_tests/pytorch/torch_utils.py
+++ b/tests/model_hub_tests/pytorch/torch_utils.py
@@ -75,7 +75,10 @@ def convert_model_impl(self, model_obj):
                 pt_res = model_obj(**self.example)
                 graph = export(model_obj, tuple(), self.example)
                 if version.parse(torch.__version__) >= version.parse("2.2"):
-                    graph = graph.run_decompositions()
+                    from torch._decomp import get_decompositions
+                    from openvino.frontend.pytorch.torchdynamo.decompositions import get_export_decomposition_list
+                    decomp = get_decompositions(get_export_decomposition_list())
+                    graph = graph.run_decompositions(decomp_table=decomp)
 
                 gm = graph.module()
                 print(gm.code)
diff --git a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py
index b79b24e9ce76a3..dfe25f27d13d7d 100644
--- a/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py
+++ b/tools/ovc/openvino/tools/ovc/moc_frontend/pytorch_frontend_utils.py
@@ -40,15 +40,20 @@ def extract_module_extensions(args):
         except:
             pass
         if not is_good_version:
-            raise RuntimeError(
-                "NNCF models produced by nncf<2.6 are not supported directly. Please upgrade nncf or export to ONNX first.")
+            raise RuntimeError("NNCF models produced by nncf<2.6 are not "
+                               "supported directly. Please upgrade nncf or "
+                               "export to ONNX first.")
     inputs = prepare_torch_inputs(example_inputs)
     if not isinstance(model, (TorchScriptPythonDecoder, TorchFXPythonDecoder)):
         if hasattr(torch, "export") and isinstance(model, (torch.export.ExportedProgram)):
             from packaging import version
             if version.parse(torch.__version__) >= version.parse("2.2"):
-                model = model.run_decompositions()
+                from torch._decomp import get_decompositions
+                from openvino.frontend.pytorch.torchdynamo.decompositions import get_export_decomposition_list
+                decomp = get_decompositions(get_export_decomposition_list())
+                model = model.run_decompositions(decomp_table=decomp)
             gm = model.module()
+            log.debug(gm.code)
             decoder = TorchFXPythonDecoder(gm)
         else:
             decoder = TorchScriptPythonDecoder(