From 41c140056a31ccff4e5a312a7f0f8cd189abd76f Mon Sep 17 00:00:00 2001
From: "jag.Xu" <jia3.xu@intel.com>
Date: Wed, 13 Nov 2024 14:48:13 +0800
Subject: [PATCH 01/28] fix assertion failed on lockable_mem when using
 devicemem  (#27297)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Details:
- *The issue caused by the memory select of the input. In the changed
behavior called locked memory will put the input into host memory if any
the implementation of  its users is CPU.*
- *when the input is set to "use_device_mem" by benchmark_app, the
locked memory will ignore the memory, which will trigger the assertion
failure.*
- *fixed just ignore "need_lockable_mem" option in case of remote
tensors, because the data could be cloned implicitly at attempt to
access device buffer from the host side to have this copy*

### Tickets:
 - *CSV-152365*
---
 src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
index 985336b801b9d3..6d48849102765e 100644
--- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -782,7 +782,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
     auto device_tensor_et = convert_to_supported_device_type(element_type);
     bool convert_needed = is_convert_required(element_type, device_tensor_et);
 
-    if (is_remote_tensor_impl && !need_lockable_mem) {
+    if (is_remote_tensor_impl) {
         if (convert_needed) {
             m_plugin_inputs[input_idx] = { create_device_tensor(pshape,
                                                                 cldnn::element_type_to_data_type(element_type),

From 79dd358b756999aabfb4c140318d6a97da61a650 Mon Sep 17 00:00:00 2001
From: Ooi Boon Sin <boon.sin.ooi@intel.com>
Date: Wed, 13 Nov 2024 16:39:39 +0800
Subject: [PATCH 02/28] Add parameter to execute inference requests at a fixed
 frequency (#26820)

### Details:
 - Add parameter to execute inference requests at a fixed frequency

### Tickets:
 - *ticket-id*

---------

Signed-off-by: Maciej Falkowski <maciej.falkowski@intel.com>
Signed-off-by: Ooi, Boon Sin <boon.sin.ooi@intel.com>
Co-authored-by: Maciej Falkowski <maciej.falkowski@intel.com>
---
 .../openvino-samples/benchmark-tool.rst       | 12 +++++++++-
 samples/cpp/benchmark_app/benchmark_app.hpp   | 10 ++++++++
 samples/cpp/benchmark_app/main.cpp            |  7 ++++++
 .../smoke_tests/test_benchmark_app.py         | 12 +++++++++-
 .../openvino/tools/benchmark/benchmark.py     | 23 ++++++++++++++++---
 .../openvino/tools/benchmark/main.py          |  3 ++-
 .../openvino/tools/benchmark/parameters.py    |  4 ++++
 7 files changed, 65 insertions(+), 6 deletions(-)
 mode change 100644 => 100755 tests/samples_tests/smoke_tests/test_benchmark_app.py
 mode change 100644 => 100755 tools/benchmark_tool/openvino/tools/benchmark/main.py

diff --git a/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst b/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst
index 19c4a013c54aae..390fe00605f2c6 100644
--- a/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst
+++ b/docs/articles_en/learn-openvino/openvino-samples/benchmark-tool.rst
@@ -245,6 +245,13 @@ There are several options for setting the number of inference iterations:
 The more iterations a model runs, the better the statistics will be for determining
 average latency and throughput.
 
+Maximum inference rate
+++++++++++++++++++++++
+
+By default, the benchmarking app will run inference at maximum rate based on device capabilities.
+The maximum inferance rate can be configured by ``-max_irate <MAXIMUM_INFERENCE_RATE>`` option.
+Tweaking this value allow better accuracy in power usage measurement by limiting the number of executions.
+
 Inputs
 ++++++++++++++++++++
 
@@ -337,7 +344,7 @@ following usage message:
             [Step 1/11] Parsing and validating input arguments
             [ INFO ] Parsing input parameters
             usage: benchmark_app.py [-h [HELP]] [-i PATHS_TO_INPUT [PATHS_TO_INPUT ...]] -m PATH_TO_MODEL [-d TARGET_DEVICE]
-                                    [-hint {throughput,cumulative_throughput,latency,none}] [-niter NUMBER_ITERATIONS] [-t TIME] [-b BATCH_SIZE] [-shape SHAPE]
+                                    [-hint {throughput,cumulative_throughput,latency,none}] [-niter NUMBER_ITERATIONS] [-max_irate MAXIMUM_INFERENCE_RATE] [-t TIME] [-b BATCH_SIZE] [-shape SHAPE]
                                     [-data_shape DATA_SHAPE] [-layout LAYOUT] [-extensions EXTENSIONS] [-c PATH_TO_CLDNN_CONFIG] [-cdir CACHE_DIR] [-lfile [LOAD_FROM_FILE]]
                                     [-api {sync,async}] [-nireq NUMBER_INFER_REQUESTS] [-nstreams NUMBER_STREAMS] [-inference_only [INFERENCE_ONLY]]
                                     [-infer_precision INFER_PRECISION] [-ip {bool,f16,f32,f64,i8,i16,i32,i64,u8,u16,u32,u64}]
@@ -536,6 +543,9 @@ following usage message:
                                            'none': no device performance mode will be set.
                                           Using explicit 'nstreams' or other device-specific options, please set hint to 'none'
                 -niter  <integer>             Optional. Number of iterations. If not specified, the number of iterations is calculated depending on a device.
+                -max_irate <float>            Optional. Maximum inference rate by frame per second.
+                                          If not specified, default value is 0, the inference will run at maximium rate depending on a device capabilities.
+                                          Tweaking this value allow better accuracy in power usage measurement by limiting the execution.
                 -t                            Optional. Time in seconds to execute topology.
 
             Input shapes
diff --git a/samples/cpp/benchmark_app/benchmark_app.hpp b/samples/cpp/benchmark_app/benchmark_app.hpp
index 99cbd7edff8856..cf38ff6708ad29 100644
--- a/samples/cpp/benchmark_app/benchmark_app.hpp
+++ b/samples/cpp/benchmark_app/benchmark_app.hpp
@@ -65,6 +65,12 @@ static const char cache_dir_message[] = "Optional. Enables caching of loaded mod
 static const char load_from_file_message[] = "Optional. Loads model from file directly without read_model."
                                              " All CNNNetwork options (like re-shape) will be ignored";
 
+/// @brief message for maximum inference rate
+static const char maximum_inference_rate_message[] =
+    "Optional. Maximum inference rate by frame per second"
+    "If not specified, default value is 0, the inference will run at maximium rate depending on a device capabilities. "
+    "Tweaking this value allow better accuracy in power usage measurement by limiting the execution.";
+
 /// @brief message for execution time
 static const char execution_time_message[] = "Optional. Time in seconds to execute topology.";
 
@@ -307,6 +313,9 @@ DEFINE_string(api, "async", api_message);
 /// @brief Number of infer requests in parallel
 DEFINE_uint64(nireq, 0, infer_requests_count_message);
 
+/// @brief Execute infer requests at a fixed frequency
+DEFINE_double(max_irate, 0, maximum_inference_rate_message);
+
 /// @brief Number of streams to use for inference on the CPU (also affects Hetero cases)
 DEFINE_string(nstreams, "", infer_num_streams_message);
 
@@ -388,6 +397,7 @@ static void show_usage() {
     std::cout << "    -hint  <performance hint> (latency or throughput or cumulative_throughput or none)   "
               << hint_message << std::endl;
     std::cout << "    -niter  <integer>             " << iterations_count_message << std::endl;
+    std::cout << "    -max_irate \"<float>\"        " << maximum_inference_rate_message << std::endl;
     std::cout << "    -t                            " << execution_time_message << std::endl;
     std::cout << std::endl;
     std::cout << "Input shapes" << std::endl;
diff --git a/samples/cpp/benchmark_app/main.cpp b/samples/cpp/benchmark_app/main.cpp
index 4dcc1e82924efd..1f1b89c2427e67 100644
--- a/samples/cpp/benchmark_app/main.cpp
+++ b/samples/cpp/benchmark_app/main.cpp
@@ -7,6 +7,7 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <thread>
 #include <utility>
 #include <vector>
 
@@ -1157,6 +1158,12 @@ int main(int argc, char* argv[]) {
 
             execTime = std::chrono::duration_cast<ns>(Time::now() - startTime).count();
             processedFramesN += batchSize;
+
+            if (FLAGS_max_irate > 0) {
+                auto nextRunFinishTime = 1 / FLAGS_max_irate * processedFramesN * 1.0e9;
+                std::this_thread::sleep_for(
+                    std::chrono::nanoseconds(static_cast<int64_t>(nextRunFinishTime - execTime)));
+            }
         }
 
         // wait the latest inference executions
diff --git a/tests/samples_tests/smoke_tests/test_benchmark_app.py b/tests/samples_tests/smoke_tests/test_benchmark_app.py
old mode 100644
new mode 100755
index f9b37e87614d42..3be4f4b88eaab8
--- a/tests/samples_tests/smoke_tests/test_benchmark_app.py
+++ b/tests/samples_tests/smoke_tests/test_benchmark_app.py
@@ -38,13 +38,16 @@ def create_random_4bit_bin_file(tmp_path, shape, name):
         f.write(raw_data)
 
 
-def verify(sample_language, device, api=None, nireq=None, shape=None, data_shape=None, nstreams=None, layout=None, pin=None, cache=None, tmp_path=None, model='bvlcalexnet-12.onnx', inp='dog-224x224.bmp', batch='1', niter='10', tm=None):
+def verify(sample_language, device, api=None, nireq=None, shape=None, data_shape=None, nstreams=None,
+           layout=None, pin=None, cache=None, tmp_path=None, model='bvlcalexnet-12.onnx',
+           inp='dog-224x224.bmp', batch='1', niter='10', max_irate=None, tm=None):
     output = get_cmd_output(
         get_executable(sample_language),
         *prepend(cache, inp, model, tmp_path),
         *('-nstreams', nstreams) if nstreams else '',
         *('-layout', layout) if layout else '',
         *('-nireq', nireq) if nireq else '',
+        *('-max_irate', max_irate) if max_irate else '',
         *('-shape', shape) if shape else '',
         *('-data_shape', data_shape) if data_shape else '',
         *('-hint', 'none') if nstreams or pin else '',
@@ -84,6 +87,13 @@ def test_nireq(sample_language, api, nireq, device, cache, tmp_path):
     verify(sample_language, device, api=api, nireq=nireq, cache=cache, tmp_path=tmp_path)
 
 
+@pytest.mark.parametrize('sample_language', ['C++', 'Python'])
+@pytest.mark.parametrize('max_irate', ['', '0', '10'])
+@pytest.mark.parametrize('device', get_devices())
+def test_max_irate(sample_language, device, max_irate, cache, tmp_path):
+    verify(sample_language, device, max_irate=max_irate, cache=cache, tmp_path=tmp_path)
+
+
 @pytest.mark.skipif('CPU' not in get_devices(), reason='affinity is a CPU property')
 @pytest.mark.parametrize('sample_language', ['C++', 'Python'])
 @pytest.mark.parametrize('pin', ['YES', 'NO', 'NUMA', 'HYBRID_AWARE'])
diff --git a/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py b/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py
index adba697b598b4a..fb6f5a8ecd7a6d 100644
--- a/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/benchmark.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+import time
 from datetime import datetime
 from math import ceil
 from openvino.runtime import Core, get_version, AsyncInferQueue
@@ -15,7 +16,8 @@ def percentile(values, percent):
 
 class Benchmark:
     def __init__(self, device: str, number_infer_requests: int = 0, number_iterations: int = None,
-                 duration_seconds: int = None, api_type: str = 'async', inference_only = None):
+                 duration_seconds: int = None, api_type: str = 'async', inference_only = None,
+                 maximum_inference_rate: float = 0):
         self.device = device
         self.core = Core()
         self.nireq = number_infer_requests if api_type == 'async' else 1
@@ -24,6 +26,7 @@ def __init__(self, device: str, number_infer_requests: int = 0, number_iteration
         self.api_type = api_type
         self.inference_only = inference_only
         self.latency_groups = []
+        self.max_irate = maximum_inference_rate
 
     def __del__(self):
         del self.core
@@ -83,13 +86,21 @@ def first_infer(self, requests):
             requests.wait_all()
             return requests[id].latency
 
+    def inference_rate_delay(self, processed_frames, exec_time):
+        if self.max_irate > 0:
+            nextRunFinishTime = 1 / self.max_irate * processed_frames
+            delay = nextRunFinishTime - exec_time
+            time.sleep(delay if delay > 0 else 0)
+
     def sync_inference(self, request, data_queue):
+        processed_frames = 0
         exec_time = 0
         iteration = 0
         times = []
         start_time = datetime.utcnow()
         while (self.niter and iteration < self.niter) or \
               (self.duration_seconds and exec_time < self.duration_seconds):
+            processed_frames += data_queue.get_next_batch_size()
             if self.inference_only == False:
                 request.set_input_tensors(data_queue.get_next_input())
             request.infer()
@@ -97,10 +108,12 @@ def sync_inference(self, request, data_queue):
             iteration += 1
 
             exec_time = (datetime.utcnow() - start_time).total_seconds()
+            self.inference_rate_delay(processed_frames, exec_time)
         total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
         return sorted(times), total_duration_sec, iteration
 
-    def async_inference_only(self, infer_queue):
+    def async_inference_only(self, infer_queue, data_queue):
+        processed_frames = 0
         exec_time = 0
         iteration = 0
         times = []
@@ -109,6 +122,7 @@ def async_inference_only(self, infer_queue):
         while (self.niter and iteration < self.niter) or \
               (self.duration_seconds and exec_time < self.duration_seconds) or \
               (iteration % self.nireq):
+            processed_frames += data_queue.get_next_batch_size()
             idle_id = infer_queue.get_idle_request_id()
             if idle_id in in_fly:
                 times.append(infer_queue[idle_id].latency)
@@ -118,6 +132,8 @@ def async_inference_only(self, infer_queue):
             iteration += 1
 
             exec_time = (datetime.utcnow() - start_time).total_seconds()
+            self.inference_rate_delay(processed_frames, exec_time)
+
         infer_queue.wait_all()
         total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
         for infer_request_id in in_fly:
@@ -149,6 +165,7 @@ def async_inference_full_mode(self, infer_queue, data_queue, pcseq):
             iteration += 1
 
             exec_time = (datetime.utcnow() - start_time).total_seconds()
+            self.inference_rate_delay(processed_frames, exec_time)
         infer_queue.wait_all()
         total_duration_sec = (datetime.utcnow() - start_time).total_seconds()
         
@@ -164,7 +181,7 @@ def main_loop(self, requests, data_queue, batch_size, latency_percentile, pcseq)
             times, total_duration_sec, iteration = self.sync_inference(requests[0], data_queue)
             fps = len(batch_size) * iteration / total_duration_sec
         elif self.inference_only:
-            times, total_duration_sec, iteration = self.async_inference_only(requests)
+            times, total_duration_sec, iteration = self.async_inference_only(requests, data_queue)
             fps = len(batch_size) * iteration / total_duration_sec
         else:
             times, total_duration_sec, processed_frames, iteration = self.async_inference_full_mode(requests, data_queue, pcseq)
diff --git a/tools/benchmark_tool/openvino/tools/benchmark/main.py b/tools/benchmark_tool/openvino/tools/benchmark/main.py
old mode 100644
new mode 100755
index c77b50a7fd4721..acec4d17bdc377
--- a/tools/benchmark_tool/openvino/tools/benchmark/main.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/main.py
@@ -88,7 +88,8 @@ def is_flag_set_in_command_line(flag):
         next_step(step_id=2)
 
         benchmark = Benchmark(args.target_device, args.number_infer_requests,
-                              args.number_iterations, args.time, args.api_type, args.inference_only)
+                              args.number_iterations, args.time, args.api_type,
+                              args.inference_only, args.maximum_inference_rate)
 
         if args.extensions:
             benchmark.add_extension(path_to_extensions=args.extensions)
diff --git a/tools/benchmark_tool/openvino/tools/benchmark/parameters.py b/tools/benchmark_tool/openvino/tools/benchmark/parameters.py
index aa79767cecc397..dac2b1490bf534 100644
--- a/tools/benchmark_tool/openvino/tools/benchmark/parameters.py
+++ b/tools/benchmark_tool/openvino/tools/benchmark/parameters.py
@@ -72,6 +72,10 @@ def parse_args():
     args.add_argument('-niter', '--number_iterations', type=check_positive, required=False, default=None,
                       help='Optional. Number of iterations. '
                            'If not specified, the number of iterations is calculated depending on a device.')
+    args.add_argument('-max_irate', '--maximum_inference_rate', type=float, required=False, default=0,
+                      help='Optional. Maximum inference rate by frame per second. '
+                           'If not specified, default value is 0, the inference will run at maximium rate depending on a device capabilities. '
+                           'Tweaking this value allow better accuracy in power usage measurement by limiting the execution.')
     args.add_argument('-t', '--time', type=check_positive, required=False, default=None,
                       help='Optional. Time in seconds to execute topology.')
 

From a58202ff10f9f1b8b6bf3345cba09a8dc163724d Mon Sep 17 00:00:00 2001
From: Tomasz Krupa <tomasz.krupa@intel.com>
Date: Wed, 13 Nov 2024 08:50:45 +0000
Subject: [PATCH 03/28] Enable weightless caching with compile_model() API
 without providing weights_path property (#27407)

Requested in
https://jira.devtools.intel.com/browse/CVS-126606?focusedId=25564110&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-25564110
---
 src/inference/src/dev/core_impl.cpp           | 12 +++++++
 src/plugins/intel_gpu/src/plugin/plugin.cpp   |  3 +-
 .../tests/functional/behavior/model_cache.cpp | 31 ++++++++++++++++---
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/src/inference/src/dev/core_impl.cpp b/src/inference/src/dev/core_impl.cpp
index 32b43f346e9e44..244d27b5eebb67 100644
--- a/src/inference/src/dev/core_impl.cpp
+++ b/src/inference/src/dev/core_impl.cpp
@@ -1447,6 +1447,18 @@ ov::SoPtr<ov::ICompiledModel> ov::CoreImpl::load_model_from_cache(
 
                 ov::AnyMap update_config = config;
                 update_config[ov::loaded_from_cache.name()] = true;
+
+                if (util::contains(plugin.get_property(ov::supported_properties), ov::weights_path)) {
+                    std::string weights_path = cacheContent.modelPath;
+                    auto pos = weights_path.rfind('.');
+                    if (pos != weights_path.npos && weights_path.substr(pos) == ".xml") {
+                        weights_path = weights_path.substr(0, pos);
+                        weights_path += ".bin";
+                    }
+                    if (ov::util::file_exists(weights_path)) {
+                        update_config[ov::weights_path.name()] = weights_path;
+                    }
+                }
                 compiled_model = context ? plugin.import_model(networkStream, context, update_config)
                                          : plugin.import_model(networkStream, update_config);
             });
diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp
index 7d010a9b590e2e..b1cc946559ee94 100644
--- a/src/plugins/intel_gpu/src/plugin/plugin.cpp
+++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@@ -596,7 +596,8 @@ std::vector<ov::PropertyName> Plugin::get_supported_properties() const {
         ov::PropertyName{ov::hint::enable_cpu_pinning.name(), PropertyMutability::RW},
         ov::PropertyName{ov::device::id.name(), PropertyMutability::RW},
         ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW},
-        ov::PropertyName{ov::hint::activations_scale_factor.name(), PropertyMutability::RW}
+        ov::PropertyName{ov::hint::activations_scale_factor.name(), PropertyMutability::RW},
+        ov::PropertyName{ov::weights_path.name(), PropertyMutability::RO},
     };
 
     return supported_properties;
diff --git a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp
index 880868d8666560..839b2640ca180c 100644
--- a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp
+++ b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp
@@ -34,12 +34,22 @@
 #include "openvino/pass/serialize.hpp"
 
 namespace {
-class CheckWeightlessCacheAccuracy : public ::testing::Test {
+class CheckWeightlessCacheAccuracy : public ::testing::Test,
+                                     public ::testing::WithParamInterface<bool> {
+public:
+    static std::string get_test_case_name(::testing::TestParamInfo<bool> obj) {
+        bool use_compile_model_api = obj.param;
+
+        std::ostringstream result;
+        result << "use_compile_model_api=" << use_compile_model_api;
+        return result.str();
+    }
 protected:
     std::shared_ptr<ov::Model> model;
     std::string xml_path;
     std::string bin_path;
     std::string cache_path;
+    bool use_compile_model_api; // for loading from cache
 
     void SetUp() override;
     void TearDown() override;
@@ -51,6 +61,7 @@ void CheckWeightlessCacheAccuracy::SetUp() {
     xml_path = filePrefix + ".xml";
     bin_path = filePrefix + ".bin";
     cache_path = filePrefix + ".blob";
+    use_compile_model_api = GetParam();
 }
 
 void CheckWeightlessCacheAccuracy::TearDown() {
@@ -74,7 +85,13 @@ void CheckWeightlessCacheAccuracy::run() {
 
     auto ifstr = std::ifstream(cache_path, std::ifstream::binary);
     ov::CompiledModel imported_model;
-    OV_ASSERT_NO_THROW(imported_model = core->import_model(ifstr, ov::test::utils::DEVICE_GPU, config_with_weights_path));
+    if (use_compile_model_api) {
+        OV_ASSERT_NO_THROW(imported_model =
+                               core->compile_model(xml_path, ov::test::utils::DEVICE_GPU, config));
+    } else {
+        OV_ASSERT_NO_THROW(imported_model =
+                               core->import_model(ifstr, ov::test::utils::DEVICE_GPU, config_with_weights_path));
+    }
     ifstr.close();
 
     auto orig_req = compiled_model.create_infer_request();
@@ -98,19 +115,23 @@ void CheckWeightlessCacheAccuracy::run() {
     }
 }
 
-TEST_F(CheckWeightlessCacheAccuracy, ReadConcatSplitAssign) {
+TEST_P(CheckWeightlessCacheAccuracy, ReadConcatSplitAssign) {
     model = ov::test::utils::make_read_concat_split_assign({1, 1, 2, 4}, ov::element::f16);
     run();
 }
 
-TEST_F(CheckWeightlessCacheAccuracy, SingleConcatWithConstant) {
+TEST_P(CheckWeightlessCacheAccuracy, SingleConcatWithConstant) {
     model = ov::test::utils::make_single_concat_with_constant({1, 1, 2, 4}, ov::element::f16);
     run();
 }
 
-TEST_F(CheckWeightlessCacheAccuracy, TiWithLstmCell) {
+TEST_P(CheckWeightlessCacheAccuracy, TiWithLstmCell) {
     model = ov::test::utils::make_ti_with_lstm_cell(ov::element::f16);
     run();
 }
 
+INSTANTIATE_TEST_SUITE_P(smoke_CheckWeightlessCacheAccuracy, CheckWeightlessCacheAccuracy,
+                         ::testing::Bool(),
+                         CheckWeightlessCacheAccuracy::get_test_case_name);
+
 }  // namespace

From 291d3e1a5e8cba2e1ed0638d5440af40ef1f9cab Mon Sep 17 00:00:00 2001
From: Tomasz Jankowski <tomasz1.jankowski@intel.com>
Date: Wed, 13 Nov 2024 10:21:04 +0100
Subject: [PATCH 04/28] [Templ test] GroupConvolutionBackpropData: Enable whole
 Tensor comparison (#27379)

### Details:
 - Enabled whole tensor comparison.

### Tickets:
 - CVS-137167

Signed-off-by: Tomasz Jankowski <tomasz1.jankowski@intel.com>
---
 .../op_reference/group_convolution_backprop.cpp        | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/plugins/template/tests/functional/op_reference/group_convolution_backprop.cpp b/src/plugins/template/tests/functional/op_reference/group_convolution_backprop.cpp
index cc2162d4d95829..0e3c5de6289f99 100644
--- a/src/plugins/template/tests/functional/op_reference/group_convolution_backprop.cpp
+++ b/src/plugins/template/tests/functional/op_reference/group_convolution_backprop.cpp
@@ -104,14 +104,13 @@ class ReferenceGroupConvolutionBackpropDataLayerTest
       public CommonReferenceTest {
 public:
     void SetUp() override {
-        legacy_compare = true;
-        auto params = GetParam();
+        const auto& params = GetParam();
         function = CreateFunction(params);
         inputData = {params.inputData, params.filterData};
         refOutData = {params.refData};
     }
     static std::string getTestCaseName(const testing::TestParamInfo<GroupConvolutionBackpropDataParams>& obj) {
-        auto param = obj.param;
+        const auto& param = obj.param;
         std::ostringstream result;
         result << "inputShape=" << param.inputShape << "_";
         result << "filterShape=" << param.filterShape << "_";
@@ -163,14 +162,13 @@ class ReferenceGroupConvolutionBackpropDataLayerOutShapeTest
       public CommonReferenceTest {
 public:
     void SetUp() override {
-        legacy_compare = true;
-        auto params = GetParam();
+        const auto& params = GetParam();
         function = CreateFunction(params);
         inputData = {params.inputData, params.filterData};
         refOutData = {params.refData};
     }
     static std::string getTestCaseName(const testing::TestParamInfo<GroupConvolutionBackpropDataOutShapeParams>& obj) {
-        auto param = obj.param;
+        const auto& param = obj.param;
         std::ostringstream result;
         result << "inputShape=" << param.inputShape << "_";
         result << "filterShape=" << param.filterShape << "_";

From 431202e0b253ec18c5b8d664a1badff98fbe2eb0 Mon Sep 17 00:00:00 2001
From: Roman Lyamin <Roman.Lyamin@intel.com>
Date: Wed, 13 Nov 2024 13:28:26 +0400
Subject: [PATCH 05/28] [GPU] Minor state fixes (#27508)

### Tickets:
 - *[152882](https://jira.devtools.intel.com/browse/CVS-152882)*
---
 src/plugins/intel_gpu/src/graph/network.cpp       | 3 ++-
 src/plugins/intel_gpu/src/graph/reshape.cpp       | 3 ++-
 src/plugins/intel_gpu/src/plugin/common_utils.cpp | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
index 0af0e957df4ea8..24a103379a025c 100644
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -581,7 +581,8 @@ void network::allocate_primitives() {
 
     // Update the output memory address of optimized-out layer if it is not valid.
     for (auto const& node : po) {
-        if (node->can_be_optimized() && !node->is_dynamic()) {
+        if (node->can_be_optimized() && !node->is_dynamic() &&
+            (node->get_dependencies().empty() || !node->get_dependency(0).is_type<read_value>())) {
             auto opt_inst = _primitives.at(node->id());
             // build deps when prim_inst does not update dependencies yet.
             if (!node->get_dependencies().empty() && opt_inst->dependencies().empty()) {
diff --git a/src/plugins/intel_gpu/src/graph/reshape.cpp b/src/plugins/intel_gpu/src/graph/reshape.cpp
index 213a0aa175f5d2..e5e33f4ad87b14 100644
--- a/src/plugins/intel_gpu/src/graph/reshape.cpp
+++ b/src/plugins/intel_gpu/src/graph/reshape.cpp
@@ -11,6 +11,7 @@
 #include "openvino/core/validation_util.hpp"
 #include "primitive_type_base.h"
 #include "reshape_inst.h"
+#include "read_value_inst.h"
 #include "reshape_shape_inference.hpp"
 #include "squeeze_shape_inference.hpp"
 #include "unsqueeze_shape_inference.hpp"
@@ -286,7 +287,7 @@ reshape_inst::typed_primitive_inst(network& network, reshape_node const& node) :
 
     // if reshape operated in-place, postpone creation of the output until network run,
     // then create new memory object as the reinterpreted output of the previous primitive
-    if (input_layout.is_static() && output_layout.is_static()) {
+    if (input_layout.is_static() && output_layout.is_static() && !node.get_dependency(0).is_type<read_value>()) {
         if (!node.can_be_optimized()) {
             _outputs = allocate_outputs();
             _mem_allocated = true;
diff --git a/src/plugins/intel_gpu/src/plugin/common_utils.cpp b/src/plugins/intel_gpu/src/plugin/common_utils.cpp
index ddd6b5677adc45..8a5e47279d10a0 100644
--- a/src/plugins/intel_gpu/src/plugin/common_utils.cpp
+++ b/src/plugins/intel_gpu/src/plugin/common_utils.cpp
@@ -88,6 +88,7 @@ void convert_and_copy(const void* src_ptr, ov::element::Type src_et, void* dst_p
     CASE(ov::element::f16, ov::element::f16, ov::float16, ov::float16);
     CASE(ov::element::bf16, ov::element::f32, ov::bfloat16, float);
     CASE(ov::element::bf16, ov::element::f16, ov::bfloat16, ov::float16);
+    CASE(ov::element::boolean, ov::element::u8, bool, uint8_t);
 
     OPENVINO_THROW("[GPU] Unsupported element types combination for copy: ", src_et, " -> ", dst_et);
 }

From acccb227fba18aa4ef4536f8774563f63cd596b1 Mon Sep 17 00:00:00 2001
From: Andrii Staikov <andrii.staikov@intel.com>
Date: Wed, 13 Nov 2024 10:54:35 +0100
Subject: [PATCH 06/28] [TRANSFORMATIONS | GPU] Add a Validate pass after
 MoveEltwiseUpThroughDataMovScalar for element type resolution (#27464)

[TRANSFORMATIONS | GPU] Add a Validate pass after
MoveEltwiseUpThroughDataMovScalar

After executing the MoveEltwiseUpThroughDataMovScalar transforamtion
some node's element types may appear to be in inconsistent/corrupted
state. Fix it by inserting a Validate pass after the transformation for
resolving the node's element types.

This commit is a workaround until the 141764 is fixed which resolves the
issue of Validate passes.

Ticket:
* CVS-151111

Signed-off-by: Andrii Staikov <andrii.staikov@intel.com>

Signed-off-by: Andrii Staikov <andrii.staikov@intel.com>
---
 .../intel_gpu/src/plugin/transformations_pipeline.cpp        | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index 158dee2ee7ac05..db93696865a971 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -824,6 +824,11 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
             ov::op::v3::Broadcast::get_type_info_static(),
         };
         manager.register_pass<ov::pass::MoveEltwiseUpThroughDataMovScalar>(allowed_data_movement_ops);
+        // FIXME (151111): this Validate is added as a workaround for resolving element
+        // types after MoveEltwiseUpThroughDataMovScalar. It has to be removed
+        // after 141764 is fixed as there's a clear issue with Validate passes
+        // not working properly.
+        manager.register_pass<ov::pass::Validate>();
 
         manager.register_pass<ov::intel_gpu::ClampFP16Output>();
         manager.register_pass<ov::intel_gpu::ConvertMatMulToFullyConnected>();

From d5dcb0469b305003fa47164361067e5e179b887d Mon Sep 17 00:00:00 2001
From: Sebastian Golebiewski <sebastianx.golebiewski@intel.com>
Date: Wed, 13 Nov 2024 12:07:02 +0100
Subject: [PATCH 07/28] [DOCS] Fixing reference in documentation (#27531)

Fixing broken links in documentation.

Signed-off-by: Sebastian Golebiewski <sebastianx.golebiewski@intel.com>
---
 .../performance-benchmarks-faq.rst            |  8 +--
 .../about-openvino/release-notes-openvino.rst | 62 +++++++++----------
 .../documentation/legacy-features.rst         |  2 +-
 .../[legacy]-supported-model-formats.rst      |  4 +-
 .../convert-onnx-faster-r-cnn.rst             |  2 +-
 .../convert-onnx-gpt-2.rst                    |  2 +-
 .../convert-pytorch-quartz-net.rst            |  2 +-
 .../convert-pytorch-rnn-t.rst                 |  2 +-
 .../[legacy]-convert-tensorflow.rst           |  2 +-
 .../legacy-model-optimizer-extensibility.rst  |  2 +-
 .../openvino-security-add-on.rst              |  2 +-
 .../openvino-training-extensions.rst          |  4 +-
 .../low-precision-transformations.rst         |  2 +-
 .../operation-specs/infrastructure/loop-5.rst |  4 +-
 .../operation-specs/sequence/gru-cell-3.rst   |  2 +-
 .../sequence/gru-sequence-5.rst               |  4 +-
 .../sequence/lstm-sequence-5.rst              |  2 +-
 .../sequence/rnn-sequence-5.rst               |  2 +-
 .../configurations-intel-gpu.rst              |  2 +-
 .../install-openvino-yocto.rst                |  2 +-
 .../llm-inference-native-ov.rst               |  4 +-
 .../llm_inference_guide/ov-tokenizers.rst     |  2 +-
 .../filter-pruning.rst                        |  2 +-
 .../inference-devices-and-modes.rst           |  8 +--
 .../gpu-device.rst                            |  2 +-
 .../advanced_throughput_options.rst           |  8 +--
 .../running-inference/stateful-models.rst     |  2 +-
 .../running-inference/string-tensors.rst      |  4 +-
 .../openvino-workflow/torch-compile.rst       |  4 +-
 29 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/docs/articles_en/about-openvino/performance-benchmarks/performance-benchmarks-faq.rst b/docs/articles_en/about-openvino/performance-benchmarks/performance-benchmarks-faq.rst
index c55d3f44451f1c..4bf0b3a0acb19a 100644
--- a/docs/articles_en/about-openvino/performance-benchmarks/performance-benchmarks-faq.rst
+++ b/docs/articles_en/about-openvino/performance-benchmarks/performance-benchmarks-faq.rst
@@ -58,11 +58,11 @@ Performance Information F.A.Q.
         - Hugginface
         - Causal Decoder-only
         - 2048
-      * - `Llama-2-7b-chat <https://ai.meta.com/llama/>`__
+      * - `Llama-2-7b-chat <https://www.llama.com/>`__
         - Meta AI
         - Auto regressive language
         - 4096
-      * - `Llama-3-8b <https://ai.meta.com/llama/>`__
+      * - `Llama-3-8b <https://www.llama.com/>`__
         - Meta AI
         - Auto regressive language
         - 8192
@@ -74,7 +74,7 @@ Performance Information F.A.Q.
         - Huggingface
         - Auto regressive language
         - 4096
-      * - `Stable-Diffusion-V1-5 <https://https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5>`__
+      * - `Stable-Diffusion-V1-5 <https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5>`__
         - Hugginface
         - Latent Diffusion Model
         - 77
@@ -118,7 +118,7 @@ Performance Information F.A.Q.
         - YOLO V5 Medium
         - object detection
         - 640x640
-      * - `yolov8n <https://https://github.com/ultralytics/ultralytics>`__
+      * - `yolov8n <https://github.com/ultralytics/ultralytics>`__
         - Yolov8nano
         - object detection
         - 608x608
diff --git a/docs/articles_en/about-openvino/release-notes-openvino.rst b/docs/articles_en/about-openvino/release-notes-openvino.rst
index 4bd0b5d32c0f0e..6685a4325d57fe 100644
--- a/docs/articles_en/about-openvino/release-notes-openvino.rst
+++ b/docs/articles_en/about-openvino/release-notes-openvino.rst
@@ -943,7 +943,7 @@ Previous 2024 releases
      deployed in an arbitrary path without any code changes.
    * KServe REST API support has been extended to properly handle the string format in JSON body,
      just like the binary format compatible with NVIDIA Triton™.
-   * `A demo showcasing a full RAG algorithm <https://github.com/openvinotoolkit/model_server/tree/main/demos/python_demos/rag_chatbot>`__
+   * `A demo showcasing a full RAG algorithm <https://github.com/openvinotoolkit/model_server/tree/releases/2024/3/demos/python_demos/rag_chatbot>`__
      fully delegated to the model server has been added.
 
    **Neural Network Compression Framework**
@@ -1000,7 +1000,7 @@ Previous 2024 releases
    * `RMBG background removal <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/rmbg-background-removal/rmbg-background-removal.ipynb>`__
    * `AnimateAnyone: pose guided image to video generation <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/animate-anyone/animate-anyone.ipynb>`__
    * `LLaVA-Next visual-language assistant <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/llava-next-multimodal-chatbot/llava-next-multimodal-chatbot.ipynb>`__
-   * `TripoSR: single image 3d reconstruction <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/triposr-3d-reconstruction/triposr-3d-reconstruction.ipynb>`__
+   * `TripoSR: single image 3d reconstruction <https://github.com/openvinotoolkit/openvino_notebooks/blob/2024.4/notebooks/triposr-3d-reconstruction/triposr-3d-reconstruction.ipynb>`__
    * `RAG system with OpenVINO and LangChain <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/llm-rag-langchain/llm-rag-langchain.ipynb>`__
 
    *Known Issues*
@@ -1309,7 +1309,7 @@ Discontinued in 2024
   * `Accuracy Checker <https://github.com/openvinotoolkit/open_model_zoo/blob/master/tools/accuracy_checker/README.md>`__.
   * `Post-Training Optimization Tool <https://docs.openvino.ai/2023.3/pot_introduction.html>`__
     (POT). Neural Network Compression Framework (NNCF) should be used instead.
-  * A `Git patch <https://github.com/openvinotoolkit/nncf/tree/develop/third_party_integration/huggingface_transformers>`__
+  * A `Git patch <https://github.com/openvinotoolkit/nncf/tree/release_v281/third_party_integration/huggingface_transformers>`__
     for NNCF integration with `huggingface/transformers <https://github.com/huggingface/transformers>`__.
     The recommended approach is to use `huggingface/optimum-intel <https://github.com/huggingface/optimum-intel>`__
     for applying NNCF optimization on top of models from Hugging Face.
@@ -1360,25 +1360,25 @@ Deprecated and to be removed in the future
        * See alternative: `PaddleOCR with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/paddle-ocr-webcam>`__,
        * See alternative: `Handwritten Text Recognition Demo <https://github.com/openvinotoolkit/open_model_zoo/blob/master/demos/handwritten_text_recognition_demo/python/README.md>`__
 
-     * `Image In-painting with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/image-inpainting>`__
+     * `Image In-painting with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/image-inpainting>`__
 
        * See alternative: `Image Inpainting Python Demo <https://github.com/openvinotoolkit/open_model_zoo/blob/master/demos/image_inpainting_demo/python/README.md>`__
 
-     * `Interactive Machine Translation with OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/machine-translation>`__
+     * `Interactive Machine Translation with OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/machine-translation>`__
 
        * See alternative: `Machine Translation Python* Demo <https://github.com/openvinotoolkit/open_model_zoo/blob/master/demos/machine_translation_demo/python/README.md>`__
 
-     * `Open Model Zoo Tools Tutorial <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/model-tools>`__
+     * `Open Model Zoo Tools Tutorial <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/model-tools>`__
 
        * No alternatives, demonstrates deprecated tools.
 
-     * `Super Resolution with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/vision-superresolution>`__
+     * `Super Resolution with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/vision-superresolution>`__
 
        * See alternative: `Super Resolution with PaddleGAN and OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/vision-paddlegan-superresolution>`__
        * See alternative:  `Image Processing C++ Demo <https://github.com/openvinotoolkit/open_model_zoo/blob/master/demos/image_processing_demo/cpp/README.md>`__
 
-     * `Image Colorization with OpenVINO Tutorial <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/vision-image-colorization>`__
-     * `Interactive Question Answering with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/question-answering>`__
+     * `Image Colorization with OpenVINO Tutorial <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.2/notebooks/vision-image-colorization>`__
+     * `Interactive Question Answering with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/question-answering>`__
 
        * See alternative: `BERT Question Answering Embedding Python* Demo <https://github.com/openvinotoolkit/open_model_zoo/blob/master/demos/bert_question_answering_embedding_demo/python/README.md>`__
        * See alternative:  `BERT Question Answering Python* Demo <https://github.com/openvinotoolkit/open_model_zoo/blob/master/demos/bert_question_answering_demo/python/README.md>`__
@@ -1387,37 +1387,37 @@ Deprecated and to be removed in the future
 
        * See alternative: `Security Barrier Camera C++ Demo  <https://github.com/openvinotoolkit/open_model_zoo/blob/master/demos/security_barrier_camera_demo/cpp/README.md>`__
 
-     * `The attention center model with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/attention-center>`_
-     * `Image Generation with DeciDiffusion <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/decidiffusion-image-generation>`_
-     * `Image generation with DeepFloyd IF and OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/deepfloyd-if>`_
-     * `Depth estimation using VI-depth with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/depth-estimation-videpth>`_
+     * `The attention center model with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/attention-center>`_
+     * `Image Generation with DeciDiffusion <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.2/notebooks/decidiffusion-image-generation>`_
+     * `Image generation with DeepFloyd IF and OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/deepfloyd-if>`_
+     * `Depth estimation using VI-depth with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/depth-estimation-videpth>`_
      * `Instruction following using Databricks Dolly 2.0 and OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/dolly-2-instruction-following>`_
 
        * See alternative: `LLM Instruction-following pipeline with OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-question-answering>`__
 
-     * `Image generation with FastComposer and OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/fastcomposer-image-generation>`__
+     * `Image generation with FastComposer and OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/fastcomposer-image-generation>`__
      * `Video Subtitle Generation with OpenAI Whisper  <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/whisper-subtitles-generation>`__
 
        * See alternative: `Automatic speech recognition using Distil-Whisper and OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/distil-whisper-asr/distil-whisper-asr.ipynb>`__
 
-     * `Introduction to Performance Tricks in OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/performance-tricks>`__
-     * `Speaker Diarization with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/pyannote-speaker-diarization>`__
-     * `Subject-driven image generation and editing using BLIP Diffusion and OpenVINO  <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/blip-diffusion-subject-generation>`__
-     * `Text Prediction with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/text-prediction>`__
-     * `Training to Deployment with TensorFlow and OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/tensorflow-training-openvino>`__
-     * `Speech to Text with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/speech-to-text>`__
-     * `Convert and Optimize YOLOv7 with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/yolov7-optimization>`__
-     * `Quantize Data2Vec Speech Recognition Model using NNCF PTQ API <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/speech-recognition-quantization/speech-recognition-quantization-data2vec.ipynb>`__
+     * `Introduction to Performance Tricks in OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/performance-tricks>`__
+     * `Speaker Diarization with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.2/notebooks/pyannote-speaker-diarization>`__
+     * `Subject-driven image generation and editing using BLIP Diffusion and OpenVINO  <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/blip-diffusion-subject-generation>`__
+     * `Text Prediction with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/text-prediction>`__
+     * `Training to Deployment with TensorFlow and OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/tensorflow-training-openvino>`__
+     * `Speech to Text with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.2/notebooks/speech-to-text>`__
+     * `Convert and Optimize YOLOv7 with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.3/notebooks/yolov7-optimization>`__
+     * `Quantize Data2Vec Speech Recognition Model using NNCF PTQ API <https://github.com/openvinotoolkit/openvino_notebooks/blob/2024.2/notebooks/speech-recognition-quantization/speech-recognition-quantization-data2vec.ipynb>`__
 
        * See alternative: `Quantize Speech Recognition Models with accuracy control using NNCF PTQ API <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/quantizing-model-with-accuracy-control/speech-recognition-quantization-wav2vec2.ipynb>`__
 
-     * `Semantic segmentation with LRASPP MobileNet v3 and OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/torchvision-zoo-to-openvino/lraspp-segmentation.ipynb>`__
-     * `Video Recognition using SlowFast and OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/slowfast-video-recognition>`__
+     * `Semantic segmentation with LRASPP MobileNet v3 and OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/blob/2024.1/notebooks/torchvision-zoo-to-openvino/lraspp-segmentation.ipynb>`__
+     * `Video Recognition using SlowFast and OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/slowfast-video-recognition>`__
 
        * See alternative: `Live Action Recognition with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/action-recognition-webcam>`__
 
-     * `Semantic Segmentation with OpenVINO™ using Segmenter <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/segmenter-semantic-segmentation>`__
-     * `Programming Language Classification with OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/code-language-id>`__
+     * `Semantic Segmentation with OpenVINO™ using Segmenter <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/segmenter-semantic-segmentation>`__
+     * `Programming Language Classification with OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/tree/2024.1/notebooks/code-language-id>`__
      * `Stable Diffusion Text-to-Image Demo <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/stable-diffusion-v2/stable-diffusion-v2-text-to-image-demo.ipynb>`__
 
        * See alternative: `Stable Diffusion v2.1 using Optimum-Intel OpenVINO and multiple Intel Hardware <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/stable-diffusion-v2/stable-diffusion-v2-optimum-demo.ipynb>`__
@@ -1426,10 +1426,10 @@ Deprecated and to be removed in the future
 
        * See alternative: `Stable Diffusion v2.1 using Optimum-Intel OpenVINO and multiple Intel Hardware <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/stable-diffusion-v2/stable-diffusion-v2-optimum-demo.ipynb>`__
 
-     * `Image generation with Segmind Stable Diffusion 1B (SSD-1B) model and OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/stable-diffusion-xl/ssd-b1.ipynb>`__
-     * `Data Preparation for 2D Medical Imaging <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/ct-segmentation-quantize/data-preparation-ct-scan.ipynb>`__
-     * `Train a Kidney Segmentation Model with MONAI and PyTorch Lightning <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/ct-segmentation-quantize/pytorch-monai-training.ipynb>`__
-     * `Live Inference and Benchmark CT-scan Data with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/ct-segmentation-quantize/ct-scan-live-inference.ipynb>`__
+     * `Image generation with Segmind Stable Diffusion 1B (SSD-1B) model and OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/blob/2024.1/notebooks/stable-diffusion-xl/ssd-b1.ipynb>`__
+     * `Data Preparation for 2D Medical Imaging <https://github.com/openvinotoolkit/openvino_notebooks/blob/2024.1/notebooks/ct-segmentation-quantize/data-preparation-ct-scan.ipynb>`__
+     * `Train a Kidney Segmentation Model with MONAI and PyTorch Lightning <https://github.com/openvinotoolkit/openvino_notebooks/blob/2024.1/notebooks/ct-segmentation-quantize/pytorch-monai-training.ipynb>`__
+     * `Live Inference and Benchmark CT-scan Data with OpenVINO™ <https://github.com/openvinotoolkit/openvino_notebooks/blob/2024.1/notebooks/ct-segmentation-quantize/ct-scan-live-inference.ipynb>`__
 
        * See alternative: `Quantize a Segmentation Model and Show Live Inference <https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/ct-segmentation-quantize/ct-segmentation-quantize-nncf.ipynb>`__
 
@@ -1458,7 +1458,7 @@ are available on request.
 
 Intel technologies' features and benefits depend on system configuration and may require
 enabled hardware, software or service activation. Learn more at
-`www.intel.com <http://www.intel.com/>`__
+`www.intel.com <https://www.intel.com/>`__
 or from the OEM or retailer.
 
 No computer system can be absolutely secure.
diff --git a/docs/articles_en/documentation/legacy-features.rst b/docs/articles_en/documentation/legacy-features.rst
index f859a3a4572f88..2457d28cf24c15 100644
--- a/docs/articles_en/documentation/legacy-features.rst
+++ b/docs/articles_en/documentation/legacy-features.rst
@@ -96,7 +96,7 @@ Discontinued:
 
    |   *New solution:* API 2.0 launched in OpenVINO 2022.1
    |   *Old solution:* discontinued with OpenVINO 2024.0
-   |      `The last version supporting API 1.0 <https://docs.openvino.ai/2023.2/openvino_2_0_transition_guide.html>`__
+   |      `2023.2 is the last version supporting API 1.0 <https://docs.openvino.ai/archives/index.html#:~:text=2023.2,Release%20Notes>`__
 
 .. dropdown:: Compile tool
 
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats.rst
index b5d3c08b39f480..fb9f41c755d4fb 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats.rst
@@ -120,7 +120,7 @@ Here are code examples of how to use these methods with different model formats:
 
               For more details on conversion, refer to the
               :doc:`guide <[legacy]-supported-model-formats/[legacy]-convert-tensorflow>`
-              and an example `tutorial <https://docs.openvino.ai/2024/notebooks/tensorflow-to-openvino-with-output.html>`__
+              and an example `tutorial <https://docs.openvino.ai/2024/notebooks/tensorflow-classification-to-openvino-with-output.html>`__
               on this topic.
 
             * The ``read_model()`` and ``compile_model()`` methods:
@@ -592,7 +592,7 @@ to OpenVINO IR or ONNX before running inference should be considered the default
    OpenVINO versions of 2023 are mostly compatible with the old instructions,
    through a deprecated MO tool, installed with the deprecated OpenVINO Developer Tools package.
 
-   `OpenVINO 2023.0 <https://docs.openvino.ai/archive/2023.0/Supported_Model_Formats.html>`__ is the last
+   `OpenVINO 2023.0 <https://docs.openvino.ai/archives/index.html>`__ is the last
    release officially supporting the MO conversion process for the legacy formats.
 
 
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-faster-r-cnn.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-faster-r-cnn.rst
index 711a060b7467b8..7880b261c80b81 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-faster-r-cnn.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-faster-r-cnn.rst
@@ -14,7 +14,7 @@ Converting an ONNX Faster R-CNN Model
 
 The instructions below are applicable **only** to the Faster R-CNN model converted to the ONNX file format from the `maskrcnn-benchmark model <https://github.com/facebookresearch/maskrcnn-benchmark>`__:
 
-1. Download the pretrained model file from `onnx/models <https://github.com/onnx/models/tree/master/vision/object_detection_segmentation/faster-rcnn>`__ (commit-SHA: 8883e49e68de7b43e263d56b9ed156dfa1e03117).
+1. Download the pretrained model file from `onnx/models <https://github.com/onnx/models/tree/main/validated/vision/object_detection_segmentation/faster-rcnn>`__ (commit-SHA: 8883e49e68de7b43e263d56b9ed156dfa1e03117).
 
 2. Generate the Intermediate Representation of the model, by changing your current working directory to the model conversion API installation directory, and running model conversion with the following parameters:
 
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-gpt-2.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-gpt-2.rst
index 84392e92e620d2..4c10c941c7fb47 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-gpt-2.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-onnx-gpt-2.rst
@@ -12,7 +12,7 @@ Converting an ONNX GPT-2 Model
 
    This guide describes a deprecated conversion method. The guide on the new and recommended method can be found in the :doc:`Python tutorials <../../../../../../learn-openvino/interactive-tutorials-python>`.
 
-`Public pre-trained GPT-2 model <https://github.com/onnx/models/tree/master/text/machine_comprehension/gpt-2>`__ is a large
+`Public pre-trained GPT-2 model <https://github.com/onnx/models/tree/main/validated/text/machine_comprehension/gpt-2>`__ is a large
 transformer-based language model with a simple objective: predict the next word, given all of the previous words within some text.
 
 Downloading the Pre-Trained Base GPT-2 Model
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-quartz-net.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-quartz-net.rst
index de3af8ce5175f0..f1ee885dae0b26 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-quartz-net.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-quartz-net.rst
@@ -20,7 +20,7 @@ Downloading the Pre-trained QuartzNet Model
 To download the pre-trained model, refer to the `NeMo Speech Models Catalog <https://ngc.nvidia.com/catalog/models/nvidia:nemospeechmodels>`__.
 Here are the instructions on how to obtain QuartzNet in ONNX format.
 
-1. Install the NeMo toolkit, using the `instructions <https://github.com/NVIDIA/NeMo/tree/main#installation>`__.
+1. Install the NeMo toolkit, using the `instructions <https://github.com/NVIDIA/NeMo/blob/main/README.md#install-nemo-framework>`__.
 
 2. Run the following code:
 
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-rnn-t.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-rnn-t.rst
index 4f33e510a40267..ad646568aed598 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-rnn-t.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-conversion-tutorials/convert-pytorch-rnn-t.rst
@@ -44,7 +44,7 @@ For UNIX-like systems, you can use ``wget``:
 
 
 The link was taken from ``setup.sh`` in the ``speech_recoginitin/rnnt`` subfolder. You will get exactly the same weights as
-if you were following the `guide <https://github.com/mlcommons/inference/tree/master/speech_recognition/rnnt>`__.
+if you were following the `guide <https://github.com/mlcommons/inference/tree/master/retired_benchmarks/speech_recognition/rnnt>`__.
 
 **Step 4**. Install required Python packages:
 
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-tensorflow.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-tensorflow.rst
index 955d5418d37270..2bcb6fde9b833b 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-tensorflow.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-conversion-api/[legacy]-supported-model-formats/[legacy]-convert-tensorflow.rst
@@ -59,7 +59,7 @@ To convert such TensorFlow model, run the `mo` script with a path to the MetaGra
 
 
 3. **SavedModel format**. In this case, a model consists of a special directory with a ``.pb`` file
-and several subfolders: ``variables``, ``assets``, and ``assets.extra``. For more information about the SavedModel directory, refer to the `README <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/saved_model#components>`__ file in the TensorFlow repository.
+and several subfolders: ``variables``, ``assets``, and ``assets.extra``. For more information about the SavedModel directory, refer to the `README <https://github.com/tensorflow/tensorflow/tree/master/tensorflow/python/saved_model>`__ file in the TensorFlow repository.
 To convert such TensorFlow model, run the ``mo`` script with a path to the SavedModel directory:
 
 .. code-block:: sh
diff --git a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst
index fc78b12640771a..3d2365f45ffe3b 100644
--- a/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst
+++ b/docs/articles_en/documentation/legacy-features/transition-legacy-conversion-api/legacy-model-optimizer-extensibility.rst
@@ -160,7 +160,7 @@ It is important to mention that sometimes it seems like transformation cannot be
 because the actual values of inputs or shapes are needed. In fact, manipulations of shapes or values can be implemented
 using operations that are added to the graph. Consider the
 ``extensions/front/onnx/flattenONNX_to_reshape.py`` transformation, which replaces an ONNX
-`Flatten <https://github.com/onnx/onnx/blob/master/docs/Operators.md#Flatten>`__ operation with a sub-graph of operations performing
+`Flatten <https://github.com/onnx/onnx/blob/main/docs/Operators.md#Flatten>`__ operation with a sub-graph of operations performing
 the following (when ``axis`` is not equal to 0 and 1):
 
 1. Calculate a shape of the ``Flatten`` input tensor, using the :doc:`ShapeOf <../../openvino-ir-format/operation-sets/operation-specs/shape/shape-of-3>` operation.
diff --git a/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst b/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst
index 2d5598a5eb8e9d..3959ebefb09a4a 100644
--- a/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst
+++ b/docs/articles_en/documentation/openvino-ecosystem/openvino-security-add-on.rst
@@ -580,7 +580,7 @@ Building OpenVINO™ Security Add-on depends on OpenVINO™ Model Server docker
 
 1. Download the `OpenVINO™ Model Server software <https://github.com/openvinotoolkit/model_server>`__
 
-2. Build the `OpenVINO™ Model Server Docker images <https://github.com/openvinotoolkit/model_server/blob/main/docs/docker_container.md>`__
+2. Build the `OpenVINO™ Model Server Docker images <https://github.com/openvinotoolkit/model_server/blob/main/docs/developer_guide.md#step-1-compile-source-code>`__
 
    .. code-block:: sh
 
diff --git a/docs/articles_en/documentation/openvino-ecosystem/openvino-training-extensions.rst b/docs/articles_en/documentation/openvino-ecosystem/openvino-training-extensions.rst
index a7a81acd9ba3a7..8a5bd91f9c1b7b 100644
--- a/docs/articles_en/documentation/openvino-ecosystem/openvino-training-extensions.rst
+++ b/docs/articles_en/documentation/openvino-ecosystem/openvino-training-extensions.rst
@@ -32,9 +32,9 @@ If the results are unsatisfactory, add datasets and perform the same steps, star
 OpenVINO Training Extensions Components
 #######################################
 
-* `OpenVINO Training Extensions API <https://github.com/openvinotoolkit/training_extensions/tree/develop/src/otx/api>`__
+* `OpenVINO Training Extensions API <https://github.com/openvinotoolkit/training_extensions/tree/develop/src/otx/>`__
 * `OpenVINO Training Extensions CLI <https://github.com/openvinotoolkit/training_extensions/tree/develop/src/otx/cli>`__
-* `OpenVINO Training Extensions Algorithms <https://github.com/openvinotoolkit/training_extensions/tree/develop/src/otx/algorithms>`__
+* `OpenVINO Training Extensions Algorithms <https://github.com/openvinotoolkit/training_extensions/tree/develop/src/otx/algo>`__
 
 Tutorials
 #########
diff --git a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst
index 6ba9e0a9b60f52..9451fabd6219d8 100644
--- a/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst
+++ b/docs/articles_en/documentation/openvino-extensibility/openvino-plugin-library/advanced-guides/low-precision-transformations.rst
@@ -35,7 +35,7 @@ The goal of Low Precision Transformations (LPT) is to transform a quantized mode
 
 As result, operation input tensor precisions will be changed from original to low precision and operations can be inferred by OpenVINO™ plugin in low precision.
 
-For a more detailed description on how to quantize a model, see the `Low precision tools <#low-precision-tools>`__ section below. For more information about model quantization, refer to **Brief History of Lower Precision in Deep Learning** section in `this whitepaper <https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training>`__.
+For a more detailed description on how to quantize a model, see the `Low precision tools <#low-precision-tools>`__ section below. For more information about model quantization, refer to **Brief History of Lower Precision in Deep Learning** section in `this whitepaper <https://www.intel.com/content/dam/develop/external/us/en/documents/lower-numerical-precision-deep-learning-jan2018-754765.pdf>`__.
 
 Input model requirements
 ########################
diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/infrastructure/loop-5.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/infrastructure/loop-5.rst
index 5cc1b024f158b1..f02c5414ac4369 100644
--- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/infrastructure/loop-5.rst
+++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/infrastructure/loop-5.rst
@@ -11,7 +11,7 @@ Loop
 **Category**: *Infrastructure*
 
 **Short description**: *Loop* operation performs recurrent execution of the network, which is described in the ``body``, iterating through the data.
-The operation has similar semantic to the ONNX Loop `operation <https://github.com/onnx/onnx/blob/master/docs/Changelog.md#Loop-13>`__.
+The operation has similar semantic to the ONNX Loop `operation <https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Loop-13>`__.
 
 **Detailed description**
 
@@ -73,7 +73,7 @@ Loop operation description in the IR also has several special sections: ``body``
 1. The body operation getting an input from the main graph should have an entry in the ``port_map`` section of the Loop operation. These edges connect input ports of the Loop with the body ``Parameter``\ s.
 2. Input tensors to the Loop can be sliced along a specified axis, the Loop can iterates over all sliced parts. The corresponding ``input`` entry in the ``port_map`` should have ``axis`` attribute specifying the axis to slice. Therefore, inputs to the Loop operation corresponding to ``input`` entries in the ``port_map`` without ``axis`` attribute are used "as is" (without slicing).
 3. The body operation producing tensor to be used in the subsequent iterations (like in RNN models) should have a back edge described in the ``back_edges`` section of the operation. The back edge connects the respective body ``Parameter`` and ``Result`` operations. For such a case the Loop operation node provides input for the first iteration, while corresponding Loop operation output produces the tensor computed during the last iteration.
-4. Output tensors produced by a particular body operation across all iterations can be concatenated and returned as a Loop operation output (this is a "scan output" according to the ONNX* Loop operation `specification <https://github.com/onnx/onnx/blob/master/docs/Changelog.md#Loop-13>`__ ). The corresponding ``output`` entry in the ``port_map`` should have ``axis`` attribute specifying the axis to concatenate. Therefore, outputs from operations corresponding to ``output`` entries in the ``port_map`` without ``axis`` attribute are returned "as is" (without concatenation).
+4. Output tensors produced by a particular body operation across all iterations can be concatenated and returned as a Loop operation output (this is a "scan output" according to the ONNX* Loop operation `specification <https://github.com/onnx/onnx/blob/main/docs/Changelog.md#Loop-13>`__ ). The corresponding ``output`` entry in the ``port_map`` should have ``axis`` attribute specifying the axis to concatenate. Therefore, outputs from operations corresponding to ``output`` entries in the ``port_map`` without ``axis`` attribute are returned "as is" (without concatenation).
 5. There is one body ``Parameter`` operation not connected through the ``port_map``. This is a "current iteration" input. The Loop operation is responsible for providing the appropriate value for each iteration.
 6. Connection of nodes inside the Loop body with the main graph should be done through ``Parameter`` and ``Result`` body operations. No other ways to connect graphs are allowed.
 
diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/gru-cell-3.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/gru-cell-3.rst
index 28dbec46289f89..f58418ee923a8b 100644
--- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/gru-cell-3.rst
+++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/gru-cell-3.rst
@@ -64,7 +64,7 @@ GRUCell
 * *linear_before_reset*
 
   * **Description**: *linear_before_reset* flag denotes if the layer behaves according to the modification
-    of *GRUCell* described in the formula in the `ONNX documentation <https://github.com/onnx/onnx/blob/master/docs/Operators.md#GRU>`__.
+    of *GRUCell* described in the formula in the `ONNX documentation <https://github.com/onnx/onnx/blob/main/docs/Operators.md#GRU>`__.
   * **Range of values**: true or false
   * **Type**: ``boolean``
   * **Default value**: false
diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/gru-sequence-5.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/gru-sequence-5.rst
index 37c70087e121ea..f9b9a5ece850ec 100644
--- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/gru-sequence-5.rst
+++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/gru-sequence-5.rst
@@ -19,7 +19,7 @@ represents a sequence of GRU cells. The sequence can be connected differently de
 ``direction`` attribute that specifies the direction of traversing of input data along sequence
 dimension or specifies whether it should be a bidirectional sequence. The most of the attributes
 are in sync with the specification of ONNX GRU operator defined
-`GRUCell <https://github.com/onnx/onnx/blob/master/docs/Operators.md#gru>`__
+`GRUCell <https://github.com/onnx/onnx/blob/main/docs/Operators.md#gru>`__
 
 
 **Attributes**
@@ -69,7 +69,7 @@ are in sync with the specification of ONNX GRU operator defined
 * *linear_before_reset*
 
   * **Description**: *linear_before_reset* flag denotes if the layer behaves according to the modification
-    of *GRUCell* described in the formula in the `ONNX documentation <https://github.com/onnx/onnx/blob/master/docs/Operators.md#GRU>`__.
+    of *GRUCell* described in the formula in the `ONNX documentation <https://github.com/onnx/onnx/blob/main/docs/Operators.md#GRU>`__.
   * **Range of values**: True or False
   * **Type**: ``boolean``
   * **Default value**: False
diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/lstm-sequence-5.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/lstm-sequence-5.rst
index c00b4c819cc66a..164033bdd2831c 100644
--- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/lstm-sequence-5.rst
+++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/lstm-sequence-5.rst
@@ -14,7 +14,7 @@ LSTMSequence
 
 **Detailed description**
 
-A single cell in the sequence is implemented in the same way as in :doc:`LSTM Cell <lstm-cell-1>` operation. *LSTMSequence* represents a sequence of LSTM cells. The sequence can be connected differently depending on ``direction`` attribute that specifies the direction of traversing of input data along sequence dimension or specifies whether it should be a bidirectional sequence. The most of the attributes are in sync with the specification of ONNX LSTM operator defined `LSTMCell <https://github.com/onnx/onnx/blob/master/docs/Operators.md#lstm>`__ .
+A single cell in the sequence is implemented in the same way as in :doc:`LSTM Cell <lstm-cell-1>` operation. *LSTMSequence* represents a sequence of LSTM cells. The sequence can be connected differently depending on ``direction`` attribute that specifies the direction of traversing of input data along sequence dimension or specifies whether it should be a bidirectional sequence. The most of the attributes are in sync with the specification of ONNX LSTM operator defined `LSTMCell <https://github.com/onnx/onnx/blob/main/docs/Operators.md#lstm>`__ .
 
 
 **Attributes**
diff --git a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/rnn-sequence-5.rst b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/rnn-sequence-5.rst
index fc9829dd999bda..a3dfc062de2dcd 100644
--- a/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/rnn-sequence-5.rst
+++ b/docs/articles_en/documentation/openvino-ir-format/operation-sets/operation-specs/sequence/rnn-sequence-5.rst
@@ -14,7 +14,7 @@ RNNSequence
 
 **Detailed description**
 
-A single cell in the sequence is implemented in the same way as in :doc:`RNNCell <rnn-cell-3>` operation. *RNNSequence* represents a sequence of RNN cells. The sequence can be connected differently depending on `direction` attribute that specifies the direction of traversing of input data along sequence dimension or specifies whether it should be a bidirectional sequence. The most of the attributes are in sync with the specification of ONNX RNN operator defined `RNNCell <https://github.com/onnx/onnx/blob/master/docs/Operators.md#rnn>`__.
+A single cell in the sequence is implemented in the same way as in :doc:`RNNCell <rnn-cell-3>` operation. *RNNSequence* represents a sequence of RNN cells. The sequence can be connected differently depending on `direction` attribute that specifies the direction of traversing of input data along sequence dimension or specifies whether it should be a bidirectional sequence. The most of the attributes are in sync with the specification of ONNX RNN operator defined `RNNCell <https://github.com/onnx/onnx/blob/main/docs/Operators.md#rnn>`__.
 
 
 **Attributes**
diff --git a/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst b/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst
index dc43881780b1e6..e10a67fddadb53 100644
--- a/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst
+++ b/docs/articles_en/get-started/configurations/configurations-intel-gpu.rst
@@ -37,7 +37,7 @@ Below are the instructions on how to install the OpenCL packages on supported Li
       and install the apt package `ocl-icd-libopencl1` with the OpenCl ICD loader.
 
       Alternatively, you can add the apt repository by following the
-      `installation guide <https://dgpu-docs.intel.com/driver/installation.html#ubuntu-install-steps>`__.
+      `installation guide <https://dgpu-docs.intel.com/driver/installation.html#ubuntu>`__.
       Then install the `ocl-icd-libopencl1`, `intel-opencl-icd`, `intel-level-zero-gpu` and `level-zero`
       apt packages:
 
diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-yocto.rst b/docs/articles_en/get-started/install-openvino/install-openvino-yocto.rst
index 0ff1b95c8eb212..475f623ef86598 100644
--- a/docs/articles_en/get-started/install-openvino/install-openvino-yocto.rst
+++ b/docs/articles_en/get-started/install-openvino/install-openvino-yocto.rst
@@ -108,6 +108,6 @@ Additional Resources
 - `Official Yocto Project documentation <https://docs.yoctoproject.org/>`__
 - `BitBake Tool <https://docs.yoctoproject.org/bitbake/>`__
 - `Poky <https://git.yoctoproject.org/poky>`__
-- `Meta-intel <https://git.yoctoproject.org/meta-intel/tree/README>`__
+- `Meta-intel <https://git.yoctoproject.org/meta-intel/tree/README.md>`__
 - `Meta-openembedded <http://cgit.openembedded.org/meta-openembedded/tree/README.md>`__
 - `Meta-clang <https://github.com/kraj/meta-clang/tree/master/#readme>`__
\ No newline at end of file
diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst
index 7f220111f64b98..2476a0423e30e1 100644
--- a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst
+++ b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst
@@ -31,8 +31,8 @@ some examples of popular Generative AI scenarios:
 
 To write such pipelines, you can follow the examples provided as part of OpenVINO:
 
-* `OpenVINO Latent Consistency Model C++ image generation pipeline <https://github.com/openvinotoolkit/openvino.genai/tree/master/image_generation/lcm_dreamshaper_v7/cpp>`__
-* `OpenVINO Stable Diffusion (with LoRA) C++ image generation pipeline <https://github.com/openvinotoolkit/openvino.genai/tree/master/image_generation/stable_diffusion_1_5/cpp>`__
+* `OpenVINO Latent Consistency Model C++ image generation pipeline <https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/4/image_generation/lcm_dreamshaper_v7/cpp>`__
+* `OpenVINO Stable Diffusion (with LoRA) C++ image generation pipeline <https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/4/image_generation/stable_diffusion_1_5/cpp>`__
 
 To perform inference, models must be first converted to OpenVINO IR format using
 Hugging Face Optimum-Intel API.
diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/ov-tokenizers.rst b/docs/articles_en/learn-openvino/llm_inference_guide/ov-tokenizers.rst
index d6e23b3791d001..2064aa843a93d8 100644
--- a/docs/articles_en/learn-openvino/llm_inference_guide/ov-tokenizers.rst
+++ b/docs/articles_en/learn-openvino/llm_inference_guide/ov-tokenizers.rst
@@ -336,7 +336,7 @@ Additional Resources
 
 * `OpenVINO Tokenizers repo <https://github.com/openvinotoolkit/openvino_tokenizers>`__
 * `OpenVINO Tokenizers Notebook <https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/openvino-tokenizers>`__
-* `Text generation C++ samples that support most popular models like LLaMA 2 <https://github.com/openvinotoolkit/openvino.genai/tree/master/text_generation/causal_lm/cpp>`__
+* `Text generation C++ samples that support most popular models like LLaMA 3 <https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/cpp/greedy_causal_lm>`__
 * `OpenVINO GenAI Repo <https://github.com/openvinotoolkit/openvino.genai>`__
 
 
diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/filter-pruning.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/filter-pruning.rst
index 5033d24ba3785a..2a551d7aa44eb5 100644
--- a/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/filter-pruning.rst
+++ b/docs/articles_en/openvino-workflow/model-optimization-guide/compressing-models-during-training/filter-pruning.rst
@@ -76,7 +76,7 @@ of optimization methods (`"compression"` section).
          :fragment: [nncf_congig]
 
 Here is a brief description of the required parameters of the Filter Pruning method. For a full description refer to the
-`GitHub <https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Pruning.md>`__ page.
+`GitHub <https://github.com/openvinotoolkit/nncf/blob/develop/docs/usage/training_time_compression/other_algorithms/Pruning.md>`__ page.
 
 * ``pruning_init`` - initial pruning rate target. For example, value ``0.1`` means that at the begging of training, convolutions that can be pruned will have 10% of their filters set to zero.
 
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes.rst
index 41d43f7eea37d6..aa8e9cdabfda64 100644
--- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes.rst
@@ -83,7 +83,7 @@ Accordingly, the code that loops over all available devices of the "GPU" type on
 Additional Resources
 ####################
 
-* `OpenVINO™ Runtime API Tutorial <./../../notebooks/openvino-api-with-output.html>`__
-* `AUTO Device Tutorial <./../../notebooks/auto-device-with-output.html>`__
-* `GPU Device Tutorial <./../../notebooks/gpu-device-with-output.html>`__
-* `NPU Device Tutorial <./../../notebooks/hello-npu-with-output.html>`__
\ No newline at end of file
+* `OpenVINO™ Runtime API Tutorial <../../notebooks/openvino-api-with-output.html>`__
+* `AUTO Device Tutorial <../../notebooks/auto-device-with-output.html>`__
+* `GPU Device Tutorial <../../notebooks/gpu-device-with-output.html>`__
+* `NPU Device Tutorial <../../notebooks/hello-npu-with-output.html>`__
\ No newline at end of file
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.rst
index 78cf0632f61b2b..b4e1c7ac15afcc 100644
--- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.rst
@@ -19,7 +19,7 @@ For an in-depth description of the GPU plugin, see:
 
 - `GPU plugin developer documentation <https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_gpu/README.md>`__
 - `OpenVINO Runtime GPU plugin source files <https://github.com/openvinotoolkit/openvino/tree/master/src/plugins/intel_gpu/>`__
-- `Accelerate Deep Learning Inference with Intel® Processor Graphics <https://software.intel.com/en-us/articles/accelerating-deep-learning-inference-with-intel-processor-graphics>`__
+- `Start AI Development with Intel <https://www.intel.com/content/www/us/en/developer/topic-technology/artificial-intelligence/overview.html>`__
 
 The GPU plugin is a part of the Intel® Distribution of OpenVINO™ toolkit. For more information on how to configure a system to use it, see the :doc:`GPU configuration <../../../get-started/configurations/configurations-intel-gpu>`.
 
diff --git a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-throughput/advanced_throughput_options.rst b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-throughput/advanced_throughput_options.rst
index 7466d00efe5eb7..cad5633e11f85b 100644
--- a/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-throughput/advanced_throughput_options.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/optimize-inference/optimizing-throughput/advanced_throughput_options.rst
@@ -85,12 +85,12 @@ Number of Streams Considerations
 
 * Select the number of streams that is **less or equal** to the number of requests that the application would be able to run simultaneously.
 * To avoid wasting resources, the number of streams should be enough to meet the *average* parallel slack rather than the peak load.
-* Use the `ov::streams::AUTO <groupov_runtime_cpp_prop_api.html#doxid-group-ov-runtime-cpp-prop-api-1gaddb29425af71fbb6ad3379c59342ff0e>`__ as a more portable option (that also respects the underlying hardware configuration).
+* Use the `ov::streams::AUTO <../../../../api/c_cpp_api/group__ov__runtime__cpp__prop__api.html#_CPPv44AUTO>`__ as a more portable option (that also respects the underlying hardware configuration).
 * It is very important to keep these streams busy, by running as many inference requests as possible (for example, start the newly-arrived inputs immediately):
 
-  * A bare minimum of requests to saturate the device can be queried as the `ov::optimal_number_of_infer_requests <groupov_runtime_cpp_prop_api.html#doxid-group-ov-runtime-cpp-prop-api-1ga087c6da667f7c3d8374aec5f6cbba027>`__ of the  ``ov:Compiled_Model``.
+  * A bare minimum of requests to saturate the device can be queried as the `ov::optimal_number_of_infer_requests <../../../../api/c_cpp_api/group__ov__runtime__cpp__prop__api.html#_CPPv432optimal_number_of_infer_requests>`__ of the  ``ov:Compiled_Model``.
 
-* *The maximum number of streams* for the device (per model) can be queried as the `ov::range_for_streams <groupov_runtime_cpp_prop_api.html#doxid-group-ov-runtime-cpp-prop-api-1ga8a5d84196f6873729167aa512c34a94a>`__.
+* *The maximum number of streams* for the device (per model) can be queried as the `ov::range_for_streams <../../../../api/c_cpp_api/group__ov__runtime__cpp__prop__api.html#_CPPv417range_for_streams>`__.
 
 Batch Size Considerations
 +++++++++++++++++++++++++
@@ -99,7 +99,7 @@ Batch Size Considerations
 
   * Otherwise (or if the number of "available" requests fluctuates), you may need to keep several instances of the network (reshaped to the different batch size) and select the properly sized instance in the runtime accordingly.
 
-* For OpenVINO devices that implement a dedicated heuristic internally, the `ov::optimal_batch_size <groupov_runtime_cpp_prop_api.html#doxid-group-ov-runtime-cpp-prop-api-1ga129bad2da2fc2a40a7d746d86fc9c68d>`__ is a *device* property (that accepts the actual model as a parameter) to query the recommended batch size for the model.
+* For OpenVINO devices that implement a dedicated heuristic internally, the `ov::optimal_batch_size <../../../../api/c_cpp_api/group__ov__runtime__cpp__prop__api.html#_CPPv418optimal_batch_size>`__ is a *device* property (that accepts the actual model as a parameter) to query the recommended batch size for the model.
 
 
 A Few Device-specific Details
diff --git a/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst b/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst
index 86788b20249a3f..d00fd19c4d636d 100644
--- a/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/stateful-models.rst
@@ -139,5 +139,5 @@ sequences.
 
 You can find more examples demonstrating how to work with states in other articles:
 
-* `LLM Chatbot notebook <../../notebooks/stable-zephyr-3b-chatbot-with-output.html>`__
+* `LLaVA-NeXT Multimodal Chatbot notebook <../../notebooks/llava-next-multimodal-chatbot-with-output.html>`__
 * :doc:`Serving Stateful Models with OpenVINO Model Server <../../openvino-workflow/model-server/ovms_docs_stateful_models>`
diff --git a/docs/articles_en/openvino-workflow/running-inference/string-tensors.rst b/docs/articles_en/openvino-workflow/running-inference/string-tensors.rst
index 438c9ea9ec0bd3..3032add547f8a8 100644
--- a/docs/articles_en/openvino-workflow/running-inference/string-tensors.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/string-tensors.rst
@@ -201,6 +201,6 @@ Additional Resources
 
 * Learn about the :doc:`basic steps to integrate inference in your application <integrate-openvino-with-your-application>`.
 
-* Use `OpenVINO tokenizers <https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/custom_operations/user_ie_extensions/tokenizer/python>`__ to produce models that use string tensors to work with textual information as pre- and post-processing for the large language models.
+* Use `OpenVINO tokenizers <https://github.com/openvinotoolkit/openvino_contrib/tree/releases/2024/0/modules/custom_operations/user_ie_extensions/tokenizer/python>`__ to produce models that use string tensors to work with textual information as pre- and post-processing for the large language models.
 
-* Check out `GenAI Samples <https://github.com/openvinotoolkit/openvino.genai/tree/master/text_generation/causal_lm/cpp>`__ to see how string tensors are used in real-life applications.
+* Check out `GenAI Samples <https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/cpp/greedy_causal_lm>`__ to see how string tensors are used in real-life applications.
diff --git a/docs/articles_en/openvino-workflow/torch-compile.rst b/docs/articles_en/openvino-workflow/torch-compile.rst
index 5bdb51a596d5d8..e5bc0ca901a5aa 100644
--- a/docs/articles_en/openvino-workflow/torch-compile.rst
+++ b/docs/articles_en/openvino-workflow/torch-compile.rst
@@ -288,7 +288,7 @@ PyTorch supports ``torch.compile`` officially on Windows from version 2.3.0 onwa
 For PyTorch versions below 2.3.0, the ``torch.compile`` feature is not supported on Windows
 officially. However, it can be accessed by running the following instructions:
 
-1. Install the PyTorch nightly wheel file - `2.1.0.dev20230713 <https://download.pytorch.org/whl/nightly/cpu/torch-2.1.0.dev20230713%2Bcpu-cp38-cp38-win_amd64.whl>`__ ,
+1. Install the PyTorch nightly wheel file - `2.1.0.dev20230713 <https://download.pytorch.org/whl/cpu/torch-2.1.0%2Bcpu-cp38-cp38-win_amd64.whl>`__ ,
 2. Update the file at ``<python_env_root>/Lib/site-packages/torch/_dynamo/eval_frames.py``
 3. Find the function called ``check_if_dynamo_supported()``:
 
@@ -374,7 +374,7 @@ The ``torch.compile`` feature is part of PyTorch 2.0, and is based on:
   (PEP 523) to dynamically modify Python bytecode right before it is executed (PyTorch operators
   that cannot be extracted to FX graph are executed in the native Python environment).
   It maintains the eager-mode capabilities using
-  `Guards <https://pytorch.org/docs/stable/torch.compiler_guards_overview.html>`__ to ensure the
+  `Guards <https://pytorch.org/docs/2.0/dynamo/guards-overview.html>`__ to ensure the
   generated graphs are valid.
 
 * **AOTAutograd** - generates the backward graph corresponding to the forward graph captured by TorchDynamo.

From a97ff61747661ba362c1b390a12c41809306858e Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Wed, 13 Nov 2024 13:51:18 +0100
Subject: [PATCH 08/28] [CI][GHA] Manylinux x86 build (#27430)

### Details:
 - Enabled manilinux x86 build based on manylinux 2014 image
 - OpenVINO tarball package
 - Wheels for 3.9-3.13 Pythons

### Tickets:
 - *148719*

---------

Co-authored-by: Alina Kladieva <alina.kladieva@intel.com>
Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
---
 .github/dockerfiles/docker_tag                |   2 +-
 .../ov_build/manylinux2014_x86_64/Dockerfile  |  20 ++
 .../ubuntu_22_04_x64_docker/Dockerfile        |  42 ++++
 .github/workflows/manylinux_2014.yml          | 191 ++++++++++++++++++
 4 files changed, 254 insertions(+), 1 deletion(-)
 create mode 100644 .github/dockerfiles/ov_build/manylinux2014_x86_64/Dockerfile
 create mode 100644 .github/dockerfiles/ov_build/ubuntu_22_04_x64_docker/Dockerfile
 create mode 100644 .github/workflows/manylinux_2014.yml

diff --git a/.github/dockerfiles/docker_tag b/.github/dockerfiles/docker_tag
index 5a4f7795ea4a44..3783a7e8d5600a 100644
--- a/.github/dockerfiles/docker_tag
+++ b/.github/dockerfiles/docker_tag
@@ -1 +1 @@
-pr-27384
+pr-27430
diff --git a/.github/dockerfiles/ov_build/manylinux2014_x86_64/Dockerfile b/.github/dockerfiles/ov_build/manylinux2014_x86_64/Dockerfile
new file mode 100644
index 00000000000000..59239575be329c
--- /dev/null
+++ b/.github/dockerfiles/ov_build/manylinux2014_x86_64/Dockerfile
@@ -0,0 +1,20 @@
+ARG REGISTRY="quay.io"
+FROM openvinogithubactions.azurecr.io/quayio/pypa/manylinux2014_x86_64
+
+USER root
+
+# Install build dependencies
+ADD install_build_dependencies.sh /install_build_dependencies.sh
+RUN chmod +x /install_build_dependencies.sh && /install_build_dependencies.sh
+
+# Install sscache
+ARG SCCACHE_VERSION="v0.7.5"
+ENV SCCACHE_HOME="/opt/sccache" \
+    SCCACHE_PATH="/opt/sccache/sccache"
+
+RUN mkdir ${SCCACHE_HOME} && cd ${SCCACHE_HOME} && \
+    SCCACHE_ARCHIVE="sccache-${SCCACHE_VERSION}-x86_64-unknown-linux-musl.tar.gz" && \
+    curl -SLO https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/${SCCACHE_ARCHIVE} && \
+    tar -xzf ${SCCACHE_ARCHIVE} --strip-components=1 && rm ${SCCACHE_ARCHIVE}
+
+ENV PATH="$SCCACHE_HOME:$PATH"
diff --git a/.github/dockerfiles/ov_build/ubuntu_22_04_x64_docker/Dockerfile b/.github/dockerfiles/ov_build/ubuntu_22_04_x64_docker/Dockerfile
new file mode 100644
index 00000000000000..2d5bc1c878069a
--- /dev/null
+++ b/.github/dockerfiles/ov_build/ubuntu_22_04_x64_docker/Dockerfile
@@ -0,0 +1,42 @@
+ARG REGISTRY="docker.io"
+FROM ${REGISTRY}/library/ubuntu:22.04
+
+USER root
+
+# APT configuration
+RUN echo 'Acquire::Retries "10";' > /etc/apt/apt.conf && \
+    echo 'APT::Get::Assume-Yes "true";' >> /etc/apt/apt.conf && \
+    echo 'APT::Get::Fix-Broken "true";' >> /etc/apt/apt.conf && \
+    echo 'APT::Get::no-install-recommends "true";' >> /etc/apt/apt.conf
+
+ENV DEBIAN_FRONTEND="noninteractive" \
+    TZ="Europe/London"
+
+RUN apt-get update && \
+    apt-get install software-properties-common && \
+    add-apt-repository --yes --no-update ppa:git-core/ppa && \
+    add-apt-repository --yes --no-update ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install \
+        curl \
+        git \
+        gpg-agent \
+        tzdata \
+        # parallel gzip
+        pigz \
+        python3 \
+        python3-pip \
+        && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install docker
+RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | \
+    gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg && \
+    echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] \
+      https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | \
+      tee /etc/apt/sources.list.d/docker.list > /dev/null
+
+RUN apt-get update && \
+    apt-get install -y docker-ce docker-ce-cli containerd.io
+
+ENV DOCKER_BUILDKIT=1
\ No newline at end of file
diff --git a/.github/workflows/manylinux_2014.yml b/.github/workflows/manylinux_2014.yml
new file mode 100644
index 00000000000000..ed375fb868459f
--- /dev/null
+++ b/.github/workflows/manylinux_2014.yml
@@ -0,0 +1,191 @@
+name: Manylinux 2014
+on:
+  workflow_dispatch:
+  pull_request:
+  merge_group:
+  push:
+    branches:
+      - master
+      - 'releases/**'
+
+concurrency:
+  # github.ref is not unique in post-commit
+  group: ${{ github.event_name == 'push' && github.run_id || github.ref }}-manylinux-2014
+  cancel-in-progress: true
+
+permissions: read-all
+
+env:
+  PIP_CACHE_PATH: /mount/caches/pip/linux
+
+jobs:
+  Smart_CI:
+    runs-on: ubuntu-latest
+    outputs:
+      affected_components: "${{ steps.smart_ci.outputs.affected_components }}"
+      changed_components: "${{ steps.smart_ci.outputs.changed_components }}"
+      skip_workflow: "${{ steps.smart_ci.outputs.skip_workflow }}"
+    steps:
+      - name: checkout action
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          sparse-checkout: .github/actions/smart-ci
+
+      - name: Get affected components
+        id: smart_ci
+        uses: ./.github/actions/smart-ci
+        with:
+          repository: ${{ github.repository }}
+          pr: ${{ github.event.number }}
+          commit_sha: ${{ github.sha }}
+          ref_name: ${{ github.ref_name }}
+          component_pattern: "category: (.*)"
+          repo_token: ${{ secrets.GITHUB_TOKEN }}
+          skip_when_only_listed_labels_set: 'docs'
+          skip_when_only_listed_files_changed: '*.md,*.rst,*.png,*.jpg,*.svg'
+
+      - name: Show affected components
+        run: |
+          echo "${{ toJSON(steps.smart_ci.outputs.affected_components) }}"
+        shell: bash
+
+  Docker:
+    needs: Smart_CI
+    if: "!needs.smart_ci.outputs.skip_workflow"
+    runs-on: aks-linux-4-cores-16gb-docker-build
+    container:
+      image: openvinogithubactions.azurecr.io/docker_build:0.2
+      volumes:
+        - /mount:/mount
+    outputs:
+      images: "${{ steps.handle_docker.outputs.images }}"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - uses: ./.github/actions/handle_docker
+        id: handle_docker
+        with:
+          images: |
+            ov_build/ubuntu_22_04_x64_docker
+            ov_build/manylinux2014_x86_64
+          registry: 'openvinogithubactions.azurecr.io'
+          dockerfiles_root_dir: '.github/dockerfiles'
+          changed_components: ${{ needs.smart_ci.outputs.changed_components }}
+          
+  Build:
+    needs: [Docker]
+    timeout-minutes: 120
+    defaults:
+      run:
+        shell: bash
+    runs-on: aks-linux-16-cores-32gb-manylinux
+    if: ${{ github.repository_owner == 'openvinotoolkit' }}
+    container:
+      image: ${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_22_04_x64_docker }}
+      volumes:
+        - /mount:/mount
+      options: -e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING -e DOCKER_CONFIG -v ${{ github.workspace }}:${{ github.workspace }}
+    env:
+      CMAKE_BUILD_TYPE: 'Release'
+      OPENVINO_REPO: ${{ github.workspace }}/src
+      INSTALL_DIR: ${{ github.workspace }}/install/openvino
+      INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/wheels
+      BUILD_DIR: ${{ github.workspace }}/build
+      DOCKER_CONFIG: "/mount/.docker"
+      CMAKE_CXX_COMPILER_LAUNCHER: sccache
+      CMAKE_C_COMPILER_LAUNCHER: sccache
+      SCCACHE_IGNORE_SERVER_IO_ERROR: 1
+      SCCACHE_SERVER_PORT: 35555
+      SCCACHE_CACHE_SIZE: 50G
+      SCCACHE_AZURE_KEY_PREFIX: manylinux_2014
+      
+    steps:
+      - name: Clone OpenVINO
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          path: ${{ env.OPENVINO_REPO }}
+          submodules: 'true'
+
+      - name: System info
+        uses: ./src/.github/actions/system_info
+
+      - name: Create docker build cache
+        run: |
+          docker volume create ov_build_cache
+          
+      - name: Build OpenVINO
+        run: |
+          docker run --rm \
+            -v ${{ env.OPENVINO_REPO }}:/work/src \
+            -v ov_build_cache:/work/build \
+            -v ${{ env.INSTALL_DIR }}:/work/install \
+            -e SCCACHE_AZURE_BLOB_CONTAINER \
+            -e SCCACHE_AZURE_CONNECTION_STRING \
+            -e SCCACHE_SERVER_PORT \
+            -e SCCACHE_IGNORE_SERVER_IO_ERROR \
+            -e SCCACHE_CACHE_SIZE \
+            -e SCCACHE_AZURE_KEY_PREFIX \
+            -e CMAKE_CXX_COMPILER_LAUNCHER \
+            -e CMAKE_C_COMPILER_LAUNCHER \
+            -w /work/src \
+            ${{ fromJSON(needs.docker.outputs.images).ov_build.manylinux2014_x86_64 }} \
+            /bin/bash -c "
+              cmake -DENABLE_CPPLINT=OFF -DENABLE_NCC_STYLE=OFF -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_PYTHON=OFF -DENABLE_WHEEL=OFF -S /work/src -B /work/build &&
+              cmake --build /work/build --parallel $(nproc) --config ${{ env.CMAKE_BUILD_TYPE }} &&
+              cmake --install /work/build --config ${{ env.CMAKE_BUILD_TYPE }} --prefix /work/install
+            "
+            
+      - name: Pack Artifacts
+        run: mkdir -p ${{ env.BUILD_DIR }} && tar -cvf - * | pigz > ${{ env.BUILD_DIR }}/openvino_package.tar.gz
+        working-directory: ${{ env.INSTALL_DIR }}
+            
+      - name: Build Python API(Python 3.9-3.13)
+        run: |
+          SUPPORTED_PYTHON_VERSIONS=("39" "310" "311" "312" "313")
+          for PY_VER in "${SUPPORTED_PYTHON_VERSIONS[@]}"; do
+            python_path=/opt/python/cp${PY_VER}-cp${PY_VER}/bin
+            docker run --rm \
+              -v ${{ env.OPENVINO_REPO }}:/work/src \
+              -v ${{ env.INSTALL_WHEELS_DIR }}:/work/wheels \
+              -v ${{ env.PIP_CACHE_PATH }}:/work/pip_cache \
+              -v ov_build_cache:/work/build \
+              -e SCCACHE_AZURE_BLOB_CONTAINER \
+              -e SCCACHE_AZURE_CONNECTION_STRING \
+              -e SCCACHE_SERVER_PORT \
+              -e SCCACHE_IGNORE_SERVER_IO_ERROR \
+              -e SCCACHE_CACHE_SIZE \
+              -e SCCACHE_AZURE_KEY_PREFIX \
+              -e CMAKE_CXX_COMPILER_LAUNCHER \
+              -e CMAKE_C_COMPILER_LAUNCHER \
+              -w /work/src \
+              ${{ fromJSON(needs.docker.outputs.images).ov_build.manylinux2014_x86_64 }} \
+              /bin/bash -c "
+                export PATH=${python_path}:\$PATH
+                PIP_VER=$(python3 -c "import pip; print(pip.__version__)")
+                export "PIP_CACHE_DIR=/work/pip_cache/${PIP_VER}"
+                python3 -m pip install -r /work/src/src/bindings/python/wheel/requirements-dev.txt &&
+                cmake -DOpenVINODeveloperPackage_DIR=/work/build -DENABLE_PYTHON=ON -DENABLE_WHEEL=ON -S /work/src/src/bindings/python -B /work/build_py${PY_VER} &&
+                cmake --build /work/build_py${PY_VER} --parallel $(nproc) --target ie_wheel --config ${{ env.CMAKE_BUILD_TYPE }} &&
+                cmake --install /work/build_py${PY_VER} --config ${{ env.CMAKE_BUILD_TYPE }} --prefix /work/wheels --component python_wheels
+              "
+          done
+
+      #
+      # Upload build artifacts
+      #
+      - name: Upload openvino package
+        if: ${{ always() }}
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        with:
+          name: openvino_package
+          path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz
+          if-no-files-found: 'error'
+      
+      - name: Upload openvino wheels
+        if: ${{ always() }}
+        uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
+        with:
+          name: openvino_wheels
+          path: ${{ env.INSTALL_WHEELS_DIR }}/wheels/*.whl
+          if-no-files-found: 'error'
\ No newline at end of file

From a0940bbbf7a4bad9e9e337a0c17b30708ce1f064 Mon Sep 17 00:00:00 2001
From: Anastasia Kuporosova <anastasia.kuporosova@intel.com>
Date: Wed, 13 Nov 2024 14:22:06 +0100
Subject: [PATCH 09/28] [PyOV] Restrict changing data in const (#27431)

### Details:
 - Const op meant to be non-changeable

### Tickets:
 - CVS-124319
---
 .../python/src/pyopenvino/core/common.cpp     |   8 +-
 .../python/tests/test_graph/test_constant.py  | 120 +-----------------
 2 files changed, 12 insertions(+), 116 deletions(-)

diff --git a/src/bindings/python/src/pyopenvino/core/common.cpp b/src/bindings/python/src/pyopenvino/core/common.cpp
index a202c3a3801001..10ae0ed0ea6042 100644
--- a/src/bindings/python/src/pyopenvino/core/common.cpp
+++ b/src/bindings/python/src/pyopenvino/core/common.cpp
@@ -358,10 +358,14 @@ py::array array_from_constant_copy(ov::op::v0::Constant&& c, py::dtype& dst_dtyp
 py::array array_from_constant_view(ov::op::v0::Constant&& c) {
     const auto& ov_type = c.get_element_type();
     const auto dtype = Common::type_helpers::get_dtype(ov_type);
+    py::array data;
     if (ov_type.bitwidth() < Common::values::min_bitwidth) {
-        return py::array(dtype, c.get_byte_size(), c.get_data_ptr(), py::cast(c));
+        data = py::array(dtype, c.get_byte_size(), c.get_data_ptr(), py::cast(c));
+    } else {
+        data = py::array(dtype, c.get_shape(), constant_helpers::_get_strides(c), c.get_data_ptr(), py::cast(c));
     }
-    return py::array(dtype, c.get_shape(), constant_helpers::_get_strides(c), c.get_data_ptr(), py::cast(c));
+    data.attr("flags").attr("writeable") = false;
+    return data;
 }
 
 };  // namespace array_helpers
diff --git a/src/bindings/python/tests/test_graph/test_constant.py b/src/bindings/python/tests/test_graph/test_constant.py
index 131654855b380a..7b349ad7cd94b1 100644
--- a/src/bindings/python/tests/test_graph/test_constant.py
+++ b/src/bindings/python/tests/test_graph/test_constant.py
@@ -205,53 +205,12 @@ def test_init_with_scalar(init_value, src_dtype, dst_dtype, shared_flag, data_ge
     assert np.allclose(const_data, expected_result)
 
 
-@pytest.mark.parametrize(
-    ("src_dtype"),
-    [
-        (np.float16),
-        (np.uint16),
-    ],
-)
-@pytest.mark.parametrize(
-    ("shared_flag"),
-    [
-        (True),
-        (False),
-    ],
-)
-@pytest.mark.parametrize(
-    ("data_getter"),
-    [
-        (DataGetter.COPY),
-        (DataGetter.VIEW),
-    ],
-)
-def test_init_bf16_populate(src_dtype, shared_flag, data_getter):
-    data = np.random.rand(1, 2, 16, 8) + 0.5
-    data = data.astype(src_dtype)
-
-    # To create bf16 constant, allocate memory and populate it:
-    init_data = np.zeros(shape=data.shape, dtype=src_dtype)
-    ov_const = ops.constant(init_data, dtype=Type.bf16, shared_memory=shared_flag)
-    ov_const.data[:] = data
-
-    # Check shape and element type of Constant class
-    assert isinstance(ov_const, Constant)
-    assert np.all(list(ov_const.shape) == [1, 2, 16, 8])
-    assert ov_const.get_element_type() == Type.bf16
-
-    _dst_dtype = Type.bf16.to_dtype()
-
-    assert ov_const.get_element_type().to_dtype() == _dst_dtype
-    # Compare values to Constant
-    if data_getter == DataGetter.COPY:
-        const_data = ov_const.get_data()
-    elif data_getter == DataGetter.VIEW:
-        const_data = ov_const.data
-    else:
-        raise AttributeError("Unknown DataGetter passed!")
-    assert const_data.dtype == _dst_dtype
-    assert np.allclose(const_data, data)
+def test_cant_change_data_in_const():
+    arr_0 = np.ones([1, 3, 32, 32])
+    ov_const = ops.constant(arr_0)
+    arr_1 = np.ones([1, 3, 32, 32]) + 1
+    with pytest.raises(ValueError, match="assignment destination is read-only"):
+        ov_const.data[:] = arr_1
 
 
 @pytest.mark.parametrize(
@@ -286,58 +245,6 @@ def test_init_bf16_direct(ov_type, numpy_dtype, shared_flag):
     assert np.allclose(data, result, rtol=0.01)
 
 
-@pytest.mark.parametrize(
-    "shape",
-    [
-        ([1, 3, 28, 28]),
-        ([1, 3, 27, 27]),
-    ],
-)
-@pytest.mark.parametrize(
-    ("low", "high", "ov_type", "src_dtype"),
-    [
-        (0, 2, Type.u1, np.uint8),
-        (0, 16, Type.u4, np.uint8),
-        (-8, 7, Type.i4, np.int8),
-        (0, 16, Type.nf4, np.uint8),
-    ],
-)
-@pytest.mark.parametrize(
-    ("shared_flag"),
-    [
-        (True),
-        (False),
-    ],
-)
-@pytest.mark.parametrize(
-    ("data_getter"),
-    [
-        (DataGetter.COPY),
-        (DataGetter.VIEW),
-    ],
-)
-def test_constant_helper_packing(shape, low, high, ov_type, src_dtype, shared_flag, data_getter):
-    data = np.random.uniform(low, high, shape).astype(src_dtype)
-
-    # Allocate memory first:
-    ov_const = ops.constant(np.zeros(shape=data.shape, dtype=src_dtype),
-                            dtype=ov_type,
-                            shared_memory=shared_flag)
-    # Fill data with packed values
-    packed_data = pack_data(data, ov_const.get_element_type())
-    ov_const.data[:] = packed_data
-
-    # Always unpack the data!
-    if data_getter == DataGetter.COPY:
-        unpacked = unpack_data(ov_const.get_data(), ov_const.get_element_type(), ov_const.shape)
-    elif data_getter == DataGetter.VIEW:
-        unpacked = unpack_data(ov_const.data, ov_const.get_element_type(), ov_const.shape)
-    else:
-        raise AttributeError("Unknown DataGetter passed!")
-
-    assert np.array_equal(unpacked, data)
-
-
 @pytest.mark.parametrize(
     ("ov_type", "src_dtype"),
     [
@@ -380,21 +287,6 @@ def test_constant_direct_packing(ov_type, src_dtype, shared_flag, data_getter):
     assert not np.shares_memory(unpacked, data)
 
 
-@pytest.mark.parametrize(
-    ("shared_flag"),
-    [
-        (True),
-        (False),
-    ],
-)
-def test_write_to_buffer(shared_flag):
-    arr_0 = np.ones([1, 3, 32, 32])
-    ov_const = ops.constant(arr_0, shared_memory=shared_flag)
-    arr_1 = np.ones([1, 3, 32, 32]) + 1
-    ov_const.data[:] = arr_1
-    assert np.array_equal(ov_const.data, arr_1)
-
-
 @pytest.mark.parametrize(
     ("shared_flag"),
     [

From eb38e67be6e6992445e7f4f1ea8014422d015512 Mon Sep 17 00:00:00 2001
From: Andrzej Kopytko <andrzejx.kopytko@intel.com>
Date: Wed, 13 Nov 2024 15:02:22 +0100
Subject: [PATCH 10/28] [DOCS] Preselection and sorting (#27538)

### Details:
 - *item1*
 - *...*

### Tickets:
 - *ticket-id*
---
 docs/sphinx_setup/_static/html/modal.html    |  3 -
 docs/sphinx_setup/_static/html/modalLLM.html |  3 -
 docs/sphinx_setup/_static/js/graphs.js       | 76 +++++---------------
 3 files changed, 17 insertions(+), 65 deletions(-)

diff --git a/docs/sphinx_setup/_static/html/modal.html b/docs/sphinx_setup/_static/html/modal.html
index ac425599b821ce..38eb673824f97e 100644
--- a/docs/sphinx_setup/_static/html/modal.html
+++ b/docs/sphinx_setup/_static/html/modal.html
@@ -11,9 +11,6 @@ <h3>Configure Graphs</h3>
                 <div>
                     <button id="build-graphs-btn" disabled="disabled" class="build-graphs-btn">Show Graphs</button>
                 </div>
-                <div class="clear-all-btn">
-                    <span class="clear-all-btn-content">Clear All</span>
-                </div>
             </div>
         </div>
         <div class="configure-graphs-content">
diff --git a/docs/sphinx_setup/_static/html/modalLLM.html b/docs/sphinx_setup/_static/html/modalLLM.html
index e3395a16931188..37b569d0bd4078 100644
--- a/docs/sphinx_setup/_static/html/modalLLM.html
+++ b/docs/sphinx_setup/_static/html/modalLLM.html
@@ -11,9 +11,6 @@ <h3>Configure Graphs</h3>
                 <div>
                     <button id="build-graphs-btn" disabled="disabled" class="build-graphs-btn">Show Graphs</button>
                 </div>
-                <div class="clear-all-btn">
-                    <span class="clear-all-btn-content">Clear All</span>
-                </div>
             </div>
         </div>
         <div class="configure-graphs-content">
diff --git a/docs/sphinx_setup/_static/js/graphs.js b/docs/sphinx_setup/_static/js/graphs.js
index 168c1c348e7a08..7171aed374dd99 100644
--- a/docs/sphinx_setup/_static/js/graphs.js
+++ b/docs/sphinx_setup/_static/js/graphs.js
@@ -9,7 +9,6 @@ class Filter {
             .forEach(item => optionMap.set(item.Platform, item));
         return Array.from(optionMap.values());
     }
-
     // param: GraphData[], ieType
     static ByIeTypes(graphDataArr, ieTypes) {
         const optionMap = new Map();
@@ -18,7 +17,6 @@ class Filter {
             .forEach(item => optionMap.set(item.Platform, item));
         return Array.from(optionMap.values());
     }
-
     // param: GraphData[], ieType, networkModels
     static ByTypesAndModels(graphDataArr, ieTypes, models) {
         return Array.from(
@@ -26,9 +24,8 @@ class Filter {
                 .filter(({ PlatformType, Model }) => ieTypes.includes(PlatformType) && models.includes(Model))
                 .reduce((map, item) => map.set(item.Platform, item), new Map())
                 .values()
-        ).sort((a, b) => a.Platform.localeCompare(b.Platform));
+        );
     }
-
     // param: GraphData[], clientPlatforms
     static ByIeKpis(graphDataArr, clientPlatforms) {
         return Array.from(
@@ -40,7 +37,6 @@ class Filter {
             }, new Set())
         );
     }
-
     // param: GraphData[]
     static getParameters(graphDataArr) {
         var parameters = []
@@ -51,7 +47,6 @@ class Filter {
         })
         return parameters;
     }
-
     // param: GraphData[]
     static getIeTypes(graphDataArr) {
         var kpis = []
@@ -62,21 +57,12 @@ class Filter {
         })
         return kpis;
     }
-
     // param: GraphData[], clientPlatforms[]
     static ByClientPlatforms(graphDataArr, platformsArr) {
         return graphDataArr.filter((data) => {
             return platformsArr.includes(data.Platform)
         });
     }
-
-    // param: GraphData[], coreTypes[]
-    static FilterByCoreTypes(graphDataArr, coreTypes) {
-        if (coreTypes) {
-            return graphDataArr.filter((data) => coreTypes.includes(data.PlatformType));
-        }
-        return graphDataArr;
-    }
 }
 
 class Modal {
@@ -114,15 +100,13 @@ class Graph {
             .sort((a, b) => a.localeCompare(b));
     }
     static getIeTypes(graphDataArr) {
-        return Array.from(new Set(graphDataArr.map((obj) => obj.PlatformType)));
-    }
-    static getCoreTypes(graphDataArr) {
-        return Array.from(new Set(graphDataArr.map((obj) => obj.ieType)));
+        return Array.from(new Set(graphDataArr.map((obj) => obj.PlatformType))).sort((a, b) => a.localeCompare(b));
     }
 
     // param: GraphData[]
     static getPlatformNames(graphDataArr) {
-        return graphDataArr.map((data) => data.Platform);
+        return graphDataArr.map((data) => data.Platform)
+        .sort((a, b) => a.localeCompare(b));
     }
 
     // param: GraphData[], engine: string, precisions: list
@@ -297,13 +281,13 @@ $(document).ready(function () {
             const models = networkModels.map((networkModel) => createCheckMark(networkModel, 'networkmodel'));
             modal.find('.models-column').append(models);
 
-            const selectAllModelsButton = createCheckMark('', 'networkmodel');
+            const selectAllModelsButton = createCheckMark('', 'networkmodel', false , false);
             modal.find('.models-selectall').append(selectAllModelsButton);
 
-            const selectAllPlatformsButton = createCheckMark('', 'platform');
+            const selectAllPlatformsButton = createCheckMark('', 'platform', false , false);
             modal.find('.platforms-selectall').append(selectAllPlatformsButton);
 
-            const precisions = Modal.getPrecisionsLabels(graph).map((precision) => createCheckMark(precision, 'precision', false));
+            const precisions = Modal.getPrecisionsLabels(graph).map((precision) => createCheckMark(precision, 'precision', false , false));
             modal.find('.precisions-column').append(precisions);
 
             selectAllCheckboxes(precisions);
@@ -318,21 +302,17 @@ $(document).ready(function () {
             modal.find('#modal-display-graphs').hide();
             modal.find('.ietype-column input').first().prop('checked', true);
 
-            const kpiLabels = Filter.getParameters(graph).map((parameter) => createCheckMark(parameter, 'kpi', false));
+            const kpiLabels = Filter.getParameters(graph).map((parameter) => createCheckMark(parameter, 'kpi', false , true));
             modal.find('.kpi-column').append(kpiLabels);
 
             $('body').prepend(modal);
 
-            preselectDefaultSettings(graph, modal, appConfig);
-
-            //is not generic solution :(
             if (appConfig.DefaultSelections.platformTypes?.data?.includes('Select All')) {
                 selectAllCheckboxes(iefilter);
-
             };
+            preselectDefaultSettings(graph, modal, appConfig);
             renderClientPlatforms(graph, modal);
 
-            $('.clear-all-btn').on('click', clearAll);
             $('#build-graphs-btn').on('click', () => {
                 $('#modal-configure-graphs').hide();
                 clickBuildGraphs(graph, appConfig, getSelectedNetworkModels(), getSelectedIeTypes(), getSelectedClientPlatforms(), getSelectedKpis(), Modal.getPrecisions(appConfig, getSelectedPrecisions()), isLLM);
@@ -409,19 +389,9 @@ $(document).ready(function () {
         precisions.prop('disabled', false);
     }
 
-    function clearAll() {
-        $('.modal-content-grid-container input:checkbox').each((index, object) => $(object).prop('checked', false));
-        validatePrecisionSelection();
-        validateSelections();
-    }
-
     function preselectDefaultSettings(graph, modal, appConfig) {
-
-        const defaultSelections = appConfig.DefaultSelections;
-        selectDefaultPlatformType(defaultSelections.platformTypes, graph, modal);
-        applyPlatformFilters(defaultSelections.platformFilters, modal, graph);
-        clearAllSettings(defaultSelections);
-
+        selectDefaultPlatformType(appConfig.DefaultSelections.platformTypes, graph, modal);
+        clearAllSettings(appConfig.DefaultSelections);
         validateSelections();
         validatePrecisionSelection();
     }
@@ -431,17 +401,8 @@ $(document).ready(function () {
         $(`input[data-ietype="${type}"]`).prop('checked', true);
         renderClientPlatforms(graph, modal);
     }
-    function applyPlatformFilters(platformFilters, modal, graph) {
-        if (!platformFilters) return;
-        const filters = modal.find('.selectable-box-container').children('.selectable-box');
-        filters.removeClass('selected');
-        platformFilters.data.forEach(selection => {
-            filters.filter(`[data-${platformFilters.name}="${selection}"]`).addClass('selected');
-        });
-        renderClientPlatforms(graph, modal);
-    }
+
     function clearAllSettings(defaultSelections) {
-        clearAll();
         Object.keys(defaultSelections).forEach(setting => {
             const { name, data } = defaultSelections[setting];
             data.forEach(selection => {
@@ -463,7 +424,7 @@ $(document).ready(function () {
         var platformNames = Graph.getPlatformNames(fPlatforms);
         $('.platforms-column .checkmark-container').remove();
 
-        const clientPlatforms = platformNames.map((platform) => createCheckMark(platform, 'platform', true));
+        const clientPlatforms = platformNames.map((platform) => createCheckMark(platform, 'platform', true, false));
 
         var enabledPlatforms = filterPlatforms(graph, getSelectedIeTypes(), getSelectedNetworkModels());
         enableCheckBoxes(clientPlatforms, enabledPlatforms);
@@ -471,6 +432,7 @@ $(document).ready(function () {
 
         enableParmeters(graph, getSelectedClientPlatforms());
         modal.find('.platforms-column input').on('click', validateSelections);
+        validateSelections();
     }
 
     function enableParmeters(graph, clientPlatforms) {
@@ -486,11 +448,12 @@ $(document).ready(function () {
         })
     }
 
-    function createCheckMark(itemLabel, modelLabel, disabled) {
+    function createCheckMark(itemLabel, modelLabel, disabled, checked = false) {
         const item = $('<label class="checkmark-container">');
         item.text(itemLabel);
         const checkbox = $('<input type="checkbox"/>');
         checkbox.prop('disabled', disabled);
+        checkbox.prop('checked', checked);
         const checkboxSpan = $('<span class="checkmark">');
         item.append(checkbox);
         item.append(checkboxSpan);
@@ -732,7 +695,6 @@ $(document).ready(function () {
         labelsContainer.addClass('chart-labels-container');
         chartWrap.append(labelsContainer);
 
-        // get the kpi title's and create headers for the graphs
         var chartGraphsContainer = $('<div>');
         chartGraphsContainer.addClass('chart-graphs-container');
         chartWrap.append(chartGraphsContainer);
@@ -778,7 +740,6 @@ $(document).ready(function () {
         labelsContainer.addClass('chart-labels-container');
         chartWrap.append(labelsContainer);
 
-        // get the kpi title's and create headers for the graphs
         var chartGraphsContainer = $('<div>');
         chartGraphsContainer.addClass('chart-graphs-container');
         chartWrap.append(chartGraphsContainer);
@@ -798,7 +759,6 @@ $(document).ready(function () {
             columnHeaderContainer.append(columnIcon);
             var columnHeader = $('<div class="chart-header">');
             columnHeader.append($('<div class="title">' + graphConfig.chartTitle + '</div>'));
-            // columnHeader.append($('<div class="subtitle">' + graphConfig.unit + ' ' + appConfig.UnitDescription[graphConfig.unit] + '</div>'));
             columnHeaderContainer.append(columnHeader);
             chartGraphsContainer.append(graphItem);
             var graphClass = $('<div>');
@@ -857,7 +817,6 @@ $(document).ready(function () {
         return graphConfigs
     }
     function processMetric(labels, datasets, chartTitle, container, widthClass, id) {
-        // ratio for consistent chart label height
         var heightRatio = (30 + (labels.length * 55));
         var chart = $('<div>');
         const containerId = `legend-container-${id}`;
@@ -898,8 +857,7 @@ $(document).ready(function () {
     }
 
     function processMetricByEngines(labels, datasets, container, widthClass, id) {
-        // ratio for consistent chart label height
-        var heightRatio = (80 + (labels.length * 55));
+         var heightRatio = (80 + (labels.length * 55));
         var chart = $('<div>');
         const containerId = `legend-container-${id}`;
         const legend = $(`<div id="${containerId}">`);

From 4847f49faf4ffbac1b00ac3395d7cc83cba36272 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Wed, 13 Nov 2024 17:23:36 +0100
Subject: [PATCH 11/28] [DOCS] fixed get_cmake_path import instructions
 (#27542)

### Details:
 - Fixed import example

### Tickets:
 - *ticket-id*
---
 .../get-started/install-openvino/install-openvino-pip.rst       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst b/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst
index 6326513fa3cea1..cd3fd41fed03e0 100644
--- a/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst
+++ b/docs/articles_en/get-started/install-openvino/install-openvino-pip.rst
@@ -119,7 +119,7 @@ to see if your case needs any of them.
 
 .. code-block:: python
 
-   from openvino import get_cmake_path
+   from openvino.utils import get_cmake_path
    cmake_path = get_cmake_path()
 
 For detailed instructions on how to use these configurations in your build setup, check out the

From f4170977048db209f2bcea9efcf19378baa87c77 Mon Sep 17 00:00:00 2001
From: Pawel Raasz <pawel.raasz@intel.com>
Date: Wed, 13 Nov 2024 17:45:21 +0100
Subject: [PATCH 12/28] [core] On Linux try return not used memory to system on
 compile model destructor (#27534)

### Details:
- Optimize Linux process using OV memory consumption by returning not
used memory to the system when compiled model is destroyed.
- It should optimize memory usage by C++ and Python applications, as
Linux avoid reclaim memory until process end especially for small chunks
of allocations.

### Tickets:
 - CVS-149497
---
 src/inference/src/cpp/compiled_model.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/inference/src/cpp/compiled_model.cpp b/src/inference/src/cpp/compiled_model.cpp
index c780bbee1e991d..d675cba4714887 100644
--- a/src/inference/src/cpp/compiled_model.cpp
+++ b/src/inference/src/cpp/compiled_model.cpp
@@ -8,6 +8,10 @@
 #include "openvino/runtime/icompiled_model.hpp"
 #include "openvino/runtime/properties.hpp"
 
+#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__)
+#    include <malloc.h>
+#endif
+
 #define OV_COMPILED_MODEL_CALL_STATEMENT(...)                 \
     if (_impl == nullptr)                                     \
         OPENVINO_THROW("CompiledModel was not initialized."); \
@@ -23,6 +27,12 @@ namespace ov {
 
 CompiledModel::~CompiledModel() {
     _impl = {};
+#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__)
+    // Linux memory margent doesn't return system memory immediate after release.
+    // It depends on memory chunk size and allocation history.
+    // Try return memory from a process to system now to reduce memory usage and not wait to the end of the process.
+    malloc_trim(0);
+#endif
 }
 
 CompiledModel::CompiledModel(const std::shared_ptr<ov::ICompiledModel>& impl, const std::shared_ptr<void>& so)

From 51906cf724578389d1f16fd20f3705cba6777709 Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Wed, 13 Nov 2024 16:49:34 +0000
Subject: [PATCH 13/28] NPUW: Support NF4 DCOFF for CW models (#27518)

---
 src/plugins/intel_cpu/src/plugin.cpp          |  3 +-
 .../npuw/partitioning/patterns/dcoff.cpp      | 13 +++-
 .../npuw/partitioning/patterns/dcoff.hpp      |  2 +-
 .../intel_npu/src/plugin/npuw/util.cpp        | 62 +++++++++++++++++++
 4 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index 5c88772eeedabc..b74d4f7c8acbbb 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -229,7 +229,8 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
                                                                            ov::element::Type_t::f32,
                                                                            ov::element::Type_t::f64,
                                                                            ov::element::Type_t::boolean,
-                                                                           ov::element::Type_t::string};
+                                                                           ov::element::Type_t::string,
+                                                                           ov::element::Type_t::nf4};
 
         if (!supported_precisions.count(input_precision)) {
             OPENVINO_THROW_NOT_IMPLEMENTED("CPU plugin: Input image format ",
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
index 60f705a0c8f26c..f464f216eadb67 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp
@@ -16,6 +16,7 @@
 #include "openvino/op/subtract.hpp"
 #include "openvino/op/util/op_types.hpp"
 #include "openvino/pass/pattern/op/label.hpp"  // any_input
+#include "openvino/pass/pattern/op/optional.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "openvino/util/common_util.hpp"
 
@@ -248,7 +249,7 @@ bool DCOFFPassBase::matcher_callback(ov::pass::pattern::Matcher& m) {
 
     auto matched_paramA = std::static_pointer_cast<ov::op::v0::Parameter>(matched_nodeA);
     auto element_type = matched_paramA->get_element_type();
-    if (element_type == ov::element::i4 || element_type == ov::element::i8) {
+    if (element_type == ov::element::i4 || element_type == ov::element::i8 || element_type == ov::element::nf4) {
         LOG_DEBUG("Matched: " << matched_paramA << ", set element type to " << m_dcoff_type);
         matched_paramA->set_element_type(m_dcoff_type);
 
@@ -296,7 +297,8 @@ bool DCOFFPassBase::matcher_callback(ov::pass::pattern::Matcher& m) {
 void DCOFFPassMatMul::build() {
     DCOFFPassBase::build();
     auto _mmin1 = opp::any_input();
-    matmul = opp::wrap_type<ov::op::v0::MatMul>({_mmin1, mulply});
+    cvtopt = opp::optional<ov::op::v0::Convert>({mulply->output(0)});
+    matmul = opp::wrap_type<ov::op::v0::MatMul>({_mmin1, cvtopt});
     register_matcher(std::make_shared<opp::Matcher>(matmul, "TagDCOFFMatMul"),
                      std::bind(&DCOFFPassMatMul::matcher_callback, this, std::placeholders::_1));
 }
@@ -306,6 +308,13 @@ void DCOFFPassMatMul::reconnect_root_to_convert(ov::pass::pattern::Matcher& m) {
     auto& node_to_output = m.get_pattern_value_map();
     auto matched_convrt = node_to_output.at(toFP32).get_node_shared_ptr();
     auto matched_matmul = node_to_output.at(matmul).get_node_shared_ptr();
+
+    auto cvt = std::static_pointer_cast<ov::op::v0::Convert>(matched_convrt);
+    auto matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_matmul);
+
+    // NB: In case convert and matmul types don't match
+    cvt->set_destination_type(matmul->inputs()[1].get_element_type());
+
     matched_matmul->input(1).replace_source_output(matched_convrt);
 }
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp
index 55ec9ccd58835c..da06a5304c8bd7 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp
@@ -59,7 +59,7 @@ class DCOFFPassBase : public ov::pass::MatcherPass {
     ov::element::Type m_dcoff_type;
     DCOFFParamRef m_params_to;
 
-    std::shared_ptr<ov::Node> paramA, paramB, toFP32, mulply;
+    std::shared_ptr<ov::Node> paramA, paramB, toFP32, mulply, cvtopt;
     bool matcher_callback(ov::pass::pattern::Matcher& m);
 
 public:
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
index 99a53430295a89..a878b244bc41e9 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
@@ -9,6 +9,7 @@
 #include <openvino/core/parallel.hpp>
 #include <openvino/core/type/bfloat16.hpp>
 #include <openvino/core/type/float16.hpp>
+#include <openvino/core/type/nf4.hpp>
 #include <sstream>
 
 #include "logging.hpp"
@@ -50,6 +51,59 @@ inline uint8_t hi4(uint8_t x) {
 inline uint8_t lo4(uint8_t x) {
     return x & 0xF;
 }
+
+void unpack_nf4f16(const ov::SoPtr<ov::ITensor>& from,
+                   const ov::SoPtr<ov::ITensor>& scale,
+                   const ov::SoPtr<ov::ITensor>& to,
+                   const ov::npuw::util::UnpackOptions& unpack_options) {
+    auto from_shape = from->get_shape();
+    auto scale_shape = scale->get_shape();
+
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(scale->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+    NPUW_ASSERT(from_shape[0] == scale_shape[0]);
+
+    const auto* from_ptr = static_cast<const uint8_t*>(from->data());
+    const auto* scale_ptr = scale->data<ov::float16>();
+    auto* to_ptr = to->data<ov::float16>();
+
+    const auto size = from->get_size();
+    ov::parallel_for(size / 2, [&](size_t idx) {
+        const uint8_t nf4_2xval = from_ptr[idx];
+        const float low_scale = scale_ptr[(idx * 2) / from_shape[1]];
+        const float high_scale = scale_ptr[(idx * 2 + 1) / from_shape[1]];
+        to_ptr[idx * 2] = ov::ConvertNF4::dequantize(lo4(nf4_2xval)) * low_scale;
+        to_ptr[idx * 2 + 1] = ov::ConvertNF4::dequantize(hi4(nf4_2xval)) * high_scale;
+    });
+    if (size % 2 != 0) {
+        const float low_scale = scale_ptr[size - 1 / from_shape[1]];
+        to_ptr[size - 1] = ov::ConvertNF4::dequantize(lo4(from_ptr[size / 2 + 1])) * low_scale;
+    }
+}
+
+void unpack_nf4f16(const ov::SoPtr<ov::ITensor>& from,
+                   const ov::SoPtr<ov::ITensor>& to,
+                   const ov::npuw::util::UnpackOptions& unpack_options) {
+    NPUW_ASSERT(from->is_continuous());
+    NPUW_ASSERT(to->is_continuous());
+    NPUW_ASSERT(from->get_size() == to->get_size());
+
+    const auto* from_ptr = static_cast<const uint8_t*>(from->data());
+    auto* to_ptr = to->data<ov::float16>();
+
+    const auto size = from->get_size();
+    ov::parallel_for(size / 2, [&](size_t idx) {
+        const uint8_t nf4_2xval = from_ptr[idx];
+        to_ptr[idx * 2] = ov::ConvertNF4::dequantize(lo4(nf4_2xval));
+        to_ptr[idx * 2 + 1] = ov::ConvertNF4::dequantize(hi4(nf4_2xval));
+    });
+    if (size % 2 != 0) {
+        to_ptr[size - 1] = ov::ConvertNF4::dequantize(lo4(from_ptr[size / 2 + 1]));
+    }
+}
+
 }  // namespace
 
 ov::Tensor ov::npuw::util::tensor_from_const(const std::shared_ptr<ov::Node>& node) {
@@ -81,6 +135,12 @@ void ov::npuw::util::unpack(const ov::SoPtr<ov::ITensor>& from,
     auto type_from = from->get_element_type();
     auto type_to = to->get_element_type();
 
+    // FIXME: Move under common switch when XARCH::unpack is implemented
+    if (type_from == ov::element::nf4 && type_to == ov::element::f16) {
+        unpack_nf4f16(from, to, unpack_options);
+        return;
+    }
+
     namespace ove = ov::element;
 #define CAST(x)    static_cast<int>((x).operator ove::Type_t())
 #define PAIR(f, t) (CAST(f) << 16 | CAST(t))
@@ -128,6 +188,8 @@ void ov::npuw::util::unpack(const ov::SoPtr<ov::ITensor>& from,
         }
     } else if (type_from == ov::element::i8) {
         ov::npuw::util::XARCH::unpack_i8f16_scale(from, scale, to, unpack_options);
+    } else if (type_from == ov::element::nf4) {
+        unpack_nf4f16(from, scale, to, unpack_options);
     } else {
         NPUW_ASSERT(false && "Unsupported combination");
     }

From 3d4f9425b71fc8f11a1f2c84028f57b1b6cf1c5b Mon Sep 17 00:00:00 2001
From: Alexey Smirnov <alexey.smirnov@intel.com>
Date: Wed, 13 Nov 2024 19:39:13 +0000
Subject: [PATCH 14/28] [NPUW] Add a pass to convert convolutions to matmul
 (#27487)

---
 .../plugin/npuw/partitioning/partitioning.cpp |   2 +
 .../plugin/npuw/partitioning/patterns/opt.cpp | 140 +++++++++++++++---
 .../plugin/npuw/partitioning/patterns/opt.hpp |   6 +
 .../intel_npu/src/plugin/npuw/util.hpp        |  14 ++
 4 files changed, 141 insertions(+), 21 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
index dbc86c5062da9e..653b8350cfdeda 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -1797,6 +1797,8 @@ void Partitioner::optimize(const std::string& func_name) {
         // rewr.add_matcher<ov::npuw::patterns::opt::DQUnpackDictMatMulGQi>(std::ref(ctx));
         rewr.add_matcher<ov::npuw::patterns::opt::CompressDictMatMulf32>(std::ref(ctx));
         rewr.add_matcher<ov::npuw::patterns::opt::DQParMMGQ>(std::ref(ctx));
+        // Convert specific convolutions to matmuls
+        rewr.add_matcher<ov::npuw::patterns::opt::ConvToMatmul>(std::ref(ctx));
         rewr.run_on_model(f._model);
 
         // Move Gather to host, if required
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
index 9693e2e8f2b753..3470739c848dac 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
@@ -116,22 +116,25 @@ namespace opp = ov::pass::pattern;
 namespace uat = ov::npuw::util::at;
 
 // FROM:
-//     ???(Act) ----------------------------------->
-//     Param(W) -> to(f16) -> Multiply -> to(f32) -> MatMul
-//     Param(S) ------------>
+//     ???(Act) ------------------------------------------------------------>
+//     Param(W) -------> (Reshape) -> to(f16/f32) -> Multiply -> (to(f32)) -> MatMul
+//     Param/Const(S) -> (Reshape) -> (to(f32)) --->
 //
 // TO:
-//     ???(Act) -> to(f16) ->
-//     Param(W) -> to(f16) -> MatMul -> Multiply -> to(f32)
-//     Param(S) -> Reshape ----------->
+//     ???(Act) --------------------> to(f16/f32) ->
+//     Param(W) -------> (Reshape) -> to(f16/f32) -> MatMul -> Multiply -> (to(f32))
+//     Param/Const(S) -> (Reshape) -> (to(f32)) -> Reshape -->
 //
 
 DQMatMulCWi::DQMatMulCWi() {
     auto qweight = opp::wrap_type<ov::op::v0::Parameter>();
-    auto qcoeff = opp::wrap_type<ov::op::v0::Parameter>();
-    auto qcvtw = opp::wrap_type<ov::op::v0::Convert>({qweight});
-    auto qmuls = opp::wrap_type<ov::op::v1::Multiply>({qcvtw, qcoeff});
-    auto qcvtm = opp::wrap_type<ov::op::v0::Convert>({qmuls});
+    auto qcoeff = opp::any_input();
+    auto reshapew = opp::optional<ov::op::v1::Reshape>({qweight, opp::any_input()});
+    auto reshapec = opp::optional<ov::op::v1::Reshape>({qcoeff, opp::any_input()});
+    auto qcvtw = opp::wrap_type<ov::op::v0::Convert>({reshapew});
+    auto qcvtc = opp::optional<ov::op::v0::Convert>({reshapec->output(0)});
+    auto qmuls = opp::wrap_type<ov::op::v1::Multiply>({qcvtw, qcvtc});
+    auto qcvtm = opp::optional<ov::op::v0::Convert>({qmuls->output(0)});
     auto qmmi = opp::any_input();
     auto qmm = opp::wrap_type<ov::op::v0::MatMul>({qmmi, qcvtm});
 
@@ -144,22 +147,24 @@ DQMatMulCWi::DQMatMulCWi() {
         auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr();
 
         auto matched_qweight = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_qweight);
-        auto matched_qcoeff = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_qcoeff);
         auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);
 
-        auto qcoeff_shape = matched_qcoeff->output(0).get_shape();
+        auto qcoeff_shape = matched_node_qcoeff->output(0).get_shape();
 
         if ((ov::element::i4 == matched_qweight->get_element_type() ||
              ov::element::i8 == matched_qweight->get_element_type()) &&
+            (ov::op::util::is_parameter(matched_node_qcoeff) || ov::op::util::is_constant(matched_node_qcoeff)) &&
             qcoeff_shape[1] == 1 && !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) {
             auto matched_node_cvtw = node_to_output.at(qcvtw).get_node_shared_ptr();
-            auto matched_node_cvtm = node_to_output.at(qcvtm).get_node_shared_ptr();
             auto matched_node_muls = node_to_output.at(qmuls).get_node_shared_ptr();
             auto matched_node_mmi = node_to_output.at(qmmi).get_node_shared_ptr();
+            auto matched_node_qcoeff_out = uat::_(node_to_output).at_or_at_or_at(qcvtc, reshapec, qcoeff);
+            auto matched_node_muls_out = uat::_(node_to_output).at_or_at(qcvtm, qmuls);
 
             // Reconnect MatMul to read from Convert(W) directly.
-            // Note: ACT is f32 so has to be converted too.
-            auto new_cvt_act = std::make_shared<ov::op::v0::Convert>(matched_node_mmi, ov::element::f16);
+            // Note: ACT has to be converted too.
+            auto cvt_prec = matched_node_cvtw->output(0).get_element_type();
+            auto new_cvt_act = std::make_shared<ov::op::v0::Convert>(matched_node_mmi, cvt_prec);
             matched_matmul->input(0).replace_source_output(new_cvt_act);
             matched_matmul->input(1).replace_source_output(matched_node_cvtw);
 
@@ -169,7 +174,7 @@ DQMatMulCWi::DQMatMulCWi() {
             // Introduce a Reshape to alter Scale factor's shape
             auto new_dims = std::vector<std::size_t>{qcoeff_shape[1], qcoeff_shape[0]};
             auto new_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, new_dims);
-            auto new_reshape = std::make_shared<ov::op::v1::Reshape>(matched_node_qcoeff, new_const, false);
+            auto new_reshape = std::make_shared<ov::op::v1::Reshape>(matched_node_qcoeff_out, new_const, false);
 
             // Reconnect Multiply's both inputs. Drop all outputs
             matched_node_muls->input(0).replace_source_output(matched_matmul);
@@ -178,16 +183,18 @@ DQMatMulCWi::DQMatMulCWi() {
                 matched_node_muls->output(0).remove_target_input(r);
             }
 
-            // Reconnect Convert(M) to convert the Multiply's result
-            matched_node_cvtm->input(0).replace_source_output(matched_node_muls);
+            // Reconnect Convert(M) to convert the Multiply's result (optional)
+            if (matched_node_muls_out != matched_node_muls) {
+                matched_node_muls_out.get_node()->input(0).replace_source_output(matched_node_muls);
+            }
 
             // Reconnect MatMul's old readers to Convert(Multiply)
             for (auto&& r : mm_readers) {
-                r.replace_source_output(matched_node_cvtm);
+                r.replace_source_output(matched_node_muls_out);
             }
+            return true;  // root has changed
         }
-
-        return true;  // root has changed
+        return false;  // root hasn't changed
     };
     register_matcher(std::make_shared<opp::Matcher>(qmm, "OptDQMatMulCWi"), std::move(callback));
 }
@@ -1461,6 +1468,97 @@ SliceLastMatmulMultiply::SliceLastMatmulMultiply() {
     register_matcher(std::make_shared<opp::Matcher>(res, "SliceLastMatmulMultiply"), std::move(callback));
 }
 
+// FROM:
+//     -> Transpose ------------------------------>
+//     Param --------> Convert(f32) --> Multiply -> Convolution -> Transpose ->
+//     Param/Const -> (Convert(f32)) ->
+//
+// TO:
+//     ------------------------------------------------------>
+//     Param -------> Reshape --> Convert(f32) --> Multiply -> MatMul ->
+//     Param/Const -> Reshape -> (Convert(f32)) ->
+//
+
+ConvToMatmul::ConvToMatmul(Context::Ref ctx) {
+    auto param = opp::wrap_type<ov::op::v0::Parameter>();
+    auto convert = opp::wrap_type<ov::op::v0::Convert>({param->output(0)});
+    auto param2 = opp::any_input();
+    auto convert2 = opp::optional<ov::op::v0::Convert>({param2->output(0)});
+    auto multiply = opp::wrap_type<ov::op::v1::Multiply>({convert, convert2});
+    auto tr_input = opp::any_input();
+    auto transpose_in = opp::wrap_type<ov::op::v1::Transpose>({tr_input, opp::any_input()});
+    auto conv = opp::wrap_type<ov::op::v1::Convolution>({transpose_in, multiply});
+    auto transpose_out = opp::wrap_type<ov::op::v1::Transpose>({conv, opp::any_input()});
+
+    // Note: Use [=] to make sure the above objects stay alive in the callback
+    auto callback = [=](ov::pass::pattern::Matcher& m) {
+        auto& node_to_output = m.get_pattern_value_map();
+
+        auto matched_node_param = node_to_output.at(param).get_node_shared_ptr();
+        auto matched_node_param2 = node_to_output.at(param2).get_node_shared_ptr();
+        auto matched_node_convert = node_to_output.at(convert).get_node_shared_ptr();
+        auto matched_node_tr_input = node_to_output.at(tr_input);
+        auto matched_node_transpose_in = node_to_output.at(transpose_in).get_node_shared_ptr();
+        auto matched_node_transpose_out = node_to_output.at(transpose_out).get_node_shared_ptr();
+        auto matched_node_multiply = node_to_output.at(multiply).get_node_shared_ptr();
+
+        const auto& shape = matched_node_param->get_shape();
+        const auto& shape2 = matched_node_param2->get_shape();
+        const auto& tr_in_shape = matched_node_transpose_in->input(0).get_shape();
+        const auto& tr_out_shape = matched_node_transpose_out->output(0).get_shape();
+
+        auto check_shape = [](const ov::Shape& shape) {
+            // last 2 dims are 1
+            return shape.size() == 4 && shape[2] == 1 && shape[3] == 1;
+        };
+
+        auto check_transpose_shape = [](const ov::Shape& shape) {
+            // first 2 dims are 1
+            return shape.size() == 4 && shape[0] == 1 && shape[1] == 1;
+        };
+
+        if ((matched_node_param->get_element_type() == ov::element::i4 ||
+             matched_node_param->get_element_type() == ov::element::i8) &&
+            (matched_node_param2->get_element_type() == ov::element::f32 ||
+             matched_node_param2->get_element_type() == ov::element::f16) &&
+            (ov::op::util::is_parameter(matched_node_param2) || ov::op::util::is_constant(matched_node_param2)) &&
+            check_shape(shape) && check_shape(shape2) && check_transpose_shape(tr_in_shape) &&
+            check_transpose_shape(tr_out_shape)) {
+            // Add Reshape before Params/Const
+            auto new_dims = std::vector<std::size_t>{shape[0], shape[1]};
+            auto new_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, new_dims);
+            auto new_reshape = std::make_shared<ov::op::v1::Reshape>(matched_node_param, new_const, false);
+            matched_node_convert->input(0).replace_source_output(new_reshape);
+            matched_node_convert->validate_and_infer_types();
+
+            auto new_dims2 = std::vector<std::size_t>{shape2[0], shape2[1]};
+            auto new_const2 = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, new_dims2);
+            auto new_reshape2 = std::make_shared<ov::op::v1::Reshape>(matched_node_param2, new_const2, false);
+
+            // Connect to Reshape
+            if (ov::op::util::is_parameter(matched_node_param2)) {
+                matched_node_multiply->input(1).replace_source_output(new_reshape2);
+                matched_node_multiply->validate_and_infer_types();
+            } else {  // constant -> convert -> multiply
+                node_to_output.at(convert2).get_node_shared_ptr()->input(0).replace_source_output(new_reshape2);
+                node_to_output.at(convert2).get_node_shared_ptr()->validate_and_infer_types();
+                matched_node_multiply->validate_and_infer_types();
+            }
+
+            // Get rid of Transposes
+            auto matmul =
+                std::make_shared<ov::op::v0::MatMul>(matched_node_tr_input, matched_node_multiply, false, true);
+
+            for (auto&& r : matched_node_transpose_out->output(0).get_target_inputs()) {
+                r.replace_source_output(matmul);
+            }
+            return true;  // root has changed
+        }
+        return false;  // root hasn't changed
+    };
+    register_matcher(std::make_shared<opp::Matcher>(transpose_out, "ConvToMatmul"), std::move(callback));
+}
+
 }  // namespace opt
 }  // namespace patterns
 }  // namespace npuw
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp
index 323d443fa781f4..8bd4e173ff210a 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp
@@ -170,6 +170,12 @@ class SliceLastMatmulMultiply : public ov::pass::MatcherPass {
     SliceLastMatmulMultiply();
 };
 
+// Convolution to MatMul
+class ConvToMatmul : public ov::pass::MatcherPass {
+public:
+    ConvToMatmul(Context::Ref ctx);
+};
+
 }  // namespace opt
 }  // namespace patterns
 }  // namespace npuw
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp
index 4343316db5569f..7a942f0b6c6351 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.hpp
@@ -99,6 +99,15 @@ struct Impl {
         return iter->second;
     }
 
+    template <typename K>
+    V& at_or_at_or_at(const K& k1, const K& k2, const K& k3) {
+        const auto iter = m->find(k1);
+        if (iter == m->end()) {
+            return at_or_at(k2, k3);
+        }
+        return iter->second;
+    }
+
     template <typename K>
     const V& at(const K& k) const {
         return const_cast<Impl*>(this)->at(k);
@@ -108,6 +117,11 @@ struct Impl {
     const V& at_or_at(const K& k1, const K& k2) const {
         return const_cast<Impl*>(this)->at_or_at(k1, k2);
     }
+
+    template <typename K>
+    const V& at_or_at_or_at(const K& k1, const K& k2, const K& k3) const {
+        return const_cast<Impl*>(this)->at_or_at_or_at(k1, k2, k3);
+    }
 };
 
 template <typename M>

From d70603eef55fb1e3c61808ddbd3af7296a870110 Mon Sep 17 00:00:00 2001
From: Andrey Babushkin <andrey.babushkin@intel.com>
Date: Wed, 13 Nov 2024 20:02:14 +0000
Subject: [PATCH 15/28] [DOCS] Security best practices for GitHub Actions
 workflows development (#27394)

### Details:
Security guidelines for GitHub Actions workflows developers
---
 docs/dev/ci/github_actions/overview.md |  5 ++
 docs/dev/ci/github_actions/security.md | 99 ++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 docs/dev/ci/github_actions/security.md

diff --git a/docs/dev/ci/github_actions/overview.md b/docs/dev/ci/github_actions/overview.md
index 8daf56a3a2252f..e65c085ede30d5 100644
--- a/docs/dev/ci/github_actions/overview.md
+++ b/docs/dev/ci/github_actions/overview.md
@@ -11,6 +11,7 @@ detailed instructions where necessary.
   * [Required workflows](#required-workflows)
   * [Workflow structure](#structure-of-the-workflows)
   * [Workflow and job organisation](#workflows-and-jobs-organisation)
+  * [Security considerations](#security-considerations)
 * [Finding results, artifacts and logs](#finding-results-artifacts-and-logs)
 * [Custom actions overview](#custom-actions)
 * [Machines overview](#machines)
@@ -205,6 +206,10 @@ Overview of the [Linux workflow's](../../../../.github/workflows/ubuntu_22.yml)
   * All the steps are executed in the shell specified by the `shell` key under `defaults: run:`
     unless a shell is specified directly in a step.
 
+### Security considerations
+
+Please consult [workflow security guidelines](security.md) before submitting a PR with GitHub Actions workflows changes.
+
 ## Finding Results, Artifacts, and Logs
 
 ### Results
diff --git a/docs/dev/ci/github_actions/security.md b/docs/dev/ci/github_actions/security.md
new file mode 100644
index 00000000000000..d46cf6fd865c41
--- /dev/null
+++ b/docs/dev/ci/github_actions/security.md
@@ -0,0 +1,99 @@
+# Security best practices for GitHub Actions Workflows
+
+There are a few simple steps that we should follow to ensure our workflows are not vulnerable to common attacks.
+
+## Adjust `GITHUB_TOKEN` permissions
+
+Use the `permissions` key to make sure the `GITHUB_TOKEN` is configured with the least privileges for each job.
+
+Start with relatively safe permissions:
+
+```yaml
+permissions: read-all
+```
+
+If you need more permissions, declare them at the job level when possible, for example:
+
+```yaml
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+
+    # GITHUB_TOKEN will have only these permissions for
+    # `stale` job
+    permissions:
+      issues: write
+      pull-requests: write
+
+    steps:
+      - uses: actions/stale@f7176fd3007623b69d27091f9b9d4ab7995f0a06
+
+```
+
+Check [GitHub documentation](https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/controlling-permissions-for-github_token) on this also.
+
+## Reduce the scope of environment variables
+
+Environment variables should be declared at the step level when possible (e.g. the variable is used only in this exact step). Only put variables on the job level when they're used by a few steps, and on the workflow level when they're used by most of the steps.
+
+Example from [the official GitHub documentation](https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables):
+
+```yaml
+name: Greeting on variable day
+
+on:
+  workflow_dispatch
+
+# Workflow level variables. Avoid using these.
+env:
+  DAY_OF_WEEK: Monday
+
+jobs:
+  greeting_job:
+    runs-on: ubuntu-latest
+    # Job level variables
+    env:
+      Greeting: Hello
+    steps:
+      - name: "Say Hello Mona it's Monday"
+        run: echo "$Greeting $First_Name. Today is $DAY_OF_WEEK!"
+        # Step level variables. Prefer this approach
+        env:
+          First_Name: Mona
+
+```
+
+## Avoid using `pull_request_target`
+
+**Never** use `pull_request_target` trigger event for workflows. If you want to use `pull_request_target`, contact a member of the OpenVINO GitHub Actions task force first. Check [GitHub blog post](https://securitylab.github.com/resources/github-actions-preventing-pwn-requests/) on this as well.
+
+## Handle secrets correctly
+
+**Never ever** use plain-text secrets hard-coded in GitHub Actions Workflow. If you need to use secrets, contact a member of the OpenVINO GitHub Actions task force first.
+
+## Be careful with user input.
+
+Most of GitHub context variables propagated from user input. That means they should be treated as an untrusted and potentially malicious. There are some tactics you can use to mitigate the risk:
+- Instead of using inline scripts, create an action and pass the variable as an argument
+- Put the value into an environment variable for the step, and use the variable in the script
+
+More details are available in [this](https://securitylab.github.com/resources/github-actions-untrusted-input/) blog post.
+
+## Pin versions for GitHub Actions
+
+When using third-party actions, pin the version with a commit hash rather than a tag to shield your workflow from potential supply-chain compromise.
+
+For example, instead of this:
+
+```yaml
+uses: actions/checkout@v4.2.2
+```
+
+use this:
+
+```yaml
+uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+```
+
+## Further reading
+Follow general [recommendations from GitHub itself](https://docs.github.com/en/actions/security-for-github-actions/security-guides/security-hardening-for-github-actions)

From 2b25007e89d589c27f6264ccafca5d50d923174c Mon Sep 17 00:00:00 2001
From: Taylor Yeonbok Lee <taylor.lee@intel.com>
Date: Wed, 13 Nov 2024 17:06:46 -0800
Subject: [PATCH 16/28] [GPU] Fix unexpectedly turned off Small N case
 optimization (#27493)

### Details:
- In https://github.com/openvinotoolkit/openvino/pull/26733/files, small
N case optimization with tile_ofm == 1 was unexpectedly turned off

### Tickets:
 - *ticket-id*
---
 .../fully_connected_kernel_bf_tiled.cpp                | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
index 178e1ea405b6bb..02304512637783 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -158,6 +158,11 @@ static bool is_weight_horizontal(const fully_connected_params& params, size_t ou
             && output_f / 4 /* tile_ofm=4 */ > min_num_threads * 1.5);
 }
 
+static bool is_weight_small_kn(const fully_connected_params& params, size_t output_f) {
+    size_t min_num_threads = params.engineInfo.computeUnitsCount * simd;
+    return output_f / 2 /*most frequently used tile_ofm*/ <= min_num_threads;
+}
+
 static bool is_suitable_outer_ofm(const fully_connected_params& params, size_t output_f) {
     size_t min_num_threads = params.engineInfo.computeUnitsCount * simd;
     return (params.weights.OFM().v > params.weights.IFM().v * 6
@@ -412,6 +417,11 @@ FullyConnected_bf_tiled::GetAutoTuneParams(const fully_connected_params& params,
                 } else if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) {
                     return selector.Default(tune_params(1, 1, 4, 4, 1, 1, 1, EXE_MODE_DEFAULT));
                 }
+            } else if (is_weight_small_kn(params, output_f)) {
+                if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2)
+                    return selector.Default(tune_params(1, 1, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT));
+                else
+                    return selector.Default(tune_params(1, 2, 4, 2, 1, 1, 1, EXE_MODE_DEFAULT));
             } else {
                 if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) {
                     return selector.Default(tune_params(1, 1, 4, 4, 1, 1, 1, EXE_MODE_DEFAULT));

From b1ff99cef534633c01cf8add32c5802ab67e7add Mon Sep 17 00:00:00 2001
From: Damian Kurek <damian.kurek@intel.com>
Date: Thu, 14 Nov 2024 06:14:10 +0100
Subject: [PATCH 17/28] [GPU] Fix memory leak (#27536)

### Details:
 - Fix memory leak and improve memory usage with continuous inference

### Tickets:
 - 148552
---
 .../intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
index 3ddb5bf8793c29..a8c715af98f198 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
@@ -299,6 +299,7 @@ inline void update_shapes(kernel_selector::Params& p, const kernel_impl_params&
         const auto& fused_prim = impl_param.fused_desc[i];
         auto& fd = bp.fused_ops[i];
         fd.output_tensor = convert_data_tensor(fused_prim.output_layout);
+        fd.tensors.clear();
         for (size_t i = fd.dep_idx_start; i < fd.dep_idx_start + fd.dep_size; i++) {
             fd.tensors.push_back(convert_data_tensor(impl_param.get_input_layout(i)));
         }

From 453ee5734383bfa0ed3cab5a369e64153ce7fbab Mon Sep 17 00:00:00 2001
From: Roman Kazantsev <roman.kazantsev@intel.com>
Date: Thu, 14 Nov 2024 13:04:36 +0400
Subject: [PATCH 18/28] [TF FE] Stabilize layer tests for Keras GRU layer on
 all platforms (#27543)

**Details:** Stabilize layer tests for Keras GRU layer on all platforms

**Ticket:** 156967

---------

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 .../workflows/job_tensorflow_layer_tests.yml  |   2 +-
 .../test_tf2_keras_gru.py                     | 144 +++++-------------
 2 files changed, 36 insertions(+), 110 deletions(-)

diff --git a/.github/workflows/job_tensorflow_layer_tests.yml b/.github/workflows/job_tensorflow_layer_tests.yml
index 29afb466d69a42..0de1708527739a 100644
--- a/.github/workflows/job_tensorflow_layer_tests.yml
+++ b/.github/workflows/job_tensorflow_layer_tests.yml
@@ -30,7 +30,7 @@ env:
 jobs:
   TensorFlow_Layer_Tests:
     name: TensorFlow Layer Tests
-    timeout-minutes: 30
+    timeout-minutes: 45
     runs-on: ${{ inputs.runner }}
     container: ${{ fromJSON(inputs.container) }}
     defaults:
diff --git a/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_gru.py b/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_gru.py
index 66b91e9d64daca..fad5c188d38d7f 100644
--- a/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_gru.py
+++ b/tests/layer_tests/tensorflow2_keras_tests/test_tf2_keras_gru.py
@@ -1,23 +1,30 @@
 # Copyright (C) 2022-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import numpy as np
 import pytest
 import tensorflow as tf
-
 from common.tf2_layer_test_class import CommonTF2LayerTest
 
+rng = np.random.default_rng(233534)
+
 
 class TestKerasGru(CommonTF2LayerTest):
-    def create_keras_gru_net(self, input_names, input_shapes, input_type, units, activation,
-                             recurrent_activation,
-                             use_bias, dropouts, flags, ir_version):
-        """
-                create TensorFlow 2 model with Keras GRU operation
-        """
+    def _prepare_input(self, inputs_info):
+        assert 'x' in inputs_info, "Test error: inputs_info must contain `x`"
+        x_shape = inputs_info['x']
+        inputs_data = {}
+        inputs_data['x'] = rng.uniform(-2.0, 2.0, x_shape).astype(self.input_type)
+        return inputs_data
+
+    def create_keras_gru_net(self, input_shapes, input_type, units,
+                             activation, recurrent_activation,
+                             dropouts, use_bias, flag1, flag2):
+        self.input_type = input_type
         tf.keras.backend.clear_session()  # For easy reset of notebook state
-        x1 = tf.keras.Input(shape=input_shapes[0][1:], name=input_names[0])
+        x1 = tf.keras.Input(shape=input_shapes[0][1:], dtype=input_type, name='x')
         dropout, recurrent_dropout = dropouts
-        go_backwards, reset_after = flags
+        go_backwards, reset_after = flag1, flag2
         y = tf.keras.layers.GRU(units=units, activation=activation,
                                 recurrent_activation=recurrent_activation,
                                 use_bias=use_bias, dropout=dropout,
@@ -25,111 +32,30 @@ def create_keras_gru_net(self, input_names, input_shapes, input_type, units, act
                                 return_sequences=False, return_state=False,
                                 go_backwards=go_backwards, reset_after=reset_after)(x1)
         tf2_net = tf.keras.Model(inputs=[x1], outputs=[y])
-
-        # TODO: add reference IR net. Now it is omitted since inference is more
-        #  important and needs to be checked in the first
         ref_net = None
 
         return tf2_net, ref_net
 
-    test_data_simple = [
-        dict(input_names=["x"], input_shapes=[[2, 2, 3]], input_type=tf.float32, units=1,
-             activation='tanh', recurrent_activation='sigmoid', dropouts=(.0, .3), use_bias=True,
-             flags=(False, False)),
-        dict(input_names=["x"], input_shapes=[[1, 2, 3]], input_type=tf.float32, units=4,
-             activation='relu', recurrent_activation='sigmoid', dropouts=(.2, .4), use_bias=True,
-             flags=(False, False)),
-        dict(input_names=["x"], input_shapes=[[3, 2, 3]], input_type=tf.float32, units=2,
-             activation='elu', recurrent_activation='tanh', dropouts=(.3, .5), use_bias=True,
-             flags=(False, False)),
-        dict(input_names=["x"], input_shapes=[[2, 3, 4]], input_type=tf.float32, units=1,
-             activation='elu', recurrent_activation='softmax', dropouts=(.0, .5), use_bias=True,
-             flags=(False, False)),
-        dict(input_names=["x"], input_shapes=[[1, 3, 4]], input_type=tf.float32, units=3,
-             activation='linear', recurrent_activation='sigmoid', dropouts=(.4, .6),
-             flags=(False, False), use_bias=True)
-    ]
-
-    @pytest.mark.parametrize("params", test_data_simple)
-    @pytest.mark.nightly
-    @pytest.mark.precommit
-    def test_keras_gru_with_bias_float32(self, params, ie_device, precision, temp_dir, ir_version,
-                                         use_legacy_frontend):
-        self._test(*self.create_keras_gru_net(**params, ir_version=ir_version),
-                   ie_device, precision, temp_dir=temp_dir, ir_version=ir_version,
-                   use_legacy_frontend=use_legacy_frontend, **params)
-
-    test_data_without_bias = [
-        dict(input_names=["x"], input_shapes=[[2, 2, 7]], input_type=tf.float32, units=1,
-             activation='tanh', recurrent_activation='sigmoid', dropouts=(.0, .3), use_bias=False,
-             flags=(False, False)),
-        dict(input_names=["x"], input_shapes=[[3, 8, 3]], input_type=tf.float32, units=4,
-             activation='relu', recurrent_activation='sigmoid', dropouts=(.7, .4), use_bias=False,
-             flags=(False, False)),
-        dict(input_names=["x"], input_shapes=[[4, 2, 2]], input_type=tf.float32, units=2,
-             activation='elu', recurrent_activation='tanh', dropouts=(.0, .5), use_bias=False,
-             flags=(False, False))
-    ]
-
-    @pytest.mark.parametrize("params", test_data_without_bias)
-    @pytest.mark.nightly
-    @pytest.mark.precommit
-    def test_keras_gru_without_bias_float32(self, params, ie_device, precision, temp_dir,
-                                            ir_version, use_legacy_frontend):
-        self._test(*self.create_keras_gru_net(**params, ir_version=ir_version),
-                   ie_device, precision, temp_dir=temp_dir, ir_version=ir_version,
-                   use_legacy_frontend=use_legacy_frontend, **params)
-
-    test_data_different_flags = [
-        dict(input_names=["x"], input_shapes=[[2, 3, 2]], input_type=tf.float32, units=1,
-             activation='elu', recurrent_activation='sigmoid', dropouts=(.0, .3), use_bias=True,
-             flags=(True, False)),
-        dict(input_names=["x"], input_shapes=[[4, 8, 3]], input_type=tf.float32, dropouts=(.1, .3),
-             units=3, activation='relu', use_bias=False, recurrent_activation='tanh',
-             flags=(False, True)),
-        dict(input_names=["x"], input_shapes=[[4, 2, 7]], input_type=tf.float32, units=5,
-             activation='relu', recurrent_activation='tanh', dropouts=(.2, .6),
-             use_bias=True, flags=(False, False)),
-        dict(input_names=["x"], input_shapes=[[4, 16, 2]], input_type=tf.float32, units=5,
-             activation='relu', recurrent_activation='tanh', dropouts=(.2, .6),
-             use_bias=True, flags=(False, True)),
-        dict(input_names=["x"], input_shapes=[[4, 8, 7]], input_type=tf.float32, units=5,
-             activation='elu', recurrent_activation='sigmoid', dropouts=(.2, .6),
-             use_bias=True, flags=(True, True)),
-    ]
-
-    @pytest.mark.parametrize("params", test_data_different_flags)
-    @pytest.mark.nightly
-    @pytest.mark.precommit
-    @pytest.mark.xfail(reason="sporadic inference mismatch")
-    def test_keras_gru_flags_float32(self, params, ie_device, precision, temp_dir, ir_version,
-                                     use_legacy_frontend):
-        self._test(*self.create_keras_gru_net(**params, ir_version=ir_version),
-                   ie_device, precision, temp_dir=temp_dir, ir_version=ir_version,
-                   use_legacy_frontend=use_legacy_frontend, **params)
-
-    test_data_zero_recurrent_dropout = [
-        dict(input_names=["x"], input_shapes=[[8, 2, 3]], input_type=tf.float32, units=3,
-             activation='elu', recurrent_activation='tanh', dropouts=(.7, .0), use_bias=True,
-             flags=(False, False)),
-        dict(input_names=["x"], input_shapes=[[4, 8, 5]], input_type=tf.float32, dropouts=(.6, .0),
-             units=2, activation='elu', use_bias=True, recurrent_activation='tanh',
-             flags=(False, False)),
-        dict(input_names=["x"], input_shapes=[[4, 3, 1]], input_type=tf.float32, units=8,
-             activation='elu', recurrent_activation='tanh', dropouts=(.5, .0),
-             use_bias=True, flags=(True, False)),
-        dict(input_names=["x"], input_shapes=[[3, 4, 2]], input_type=tf.float32, units=3,
-             activation='elu', recurrent_activation='tanh', dropouts=(.7, .0), use_bias=True,
-             flags=(True, False)),
-    ]
-
-    @pytest.mark.parametrize("params", test_data_zero_recurrent_dropout)
+    @pytest.mark.parametrize('input_shapes', [[[2, 3, 4]]])
+    @pytest.mark.parametrize('input_type', [np.float32, np.float64])
+    @pytest.mark.parametrize('units', [1, 2, 3])
+    @pytest.mark.parametrize('activation', ['tanh', 'relu', 'elu', 'linear'])
+    @pytest.mark.parametrize('recurrent_activation', ['sigmoid', 'tanh', 'softmax'])
+    @pytest.mark.parametrize('dropouts', [(.0, .0), (.0, .3), (.2, .4), ])
+    @pytest.mark.parametrize('use_bias', [True, False])
+    @pytest.mark.parametrize('flag1', [True, False])
+    @pytest.mark.parametrize('flag2', [True, False])
     @pytest.mark.nightly
     @pytest.mark.precommit
-    @pytest.mark.xfail(reason="50176")
-    def test_keras_gru_flags_zero_recurrent_dropout_float32(self, params, ie_device, precision,
-                                                            temp_dir, ir_version,
-                                                            use_legacy_frontend):
-        self._test(*self.create_keras_gru_net(**params, ir_version=ir_version),
+    def test_keras_gru(self, input_shapes, input_type, units,
+                       activation, recurrent_activation,
+                       dropouts, use_bias, flag1, flag2,
+                       ie_device, precision, temp_dir, ir_version,
+                       use_legacy_frontend):
+        params = {}
+        params['input_shapes'] = input_shapes
+        self._test(*self.create_keras_gru_net(input_shapes, input_type, units,
+                                              activation, recurrent_activation,
+                                              dropouts, use_bias, flag1, flag2),
                    ie_device, precision, temp_dir=temp_dir, ir_version=ir_version,
                    use_legacy_frontend=use_legacy_frontend, **params)

From a661f0d63c0ea204a2f5095a25e4458ba93f5884 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi3.zhang@intel.com>
Date: Thu, 14 Nov 2024 19:09:20 +0800
Subject: [PATCH 19/28] [CPU]fix rope mark up to skip shapeof (#27462)

### Details:
 - *Stop markup at ShapeOf*
- *The ShapeOf in the subgraph of Rope's 2nd/3rd input should not be
marked*
- *The parent of ShapeOf may change when IR changes so skip it to avoid
unknown precision problem*

### Tickets:
 - *CVS-155898*
---
 .../mark_rope_input_to_keep_in_mixed_precision.cpp         | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/common/transformations/src/transformations/common_optimizations/mark_rope_input_to_keep_in_mixed_precision.cpp b/src/common/transformations/src/transformations/common_optimizations/mark_rope_input_to_keep_in_mixed_precision.cpp
index 63c7495b28112b..e40aeaa67421a8 100644
--- a/src/common/transformations/src/transformations/common_optimizations/mark_rope_input_to_keep_in_mixed_precision.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/mark_rope_input_to_keep_in_mixed_precision.cpp
@@ -31,9 +31,12 @@ ov::pass::MarkRopeInputsToKeepInMixedPrecision::MarkRopeInputsToKeepInMixedPreci
         auto visit_func = [](ov::Node* node) {
             ov::disable_fp16_compression(node->shared_from_this());
         };
-        // skip constant and parameter node
+        // skip constant, parameter and shapeof
+        // The inputs of cos_sin table generation are position_ids and a ShapeOf [batch, input_length]
+        // The parent of ShapeOf may change when IR changes so skip it to avoid unknown precision problem
         auto skip_node_predicate = [](ov::Node* node) -> bool {
-            return ov::is_type<ov::op::v0::Constant>(node) || ov::is_type<ov::op::v0::Parameter>(node);
+            return ov::is_type<ov::op::v0::Constant>(node) || ov::is_type<ov::op::v0::Parameter>(node) ||
+                   ov::is_type<ov::op::util::ShapeOfBase>(node);
         };
         if (!visited.count(cos_input_node)) {
             ov::op::util::visit_path(cos_input_node, visited, visit_func, skip_node_predicate);

From fefe0c33e02928d85646a940fe854d39186461a3 Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Thu, 14 Nov 2024 16:38:44 +0100
Subject: [PATCH 20/28] Fixed OVC app name in telemetry. (#27544)

### Details:
- Fixed the issue, that '`Model Conversion API`' telemetry category
includes `OVC` and all other tools, now `OVC` has a separate category
"`OVC`".
- Refactored `send_params_info()` method and fixed naming of telemetry
init method.

---------

Co-authored-by: Roman Kazantsev <roman.kazantsev@intel.com>
---
 tools/ovc/openvino/tools/ovc/__init__.py      | 14 +++++-----
 tools/ovc/openvino/tools/ovc/__main__.py      |  4 +--
 tools/ovc/openvino/tools/ovc/convert_impl.py  | 10 +++----
 tools/ovc/openvino/tools/ovc/ovc.py           |  4 +--
 .../ovc/openvino/tools/ovc/telemetry_utils.py | 27 +++++++++----------
 5 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/tools/ovc/openvino/tools/ovc/__init__.py b/tools/ovc/openvino/tools/ovc/__init__.py
index a2912d28e08af7..5b750b58969d24 100644
--- a/tools/ovc/openvino/tools/ovc/__init__.py
+++ b/tools/ovc/openvino/tools/ovc/__init__.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from openvino.tools.ovc.convert import convert_model
-from openvino.tools.ovc.telemetry_utils import is_optimum, init_mo_telemetry
+from openvino.tools.ovc.telemetry_utils import is_optimum, init_ovc_telemetry
 
 import importlib.metadata as importlib_metadata
 
@@ -11,10 +11,10 @@
 except importlib_metadata.PackageNotFoundError:
     optimum_version = None
 
+from openvino.runtime import get_version as get_rt_version  # pylint: disable=no-name-in-module,import-error
+telemetry = init_ovc_telemetry('OpenVINO')
+telemetry.send_event("ov", "import", "general_import")
+
 if is_optimum() and optimum_version is not None:
-    from openvino.runtime import get_version as get_rt_version  # pylint: disable=no-name-in-module,import-error
-    telemetry = init_mo_telemetry("Optimum Intel", optimum_version)
-    telemetry.send_event("ov", "import", "import_from_optimum,ov_version:{}".format(get_rt_version()))
-else:
-    telemetry = init_mo_telemetry()
-    telemetry.send_event("ov", "import", "general_import")
+    telemetry = init_ovc_telemetry("Optimum Intel", optimum_version)
+    telemetry.send_event("optimum", "import", "import_from_optimum,ov_version:{}".format(get_rt_version()))
diff --git a/tools/ovc/openvino/tools/ovc/__main__.py b/tools/ovc/openvino/tools/ovc/__main__.py
index 5e9ef2dfba4e6f..d264010d9870d9 100644
--- a/tools/ovc/openvino/tools/ovc/__main__.py
+++ b/tools/ovc/openvino/tools/ovc/__main__.py
@@ -4,7 +4,7 @@
 import sys
 
 from openvino.tools.ovc.main import main
-from openvino.tools.ovc.telemetry_utils import init_mo_telemetry
+from openvino.tools.ovc.telemetry_utils import init_ovc_telemetry
 
-init_mo_telemetry()
+init_ovc_telemetry()
 sys.exit(main())
diff --git a/tools/ovc/openvino/tools/ovc/convert_impl.py b/tools/ovc/openvino/tools/ovc/convert_impl.py
index dc0694f0a405b5..aef054f8aafc24 100644
--- a/tools/ovc/openvino/tools/ovc/convert_impl.py
+++ b/tools/ovc/openvino/tools/ovc/convert_impl.py
@@ -32,7 +32,7 @@
 from openvino.tools.ovc.utils import check_values_equal
 from openvino.tools.ovc.logger import init_logger
 from openvino.tools.ovc.telemetry_utils import send_params_info, send_conversion_result, \
-    init_mo_telemetry
+    init_ovc_telemetry
 from openvino.tools.ovc.moc_frontend.pytorch_frontend_utils import get_pytorch_decoder, \
     extract_input_info_from_example, get_pytorch_decoder_for_model_on_disk
 from openvino.tools.ovc.moc_frontend.paddle_frontend_utils import paddle_frontend_converter
@@ -428,7 +428,7 @@ def _convert(cli_parser: argparse.ArgumentParser, args, python_api_used):
         tracemalloc.start()
 
     simplified_ie_version = VersionChecker().get_ie_simplified_version()
-    telemetry = init_mo_telemetry()
+    telemetry = init_ovc_telemetry()
     telemetry.start_session('ovc')
     telemetry.send_event('ovc', 'version', simplified_ie_version)
     # Initialize logger with 'ERROR' as default level to be able to form nice messages
@@ -484,12 +484,12 @@ def _convert(cli_parser: argparse.ArgumentParser, args, python_api_used):
 
         argv.feManager = FrontEndManager()
 
-        # send telemetry with params info
-        send_params_info(argv, cli_parser)
-
         non_default_params = get_non_default_params(argv, cli_parser)
         argv.is_python_api_used = python_api_used
 
+        # send telemetry with params info
+        send_params_info(non_default_params)
+
         argv.framework = model_framework
 
         orig_input_model = argv.input_model
diff --git a/tools/ovc/openvino/tools/ovc/ovc.py b/tools/ovc/openvino/tools/ovc/ovc.py
index 20c4a675797a92..88f2d7a08619be 100755
--- a/tools/ovc/openvino/tools/ovc/ovc.py
+++ b/tools/ovc/openvino/tools/ovc/ovc.py
@@ -6,8 +6,8 @@
 import sys
 
 if __name__ == "__main__":
-    from openvino.tools.ovc.telemetry_utils import init_mo_telemetry
+    from openvino.tools.ovc.telemetry_utils import init_ovc_telemetry
     from openvino.tools.ovc.main import main
 
-    init_mo_telemetry()
+    init_ovc_telemetry()
     sys.exit(main())
diff --git a/tools/ovc/openvino/tools/ovc/telemetry_utils.py b/tools/ovc/openvino/tools/ovc/telemetry_utils.py
index 4a54632b8c642d..412d9b9607541e 100644
--- a/tools/ovc/openvino/tools/ovc/telemetry_utils.py
+++ b/tools/ovc/openvino/tools/ovc/telemetry_utils.py
@@ -25,7 +25,7 @@ def is_optimum():
     return False
 
 
-def init_mo_telemetry(app_name='Model Conversion API', app_version=None):
+def init_ovc_telemetry(app_name='OVC', app_version=None):
     app_version = app_version if app_version is not None else get_rt_version()
     return init_telemetry_class(tid=get_tid(),
                                 app_name=app_name,
@@ -97,22 +97,19 @@ def arg_to_str(arg):
     return str(type(arg))
 
 
-def send_params_info(argv: argparse.Namespace, cli_parser: argparse.ArgumentParser):
+def send_params_info(params: dict):
     """
     This function sends information about used command line parameters.
-    :param argv: command line parameters.
-    :param cli_parser: command line parameters parser.
+    :param params: command-line parameters dictionary.
     """
     t = tm.Telemetry()
     params_with_paths = get_params_with_paths_list()
-    for arg in vars(argv):
-        arg_value = getattr(argv, arg)
-        if not check_values_equal(arg_value, cli_parser.get_default(arg)):
-            if arg in params_with_paths:
-                # If command line argument value is a directory or a path to file it is not sent
-                # as it may contain confidential information. "1" value is used instead.
-                param_str = arg + ":" + str(1)
-            else:
-                param_str = arg + ":" + arg_to_str(arg_value)
-
-            t.send_event('ovc', 'cli_parameters', param_str)
+    for key, value in params.items():
+        if key in params_with_paths:
+            # If command line argument value is a directory or a path to file it is not sent
+            # as it may contain confidential information. "1" value is used instead.
+            param_str = key + ":" + str(1)
+        else:
+            param_str = key + ":" + arg_to_str(value)
+
+        t.send_event('ovc', 'cli_parameters', param_str)

From 19ae2a950d0127a60d66a6e46b55c91faf07d736 Mon Sep 17 00:00:00 2001
From: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Date: Thu, 14 Nov 2024 17:20:56 +0100
Subject: [PATCH 21/28] [GHA] Manylinux added CI tags and manifest (#27540)

### Details:
- added actions to save artifacts for Jenkins - create_manifest and
store_artifacts
 - Added overall status workflow

### Tickets:
 - *ticket-id*
---
 .github/actions/common/constants.py  |  1 +
 .github/workflows/manylinux_2014.yml | 47 +++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/.github/actions/common/constants.py b/.github/actions/common/constants.py
index da55ece2ee4258..6a1d165fc7df13 100644
--- a/.github/actions/common/constants.py
+++ b/.github/actions/common/constants.py
@@ -16,6 +16,7 @@ class EventType(Enum):
     'public_linux_ubuntu_24_04_x86_64_release',
     'public_windows_vs2019_Release',
     'public_windows_vs2019_Debug',
+    'public_manylinux2014_x86_64_release',
 )
 ProductType = Enum('ProductType', {t.upper(): t for t in productTypes})
 
diff --git a/.github/workflows/manylinux_2014.yml b/.github/workflows/manylinux_2014.yml
index ed375fb868459f..bd5da965226a50 100644
--- a/.github/workflows/manylinux_2014.yml
+++ b/.github/workflows/manylinux_2014.yml
@@ -88,6 +88,7 @@ jobs:
       options: -e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING -e DOCKER_CONFIG -v ${{ github.workspace }}:${{ github.workspace }}
     env:
       CMAKE_BUILD_TYPE: 'Release'
+      ARCH: 'x86_64'
       OPENVINO_REPO: ${{ github.workspace }}/src
       INSTALL_DIR: ${{ github.workspace }}/install/openvino
       INSTALL_WHEELS_DIR: ${{ github.workspace }}/install/wheels
@@ -99,6 +100,9 @@ jobs:
       SCCACHE_SERVER_PORT: 35555
       SCCACHE_CACHE_SIZE: 50G
       SCCACHE_AZURE_KEY_PREFIX: manylinux_2014
+      ARTIFACTS_SHARE: "/mount/build-artifacts"
+      MANIFEST_PATH: ${{ github.workspace }}/manifest.yml
+      PRODUCT_TYPE: public_manylinux2014_x86_64_release
       
     steps:
       - name: Clone OpenVINO
@@ -109,6 +113,17 @@ jobs:
 
       - name: System info
         uses: ./src/.github/actions/system_info
+        
+      - name: Generate product manifest and set CI_BUILD_NUMBER & CI_BUILD_DEV_TAG
+        id: create_manifest
+        uses: ./src/.github/actions/create_manifest
+        with:
+          repos: |
+            ${{ env.OPENVINO_REPO }}
+          product_type: ${{ env.PRODUCT_TYPE }}
+          target_arch: ${{ env.ARCH }}
+          build_type: ${{ env.CMAKE_BUILD_TYPE }}
+          save_to: ${{ env.MANIFEST_PATH }}
 
       - name: Create docker build cache
         run: |
@@ -128,6 +143,8 @@ jobs:
             -e SCCACHE_AZURE_KEY_PREFIX \
             -e CMAKE_CXX_COMPILER_LAUNCHER \
             -e CMAKE_C_COMPILER_LAUNCHER \
+            -e CI_BUILD_NUMBER \
+            -e CI_BUILD_DEV_TAG \
             -w /work/src \
             ${{ fromJSON(needs.docker.outputs.images).ov_build.manylinux2014_x86_64 }} \
             /bin/bash -c "
@@ -158,6 +175,8 @@ jobs:
               -e SCCACHE_AZURE_KEY_PREFIX \
               -e CMAKE_CXX_COMPILER_LAUNCHER \
               -e CMAKE_C_COMPILER_LAUNCHER \
+              -e CI_BUILD_NUMBER \
+              -e CI_BUILD_DEV_TAG \
               -w /work/src \
               ${{ fromJSON(needs.docker.outputs.images).ov_build.manylinux2014_x86_64 }} \
               /bin/bash -c "
@@ -188,4 +207,30 @@ jobs:
         with:
           name: openvino_wheels
           path: ${{ env.INSTALL_WHEELS_DIR }}/wheels/*.whl
-          if-no-files-found: 'error'
\ No newline at end of file
+          if-no-files-found: 'error'
+          
+      - name: Store artifacts to a shared drive
+        id: store_artifacts
+        if: ${{ always() }}
+        uses: ./src/.github/actions/store_artifacts
+        with:
+          artifacts: |
+            ${{ env.BUILD_DIR }}/openvino_package.tar.gz
+            ${{ env.MANIFEST_PATH }}
+            ${{ env.INSTALL_WHEELS_DIR }}/wheels
+          storage_dir: ${{ env.PRODUCT_TYPE }}
+          storage_root: ${{ env.ARTIFACTS_SHARE }}
+          
+  Overall_Status:
+    name: ci/gha_overall_status_manylinux2014
+    needs: [Smart_CI, Build]
+    if: ${{ always() }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check status of all jobs
+        if: >-
+          ${{
+            contains(needs.*.result, 'failure') ||
+            contains(needs.*.result, 'cancelled')
+          }}
+        run: exit 1
\ No newline at end of file

From 651a51cf68cc34bcd5b191a573d54f5c2a451af0 Mon Sep 17 00:00:00 2001
From: Tomasz Krupa <tomasz.krupa@intel.com>
Date: Fri, 15 Nov 2024 06:22:32 +0000
Subject: [PATCH 22/28] [GPU] Change weights_path mutabilibity to RW (#27553)

### Details:
It needs to be readable (e.g. to be queried by hello_query_device
sample) and writable for plugin to set the default value.

### Tickets:
https://jira.devtools.intel.com/browse/CVS-157364
---
 src/plugins/intel_gpu/src/plugin/plugin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp
index b1cc946559ee94..c3ba90fd66f7a8 100644
--- a/src/plugins/intel_gpu/src/plugin/plugin.cpp
+++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@@ -597,7 +597,7 @@ std::vector<ov::PropertyName> Plugin::get_supported_properties() const {
         ov::PropertyName{ov::device::id.name(), PropertyMutability::RW},
         ov::PropertyName{ov::hint::dynamic_quantization_group_size.name(), PropertyMutability::RW},
         ov::PropertyName{ov::hint::activations_scale_factor.name(), PropertyMutability::RW},
-        ov::PropertyName{ov::weights_path.name(), PropertyMutability::RO},
+        ov::PropertyName{ov::weights_path.name(), PropertyMutability::RW},
     };
 
     return supported_properties;

From c33855978237389f0d0366d0a2bc8af31c4a78d4 Mon Sep 17 00:00:00 2001
From: Sebastian Golebiewski <sebastianx.golebiewski@intel.com>
Date: Fri, 15 Nov 2024 09:29:29 +0100
Subject: [PATCH 23/28] [DOCS] Updating NPU GenAI docs (#27489)

Updating the `Run LLMs with OpenVINO GenAI Flavor on NPU` article -
adding info about new config options.
This PR addresses JIRA ticket: 156503.

---------

Signed-off-by: Sebastian Golebiewski <sebastianx.golebiewski@intel.com>
Co-authored-by: Karol Blaszczak <karol.blaszczak@intel.com>
---
 .../llm_inference_guide/genai-guide-npu.rst   | 185 ++++++++++++++++--
 1 file changed, 165 insertions(+), 20 deletions(-)

diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst
index 4585ca97488023..6917d809c7e5d6 100644
--- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst
+++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst
@@ -9,7 +9,7 @@ This guide will give you extra details on how to utilize NPU with the GenAI flav
 for information on how to start.
 
 Prerequisites
-#############
+#####################
 
 Install required dependencies:
 
@@ -17,35 +17,92 @@ Install required dependencies:
 
    python -m venv npu-env
    npu-env\Scripts\activate
-   pip install optimum-intel nncf==2.11 onnx==1.16.1
+   pip install nncf==2.12 onnx==1.16.1 optimum-intel==1.19.0
    pip install --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 
 Export an LLM model via Hugging Face Optimum-Intel
 ##################################################
 
-A chat-tuned TinyLlama model is used in this example. The following conversion & optimization
-settings are recommended when using the NPU:
+Since **symmetrically-quantized 4-bit (INT4) models are preffered for inference on NPU**, make sure to export
+the model with the proper conversion and optimization settings.
 
-.. code-block:: python
+| You may export LLMs via Optimum-Intel, using one of two compression methods:
+| **group quantization** - for both smaller and larger models,
+| **channel-wise quantization** - remarkably effective but for models exceeding 1 billion parameters.
 
-   optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-Chat-v1.0 --weight-format int4 --sym --group-size 128 --ratio 1.0 TinyLlama
+You select one of the methods by setting the ``--group-size`` parameter to either ``128`` or ``-1``, respectively. See the following examples:
 
-**For models exceeding 1 billion parameters**, it is recommended to use **channel-wise
-quantization** that is remarkably effective. For example, you can try the approach with the
-llama-2-7b-chat-hf model:
+.. tab-set::
+
+   .. tab-item:: Group quantization
+
+      .. code-block:: console
+         :name: group-quant
+
+         optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-Chat-v1.0 --weight-format int4 --sym --ratio 1.0 --group_size 128 TinyLlama-1.1B-Chat-v1.0
+
+   .. tab-item:: Channel-wise quantization
+
+      .. tab-set::
+
+         .. tab-item:: Data-free quantization
+
+
+            .. code-block:: console
+               :name: channel-wise-data-free-quant
+
+               optimum-cli export openvino -m meta-llama/Llama-2-7b-chat-hf --weight-format int4 --sym --ratio 1.0 --group-size -1 Llama-2-7b-chat-hf
 
-.. code-block:: python
+         .. tab-item:: Data-aware quantization
 
-   optimum-cli export openvino -m meta-llama/Llama-2-7b-chat-hf --weight-format int4 --sym --group-size -1 --ratio 1.0 Llama-2-7b-chat-hf
+            If you want to improve accuracy, make sure you:
+
+            1. Update NNCF: ``pip install nncf==2.13``
+            2. Use ``--scale_estimation --dataset=<dataset_name>`` and accuracy aware quantization ``--awq``:
+
+               .. code-block:: console
+                  :name: channel-wise-data-aware-quant
+
+                  optimum-cli export openvino -m meta-llama/Llama-2-7b-chat-hf --weight-format int4 --sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset=wikitext2  Llama-2-7b-chat-hf
+
+
+      .. important::
+
+         Remember that the negative value of ``-1`` is required here, not ``1``.
+
+
+
+You can also try using 4-bit (INT4)
+`GPTQ models <https://huggingface.co/models?other=gptq,4-bit&sort=trending>`__,
+which do not require specifying quantization parameters:
+
+.. code-block:: console
+
+   optimum-cli export openvino -m TheBloke/Llama-2-7B-Chat-GPTQ
+
+
+| Remember, NPU supports GenAI models quantized symmetrically to INT4.
+| Below is a list of such models:
+
+* meta-llama/Meta-Llama-3-8B-Instruct
+* microsoft/Phi-3-mini-4k-instruct
+* Qwen/Qwen2-7B
+* mistralai/Mistral-7B-Instruct-v0.2
+* openbmb/MiniCPM-1B-sft-bf16
+* TinyLlama/TinyLlama-1.1B-Chat-v1.0
+* TheBloke/Llama-2-7B-Chat-GPTQ
+* Qwen/Qwen2-7B-Instruct-GPTQ-Int4
 
 
 Run generation using OpenVINO GenAI
 ###################################
 
-It is recommended to install the latest available
+It is typically recommended to install the latest available
 `driver <https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html>`__.
 
-Use the following code snippet to perform generation with OpenVINO GenAI API:
+Use the following code snippet to perform generation with OpenVINO GenAI API.
+Note that **currently, the NPU pipeline supports greedy decoding only**. This means that
+you need to add ``do_sample=False`` **to the** ``generate()`` **method:**
 
 .. tab-set::
 
@@ -53,26 +110,31 @@ Use the following code snippet to perform generation with OpenVINO GenAI API:
       :sync: py
 
       .. code-block:: python
+         :emphasize-lines: 4
 
          import openvino_genai as ov_genai
          model_path = "TinyLlama"
          pipe = ov_genai.LLMPipeline(model_path, "NPU")
-         print(pipe.generate("The Sun is yellow because", max_new_tokens=100))
+         print(pipe.generate("The Sun is yellow because", max_new_tokens=100, do_sample=False))
 
    .. tab-item:: C++
       :sync: cpp
 
       .. code-block:: cpp
+         :emphasize-lines: 7, 9
 
          #include "openvino/genai/llm_pipeline.hpp"
          #include <iostream>
 
          int main(int argc, char* argv[]) {
             std::string model_path = "TinyLlama";
-            ov::genai::LLMPipeline pipe(model_path, "NPU");
-            std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(100));
+            ov::genai::GenerationConfig config;
+            config.do_sample=false;
+            config.max_new_tokens=100;
+            std::cout << pipe.generate("The Sun is yellow because", config);
          }
 
+
 Additional configuration options
 ################################
 
@@ -88,9 +150,9 @@ user explicitly sets a lower length limit for the response.
 You may configure both the 'maximum input prompt length' and 'minimum response length' using
 the following parameters:
 
-* ``MAX_PROMPT_LEN``: Defines the maximum number of tokens that the LLM pipeline can process
-  for the input prompt (default: 1024).
-* ``MIN_RESPONSE_LEN``: Defines the minimum number of tokens that the LLM pipeline will generate
+* ``MAX_PROMPT_LEN`` - defines the maximum number of tokens that the LLM pipeline can process
+  for the input prompt (default: 1024),
+* ``MIN_RESPONSE_LEN`` - defines the minimum number of tokens that the LLM pipeline will generate
   in its response (default: 150).
 
 Use the following code snippet to change the default settings:
@@ -113,10 +175,93 @@ Use the following code snippet to change the default settings:
          ov::AnyMap pipeline_config = { { "MAX_PROMPT_LEN",  1024 }, { "MIN_RESPONSE_LEN", 512 } };
          ov::genai::LLMPipeline pipe(model_path, "NPU", pipeline_config);
 
+Cache compiled models
++++++++++++++++++++++
+
+Specify the ``NPUW_CACHE_DIR`` option in ``pipeline_config`` for NPU pipeline to
+cache the compiled models. Using the code snippet below shortens the initialization time
+of the pipeline runs coming next:
+
+.. tab-set::
+
+   .. tab-item:: Python
+      :sync: py
+
+      .. code-block:: python
+
+         pipeline_config = { "NPUW_CACHE_DIR": ".npucache" }
+         pipe = ov_genai.LLMPipeline(model_path, "NPU", pipeline_config)
+
+   .. tab-item:: C++
+      :sync: cpp
+
+      .. code-block:: cpp
+
+         ov::AnyMap pipeline_config = { { "NPUW_CACHE_DIR",  ".npucache" } };
+         ov::genai::LLMPipeline pipe(model_path, "NPU", pipeline_config);
+
+
+Disable memory allocation
++++++++++++++++++++++++++
+
+In case of execution failures, either silent or with errors, try to update the NPU driver to
+`32.0.100.3104 or newer <https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html>`__.
+If the update is not possible, set the ``DISABLE_OPENVINO_GENAI_NPU_L0``
+environment variable to disable NPU memory allocation, which might be supported
+only on newer drivers for Intel Core Ultra 200V processors.
+
+Set the environment variable in a terminal:
+
+.. tab-set::
+
+   .. tab-item:: Linux
+      :sync: linux
+
+      .. code-block:: console
+
+         export DISABLE_OPENVINO_GENAI_NPU_L0=1
+
+   .. tab-item:: Windows
+      :sync: win
+
+      .. code-block:: console
+
+         set DISABLE_OPENVINO_GENAI_NPU_L0=1
+
+
+Performance modes
++++++++++++++++++++++
+
+You can configure the NPU pipeline with the ``GENERATE_HINT`` option to switch
+between two different performance modes:
+
+* ``FAST_COMPILE`` (default) - enables fast compilation at the expense of performance,
+* ``BEST_PERF`` - ensures best possible performance at lower compilation speed.
+
+Use the following code snippet:
+
+.. tab-set::
+
+   .. tab-item:: Python
+      :sync: py
+
+      .. code-block:: python
+
+         pipeline_config = { "GENERATE_HINT": "BEST_PERF" }
+         pipe = ov_genai.LLMPipeline(model_path, "NPU", pipeline_config)
+
+   .. tab-item:: C++
+      :sync: cpp
+
+      .. code-block:: cpp
+
+         ov::AnyMap pipeline_config = { { "GENERATE_HINT",  "BEST_PERF" } };
+         ov::genai::LLMPipeline pipe(model_path, "NPU", pipeline_config);
+
 
 Additional Resources
 ####################
 
 * :doc:`NPU Device <../../openvino-workflow/running-inference/inference-devices-and-modes/npu-device>`
 * `OpenVINO GenAI Repo <https://github.com/openvinotoolkit/openvino.genai>`__
-* `Neural Network Compression Framework <https://github.com/openvinotoolkit/nncf>`__
\ No newline at end of file
+* `Neural Network Compression Framework <https://github.com/openvinotoolkit/nncf>`__

From 250f0015716b254cc580d50bb6d8de204c386a54 Mon Sep 17 00:00:00 2001
From: Andrzej Kopytko <andrzejx.kopytko@intel.com>
Date: Fri, 15 Nov 2024 10:02:49 +0100
Subject: [PATCH 24/28] [DOCS Added new hierarchical field for Coveo (#27569)

### Details:
 - *item1*
 - *...*

### Tickets:
 - *ticket-id*
---
 .../openvino_custom_sphinx_sitemap/__init__.py  | 17 +++++++++++++----
 docs/sphinx_setup/conf.py                       |  3 ++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py b/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py
index c82e0a8d5995f7..ca93d02d75c6a9 100644
--- a/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py
+++ b/docs/openvino_custom_sphinx_sitemap/openvino_custom_sphinx_sitemap/__init__.py
@@ -120,15 +120,24 @@ def process_coveo_meta(meta, url, link):
 
     for namespace, values in meta:
         namespace_element = ET.SubElement(url, namespace)
+        loc_element = url.find("loc")
 
         for tag_name, tag_value in values.items():
             if tag_name == 'ovdoctype':
-                processed_link = process_link(link)
-                ET.SubElement(namespace_element, tag_name).text = processed_link
-            else:
+                ET.SubElement(namespace_element, tag_name).text = process_link(link)
+            elif tag_name == 'ovcategory' and loc_element is not None:
+                ET.SubElement(namespace_element, tag_name).text = extract_link(loc_element.text)
+            elif tag_name == 'ovversion':
                 ET.SubElement(namespace_element, tag_name).text = tag_value
 
 def process_link(link):
     if '/' in link:
         return link.split('/')[0].replace("-", " ")
-    return link.split('.html')[0].replace("-", " ")
\ No newline at end of file
+    return link.split('.html')[0].replace("-", " ")
+
+def extract_link(link):
+    path = link.split("://")[-1]
+    segments = path.split('/')[1:]
+    if segments and segments[-1].endswith('.html'):
+        segments = segments[:-1]
+    return '|'.join(segments)
\ No newline at end of file
diff --git a/docs/sphinx_setup/conf.py b/docs/sphinx_setup/conf.py
index 01c74de0175bcf..8bf8438fb5e2c2 100644
--- a/docs/sphinx_setup/conf.py
+++ b/docs/sphinx_setup/conf.py
@@ -84,7 +84,8 @@
 ov_sitemap_meta = [
     ('coveo:metadata', {
         'ovversion': version_name,
-        'ovdoctype': 'null'
+        'ovdoctype': 'null',
+        'ovcategory': 'null'
     })
 ]
 

From 6489755e9f91fbb8bb273ee781f80494570f42cb Mon Sep 17 00:00:00 2001
From: Evgeny Kotov <evgeny.kotov@intel.com>
Date: Fri, 15 Nov 2024 10:14:49 +0100
Subject: [PATCH 25/28] fix TSUnsqueezeBackward Reshape does nothing (#27467)

### Details:
 - fix TSUnsqueezeBackward

### Tickets:
 - CVS-111560
---
 .../transpose_sinking/ts_unsqueeze.cpp        | 62 +++++++++++-
 .../transpose_sinking/ts_common_test.cpp      | 97 +++++++++++++++++++
 .../transpose_sinking/ts_general_test.cpp     |  4 +-
 3 files changed, 158 insertions(+), 5 deletions(-)

diff --git a/src/common/transformations/src/transformations/transpose_sinking/ts_unsqueeze.cpp b/src/common/transformations/src/transformations/transpose_sinking/ts_unsqueeze.cpp
index cdeb9226ed236c..ce47caa10c4c0f 100644
--- a/src/common/transformations/src/transformations/transpose_sinking/ts_unsqueeze.cpp
+++ b/src/common/transformations/src/transformations/transpose_sinking/ts_unsqueeze.cpp
@@ -14,7 +14,6 @@
 #include "openvino/op/transpose.hpp"
 #include "openvino/op/unsqueeze.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
-#include "transformations/rt_info/transpose_sinking_attr.hpp"
 #include "transformations/transpose_sinking/ts_utils.hpp"
 #include "transformations/utils/utils.hpp"
 
@@ -99,6 +98,22 @@ bool unsqueeze_axes_to_shape(const Output<Node>& input_node,
     }
     return true;
 }
+
+bool AreInputOutputShapesEqual(const std::shared_ptr<ov::op::v1::Reshape>& reshape) {
+    const auto input_shape = reshape->get_input_partial_shape(0);
+    const auto output_shape = reshape->get_output_partial_shape(0);
+
+    if (input_shape.is_dynamic() || output_shape.is_dynamic()) {
+        return false;
+    }
+    return input_shape == output_shape;
+}
+
+bool HasSpecialOne(const std::shared_ptr<ov::op::v0::Constant>& reshape_const) {
+    auto const_value = reshape_const->cast_vector<int64_t>();
+    return std::find(const_value.begin(), const_value.end(), -1) != const_value.end();
+}
+
 }  // namespace
 
 TSUnsqueezeForward::TSUnsqueezeForward() {
@@ -112,6 +127,28 @@ TSUnsqueezeForward::TSUnsqueezeForward() {
         if (!unsqueeze_axes) {
             return false;
         }
+        auto ts_order_values = transpose_info.transpose_const->cast_vector<size_t>();
+
+        // if main_node does nothing, just swap them
+        auto reshape = as_type_ptr<ov::op::v1::Reshape>(main_node);
+        if (reshape && AreInputOutputShapesEqual(reshape) && !HasSpecialOne(unsqueeze_axes)) {
+            TransposeInputsInfo transpose_input_info = {transpose_info.transpose, transpose_info.transpose_const, 0};
+            // remove input Transpose
+            auto success = sink_forward::UpdateInputTransposes(main_node, transpose_input_info, {0});
+            if (!success) {
+                return false;
+            }
+
+            const auto reshape_order = ov::pass::transpose_sinking::utils::ReverseTransposeOrder(ts_order_values);
+            // transpose reshape const with Gather operation
+            auto axis = std::make_shared<ov::op::v0::Constant>(element::i32, Shape{}, 0);
+            auto gather =
+                ov::pass::transpose_sinking::utils::ChangeValuesOrder(reshape->input_value(1), reshape_order, axis);
+            main_node->input(1).replace_source_output(gather);
+
+            default_outputs_update(main_node, transpose_input_info);
+            return true;
+        }
 
         std::vector<size_t> non_negative_axes;
         if (as_type_ptr<ov::op::v1::Reshape>(main_node)) {
@@ -124,7 +161,6 @@ TSUnsqueezeForward::TSUnsqueezeForward() {
             non_negative_axes =
                 ov::util::try_get_normalized_axis_vector(unsqueeze_axes->get_tensor_view(), rank, *main_node);
         }
-        auto ts_order_values = transpose_info.transpose_const->cast_vector<size_t>();
 
         ts_order_values = GetOrderBeforeReduction(non_negative_axes, ts_order_values);
         auto new_transpose_order = ov::op::v0::Constant::create(transpose_info.transpose_const->get_element_type(),
@@ -183,6 +219,27 @@ TSUnsqueezeBackward::TSUnsqueezeBackward() {
         if (!transpose_order || !unsqueeze_axes)
             return false;
 
+        auto transpose_order_values = transpose_order->cast_vector<size_t>();
+
+        // if main_node does nothing, just swap them
+        auto reshape = as_type_ptr<ov::op::v1::Reshape>(main_node);
+        if (reshape && AreInputOutputShapesEqual(reshape) && !HasSpecialOne(unsqueeze_axes)) {
+            // insert Transpose before main_node on #0 input
+            for (auto& new_node : sink_backward::InsertTransposeBeforeNode(main_node, transpose_order, {0})) {
+                register_new_node(new_node);
+            }
+            // transpose reshape const with Gather operation
+            auto axis = std::make_shared<ov::op::v0::Constant>(element::i32, Shape{}, 0);
+            auto gather = ov::pass::transpose_sinking::utils::ChangeValuesOrder(reshape->input_value(1),
+                                                                                transpose_order_values,
+                                                                                axis);
+            main_node->input(1).replace_source_output(gather);
+
+            main_node->validate_and_infer_types();
+            RemoveTransposeConsumers(main_node);
+            return true;
+        }
+
         std::vector<size_t> non_negative_axes;
         if (as_type_ptr<ov::op::v1::Reshape>(main_node)) {
             auto success = shape_to_unsqueeze_axes(main_node, unsqueeze_axes, non_negative_axes);
@@ -205,7 +262,6 @@ TSUnsqueezeBackward::TSUnsqueezeBackward() {
             }
         }
 
-        auto transpose_order_values = transpose_order->cast_vector<size_t>();
         auto old_transpose_order_values = transpose_order_values;
         std::vector<size_t> new_values;
 
diff --git a/src/common/transformations/tests/transpose_sinking/ts_common_test.cpp b/src/common/transformations/tests/transpose_sinking/ts_common_test.cpp
index d71c9006edd38a..fc5c315312cfaa 100644
--- a/src/common/transformations/tests/transpose_sinking/ts_common_test.cpp
+++ b/src/common/transformations/tests/transpose_sinking/ts_common_test.cpp
@@ -1677,6 +1677,103 @@ auto test_backward_unsqueeze_dyn_rank = []() {
 INSTANTIATE_TEST_SUITE_P(TransposeSinkingCommonUnsqueezeBackwardDynRank,
                          TSTestFixture,
                          test_backward_unsqueeze_dyn_rank());
+
+TEST_F(TransformationTestsF, TransposeSinkingCommonReshapeUnsqueezeBackwardSameShape) {
+    auto create_transpose = [](const std::shared_ptr<ov::Node>& parent) {
+        auto ts_order = std::make_shared<Constant>(element::u64, Shape{3}, Shape{1, 0, 2});
+        return std::make_shared<Transpose>(parent, ts_order);
+    };
+
+    const Shape input_shape = {4, 5, 6};
+    {
+        auto X = std::make_shared<Parameter>(element::f32, input_shape);
+        auto reshape_const = std::make_shared<Constant>(element::u64, Shape{3}, Shape{4, 5, 6});
+        auto reshape = std::make_shared<Reshape>(X, reshape_const, false);
+        auto transpose = create_transpose(reshape);
+        model = std::make_shared<Model>(ov::OutputVector{transpose}, ov::ParameterVector{X});
+    }
+
+    {
+        auto X = std::make_shared<Parameter>(element::f32, input_shape);
+        auto transpose = create_transpose(X);
+        auto reshape_const = std::make_shared<Constant>(element::u64, Shape{3}, Shape{4, 5, 6});
+        auto axis = std::make_shared<ov::op::v0::Constant>(element::i32, Shape{}, 0);
+        auto indices = std::make_shared<Constant>(element::i32, Shape{3}, Shape{1, 0, 2});
+        auto gather = std::make_shared<ov::op::v8::Gather>(reshape_const, indices, axis);
+        auto reshape = std::make_shared<Reshape>(transpose, gather, false);
+        model_ref = std::make_shared<Model>(ov::OutputVector{reshape}, ov::ParameterVector{X});
+    }
+
+    manager.register_pass<TSUnsqueezeBackward>();
+}
+
+TEST_F(TransformationTestsF, TransposeSinkingCommonReshapeUnsqueezeBackwardSameShapeSpecialOne) {
+    auto create_transpose = [](const std::shared_ptr<ov::Node>& parent) {
+        auto ts_order = std::make_shared<Constant>(element::u64, Shape{3}, Shape{1, 0, 2});
+        return std::make_shared<Transpose>(parent, ts_order);
+    };
+
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{4, 5, 6});
+        auto reshape_const = std::make_shared<Constant>(element::i64, Shape{3}, std::vector<int>{4, 5, -1});
+        auto reshape = std::make_shared<Reshape>(X, reshape_const, false);
+        auto transpose = create_transpose(reshape);
+        model = std::make_shared<Model>(ov::OutputVector{transpose}, ov::ParameterVector{X});
+    }
+
+    model_ref = model->clone();
+
+    manager.register_pass<TSUnsqueezeBackward>();
+}
+
+TEST_F(TransformationTestsF, TransposeSinkingCommonReshapeUnsqueezeForwardSameShape) {
+    auto create_transpose = [](const std::shared_ptr<ov::Node>& parent) {
+        auto ts_order = std::make_shared<Constant>(element::u64, Shape{4}, Shape{1, 3, 0, 2});
+        return std::make_shared<Transpose>(parent, ts_order);
+    };
+
+    const Shape input_shape = {4, 5, 6, 7};
+    {
+        auto X = std::make_shared<Parameter>(element::f32, input_shape);
+        auto transpose = create_transpose(X);
+        auto reshape_const = std::make_shared<Constant>(element::u64, Shape{4}, Shape{5, 7, 4, 6});
+        auto reshape = std::make_shared<Reshape>(transpose, reshape_const, false);
+        model = std::make_shared<Model>(ov::OutputVector{reshape}, ov::ParameterVector{X});
+    }
+
+    {
+        auto X = std::make_shared<Parameter>(element::f32, input_shape);
+        auto reshape_const = std::make_shared<Constant>(element::u64, Shape{4}, Shape{5, 7, 4, 6});
+        auto axis = std::make_shared<ov::op::v0::Constant>(element::i32, Shape{}, 0);
+        auto indices = std::make_shared<Constant>(element::i32, Shape{4}, Shape{2, 0, 3, 1});
+        auto gather = std::make_shared<ov::op::v8::Gather>(reshape_const, indices, axis);
+        auto reshape = std::make_shared<Reshape>(X, gather, false);
+        auto transpose = create_transpose(reshape);
+        model_ref = std::make_shared<Model>(ov::OutputVector{transpose}, ov::ParameterVector{X});
+    }
+
+    manager.register_pass<TSUnsqueezeForward>();
+}
+
+TEST_F(TransformationTestsF, TransposeSinkingCommonReshapeUnsqueezeForwardSameShapeSpecialOne) {
+    auto create_transpose = [](const std::shared_ptr<ov::Node>& parent) {
+        auto ts_order = std::make_shared<Constant>(element::u64, Shape{3}, Shape{1, 0, 2});
+        return std::make_shared<Transpose>(parent, ts_order);
+    };
+
+    {
+        auto X = std::make_shared<Parameter>(element::f32, Shape{4, 5, 6});
+        auto transpose = create_transpose(X);
+        auto reshape_const = std::make_shared<Constant>(element::i64, Shape{3}, std::vector<int>{4, 5, -1});
+        auto reshape = std::make_shared<Reshape>(transpose, reshape_const, false);
+        model = std::make_shared<Model>(ov::OutputVector{reshape}, ov::ParameterVector{X});
+    }
+
+    model_ref = model->clone();
+
+    manager.register_pass<TSUnsqueezeForward>();
+}
+
 }  // namespace common
 }  // namespace testing
 }  // namespace transpose_sinking
diff --git a/src/common/transformations/tests/transpose_sinking/ts_general_test.cpp b/src/common/transformations/tests/transpose_sinking/ts_general_test.cpp
index f00c69d2a8d734..7dc3a2b54c7bea 100644
--- a/src/common/transformations/tests/transpose_sinking/ts_general_test.cpp
+++ b/src/common/transformations/tests/transpose_sinking/ts_general_test.cpp
@@ -380,7 +380,7 @@ TEST_F(TransformationTestsF, TSGeneralTestMultipleTypes) {
         auto ng_order0 = std::make_shared<Constant>(ov::element::u64, ov::Shape{4}, ov::Shape{0, 2, 3, 1});
         auto transpose0 = std::make_shared<Transpose>(node0, ng_order0);
 
-        auto reshape_const = std::make_shared<Constant>(ov::element::u64, ov::Shape{4}, ov::Shape{1, 40, 55, 96});
+        auto reshape_const = std::make_shared<Constant>(ov::element::u64, ov::Shape{4}, ov::Shape{2, 20, 55, 96});
         auto reshape = std::make_shared<Reshape>(transpose0, reshape_const, false);
 
         auto ng_order1 = std::make_shared<Constant>(ov::element::u64, ov::Shape{4}, ov::Shape{0, 3, 1, 2});
@@ -399,7 +399,7 @@ TEST_F(TransformationTestsF, TSGeneralTestMultipleTypes) {
         auto ng_order0 = std::make_shared<Constant>(ov::element::u64, ov::Shape{4}, ov::Shape{0, 2, 3, 1});
         auto transpose0 = std::make_shared<Transpose>(node0, ng_order0);
 
-        auto reshape_const = std::make_shared<Constant>(ov::element::u64, ov::Shape{4}, ov::Shape{1, 40, 55, 96});
+        auto reshape_const = std::make_shared<Constant>(ov::element::u64, ov::Shape{4}, ov::Shape{2, 20, 55, 96});
         auto reshape = std::make_shared<Reshape>(transpose0, reshape_const, false);
 
         auto node1 = MakeAllNodesSubgraph(reshape, 3, 3);

From 3e63de016bced2a7ad18550fa5f8ca8fb47ba394 Mon Sep 17 00:00:00 2001
From: Xin Wang <xin1.wang@intel.com>
Date: Fri, 15 Nov 2024 20:57:31 +0800
Subject: [PATCH 26/28] [NPU] Remove template in ext wrapper and fuse functions
 (#27511)

### Details:
 - *Remove template in zero_ext_graph_wrappers*
 - *Remove zero_ext_graph_wrappers_interface.hpp*
 - *Add more low level debug log*
 - *Update level-zero-ext repo commit to use 1.9 version*

### Tickets:
 - *156387*

---------

Signed-off-by: Xin Wang <xin1.wang@intel.com>
---
 .../include/driver_compiler_adapter.hpp       |   4 +-
 .../compiler_adapter/include/driver_graph.hpp |   6 +-
 .../include/plugin_compiler_adapter.hpp       |   4 +-
 .../compiler_adapter/include/plugin_graph.hpp |   6 +-
 .../include/ze_graph_ext_wrappers.hpp         | 112 +---
 .../ze_graph_ext_wrappers_interface.hpp       |  42 --
 .../src/driver_compiler_adapter.cpp           |  24 +-
 .../src/compiler_adapter/src/driver_graph.cpp |   2 +-
 .../src/plugin_compiler_adapter.cpp           |  24 +-
 .../src/compiler_adapter/src/plugin_graph.cpp |   2 +-
 .../src/ze_graph_ext_wrappers.cpp             | 527 ++++++++----------
 .../intel_npu/thirdparty/level-zero-ext       |   2 +-
 12 files changed, 269 insertions(+), 486 deletions(-)
 delete mode 100644 src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers_interface.hpp

diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp
index dc000b99d7446b..82ababf21c147a 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp
@@ -16,7 +16,7 @@
 #include "intel_npu/config/config.hpp"
 #include "intel_npu/utils/logger/logger.hpp"
 #include "intel_npu/utils/zero/zero_init.hpp"
-#include "ze_graph_ext_wrappers_interface.hpp"
+#include "ze_graph_ext_wrappers.hpp"
 
 namespace intel_npu {
 
@@ -54,7 +54,7 @@ class DriverCompilerAdapter final : public ICompilerAdapter {
     std::string serializeConfig(const Config& config, ze_graph_compiler_version_info_t compilerVersion) const;
 
     std::shared_ptr<ZeroInitStructsHolder> _zeroInitStruct;
-    std::shared_ptr<ZeGraphExtWrappersInterface> _zeGraphExt;
+    std::shared_ptr<ZeGraphExtWrappers> _zeGraphExt;
 
     ze_device_graph_properties_t _deviceGraphProperties = {};
 
diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp
index f7ea940cf9a160..0f426581687f65 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp
@@ -10,13 +10,13 @@
 
 #include "intel_npu/common/igraph.hpp"
 #include "intel_npu/utils/zero/zero_init.hpp"
-#include "ze_graph_ext_wrappers_interface.hpp"
+#include "ze_graph_ext_wrappers.hpp"
 
 namespace intel_npu {
 
 class DriverGraph final : public IGraph {
 public:
-    DriverGraph(const std::shared_ptr<ZeGraphExtWrappersInterface>& zeGraphExt,
+    DriverGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
                 const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct,
                 ze_graph_handle_t graphHandle,
                 NetworkMetadata metadata,
@@ -37,7 +37,7 @@ class DriverGraph final : public IGraph {
 private:
     bool release_blob(const Config& config);
 
-    std::shared_ptr<ZeGraphExtWrappersInterface> _zeGraphExt;
+    std::shared_ptr<ZeGraphExtWrappers> _zeGraphExt;
     std::shared_ptr<ZeroInitStructsHolder> _zeroInitStruct;
 
     Logger _logger;
diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp
index eab8a19627cd1c..8d2616884e7d5f 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp
@@ -11,7 +11,7 @@
 #include "intel_npu/utils/logger/logger.hpp"
 #include "intel_npu/utils/zero/zero_init.hpp"
 #include "openvino/runtime/so_ptr.hpp"
-#include "ze_graph_ext_wrappers_interface.hpp"
+#include "ze_graph_ext_wrappers.hpp"
 
 namespace intel_npu {
 
@@ -28,7 +28,7 @@ class PluginCompilerAdapter final : public ICompilerAdapter {
 private:
     std::shared_ptr<ZeroInitStructsHolder> _zeroInitStruct;
 
-    std::shared_ptr<ZeGraphExtWrappersInterface> _zeGraphExt;
+    std::shared_ptr<ZeGraphExtWrappers> _zeGraphExt;
     ov::SoPtr<ICompiler> _compiler;
 
     Logger _logger;
diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp
index 1028112368e67f..2d7d9bfd429e47 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp
@@ -12,13 +12,13 @@
 #include "intel_npu/icompiler.hpp"
 #include "intel_npu/utils/zero/zero_init.hpp"
 #include "openvino/runtime/so_ptr.hpp"
-#include "ze_graph_ext_wrappers_interface.hpp"
+#include "ze_graph_ext_wrappers.hpp"
 
 namespace intel_npu {
 
 class PluginGraph final : public IGraph {
 public:
-    PluginGraph(const std::shared_ptr<ZeGraphExtWrappersInterface>& zeGraphExt,
+    PluginGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
                 const ov::SoPtr<ICompiler>& compiler,
                 const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct,
                 ze_graph_handle_t graphHandle,
@@ -38,7 +38,7 @@ class PluginGraph final : public IGraph {
     ~PluginGraph() override;
 
 private:
-    std::shared_ptr<ZeGraphExtWrappersInterface> _zeGraphExt;
+    std::shared_ptr<ZeGraphExtWrappers> _zeGraphExt;
     std::shared_ptr<ZeroInitStructsHolder> _zeroInitStruct;
 
     const ov::SoPtr<ICompiler> _compiler;
diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp
index 1bc58b153a48ff..3e8c17ad13db7e 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp
@@ -10,42 +10,19 @@
 #include <type_traits>
 #include <utility>
 
+#include "intel_npu/network_metadata.hpp"
 #include "intel_npu/utils/logger/logger.hpp"
 #include "intel_npu/utils/zero/zero_init.hpp"
 #include "intel_npu/utils/zero/zero_types.hpp"
-#include "ze_graph_ext_wrappers_interface.hpp"
 
 namespace intel_npu {
 
-#define NotSupportQuery(T) (T == ZE_GRAPH_EXT_VERSION_1_2)
-
-// ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy,
-// pfnQueryNetworkGetSupportedLayers)
-#define SupportAPIGraphQueryNetworkV1(T) (T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4)
-
-// ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory)
-#define SupportAPIGraphQueryNetworkV2(T) ((!NotSupportQuery(T) && !SupportAPIGraphQueryNetworkV1(T)))
-
-// For ext version >= 1.5, pfnCreate2 api is avaible
-#define NotSupportGraph2(T) \
-    (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4)
-
-// A bug inside the driver makes the "pfnGraphGetArgumentMetadata" call not safe for use prior to
-// "ze_graph_dditable_ext_1_6_t".
-// See: E#117498
-#define NotSupportArgumentMetadata(T)                                                                   \
-    (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4 || \
-     T == ZE_GRAPH_EXT_VERSION_1_5)
-
-#define UseCopyForNativeBinary(T)                                                                       \
-    (T == ZE_GRAPH_EXT_VERSION_1_2 || T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4 || \
-     T == ZE_GRAPH_EXT_VERSION_1_5 || T == ZE_GRAPH_EXT_VERSION_1_6)
+using SerializedIR = std::pair<size_t, std::shared_ptr<uint8_t>>;
 
 /**
  * Adapter to use CiD through ZeroAPI
  */
-template <ze_graph_ext_version_t TableExtension>
-class ZeGraphExtWrappers final : public ZeGraphExtWrappersInterface {
+class ZeGraphExtWrappers {
 public:
     ZeGraphExtWrappers(const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct);
     ZeGraphExtWrappers(const ZeGraphExtWrappers&) = delete;
@@ -53,105 +30,40 @@ class ZeGraphExtWrappers final : public ZeGraphExtWrappersInterface {
     ~ZeGraphExtWrappers();
 
     std::unordered_set<std::string> queryGraph(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-                                               const std::string& buildFlags) const override;
+                                               const std::string& buildFlags) const;
     ze_graph_handle_t getGraphHandle(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
                                      const std::string& buildFlags,
-                                     const uint32_t& flags) const override;
+                                     const uint32_t& flags) const;
 
-    ze_graph_handle_t getGraphHandle(const std::vector<uint8_t>& network) const override;
+    ze_graph_handle_t getGraphHandle(const std::vector<uint8_t>& network) const;
 
-    NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const override;
+    NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const;
 
-    _ze_result_t destroyGraph(ze_graph_handle_t graphHandle) override;
+    _ze_result_t destroyGraph(ze_graph_handle_t graphHandle);
 
     void getGraphBinary(ze_graph_handle_t graphHandle,
                         std::vector<uint8_t>& blob,
                         const uint8_t*& blobPtr,
-                        size_t& blobSize) const override;
+                        size_t& blobSize) const;
 
-    void setGraphArgumentValue(ze_graph_handle_t graphHandle, uint32_t argi_, const void* argv) const override;
+    void setGraphArgumentValue(ze_graph_handle_t graphHandle, uint32_t argi_, const void* argv) const;
 
-    void initializeGraph(ze_graph_handle_t graphHandle, const Config& config) const override;
+    void initializeGraph(ze_graph_handle_t graphHandle, const Config& config) const;
 
 private:
-    template <ze_graph_ext_version_t T = TableExtension, std::enable_if_t<!NotSupportQuery(T), bool> = true>
     std::unordered_set<std::string> getQueryResultFromSupportedLayers(
         ze_result_t result,
         ze_graph_query_network_handle_t& hGraphQueryNetwork) const;
 
-    template <ze_graph_ext_version_t T = TableExtension,
-              typename std::enable_if_t<NotSupportArgumentMetadata(T), bool> = true>
     void getMetadata(ze_graph_handle_t graphHandle,
                      uint32_t index,
                      std::vector<IODescriptor>& inputs,
                      std::vector<IODescriptor>& outputs) const;
 
-    template <ze_graph_ext_version_t T = TableExtension,
-              typename std::enable_if_t<!NotSupportArgumentMetadata(T), bool> = true>
-    void getMetadata(ze_graph_handle_t graphHandle,
-                     uint32_t index,
-                     std::vector<IODescriptor>& inputs,
-                     std::vector<IODescriptor>& outputs) const;
-
-    template <ze_graph_ext_version_t T = TableExtension,
-              typename std::enable_if_t<UseCopyForNativeBinary(T), bool> = true>
-    void getNativeBinary(ze_graph_handle_t graphHandle,
-                         std::vector<uint8_t>& blob,
-                         const uint8_t*& blobPtr,
-                         size_t& blobSize) const;
-
-    template <ze_graph_ext_version_t T = TableExtension,
-              typename std::enable_if_t<!UseCopyForNativeBinary(T), bool> = true>
-    void getNativeBinary(ze_graph_handle_t graphHandle,
-                         std::vector<uint8_t>& /* unusedBlob */,
-                         const uint8_t*& blobPtr,
-                         size_t& blobSize) const;
-
-    template <ze_graph_ext_version_t T = TableExtension,
-              typename std::enable_if_t<SupportAPIGraphQueryNetworkV2(T), bool> = true>
-    ze_result_t queryNetworkCreateV2(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-                                     const std::string& buildFlags,
-                                     ze_graph_query_network_handle_t& hGraphQueryNetwork) const;
-
-    // ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory)
-    template <ze_graph_ext_version_t T = TableExtension,
-              typename std::enable_if_t<SupportAPIGraphQueryNetworkV2(T), bool> = true>
-    std::unordered_set<std::string> queryImpl(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-                                              const std::string& buildFlags) const;
-
-    template <ze_graph_ext_version_t T = TableExtension,
-              typename std::enable_if_t<SupportAPIGraphQueryNetworkV1(T), bool> = true>
-    ze_result_t queryNetworkCreateV1(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-                                     const std::string& buildFlags,
-                                     ze_graph_query_network_handle_t& hGraphQueryNetwork) const;
-
-    // ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy,
-    // pfnQueryNetworkGetSupportedLayers)
-    template <ze_graph_ext_version_t T = TableExtension,
-              typename std::enable_if_t<SupportAPIGraphQueryNetworkV1(T), bool> = true>
-    std::unordered_set<std::string> queryImpl(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-                                              const std::string& buildFlags) const;
-
-    // For ext version < 1.3
-    template <ze_graph_ext_version_t T = TableExtension, typename std::enable_if_t<NotSupportQuery(T), bool> = true>
-    std::unordered_set<std::string> queryImpl(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-                                              const std::string& buildFlags) const;
-
-    template <ze_graph_ext_version_t T = TableExtension, typename std::enable_if_t<NotSupportGraph2(T), bool> = true>
-    void createGraph(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-                     const std::string& buildFlags,
-                     const uint32_t& flags,
-                     ze_graph_handle_t* graph) const;
-
-    template <ze_graph_ext_version_t T = TableExtension, typename std::enable_if_t<!NotSupportGraph2(T), bool> = true>
-    void createGraph(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-                     const std::string& buildFlags,
-                     const uint32_t& flags,
-                     ze_graph_handle_t* graph) const;
-
     void initialize_graph_through_command_list(ze_graph_handle_t graphHandle, const Config& config) const;
 
     std::shared_ptr<ZeroInitStructsHolder> _zeroInitStruct;
+    uint32_t _graphExtVersion;
 
     Logger _logger;
 };
diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers_interface.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers_interface.hpp
deleted file mode 100644
index ac44f9853e11e3..00000000000000
--- a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers_interface.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (C) 2018-2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include <ze_graph_ext.h>
-
-#include "intel_npu/network_metadata.hpp"
-
-namespace intel_npu {
-
-using SerializedIR = std::pair<size_t, std::shared_ptr<uint8_t>>;
-
-class ZeGraphExtWrappersInterface {
-public:
-    virtual std::unordered_set<std::string> queryGraph(SerializedIR serializedIR,
-                                                       const std::string& buildFlags) const = 0;
-
-    virtual ze_graph_handle_t getGraphHandle(SerializedIR serializedIR,
-                                             const std::string& buildFlags,
-                                             const uint32_t& flags) const = 0;
-
-    virtual ze_graph_handle_t getGraphHandle(const std::vector<uint8_t>& network) const = 0;
-
-    virtual NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const = 0;
-
-    virtual _ze_result_t destroyGraph(ze_graph_handle_t graphHandle) = 0;
-
-    virtual void getGraphBinary(ze_graph_handle_t graphHandle,
-                                std::vector<uint8_t>& blob,
-                                const uint8_t*& blobPtr,
-                                size_t& blobSize) const = 0;
-
-    virtual void setGraphArgumentValue(ze_graph_handle_t graphHandle, uint32_t argi_, const void* argv) const = 0;
-
-    virtual void initializeGraph(ze_graph_handle_t graphHandle, const Config& config) const = 0;
-
-    virtual ~ZeGraphExtWrappersInterface() = default;
-};
-
-}  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
index b4da8a2bcc316b..f819ed73711cf2 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp
@@ -155,29 +155,7 @@ DriverCompilerAdapter::DriverCompilerAdapter(const std::shared_ptr<ZeroInitStruc
 
     _logger.info("DriverCompilerAdapter creating adapter using graphExtVersion");
 
-    switch (graphExtVersion) {
-    case ZE_GRAPH_EXT_VERSION_1_3:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_3>>(_zeroInitStruct);
-        break;
-    case ZE_GRAPH_EXT_VERSION_1_4:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_4>>(_zeroInitStruct);
-        break;
-    case ZE_GRAPH_EXT_VERSION_1_5:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_5>>(_zeroInitStruct);
-        break;
-    case ZE_GRAPH_EXT_VERSION_1_6:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_6>>(_zeroInitStruct);
-        break;
-    case ZE_GRAPH_EXT_VERSION_1_7:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_7>>(_zeroInitStruct);
-        break;
-    case ZE_GRAPH_EXT_VERSION_1_8:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_8>>(_zeroInitStruct);
-        break;
-    default:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_2>>(_zeroInitStruct);
-        break;
-    }
+    _zeGraphExt = std::make_shared<ZeGraphExtWrappers>(_zeroInitStruct);
 
     _logger.info("initialize DriverCompilerAdapter complete, using graphExtVersion: %d.%d",
                  ZE_MAJOR_VERSION(graphExtVersion),
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
index 84759bf802f1c1..e1f3990b835e8d 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
@@ -10,7 +10,7 @@
 
 namespace intel_npu {
 
-DriverGraph::DriverGraph(const std::shared_ptr<ZeGraphExtWrappersInterface>& zeGraphExt,
+DriverGraph::DriverGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
                          const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct,
                          ze_graph_handle_t graphHandle,
                          NetworkMetadata metadata,
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp
index 73dd3817e24812..06d71fd1126c17 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp
@@ -70,29 +70,7 @@ PluginCompilerAdapter::PluginCompilerAdapter(const std::shared_ptr<ZeroInitStruc
 
     _logger.info("PluginCompilerAdapter creating adapter using graphExtVersion");
 
-    switch (graphExtVersion) {
-    case ZE_GRAPH_EXT_VERSION_1_3:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_3>>(_zeroInitStruct);
-        break;
-    case ZE_GRAPH_EXT_VERSION_1_4:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_4>>(_zeroInitStruct);
-        break;
-    case ZE_GRAPH_EXT_VERSION_1_5:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_5>>(_zeroInitStruct);
-        break;
-    case ZE_GRAPH_EXT_VERSION_1_6:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_6>>(_zeroInitStruct);
-        break;
-    case ZE_GRAPH_EXT_VERSION_1_7:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_7>>(_zeroInitStruct);
-        break;
-    case ZE_GRAPH_EXT_VERSION_1_8:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_8>>(_zeroInitStruct);
-        break;
-    default:
-        _zeGraphExt = std::make_shared<ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_2>>(_zeroInitStruct);
-        break;
-    }
+    _zeGraphExt = std::make_shared<ZeGraphExtWrappers>(_zeroInitStruct);
 
     _logger.info("initialize PluginCompilerAdapter complete, using graphExtVersion: %d.%d",
                  ZE_MAJOR_VERSION(graphExtVersion),
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp
index 8f60efd50af75c..c99069a0a9760f 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp
@@ -10,7 +10,7 @@
 
 namespace intel_npu {
 
-PluginGraph::PluginGraph(const std::shared_ptr<ZeGraphExtWrappersInterface>& zeGraphExt,
+PluginGraph::PluginGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
                          const ov::SoPtr<ICompiler>& compiler,
                          const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct,
                          ze_graph_handle_t graphHandle,
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp
index fad389ca30e0c7..f6366a2509747b 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp
@@ -14,6 +14,25 @@
 #include "intel_npu/utils/zero/zero_wrappers.hpp"
 #include "openvino/core/model.hpp"
 
+#define NotSupportQuery(T) (T <= ZE_GRAPH_EXT_VERSION_1_2)
+
+// ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy,
+// pfnQueryNetworkGetSupportedLayers)
+#define SupportAPIGraphQueryNetworkV1(T) (T == ZE_GRAPH_EXT_VERSION_1_3 || T == ZE_GRAPH_EXT_VERSION_1_4)
+
+// ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory)
+#define SupportAPIGraphQueryNetworkV2(T) ((!NotSupportQuery(T) && !SupportAPIGraphQueryNetworkV1(T)))
+
+// For ext version >= 1.5, pfnCreate2 api is avaible
+#define NotSupportGraph2(T) (T < ZE_GRAPH_EXT_VERSION_1_5)
+
+// A bug inside the driver makes the "pfnGraphGetArgumentMetadata" call not safe for use prior to
+// "ze_graph_dditable_ext_1_6_t".
+// See: E#117498
+#define NotSupportArgumentMetadata(T) (T < ZE_GRAPH_EXT_VERSION_1_6)
+
+#define UseCopyForNativeBinary(T) (T < ZE_GRAPH_EXT_VERSION_1_7)
+
 namespace {
 
 ov::element::Type_t toOVElementType(const ze_graph_argument_precision_t zeElementType) {
@@ -63,19 +82,28 @@ ov::element::Type_t toOVElementType(const ze_graph_argument_precision_t zeElemen
 
 namespace intel_npu {
 
-template <ze_graph_ext_version_t TableExtension>
-ZeGraphExtWrappers<TableExtension>::ZeGraphExtWrappers(const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct)
+ZeGraphExtWrappers::ZeGraphExtWrappers(const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct)
     : _zeroInitStruct(zeroInitStruct),
-      _logger("ZeGraphExtWrappers", Logger::global().level()) {}
+      _graphExtVersion(zeroInitStruct->getGraphDdiTable().version()),
+      _logger("ZeGraphExtWrappers", Logger::global().level()) {
+    _logger.info("Graph ext version used by zero wrapper: %d.%d",
+                 ZE_MAJOR_VERSION(_graphExtVersion),
+                 ZE_MINOR_VERSION(_graphExtVersion));
+    _logger.debug("capabilities:");
+    _logger.debug("-SupportQuery: %d", !NotSupportQuery(_graphExtVersion));
+    _logger.debug("-SupportAPIGraphQueryNetworkV1: %d", SupportAPIGraphQueryNetworkV1(_graphExtVersion));
+    _logger.debug("-SupportAPIGraphQueryNetworkV2 :%d", SupportAPIGraphQueryNetworkV2(_graphExtVersion));
+    _logger.debug("-SupportpfnCreate2 :%d", !NotSupportGraph2(_graphExtVersion));
+    _logger.debug("-SupportArgumentMetadata :%d", !NotSupportArgumentMetadata(_graphExtVersion));
+    _logger.debug("-UseCopyForNativeBinary :%d", UseCopyForNativeBinary(_graphExtVersion));
+}
 
-template <ze_graph_ext_version_t TableExtension>
-ZeGraphExtWrappers<TableExtension>::~ZeGraphExtWrappers() {
-    _logger.debug("ZeGraphExtWrappers obj destroyed");
+ZeGraphExtWrappers::~ZeGraphExtWrappers() {
+    _logger.debug("Obj destroyed");
 }
 
-template <ze_graph_ext_version_t TableExtension>
-_ze_result_t ZeGraphExtWrappers<TableExtension>::destroyGraph(ze_graph_handle_t graphHandle) {
-    _logger.debug("destroyGraph - pfnDestroy graphHandle");
+_ze_result_t ZeGraphExtWrappers::destroyGraph(ze_graph_handle_t graphHandle) {
+    _logger.debug("destroyGraph - perfrom pfnDestroy");
     auto result = _zeroInitStruct->getGraphDdiTable().pfnDestroy(graphHandle);
 
     if (ZE_RESULT_SUCCESS != result) {
@@ -87,73 +115,62 @@ _ze_result_t ZeGraphExtWrappers<TableExtension>::destroyGraph(ze_graph_handle_t
     return result;
 }
 
-template <ze_graph_ext_version_t TableExtension>
-template <ze_graph_ext_version_t T, std::enable_if_t<UseCopyForNativeBinary(T), bool>>
-void ZeGraphExtWrappers<TableExtension>::getNativeBinary(ze_graph_handle_t graphHandle,
-                                                         std::vector<uint8_t>& blob,
-                                                         const uint8_t*& blobPtr,
-                                                         size_t& blobSize) const {
-    // Get blob size first
-    auto result = _zeroInitStruct->getGraphDdiTable().pfnGetNativeBinary(graphHandle, &blobSize, nullptr);
-    blob.resize(blobSize);
-    THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetNativeBinary get blob size, Failed to compile network.",
-                                    result,
-                                    _zeroInitStruct->getGraphDdiTable());
-
-    // Get blob data
-    result = _zeroInitStruct->getGraphDdiTable().pfnGetNativeBinary(graphHandle, &blobSize, blob.data());
-    THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetNativeBinary get blob data, Failed to compile network.",
-                                    result,
-                                    _zeroInitStruct->getGraphDdiTable());
-
-    blobPtr = blob.data();
-}
-
-template <ze_graph_ext_version_t TableExtension>
-template <ze_graph_ext_version_t T, std::enable_if_t<!UseCopyForNativeBinary(T), bool>>
-void ZeGraphExtWrappers<TableExtension>::getNativeBinary(ze_graph_handle_t graphHandle,
-                                                         std::vector<uint8_t>& /* unusedBlob */,
-                                                         const uint8_t*& blobPtr,
-                                                         size_t& blobSize) const {
-    // Get blob ptr and size
-    auto result = _zeroInitStruct->getGraphDdiTable().pfnGetNativeBinary2(graphHandle, &blobSize, &blobPtr);
-    THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetNativeBinary get blob size, Failed to compile network.",
-                                    result,
-                                    _zeroInitStruct->getGraphDdiTable());
-}
-
-template <ze_graph_ext_version_t TableExtension>
-void ZeGraphExtWrappers<TableExtension>::getGraphBinary(ze_graph_handle_t graphHandle,
-                                                        std::vector<uint8_t>& blob,
-                                                        const uint8_t*& blobPtr,
-                                                        size_t& blobSize) const {
+void ZeGraphExtWrappers::getGraphBinary(ze_graph_handle_t graphHandle,
+                                        std::vector<uint8_t>& blob,
+                                        const uint8_t*& blobPtr,
+                                        size_t& blobSize) const {
     if (graphHandle == nullptr) {
         OPENVINO_THROW("Graph handle is null");
     }
 
-    _logger.info("ZeGraphExtWrappers getGraphBinary get blob from graphHandle");
-
-    getNativeBinary(graphHandle, blob, blobPtr, blobSize);
+    _logger.debug("getGraphBinary - get blob from graphHandle");
+
+    if (UseCopyForNativeBinary(_graphExtVersion)) {
+        // Get blob size first
+        _logger.debug("getGraphBinary - perfrom pfnGetNativeBinary to get size");
+        auto result = _zeroInitStruct->getGraphDdiTable().pfnGetNativeBinary(graphHandle, &blobSize, nullptr);
+        blob.resize(blobSize);
+        THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetNativeBinary get blob size, Failed to compile network.",
+                                        result,
+                                        _zeroInitStruct->getGraphDdiTable());
+
+        // Get blob data
+        _logger.debug("getGraphBinary - perfrom pfnGetNativeBinary to get data");
+        result = _zeroInitStruct->getGraphDdiTable().pfnGetNativeBinary(graphHandle, &blobSize, blob.data());
+        THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetNativeBinary get blob data, Failed to compile network.",
+                                        result,
+                                        _zeroInitStruct->getGraphDdiTable());
+
+        blobPtr = blob.data();
+    } else {
+        // Get blob ptr and size
+        _logger.debug("getGraphBinary - perfrom pfnGetNativeBinary2 to get size and data");
+        auto result = _zeroInitStruct->getGraphDdiTable().pfnGetNativeBinary2(graphHandle, &blobSize, &blobPtr);
+        THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetNativeBinary get blob size, Failed to compile network.",
+                                        result,
+                                        _zeroInitStruct->getGraphDdiTable());
+    }
 }
 
-template <ze_graph_ext_version_t TableExtension>
-void ZeGraphExtWrappers<TableExtension>::setGraphArgumentValue(ze_graph_handle_t graphHandle,
-                                                               uint32_t argi,
-                                                               const void* argv) const {
+void ZeGraphExtWrappers::setGraphArgumentValue(ze_graph_handle_t graphHandle, uint32_t argi, const void* argv) const {
+    _logger.debug("setGraphArgumentValue - perform pfnSetArgumentValue");
     auto result = _zeroInitStruct->getGraphDdiTable().pfnSetArgumentValue(graphHandle, argi, argv);
     THROW_ON_FAIL_FOR_LEVELZERO_EXT("zeGraphSetArgumentValue", result, _zeroInitStruct->getGraphDdiTable());
 }
 
-template <ze_graph_ext_version_t TableExtension>
-void ZeGraphExtWrappers<TableExtension>::initializeGraph(ze_graph_handle_t graphHandle, const Config& config) const {
+void ZeGraphExtWrappers::initializeGraph(ze_graph_handle_t graphHandle, const Config& config) const {
     if (_zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8) {
+        _logger.debug("Use initialize_graph_through_command_list for ext version smaller than 1.8");
         initialize_graph_through_command_list(graphHandle, config);
     } else {
+        _logger.debug("Initialize graph based on graph properties for ext version larger than 1.8");
         ze_graph_properties_2_t properties = {};
         properties.stype = ZE_STRUCTURE_TYPE_GRAPH_PROPERTIES;
+        _logger.debug("initializeGraph - perfrom pfnGetProperties2");
         _zeroInitStruct->getGraphDdiTable().pfnGetProperties2(graphHandle, &properties);
 
         if (properties.initStageRequired & ZE_GRAPH_STAGE_INITIALIZE) {
+            _logger.debug("initializeGraph - perfrom pfnGraphInitialize");
             _zeroInitStruct->getGraphDdiTable().pfnGraphInitialize(graphHandle);
         }
 
@@ -163,32 +180,31 @@ void ZeGraphExtWrappers<TableExtension>::initializeGraph(ze_graph_handle_t graph
     }
 }
 
-template <ze_graph_ext_version_t TableExtension>
-void ZeGraphExtWrappers<TableExtension>::initialize_graph_through_command_list(ze_graph_handle_t graphHandle,
-                                                                               const Config& config) const {
+void ZeGraphExtWrappers::initialize_graph_through_command_list(ze_graph_handle_t graphHandle,
+                                                               const Config& config) const {
     ze_device_properties_t deviceProperties = {};
     deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
     THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties",
                                 zeDeviceGetProperties(_zeroInitStruct->getDevice(), &deviceProperties));
     auto groupOrdinal = zeroUtils::findGroupOrdinal(_zeroInitStruct->getDevice(), deviceProperties);
 
-    _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list init start - create graph_command_list");
+    _logger.debug("initialize_graph_through_command_list init start - create graph_command_list");
     CommandList graph_command_list(_zeroInitStruct, groupOrdinal);
-    _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - create graph_command_queue");
+    _logger.debug("initialize_graph_through_command_list - create graph_command_queue");
     CommandQueue graph_command_queue(_zeroInitStruct, ZE_COMMAND_QUEUE_PRIORITY_NORMAL, groupOrdinal, false);
-    _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - create fence");
+    _logger.debug("initialize_graph_through_command_list - create fence");
     Fence fence(graph_command_queue);
 
-    _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - performing appendGraphInitialize");
+    _logger.debug("initialize_graph_through_command_list - performing appendGraphInitialize");
     graph_command_list.appendGraphInitialize(graphHandle);
-    _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - closing graph command list");
+    _logger.debug("initialize_graph_through_command_list - closing graph command list");
     graph_command_list.close();
 
-    _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - performing executeCommandList");
+    _logger.debug("initialize_graph_through_command_list - performing executeCommandList");
     graph_command_queue.executeCommandList(graph_command_list, fence);
-    _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - performing hostSynchronize");
+    _logger.debug("initialize_graph_through_command_list - performing hostSynchronize");
     fence.hostSynchronize();
-    _logger.debug("ZeGraphExtWrappers::initialize_graph_through_command_list - hostSynchronize completed");
+    _logger.debug("initialize_graph_through_command_list - hostSynchronize completed");
 }
 
 // Parse the result string of query from foramt <name_0><name_1><name_2> to unordered_set of string
@@ -210,102 +226,17 @@ static std::unordered_set<std::string> parseQueryResult(std::vector<char>& data)
     return result;
 }
 
-// For ext version < 1.3, query is unsupported, return empty result and add debug log here
-template <ze_graph_ext_version_t TableExtension>
-template <ze_graph_ext_version_t T, std::enable_if_t<NotSupportQuery(T), bool>>
-std::unordered_set<std::string> ZeGraphExtWrappers<TableExtension>::queryImpl(
-    std::pair<size_t, std::shared_ptr<uint8_t>>,
-    const std::string&) const {
-    _logger.info("queryImpl - Driver version is less than 1.3, queryNetwork is unsupported.");
-    return std::unordered_set<std::string>();
-}
-
-// For ext version == 1.3 && == 1.4
-template <ze_graph_ext_version_t TableExtension>
-template <ze_graph_ext_version_t T, std::enable_if_t<SupportAPIGraphQueryNetworkV1(T), bool>>
-ze_result_t ZeGraphExtWrappers<TableExtension>::queryNetworkCreateV1(
-    std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-    const std::string& buildFlags,
-    ze_graph_query_network_handle_t& hGraphQueryNetwork) const {
-    ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
-                            nullptr,
-                            ZE_GRAPH_FORMAT_NGRAPH_LITE,
-                            serializedIR.first,
-                            serializedIR.second.get(),
-                            buildFlags.c_str()};
-
-    // Create querynetwork handle
-    ze_result_t result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkCreate(_zeroInitStruct->getContext(),
-                                                                                   _zeroInitStruct->getDevice(),
-                                                                                   &desc,
-                                                                                   &hGraphQueryNetwork);
-    THROW_ON_FAIL_FOR_LEVELZERO_EXT("queryNetworkCreateV1", result, _zeroInitStruct->getGraphDdiTable());
-
-    return result;
-}
-
-// For ext version == 1.3 && == 1.4, query is supported, calling querynetwork api in _zeroInitStruct->getGraphDdiTable()
-template <ze_graph_ext_version_t TableExtension>
-template <ze_graph_ext_version_t T, std::enable_if_t<SupportAPIGraphQueryNetworkV1(T), bool>>
-std::unordered_set<std::string> ZeGraphExtWrappers<TableExtension>::queryImpl(
-    std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-    const std::string& buildFlags) const {
-    _logger.info("queryImpl - Calling queryNetwork of 1.3 version.");
-
-    ze_graph_query_network_handle_t hGraphQueryNetwork = nullptr;
-
-    auto result = queryNetworkCreateV1(std::move(serializedIR), buildFlags, hGraphQueryNetwork);
-
-    return getQueryResultFromSupportedLayers(result, hGraphQueryNetwork);
-}
-
-// For ext version >= 1.5
-template <ze_graph_ext_version_t TableExtension>
-template <ze_graph_ext_version_t T, std::enable_if_t<SupportAPIGraphQueryNetworkV2(T), bool>>
-ze_result_t ZeGraphExtWrappers<TableExtension>::queryNetworkCreateV2(
-    std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-    const std::string& buildFlags,
-    ze_graph_query_network_handle_t& hGraphQueryNetwork) const {
-    ze_graph_desc_2_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
-                              nullptr,
-                              ZE_GRAPH_FORMAT_NGRAPH_LITE,
-                              serializedIR.first,
-                              serializedIR.second.get(),
-                              buildFlags.c_str(),
-                              ZE_GRAPH_FLAG_NONE};
-
-    // Create querynetwork handle
-    _logger.debug("queryNetworkCreateV2 - performing pfnQueryNetworkCreate2");
-    ze_result_t result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkCreate2(_zeroInitStruct->getContext(),
-                                                                                    _zeroInitStruct->getDevice(),
-                                                                                    &desc,
-                                                                                    &hGraphQueryNetwork);
-    THROW_ON_FAIL_FOR_LEVELZERO_EXT("queryNetworkCreateV2", result, _zeroInitStruct->getGraphDdiTable());
-
-    return result;
-}
-
-// For ext version >= 1.5
-template <ze_graph_ext_version_t TableExtension>
-template <ze_graph_ext_version_t T, std::enable_if_t<SupportAPIGraphQueryNetworkV2(T), bool>>
-std::unordered_set<std::string> ZeGraphExtWrappers<TableExtension>::queryImpl(
-    std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-    const std::string& buildFlags) const {
-    _logger.debug("queryImpl - Calling queryNetwork of 1.5 version.");
-
-    ze_graph_query_network_handle_t hGraphQueryNetwork = nullptr;
-
-    auto result = queryNetworkCreateV2(std::move(serializedIR), buildFlags, hGraphQueryNetwork);
-
-    return getQueryResultFromSupportedLayers(result, hGraphQueryNetwork);
-}
-
-template <ze_graph_ext_version_t TableExtension>
-template <ze_graph_ext_version_t T, std::enable_if_t<!NotSupportQuery(T), bool>>
-std::unordered_set<std::string> ZeGraphExtWrappers<TableExtension>::getQueryResultFromSupportedLayers(
+std::unordered_set<std::string> ZeGraphExtWrappers::getQueryResultFromSupportedLayers(
     ze_result_t result,
     ze_graph_query_network_handle_t& hGraphQueryNetwork) const {
+    if (NotSupportQuery(_graphExtVersion)) {
+        OPENVINO_THROW("pfnQueryNetworkGetSupportedLayers not supported for ",
+                       ZE_MAJOR_VERSION(_graphExtVersion),
+                       ".",
+                       ZE_MINOR_VERSION(_graphExtVersion));
+    }
     // Get the size of query result
+    _logger.debug("getQueryResultFromSupportLayers - perfrom pfnQueryNetworkGetSupportedLayers to get size");
     size_t size = 0;
     result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkGetSupportedLayers(hGraphQueryNetwork, &size, nullptr);
     THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryNetworkGetSupportedLayers get size of query result",
@@ -313,6 +244,7 @@ std::unordered_set<std::string> ZeGraphExtWrappers<TableExtension>::getQueryResu
                                     _zeroInitStruct->getGraphDdiTable());
 
     // Get the result data of query
+    _logger.debug("getQueryResultFromSupportLayers - perfrom pfnQueryNetworkGetSupportedLayers to get data");
     std::vector<char> supportedLayers(size);
     result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkGetSupportedLayers(hGraphQueryNetwork,
                                                                                    &size,
@@ -321,80 +253,117 @@ std::unordered_set<std::string> ZeGraphExtWrappers<TableExtension>::getQueryResu
                                     result,
                                     _zeroInitStruct->getGraphDdiTable());
 
+    _logger.debug("getQueryResultFromSupportLayers - perfrom pfnQueryNetworkDestroy");
     result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkDestroy(hGraphQueryNetwork);
     THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryNetworkDestroy", result, _zeroInitStruct->getGraphDdiTable());
 
     return parseQueryResult(supportedLayers);
 }
 
-template <ze_graph_ext_version_t TableExtension>
-std::unordered_set<std::string> ZeGraphExtWrappers<TableExtension>::queryGraph(
-    std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-    const std::string& buildFlags) const {
-    return queryImpl(std::move(serializedIR), buildFlags);
-}
-
-// For ext version <1.5, calling pfnCreate api in _zeroInitStruct->getGraphDdiTable()
-template <ze_graph_ext_version_t TableExtension>
-template <ze_graph_ext_version_t T, std::enable_if_t<NotSupportGraph2(T), bool>>
-void ZeGraphExtWrappers<TableExtension>::createGraph(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-                                                     const std::string& buildFlags,
-                                                     const uint32_t& /*flags*/,
-                                                     ze_graph_handle_t* graph) const {
-    ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
-                            nullptr,
-                            ZE_GRAPH_FORMAT_NGRAPH_LITE,
-                            serializedIR.first,
-                            serializedIR.second.get(),
-                            buildFlags.c_str()};
-
-    _logger.debug("createGraph - performing pfnCreate");
-    // Create querynetwork handle
-    auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate(_zeroInitStruct->getContext(),
-                                                                _zeroInitStruct->getDevice(),
-                                                                &desc,
-                                                                graph);
-    THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnCreate", result, _zeroInitStruct->getGraphDdiTable());
+std::unordered_set<std::string> ZeGraphExtWrappers::queryGraph(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
+                                                               const std::string& buildFlags) const {
+    // ext version >= 1.5, support API (pfnCreate2, pfnQueryNetworkCreate2, pfnQueryContextMemory)
+    // ext version == 1.3 && 1.4, support API (pfnQueryNetworkCreate, pfnQueryNetworkDestroy,
+    // pfnQueryNetworkGetSupportedLayers)
+    // For ext version < 1.3, query is not supported
+    ze_result_t result = ZE_RESULT_SUCCESS;
+    if (NotSupportQuery(_graphExtVersion)) {
+        // For ext version < 1.3, query is unsupported, return empty result and add debug log here
+        _logger.warning("queryGraph - Driver version is less than 1.3, queryNetwork is unsupported.");
+        return std::unordered_set<std::string>();
+    } else if (SupportAPIGraphQueryNetworkV1(_graphExtVersion)) {
+        // For ext version == 1.3 && == 1.4, query is supported, calling querynetwork api in
+        // _zeroInitStruct->getGraphDdiTable()
+        ze_graph_query_network_handle_t hGraphQueryNetwork = nullptr;
+
+        // For ext version == 1.3 && == 1.4
+        ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
+                                nullptr,
+                                ZE_GRAPH_FORMAT_NGRAPH_LITE,
+                                serializedIR.first,
+                                serializedIR.second.get(),
+                                buildFlags.c_str()};
+
+        // Create querynetwork handle
+        _logger.debug("For ext of 1.3 and 1.4 - perform pfnQueryNetworkCreate");
+        result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkCreate(_zeroInitStruct->getContext(),
+                                                                           _zeroInitStruct->getDevice(),
+                                                                           &desc,
+                                                                           &hGraphQueryNetwork);
+        THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryNetworkCreate", result, _zeroInitStruct->getGraphDdiTable());
+
+        return getQueryResultFromSupportedLayers(result, hGraphQueryNetwork);
+    } else if (SupportAPIGraphQueryNetworkV2(_graphExtVersion)) {
+        // For ext version >= 1.5
+        ze_graph_query_network_handle_t hGraphQueryNetwork = nullptr;
+
+        // For ext version >= 1.5
+        ze_graph_desc_2_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
+                                  nullptr,
+                                  ZE_GRAPH_FORMAT_NGRAPH_LITE,
+                                  serializedIR.first,
+                                  serializedIR.second.get(),
+                                  buildFlags.c_str(),
+                                  ZE_GRAPH_FLAG_NONE};
+
+        // Create querynetwork handle
+        _logger.debug("For ext larger than 1.4 - perform pfnQueryNetworkCreate2");
+        result = _zeroInitStruct->getGraphDdiTable().pfnQueryNetworkCreate2(_zeroInitStruct->getContext(),
+                                                                            _zeroInitStruct->getDevice(),
+                                                                            &desc,
+                                                                            &hGraphQueryNetwork);
+        THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnQueryNetworkCreate2", result, _zeroInitStruct->getGraphDdiTable());
+
+        return getQueryResultFromSupportedLayers(result, hGraphQueryNetwork);
+    }
+    _logger.warning("queryGraph - Driver version is %d.%d, queryNetwork is unsupported.",
+                    ZE_MAJOR_VERSION(_graphExtVersion),
+                    ZE_MINOR_VERSION(_graphExtVersion));
+    return std::unordered_set<std::string>();
 }
 
-// For ext version >= 1.5, calling pfnCreate2 api in _zeroInitStruct->getGraphDdiTable()
-template <ze_graph_ext_version_t TableExtension>
-template <ze_graph_ext_version_t T, std::enable_if_t<!NotSupportGraph2(T), bool>>
-void ZeGraphExtWrappers<TableExtension>::createGraph(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
+ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
                                                      const std::string& buildFlags,
-                                                     const uint32_t& flags,
-                                                     ze_graph_handle_t* graph) const {
-    ze_graph_desc_2_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
-                              nullptr,
-                              ZE_GRAPH_FORMAT_NGRAPH_LITE,
-                              serializedIR.first,
-                              serializedIR.second.get(),
-                              buildFlags.c_str(),
-                              flags};
-
-    _logger.debug("createGraph - performing pfnCreate2");
-    // Create querynetwork handle
-    auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate2(_zeroInitStruct->getContext(),
-                                                                 _zeroInitStruct->getDevice(),
-                                                                 &desc,
-                                                                 graph);
-    THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnCreate2", result, _zeroInitStruct->getGraphDdiTable());
-}
-
-template <ze_graph_ext_version_t TableExtension>
-ze_graph_handle_t ZeGraphExtWrappers<TableExtension>::getGraphHandle(
-    std::pair<size_t, std::shared_ptr<uint8_t>> serializedIR,
-    const std::string& buildFlags,
-    const uint32_t& flags) const {
+                                                     const uint32_t& flags) const {
     ze_graph_handle_t graphHandle;
-
-    createGraph(std::move(serializedIR), buildFlags, flags, &graphHandle);
-
+    if (NotSupportGraph2(_graphExtVersion)) {
+        // For ext version <1.5, calling pfnCreate api in _zeroInitStruct->getGraphDdiTable()
+        ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
+                                nullptr,
+                                ZE_GRAPH_FORMAT_NGRAPH_LITE,
+                                serializedIR.first,
+                                serializedIR.second.get(),
+                                buildFlags.c_str()};
+
+        _logger.debug("getGraphHandle - perform pfnCreate");
+        // Create querynetwork handle
+        auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate(_zeroInitStruct->getContext(),
+                                                                    _zeroInitStruct->getDevice(),
+                                                                    &desc,
+                                                                    &graphHandle);
+        THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnCreate", result, _zeroInitStruct->getGraphDdiTable());
+    } else {
+        // For ext version >= 1.5, calling pfnCreate2 api in _zeroInitStruct->getGraphDdiTable()
+        ze_graph_desc_2_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
+                                  nullptr,
+                                  ZE_GRAPH_FORMAT_NGRAPH_LITE,
+                                  serializedIR.first,
+                                  serializedIR.second.get(),
+                                  buildFlags.c_str(),
+                                  flags};
+
+        _logger.debug("getGraphHandle - perform pfnCreate2");
+        // Create querynetwork handle
+        auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate2(_zeroInitStruct->getContext(),
+                                                                     _zeroInitStruct->getDevice(),
+                                                                     &desc,
+                                                                     &graphHandle);
+        THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnCreate2", result, _zeroInitStruct->getGraphDdiTable());
+    }
     return graphHandle;
 }
 
-template <ze_graph_ext_version_t TableExtension>
-ze_graph_handle_t ZeGraphExtWrappers<TableExtension>::getGraphHandle(const std::vector<uint8_t>& network) const {
+ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(const std::vector<uint8_t>& network) const {
     ze_graph_handle_t graphHandle;
 
     if (network.empty()) {
@@ -408,6 +377,7 @@ ze_graph_handle_t ZeGraphExtWrappers<TableExtension>::getGraphHandle(const std::
                             network.data(),
                             nullptr};
 
+    _logger.debug("getGraphHandle - perform pfnCreate");
     auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate(_zeroInitStruct->getContext(),
                                                                 _zeroInitStruct->getDevice(),
                                                                 &desc,
@@ -473,87 +443,74 @@ static IODescriptor getIODescriptor(const ze_graph_argument_properties_3_t& arg,
             metadata.has_value() ? std::optional(shapeFromIRModel) : std::nullopt};
 }
 
-template <ze_graph_ext_version_t TableExtension>
-template <ze_graph_ext_version_t T, std::enable_if_t<NotSupportArgumentMetadata(T), bool>>
-void ZeGraphExtWrappers<TableExtension>::getMetadata(ze_graph_handle_t graphHandle,
-                                                     uint32_t index,
-                                                     std::vector<IODescriptor>& inputs,
-                                                     std::vector<IODescriptor>& outputs) const {
-    ze_graph_argument_properties_3_t arg;
-    auto result = _zeroInitStruct->getGraphDdiTable().pfnGetArgumentProperties3(graphHandle, index, &arg);
-    THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetArgumentProperties3", result, _zeroInitStruct->getGraphDdiTable());
-
-    switch (arg.type) {
-    case ZE_GRAPH_ARGUMENT_TYPE_INPUT: {
-        inputs.push_back(getIODescriptor(arg, std::nullopt));
-    } break;
-    case ZE_GRAPH_ARGUMENT_TYPE_OUTPUT: {
-        outputs.push_back(getIODescriptor(arg, std::nullopt));
-    } break;
-    default: {
-        OPENVINO_THROW("Invalid ze_graph_argument_type_t found in ze_graph_argument_properties_3_t object: ", arg.type);
-    }
-    }
-}
-
-template <ze_graph_ext_version_t TableExtension>
-template <ze_graph_ext_version_t T, std::enable_if_t<!NotSupportArgumentMetadata(T), bool>>
-void ZeGraphExtWrappers<TableExtension>::getMetadata(ze_graph_handle_t graphHandle,
-                                                     uint32_t index,
-                                                     std::vector<IODescriptor>& inputs,
-                                                     std::vector<IODescriptor>& outputs) const {
-    ze_graph_argument_properties_3_t arg;
-    auto result = _zeroInitStruct->getGraphDdiTable().pfnGetArgumentProperties3(graphHandle, index, &arg);
-    THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetArgumentProperties3", result, _zeroInitStruct->getGraphDdiTable());
+void ZeGraphExtWrappers::getMetadata(ze_graph_handle_t graphHandle,
+                                     uint32_t index,
+                                     std::vector<IODescriptor>& inputs,
+                                     std::vector<IODescriptor>& outputs) const {
+    if (NotSupportArgumentMetadata(_graphExtVersion)) {
+        ze_graph_argument_properties_3_t arg;
+        _logger.debug("getMetadata - perfrom pfnGetArgumentProperties3");
+        auto result = _zeroInitStruct->getGraphDdiTable().pfnGetArgumentProperties3(graphHandle, index, &arg);
+        THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetArgumentProperties3", result, _zeroInitStruct->getGraphDdiTable());
+
+        switch (arg.type) {
+        case ZE_GRAPH_ARGUMENT_TYPE_INPUT: {
+            inputs.push_back(getIODescriptor(arg, std::nullopt));
+        } break;
+        case ZE_GRAPH_ARGUMENT_TYPE_OUTPUT: {
+            outputs.push_back(getIODescriptor(arg, std::nullopt));
+        } break;
+        default: {
+            OPENVINO_THROW("Invalid ze_graph_argument_type_t found in ze_graph_argument_properties_3_t object: ",
+                           arg.type);
+        }
+        }
+    } else {
+        ze_graph_argument_properties_3_t arg;
+        _logger.debug("getMetadata - perfrom pfnGetArgumentProperties3");
+        auto result = _zeroInitStruct->getGraphDdiTable().pfnGetArgumentProperties3(graphHandle, index, &arg);
+        THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetArgumentProperties3", result, _zeroInitStruct->getGraphDdiTable());
 
-    std::optional<ze_graph_argument_metadata_t> optionalMetadata = std::nullopt;
+        std::optional<ze_graph_argument_metadata_t> optionalMetadata = std::nullopt;
 
-    if (!isStateInputName(arg.name) && !isStateOutputName(arg.name) && !isShapeTensorName(arg.name)) {
-        ze_graph_argument_metadata_t metadata;
-        result = _zeroInitStruct->getGraphDdiTable().pfnGraphGetArgumentMetadata(graphHandle, index, &metadata);
-        THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGraphGetArgumentMetadata", result, _zeroInitStruct->getGraphDdiTable());
+        if (!isStateInputName(arg.name) && !isStateOutputName(arg.name) && !isShapeTensorName(arg.name)) {
+            _logger.debug("getMetadata - perfrom pfnGetArgumentMetadata");
+            ze_graph_argument_metadata_t metadata;
+            result = _zeroInitStruct->getGraphDdiTable().pfnGraphGetArgumentMetadata(graphHandle, index, &metadata);
+            THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGraphGetArgumentMetadata", result, _zeroInitStruct->getGraphDdiTable());
 
-        optionalMetadata = std::optional(metadata);
-    }
+            optionalMetadata = std::optional(metadata);
+        }
 
-    switch (arg.type) {
-    case ZE_GRAPH_ARGUMENT_TYPE_INPUT: {
-        inputs.push_back(getIODescriptor(arg, optionalMetadata));
-    } break;
-    case ZE_GRAPH_ARGUMENT_TYPE_OUTPUT: {
-        outputs.push_back(getIODescriptor(arg, optionalMetadata));
-    } break;
-    default: {
-        OPENVINO_THROW("Invalid ze_graph_argument_type_t found in ze_graph_argument_properties_3_t object: ", arg.type);
-    }
+        switch (arg.type) {
+        case ZE_GRAPH_ARGUMENT_TYPE_INPUT: {
+            inputs.push_back(getIODescriptor(arg, optionalMetadata));
+        } break;
+        case ZE_GRAPH_ARGUMENT_TYPE_OUTPUT: {
+            outputs.push_back(getIODescriptor(arg, optionalMetadata));
+        } break;
+        default: {
+            OPENVINO_THROW("Invalid ze_graph_argument_type_t found in ze_graph_argument_properties_3_t object: ",
+                           arg.type);
+        }
+        }
     }
 }
 
-template <ze_graph_ext_version_t TableExtension>
-NetworkMetadata ZeGraphExtWrappers<TableExtension>::getNetworkMeta(ze_graph_handle_t graphHandle) const {
+NetworkMetadata ZeGraphExtWrappers::getNetworkMeta(ze_graph_handle_t graphHandle) const {
     ze_graph_properties_t graphProperties{};
 
+    _logger.debug("getNetworkMeta - perfrom pfnGetProperties");
     auto result = _zeroInitStruct->getGraphDdiTable().pfnGetProperties(graphHandle, &graphProperties);
     THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnGetProperties", result, _zeroInitStruct->getGraphDdiTable());
-
     NetworkMetadata meta;
-
     for (uint32_t index = 0; index < graphProperties.numGraphArgs; ++index) {
         getMetadata(graphHandle, index, meta.inputs, meta.outputs);
     }
     // TODO: support this information in CiD [track: E#33479]
     meta.numStreams = 1;
     meta.bindRelatedDescriptors();
-
     return meta;
 }
 
-template class ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_2>;
-template class ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_3>;
-template class ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_4>;
-template class ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_5>;
-template class ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_6>;
-template class ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_7>;
-template class ZeGraphExtWrappers<ZE_GRAPH_EXT_VERSION_1_8>;
-
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/thirdparty/level-zero-ext b/src/plugins/intel_npu/thirdparty/level-zero-ext
index a6487cc2c5da9a..a63155ae4e64fe 160000
--- a/src/plugins/intel_npu/thirdparty/level-zero-ext
+++ b/src/plugins/intel_npu/thirdparty/level-zero-ext
@@ -1 +1 @@
-Subproject commit a6487cc2c5da9aa13db9e005a320a1b6a0ee5919
+Subproject commit a63155ae4e64feaaa6931f4696c2e2e699063875

From 037689031c7866be452d30be360205ee4331745f Mon Sep 17 00:00:00 2001
From: Sebastian Golebiewski <sebastianx.golebiewski@intel.com>
Date: Fri, 15 Nov 2024 16:57:32 +0100
Subject: [PATCH 27/28] [DOCS] Updating Weight Compression Article (#27432)

Rearranging information in the `LLM Weight Compression` article.

---------

Co-authored-by: Karol Blaszczak <karol.blaszczak@intel.com>
---
 .../weight-compression.rst                    | 669 +++++++++---------
 .../4-bit-weight-quantization.rst             | 175 +++++
 2 files changed, 492 insertions(+), 352 deletions(-)
 create mode 100644 docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression/4-bit-weight-quantization.rst

diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst
index bbc09ccd4b5fbb..046dde9661c3bb 100644
--- a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst
+++ b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression.rst
@@ -6,38 +6,36 @@ LLM Weight Compression
    :hidden:
 
    weight-compression/microscaling-quantization
+   weight-compression/4-bit-weight-quantization
 
 
-Weight compression is a technique for enhancing the efficiency of models,
-especially those with large memory requirements. This method reduces the model's
-memory footprint, a crucial factor for Large Language Models (LLMs).
+Weight compression enhances the efficiency of models by reducing their memory footprint,
+a crucial factor for Large Language Models (LLMs). It is especially effective for networks with high memory requirements.
 
-Unlike full model quantization, where weights and activations are quantized,
-weight compression in `Neural Network Compression Framework (NNCF) <https://github.com/openvinotoolkit/nncf>`__
-only targets the model's weights. This approach allows the activations to remain as
-floating-point numbers, preserving most of the model's accuracy while improving its
-speed and reducing its size.
+Unlike full model quantization, where both weights and activations are quantized, it
+only targets weights, keeping activations as floating-point numbers. This means preserving most
+of the model's accuracy while improving its
+speed and reducing its size. The reduction in size is especially noticeable with larger models.
+For instance the 7 billion parameter Llama 2 model can be reduced
+from about 25GB to 4GB using 4-bit weight compression.
 
-The reduction in size is especially noticeable with larger models,
-for instance the 7 billion parameter Llama 2 model can be reduced
-from about 25GB to 4GB using 4-bit weight compression. With smaller models (i.e. less
-than 1B parameters), weight compression may result in more accuracy reduction than
-with larger models.
+.. note::
+
+   With smaller language models (i.e. less than 1B parameters), weight
+   compression may result in more accuracy reduction than with larger models.
+   Therefore, weight compression is recommended for use with LLMs only.
 
-LLMs and other models that require
+LLMs and other GenAI models that require
 extensive memory to store the weights during inference can benefit
 from weight compression as it:
 
 * enables inference of exceptionally large models that cannot be accommodated in the
   device memory;
-
 * reduces storage and memory overhead, making models more lightweight and less resource
   intensive for deployment;
-
 * improves inference speed by reducing the latency of memory access when computing the
   operations with weights, for example, Linear layers. The weights are smaller and thus
   faster to load from memory;
-
 * unlike quantization, does not require sample data to calibrate the range of
   activation values.
 
@@ -46,197 +44,228 @@ provides weight quantization to 8 and 4-bit integer data types as a compression
 method primarily designed to optimize LLMs.
 
 
+Compression Methods (8-bit vs. 4-bit)
+#####################################
+
+For models that come from `Hugging Face <https://huggingface.co/models>`__ and are supported
+by Optimum, it is recommended to use the **Optimum Intel API**, which employs NNCF weight
+compression capabilities to optimize various large Transformer models.
+
+The NNCF ``nncf.compress_weights()`` API, with most of its options, is exposed in the
+``.from_pretrained()`` method of Optimum Intel classes. Optimum also has several datasets
+for data-aware quantization available out-of-the-box.
 
-Compress Model Weights
-######################
+You can use the examples below to perform data-free 8-bit or 4-bit weight quantization.
+Before you start, make sure Optimum Intel is installed in your environment
+by running the following command:
 
-**8-bit weight quantization** method offers a balance between model size reduction and
-maintaining accuracy, which usually leads to significant performance improvements for
-Transformer-based models. Models with 8-bit compressed weights are performant on the
-vast majority of supported CPU and GPU platforms. By default, weights are compressed
-asymmetrically to "INT8_ASYM" mode.
+.. code-block:: python
 
+   pip install optimum[openvino]
 
-The code snippet below shows how to do asymmetrical 8-bit quantization of the model weights
-represented in OpenVINO IR using NNCF:
+**8-bit weight quantization** offers a good balance between reducing the size and lowering the
+accuracy of a model. It usually results in significant improvements for transformer-based models
+and guarantees good model performance for a vast majority of supported CPU and GPU platforms.
+By default, weights are compressed asymmetrically to "INT8_ASYM" mode.
 
 .. tab-set::
 
-   .. tab-item:: OpenVINO
-      :sync: openvino
+   .. tab-item:: Compression with Optimum-Intel
+      :sync: optimum
 
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/weight_compression_openvino.py
-         :language: python
-         :fragment: [compression_8bit]
+      Load a pre-trained Hugging Face model, compress it to INT8_ASYM, using the
+      Optimum Intel API, and then execute inference with a text phrase:
 
+      Simply use the optimum-cli command line tool:
 
-Now, the model is ready for compilation and inference.
-It can be also saved into a compressed format, resulting in a smaller binary file.
+      .. code-block:: console
 
-**4-bit weight quantization** method stands for an INT4-INT8 mixed-precision weight quantization,
-where INT4 is considered as the primary precision and asymmetric INT8 is the backup one.
-It usually results in a smaller model size and lower inference latency, although the accuracy
-degradation could be higher, depending on the model.
+         optimum-cli export openvino --model microsoft/Phi-3.5-mini-instruct --weight-format int8 ov_phi-3.5-mini-instruct
 
-The code snippet below shows how to do 4-bit quantization of the model weights represented
-in OpenVINO IR using NNCF:
+      You can also use the code sample to the same effect:
 
-.. tab-set::
+      .. code-block:: python
 
-   .. tab-item:: OpenVINO
-      :sync: openvino
+         from optimum.intel.openvino import OVModelForCausalLM, OVWeightQuantizationConfig
+         from transformers import AutoTokenizer, pipeline
 
-      .. doxygensnippet:: docs/optimization_guide/nncf/code/weight_compression_openvino.py
-         :language: python
-         :fragment: [compression_4bit]
+         # Load and compress a model from Hugging Face.
+         model_id = "microsoft/Phi-3.5-mini-instruct"
+         model = OVModelForCausalLM.from_pretrained(
+             model_id,
+             export=True,
+             quantization_config=OVWeightQuantizationConfig(bits=8)
+         )
 
+         # Inference
+         tokenizer = AutoTokenizer.from_pretrained(model_id)
+         pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+         phrase = "The weather is"
+         results = pipe(phrase)
+         print(results)
 
-The table below summarizes the benefits and trade-offs for each compression type in terms of
-memory reduction, speed gain, and accuracy loss.
+      For more details, refer to the article on how to
+      :doc:`infer LLMs using Optimum Intel <../../learn-openvino/llm_inference_guide/llm-inference-hf>`.
 
-.. list-table::
-   :widths: 25 20 20 20
-   :header-rows: 1
+   .. tab-item:: Compression with NNCF
+      :sync: nncf
 
-   * -
-     - Memory Reduction
-     - Latency Improvement
-     - Accuracy Loss
-   * - INT8 Asymmetric
-     - Low
-     - Medium
-     - Low
-   * - INT4 Symmetric
-     - High
-     - High
-     - High
-   * - INT4 Asymmetric
-     - High
-     - Medium
-     - Medium
+      Load a pre-trained Hugging Face model, using the Optimum Intel API,
+      compress it to INT8_ASYM, using NNCF, and then execute inference with a text phrase:
 
+      .. code-block:: python
 
+        from nncf import compress_weights, CompressWeightsMode
+        from optimum.intel.openvino import OVModelForCausalLM
+        from transformers import AutoTokenizer, pipeline
 
-The INT4 method has several parameters that can provide different performance-accuracy
-trade-offs after optimization:
+        # Load a model and compress it with NNCF.
+        model_id = "microsoft/Phi-3.5-mini-instruct"
+        model = OVModelForCausalLM.from_pretrained(model_id, export=True, load_in_8bit=False, compile=False)
+        model.model = compress_weights(model.model, mode=CompressWeightsMode.INT8_ASYM)
 
-* ``mode`` - there are two optimization modes: symmetric and asymmetric.
+        # Inference
+        model.compile()
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+        phrase = "The weather is"
+        results = pipe(phrase)
+        print(results)
 
-  **Symmetric Compression** - ``INT4_SYM``
 
-  INT4 Symmetric mode involves quantizing weights to a signed 4-bit integer
-  symmetrically without zero point. This mode is faster than the INT8_ASYM, making
-  it ideal for situations where **speed and size reduction are prioritized over accuracy**.
+Here is an example of code using NNCF to perform asymmetrical 8-bit weight quantization of
+a model in the OpenVINO IR format:
 
-  .. code-block:: python
+.. tab-set::
 
-    from nncf import compress_weights
-    from nncf import CompressWeightsMode
+   .. tab-item:: OpenVINO
+      :sync: openvino
 
-    compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_SYM)
+      .. doxygensnippet:: docs/optimization_guide/nncf/code/weight_compression_openvino.py
+         :language: python
+         :fragment: [compression_8bit]
 
-  **Asymmetric Compression** - ``INT4_ASYM``
 
-  INT4 Asymmetric mode also uses an unsigned 4-bit integer but quantizes weights
-  asymmetrically with a non-fixed zero point. This mode slightly compromises speed in
-  favor of better accuracy compared to the symmetric mode. This mode is useful when
-  **minimal accuracy loss is crucial**, but a faster performance than INT8 is still desired.
+**4-bit weight quantization** is actually a mixed-precision compression,
+primarily INT4 and a backup asymmetric INT8 precisions. It produces a smaller model,
+offering lower inference latency but potentially noticeable accuracy degradation,
+depending on the model.
 
-  .. code-block:: python
+.. tab-set::
 
-    from nncf import compress_weights
-    from nncf import CompressWeightsMode
+   .. tab-item:: Compression with Optimum-Intel
+      :sync: optimum
 
-    compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_ASYM)
+      Load a pre-trained Hugging Face model, compress it to INT4, using the
+      Optimum Intel API, and then execute inference with a text phrase:
 
-* ``group_size`` controls the size of the group of weights that share the same
-  quantization parameters. Shared quantization parameters help to speed up the
-  calculation of activation values as they are dequantized and quantized between
-  layers. However, they can reduce accuracy. The following group sizes are
-  recommended: ``128``, ``64``, ``32`` (``128`` is default value).
+      Simply use the optimum-cli command line tool:
 
-  `Smaller Group Size`: Leads to a more accurate model but increases the model's
-  footprint and reduces inference speed.
+      .. code-block:: console
 
-  `Larger Group Size`: Results in faster inference and a smaller model, but might
-  compromise accuracy.
+         optimum-cli export openvino --model microsoft/Phi-3.5-mini-instruct --weight-format int4 --awq --scale-estimation --dataset wikitext2 --group-size 64 --ratio 1.0 ov_phi-3.5-mini-instruct
 
-* ``ratio`` controls the ratio between the layers compressed to the precision defined
-  by ``mode`` and the rest of the layers that will be kept in the ``backup_mode`` in the optimized model.
-  Ratio is a decimal between 0 and 1. For example, 0.8 means that 80% of layers will be
-  compressed to the precision defined by ``mode``, while the rest will be compressed to
-  ``backup_mode`` precision. The default value for ratio is 1.
+      You can also use the code sample to the same effect:
 
-  `Higher Ratio (more layers set to mode precision)`: Reduces the model size and increase inference speed but
-  might lead to higher accuracy degradation.
+      .. code-block:: python
 
-  `Lower Ratio (more layers set to backup_mode precision)`: Maintains better accuracy but results in a larger model size
-  and potentially slower inference.
+         from optimum.intel.openvino import OVModelForCausalLM, OVWeightQuantizationConfig
+         from transformers import AutoTokenizer, pipeline
 
-  In this example, 90% of the model's layers are quantized to INT4 asymmetrically with
-  a group size of 64:
+         # Load and compress a model from Hugging Face.
+         model_id = "microsoft/Phi-3.5-mini-instruct"
+         model = OVModelForCausalLM.from_pretrained(
+             model_id,
+             export=True,
+             quantization_config=OVWeightQuantizationConfig(
+                 bits=4,
+                 quant_method="awq",
+                 scale_estimation=True,
+                 dataset="wikitext2",
+                 group_size=64,
+                 ratio=1.0
+             )
+         )
 
-  .. code-block:: python
+         # Inference
+         tokenizer = AutoTokenizer.from_pretrained(model_id)
+         pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+         phrase = "The weather is"
+         results = pipe(phrase)
+         print(results)
 
-    from nncf import compress_weights, CompressWeightsMode
+   .. tab-item:: Compression with NNCF
+      :sync: nncf
 
-    # Example: Compressing weights with INT4_ASYM mode, group size of 64, and 90% INT4 ratio
-    compressed_model = compress_weights(
-      model,
-      mode=CompressWeightsMode.INT4_ASYM,
-      group_size=64,
-      ratio=0.9,
-    )
+      Load a pre-trained Hugging Face model, using the Optimum Intel API,
+      compress it to INT4 using NNCF, and then execute inference with a text phrase:
 
-* ``scale_estimation`` - boolean parameter that enables more accurate estimation of
-  quantization scales. Especially helpful when the weights of all layers are quantized to
-  4 bits. Requires dataset.
+      .. code-block:: python
 
-* ``awq`` - boolean parameter that enables the AWQ method for more accurate INT4 weight
-  quantization. Especially helpful when the weights of all the layers are quantized to
-  4 bits. The method can sometimes result in reduced accuracy when used with
-  Dynamic Quantization of activations. Requires dataset.
+         from nncf import compress_weights, CompressWeightsMode
+         from optimum.intel.openvino import OVModelForCausalLM
+         from transformers import AutoTokenizer, pipeline
 
-* ``gptq`` - boolean parameter that enables the GPTQ method for more accurate INT4 weight
-  quantization. Requires dataset.
+         # Load a model and compress it with NNCF.
+         model_id = "microsoft/Phi-3.5-mini-instruct"
+         model = OVModelForCausalLM.from_pretrained(model_id, export=True, load_in_8bit=False, compile=False)
+         model.model = compress_weights(model.model, mode=CompressWeightsMode.INT4_SYM)
 
-* ``dataset`` - calibration dataset for data-aware weight compression. It is required
-  for some compression options, for example, ``scale_estimation``, ``gptq`` or ``awq``. Some types
-  of ``sensitivity_metric`` can use data for precision selection.
+         # Inference
+         model.compile()
+         tokenizer = AutoTokenizer.from_pretrained(model_id)
+         pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+         phrase = "The weather is"
+         results = pipe(phrase)
+         print(results)
 
-* ``sensitivity_metric`` - controls the metric to estimate the sensitivity of compressing
-  layers in the bit-width selection algorithm. Some of the metrics require dataset to be
-  provided. The following types are supported:
 
-  * ``nncf.SensitivityMetric.WEIGHT_QUANTIZATION_ERROR`` - data-free metric computed as
-    the inverted 8-bit quantization noise. Weights with highest value of this metric can
-    be accurately quantized channel-wise to 8-bit. The idea is to leave these weights in
-    8 bit, and quantize the rest of layers to 4-bit group-wise. Since group-wise is more
-    accurate than per-channel, accuracy should not degrade.
+      For more details, refer to the article on how to
+      :doc:`infer LLMs using Optimum Intel <../../../learn-openvino/llm_inference_guide/llm-inference-hf>`.
 
-  * ``nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION`` - requires dataset. The average
-    Hessian trace of weights with respect to the layer-wise quantization error multiplied
-    by L2 norm of 8-bit quantization noise.
+The code snippet below shows how to do 4-bit quantization of the model weights represented
+in OpenVINO IR using NNCF:
 
-  * ``nncf.SensitivityMetric.MEAN_ACTIVATION_VARIANCE`` - requires dataset. The mean
-    variance of the layers' inputs multiplied by inverted 8-bit quantization noise.
+.. tab-set::
 
-  * ``nncf.SensitivityMetric.MAX_ACTIVATION_VARIANCE`` - requires dataset. The maximum
-    variance of the layers' inputs multiplied by inverted 8-bit quantization noise.
+   .. tab-item:: OpenVINO
+      :sync: openvino
 
-  * ``nncf.SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE`` - requires dataset. The mean
-    magnitude of the layers' inputs multiplied by inverted 8-bit quantization noise.
+      .. doxygensnippet:: docs/optimization_guide/nncf/code/weight_compression_openvino.py
+         :language: python
+         :fragment: [compression_4bit]
+
+Refer to the article about
+:doc:`4-bit weight quantization <./weight-compression/4-bit-weight-quantization>`
+for more details.
 
-* ``all_layers`` - boolean parameter that enables INT4 weight quantization of all
-  Fully-Connected and Embedding layers, including the first and last layers in the model.
+Once the model has been optimized, it is ready for compilation and inference. The model can
+also be :ref:`saved into a compressed format <save_pretrained>`, resulting in a
+smaller binary file.
+
+The table below summarizes the benefits and trade-offs for each compression type in terms of
+memory reduction, speed gain, and accuracy loss.
 
-* ``lora_correction`` - boolean parameter that enables the LoRA Correction Algorithm
-  to further improve the accuracy of INT4 compressed models on top of other
-  algorithms - AWQ and Scale Estimation.
+.. list-table::
+   :widths: 25 20 20 20
+   :header-rows: 1
 
-* ``backup_mode`` - defines a backup precision for mixed-precision weight compression.
-  There are three modes: INT8_ASYM, INT8_SYM, and NONE, which retains
-  the original floating-point precision of the model weights (``INT8_ASYM`` is default value).
+   * -
+     - Memory Reduction
+     - Latency Improvement
+     - Accuracy Loss
+   * - INT8 Asymmetric
+     - Low
+     - Medium
+     - Low
+   * - INT4 Symmetric
+     - High
+     - High
+     - High
+   * - INT4 Asymmetric
+     - High
+     - Medium
+     - Medium
 
 
 **Use synthetic data for LLM weight compression**
@@ -268,8 +297,8 @@ for details of the usage.
    # Synthetic-based compression
    synthetic_dataset = nncf.data.generate_text_data(hf_model, tokenizer, dataset_size=100)
    quantization_dataset = nncf.Dataset(
-       synthetic_dataset, 
-       transform_fn # see example in NNCF repo how to make transform_fn
+       synthetic_dataset,
+       transform_fn # See the example in NNCF repo to learn how to make transform_fn.
    )
 
    model = compress_weights(
@@ -280,58 +309,16 @@ for details of the usage.
        dataset=quantization_dataset,
        awq=True,
        scale_estimation=True
-   )  # model is openvino.Model
+   )  # The model is openvino.Model.
 
 For data-aware weight compression refer to the following
 `example <https://github.com/openvinotoolkit/nncf/tree/develop/examples/llm_compression/openvino/tiny_llama>`__.
 
 .. note::
 
-  Some methods can be stacked on top of one another to achieve a better
-  accuracy-performance trade-off after weight quantization. For example, the **Scale Estimation**
-  method can be applied along with **AWQ** and mixed-precision quantization (the ``ratio`` parameter).
-
-
-**Hugging Face Optimum-Intel API**
-
-Hugging Face Optimum-Intel provides an easy way to use NNCF Weight Compression capabilities to optimize
-various large Transformer models. Most of the options of the NNCF ``nncf.compress_weights()`` API are
-exposed in the ``.from_pretrained()`` method of Optimum-Intel classes. Optimum also has several datasets
-for data-aware quantization available out-of-the-box.
-The example below shows data-free 4-bit weight quantization
-applied on top of OpenVINO IR. Before trying the example, make sure Optimum Intel
-is installed in your environment by running the following command:
-
-.. code-block:: python
-
-  pip install optimum[openvino]
-
-.. code-block:: python
-
-  from optimum.intel.openvino import OVModelForCausalLM, OVWeightQuantizationConfig
-  from transformers import AutoTokenizer, pipeline
-
-  # Load and compress model from Hugging Face
-  model_id = "microsoft/Phi-3.5-mini-instruct"
-  model = OVModelForCausalLM.from_pretrained(
-      model_id,
-      export=True,
-      quantization_config=OVWeightQuantizationConfig(
-          bits=4,
-          quant_method="awq",
-          scale_estimation=True,
-          dataset="wikitext2",
-          group_size=64,
-          ratio=1.0
-      )
-  )
-
-  # Inference
-  tokenizer = AutoTokenizer.from_pretrained(model_id)
-  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
-  phrase = "The weather is"
-  results = pipe(phrase)
-  print(results)
+   Some methods can be stacked on top of one another to achieve a better
+   accuracy-performance trade-off after weight quantization. For example, the **Scale Estimation**
+   method can be applied along with **AWQ** and mixed-precision quantization (the ``ratio`` parameter).
 
 
 Exporting and Loading Compressed Models
@@ -344,179 +331,157 @@ so it is preferable to compress the model once, save it, and then
 load the compressed model later for faster time to first inference.
 
 .. code-block:: python
+   :name: save_pretrained
 
-  # Save compressed model for faster loading later
-  model.save_pretrained("Phi-3.5-mini-instruct-int4-sym-ov")
-  tokenizer.save_pretrained("Phi-3.5-mini-instruct-int4-sym-ov")
-
-  # Load a saved model
-  model = OVModelForCausalLM.from_pretrained("Phi-3.5-mini-instruct-int4-sym-ov")
-  tokenizer = AutoTokenizer.from_pretrained("Phi-3.5-mini-instruct-int4-sym-ov")
-
-GPTQ Models
-############
+   # Save compressed model for faster loading later
+   model.save_pretrained("Phi-3.5-mini-instruct-int4-sym-ov")
+   tokenizer.save_pretrained("Phi-3.5-mini-instruct-int4-sym-ov")
 
-OpenVINO also supports 4-bit models from Hugging Face
-`Transformers <https://github.com/huggingface/transformers>`__ library optimized
-with `GPTQ <https://github.com/PanQiWei/AutoGPTQ>`__. In this case, there is no
-need for an additional model optimization step because model conversion will
-automatically preserve the INT4 optimization results, allowing model inference to benefit from it.
+   # Load a saved model
+   model = OVModelForCausalLM.from_pretrained("Phi-3.5-mini-instruct-int4-sym-ov")
+   tokenizer = AutoTokenizer.from_pretrained("Phi-3.5-mini-instruct-int4-sym-ov")
 
-A compression example using a GPTQ model is shown below.
-Make sure to install GPTQ dependencies by running the following command:
+.. tip::
 
-.. code-block:: python
-
-  pip install optimum[openvino] auto-gptq
-
-.. code-block:: python
+   Models optimized with with NNCF or Optimum Intel can be used with
+   :doc:`OpenVINO GenAI <../../learn-openvino/llm_inference_guide/genai-guide>`.
 
-  from optimum.intel.openvino import OVModelForCausalLM
-  from transformers import AutoTokenizer, pipeline
 
-  # Load model from Hugging Face already optimized with GPTQ
-  model_id = "TheBloke/Llama-2-7B-Chat-GPTQ"
-  model = OVModelForCausalLM.from_pretrained(model_id, export=True)
+Auto-tuning of Weight Compression Parameters
+############################################
 
-  # Inference
-  tokenizer = AutoTokenizer.from_pretrained(model_id)
-  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
-  phrase = "The weather is"
-  results = pipe(phrase)
-  print(results)
+To find the optimal weight compression parameters for a particular model, refer to the
+`example <https://github.com/openvinotoolkit/nncf/tree/develop/examples/llm_compression/openvino/tiny_llama_find_hyperparams>`__ ,
+where weight compression parameters are being searched from the subset of values.
+To speed up the search, a self-designed validation pipeline called
+`WhoWhatBench <https://github.com/openvinotoolkit/openvino.genai/tree/master/llm_bench/python/who_what_benchmark>`__
+is used. The pipeline can quickly evaluate the changes in the accuracy of the optimized
+model compared to the baseline.
 
-An `example of a model <https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ>`__
-that has been optimized using GPTQ.
 
 Compression Metrics Examples
-########################################
+############################
 
-The table below shows examples of text-generation Language Models with different
+Below you will find examples of text-generation Language Models with different
 optimization settings in a data-free setup, where no dataset is used at the optimization step.
 The Perplexity metric is a measurement of response accuracy, where a higher complexity
 score indicates a lower accuracy. It is measured on the
 `Lambada OpenAI dataset <https://github.com/openai/gpt-2/issues/131#issuecomment-497136199>`__.
 
-.. list-table::
-   :widths: 40 55 25 25
-   :header-rows: 1
-
-   * - Model
-     - Optimization
-     - Perplexity\*
-     - Model Size (Gb)
-   * - databricks/dolly-v2-3b
-     - FP32
-     - 5.01
-     - 10.3
-   * - databricks/dolly-v2-3b
-     - INT8_ASYM
-     - 5.07
-     - 2.6
-   * - databricks/dolly-v2-3b
-     - INT4_ASYM,group_size=32,ratio=0.5
-     - 5.28
-     - 2.2
-   * - facebook/opt-6.7b
-     - FP32
-     - 4.25
-     - 24.8
-   * - facebook/opt-6.7b
-     - INT8_ASYM
-     - 4.27
-     - 6.2
-   * - facebook/opt-6.7b
-     - INT4_ASYM,group_size=64,ratio=0.8
-     - 4.32
-     - 4.1
-   * - meta-llama/Llama-2-7b-chat-hf
-     - FP32
-     - 3.28
-     - 25.1
-   * - meta-llama/Llama-2-7b-chat-hf
-     - INT8_ASYM
-     - 3.29
-     - 6.3
-   * - meta-llama/Llama-2-7b-chat-hf
-     - INT4_ASYM,group_size=128,ratio=0.8
-     - 3.41
-     - 4.0
-   * - togethercomputer/RedPajama-INCITE-7B-Instruct
-     - FP32
-     - 4.15
-     - 25.6
-   * - togethercomputer/RedPajama-INCITE-7B-Instruct
-     - INT8_ASYM
-     - 4.17
-     - 6.4
-   * - togethercomputer/RedPajama-INCITE-7B-Instruct
-     - INT4_ASYM,group_size=128,ratio=1.0
-     - 4.17
-     - 3.6
-   * - meta-llama/Llama-2-13b-chat-hf
-     - FP32
-     - 2.92
-     - 48.5
-   * - meta-llama/Llama-2-13b-chat-hf
-     - INT8_ASYM
-     - 2.91
-     - 12.1
-   * - meta-llama/Llama-2-13b-chat-hf
-     - INT4_SYM,group_size=64,ratio=0.8
-     - 2.98
-     - 8.0
-
-
-The following table shows accuracy metric in a data-aware 4-bit weight quantization
-setup measured on the `Wikitext dataset <https://arxiv.org/pdf/1609.07843.pdf>`__.
-
-.. list-table::
-   :widths: 40 55 25 25
-   :header-rows: 1
-
-   * - Model
-     - Optimization
-     - Word perplexity\*
-     - Model Size (Gb)
-   * - meta-llama/llama-7b-chat-hf
-     - FP32
-     - 11.57
-     - 12.61
-   * - meta-llama/llama-7b-chat-hf
-     - INT4_SYM,group_size=128,ratio=1.0,awq=True
-     - 12.34
-     - 2.6
-   * - stabilityai_stablelm-3b-4e1t
-     - FP32
-     - 10.17
-     - 10.41
-   * - stabilityai_stablelm-3b-4e1t
-     - INT4_SYM,group_size=64,ratio=1.0,awq=True
-     - 10.89
-     - 2.6
-   * - HuggingFaceH4/zephyr-7b-beta
-     - FP32
-     - 9.82
-     - 13.99
-   * - HuggingFaceH4/zephyr-7b-beta
-     - INT4_SYM,group_size=128,ratio=1.0
-     - 10.32
-     - 2.6
+.. dropdown:: Perplexity\* in data-free optimization
+
+   .. list-table::
+      :widths: 40 55 25 25
+      :header-rows: 1
+
+      * - Model
+        - Optimization
+        - Perplexity\*
+        - Model Size (Gb)
+      * - databricks/dolly-v2-3b
+        - FP32
+        - 5.01
+        - 10.3
+      * - databricks/dolly-v2-3b
+        - INT8_ASYM
+        - 5.07
+        - 2.6
+      * - databricks/dolly-v2-3b
+        - INT4_ASYM,group_size=32,ratio=0.5
+        - 5.28
+        - 2.2
+      * - facebook/opt-6.7b
+        - FP32
+        - 4.25
+        - 24.8
+      * - facebook/opt-6.7b
+        - INT8_ASYM
+        - 4.27
+        - 6.2
+      * - facebook/opt-6.7b
+        - INT4_ASYM,group_size=64,ratio=0.8
+        - 4.32
+        - 4.1
+      * - meta-llama/Llama-2-7b-chat-hf
+        - FP32
+        - 3.28
+        - 25.1
+      * - meta-llama/Llama-2-7b-chat-hf
+        - INT8_ASYM
+        - 3.29
+        - 6.3
+      * - meta-llama/Llama-2-7b-chat-hf
+        - INT4_ASYM,group_size=128,ratio=0.8
+        - 3.41
+        - 4.0
+      * - togethercomputer/RedPajama-INCITE-7B-Instruct
+        - FP32
+        - 4.15
+        - 25.6
+      * - togethercomputer/RedPajama-INCITE-7B-Instruct
+        - INT8_ASYM
+        - 4.17
+        - 6.4
+      * - togethercomputer/RedPajama-INCITE-7B-Instruct
+        - INT4_ASYM,group_size=128,ratio=1.0
+        - 4.17
+        - 3.6
+      * - meta-llama/Llama-2-13b-chat-hf
+        - FP32
+        - 2.92
+        - 48.5
+      * - meta-llama/Llama-2-13b-chat-hf
+        - INT8_ASYM
+        - 2.91
+        - 12.1
+      * - meta-llama/Llama-2-13b-chat-hf
+        - INT4_SYM,group_size=64,ratio=0.8
+        - 2.98
+        - 8.0
+
+
+.. dropdown:: Perplexity\* in data-aware optimization
+
+   The following table shows accuracy metric in a data-aware 4-bit weight quantization
+   setup measured on the `Wikitext dataset <https://arxiv.org/pdf/1609.07843.pdf>`__.
+
+   .. list-table::
+      :widths: 40 55 25 25
+      :header-rows: 1
+
+      * - Model
+        - Optimization
+        - Word perplexity\*
+        - Model Size (Gb)
+      * - meta-llama/llama-7b-chat-hf
+        - FP32
+        - 11.57
+        - 12.61
+      * - meta-llama/llama-7b-chat-hf
+        - INT4_SYM,group_size=128,ratio=1.0,awq=True
+        - 12.34
+        - 2.6
+      * - stabilityai_stablelm-3b-4e1t
+        - FP32
+        - 10.17
+        - 10.41
+      * - stabilityai_stablelm-3b-4e1t
+        - INT4_SYM,group_size=64,ratio=1.0,awq=True
+        - 10.89
+        - 2.6
+      * - HuggingFaceH4/zephyr-7b-beta
+        - FP32
+        - 9.82
+        - 13.99
+      * - HuggingFaceH4/zephyr-7b-beta
+        - INT4_SYM,group_size=128,ratio=1.0
+        - 10.32
+        - 2.6
 
 
 \*Perplexity metric in both tables was measured without the Dynamic Quantization feature
 enabled in the OpenVINO runtime.
 
-Auto-tuning of Weight Compression Parameters
-############################################
-
-To find the optimal weight compression parameters for a particular model, refer to the
-`example <https://github.com/openvinotoolkit/nncf/tree/develop/examples/llm_compression/openvino/tiny_llama_find_hyperparams>`__ ,
-where weight compression parameters are being searched from the subset of values.
-To speed up the search, a self-designed validation pipeline called
-`WhoWhatBench <https://github.com/openvinotoolkit/openvino.genai/tree/master/llm_bench/python/who_what_benchmark>`__
-is used. The pipeline can quickly evaluate the changes in the accuracy of the optimized
-model compared to the baseline.
 
 Additional Resources
 ####################
diff --git a/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression/4-bit-weight-quantization.rst b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression/4-bit-weight-quantization.rst
new file mode 100644
index 00000000000000..ae9bc7d7b8b4a3
--- /dev/null
+++ b/docs/articles_en/openvino-workflow/model-optimization-guide/weight-compression/4-bit-weight-quantization.rst
@@ -0,0 +1,175 @@
+4-bit Weight Quantization
+=========================
+
+The 4-bit weight quantization method results in significant reduction in model size and
+memory usage, making LLMs more accessible to less performant devices.
+It also usually offers lower inference latency, however, depending on specific models,
+it may potentially impact the accuracy.
+
+Nevertheless, the INT4 method has several parameters that can provide different performance-accuracy
+trade-offs after optimization:
+
+* ``mode`` - there are two optimization modes: symmetric and asymmetric.
+
+  .. tab-set::
+
+     .. tab-item:: Symmetric Compression
+        :sync: int4-sym
+
+        INT4 Symmetric mode (``INT4_SYM``) involves quantizing weights to a signed 4-bit integer
+        symmetrically without zero point. This mode is faster than the INT8_ASYM, making
+        it ideal for situations where **speed and size reduction are prioritized over accuracy**.
+
+        .. code-block:: python
+
+           from nncf import compress_weights
+           from nncf import CompressWeightsMode
+
+           compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_SYM)
+
+     .. tab-item:: Asymmetric Compression
+        :sync: int4-asym
+
+        INT4 Asymmetric mode (``INT4_ASYM``) also uses an unsigned 4-bit integer but quantizes weights
+        asymmetrically with a non-fixed zero point. This mode slightly compromises speed in
+        favor of better accuracy compared to the symmetric mode. This mode is useful when
+        **minimal accuracy loss is crucial**, but a faster performance than INT8 is still desired.
+
+        .. code-block:: python
+
+           from nncf import compress_weights
+           from nncf import CompressWeightsMode
+
+           compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_ASYM)
+
+* ``group_size`` controls the size of the group of weights that share the same
+  quantization parameters. Shared quantization parameters help to speed up the
+  calculation of activation values as they are dequantized and quantized between
+  layers. However, they can reduce accuracy. The following group sizes are
+  recommended: ``128``, ``64``, ``32`` (``128`` is default value).
+
+  `Smaller Group Size`: Leads to a more accurate model but increases the model's
+  footprint and reduces inference speed.
+
+  `Larger Group Size`: Results in faster inference and a smaller model, but might
+  compromise accuracy.
+
+* ``ratio`` controls the ratio between the layers compressed to the precision defined
+  by ``mode`` and the rest of the layers that will be kept in the ``backup_mode`` in the optimized model.
+  Ratio is a decimal between 0 and 1. For example, 0.8 means that 80% of layers will be
+  compressed to the precision defined by ``mode``, while the rest will be compressed to
+  ``backup_mode`` precision. The default value for ratio is 1.
+
+  | **Higher Ratio (more layers set to mode precision)**:
+  | Reduces the model size and increase inference speed but
+    might lead to higher accuracy degradation.
+
+  | **Lower Ratio (more layers set to backup_mode precision)**:
+  | Maintains better accuracy but results in a larger model size
+    and potentially slower inference.
+
+  In the example below, 90% of the model's layers are quantized to INT4 asymmetrically with
+  a group size of 64:
+
+  .. code-block:: python
+
+    from nncf import compress_weights, CompressWeightsMode
+
+    # Example: Compressing weights with INT4_ASYM mode, group size of 64, and 90% INT4 ratio
+    compressed_model = compress_weights(
+      model,
+      mode=CompressWeightsMode.INT4_ASYM,
+      group_size=64,
+      ratio=0.9,
+    )
+
+* ``scale_estimation`` - a boolean parameter that enables more accurate estimation of
+  quantization scales. Especially helpful when the weights of all layers are quantized to
+  4 bits. Requires dataset.
+
+* ``awq`` - a boolean parameter that enables the AWQ method for more accurate INT4 weight
+  quantization. Especially helpful when the weights of all the layers are quantized to
+  4 bits. The method can sometimes result in reduced accuracy when used with
+  Dynamic Quantization of activations. Requires dataset.
+
+* ``gptq`` - a boolean parameter that enables the GPTQ method for more accurate INT4 weight
+  quantization. Requires dataset.
+
+* ``dataset`` - a calibration dataset for data-aware weight compression. It is required
+  for some compression options, for example, ``scale_estimation``, ``gptq`` or ``awq``. Some types
+  of ``sensitivity_metric`` can use data for precision selection.
+
+* ``sensitivity_metric`` - controls the metric to estimate the sensitivity of compressing
+  layers in the bit-width selection algorithm. Some of the metrics require dataset to be
+  provided. The following types are supported:
+
+  * ``nncf.SensitivityMetric.WEIGHT_QUANTIZATION_ERROR`` - a data-free metric computed as
+    the inverted 8-bit quantization noise. Weights with highest value of this metric can
+    be accurately quantized channel-wise to 8-bit. The idea is to leave these weights in
+    8 bit, and quantize the rest of layers to 4-bit group-wise. Since group-wise is more
+    accurate than per-channel, accuracy should not degrade.
+
+  * ``nncf.SensitivityMetric.HESSIAN_INPUT_ACTIVATION`` - requires a dataset. The average
+    Hessian trace of weights with respect to the layer-wise quantization error multiplied
+    by L2 norm of 8-bit quantization noise.
+
+  * ``nncf.SensitivityMetric.MEAN_ACTIVATION_VARIANCE`` - requires a dataset. The mean
+    variance of the layers' inputs multiplied by inverted 8-bit quantization noise.
+
+  * ``nncf.SensitivityMetric.MAX_ACTIVATION_VARIANCE`` - requires a dataset. The maximum
+    variance of the layers' inputs multiplied by inverted 8-bit quantization noise.
+
+  * ``nncf.SensitivityMetric.MEAN_ACTIVATION_MAGNITUDE`` - requires a dataset. The mean
+    magnitude of the layers' inputs multiplied by inverted 8-bit quantization noise.
+
+* ``all_layers`` - a boolean parameter that enables INT4 weight quantization of all
+  Fully-Connected and Embedding layers, including the first and last layers in the model.
+
+* ``lora_correction`` - a boolean parameter that enables the LoRA Correction Algorithm
+  to further improve the accuracy of INT4 compressed models on top of other
+  algorithms - AWQ and Scale Estimation.
+
+* ``backup_mode`` - defines a backup precision for mixed-precision weight compression.
+  There are three modes: INT8_ASYM, INT8_SYM, and NONE, which retains
+  the original floating-point precision of the model weights (``INT8_ASYM`` is default value).
+
+|
+
+4-bit Weight Quantization with GPTQ
+###################################
+
+You can use models from Hugging Face
+`Transformers <https://github.com/huggingface/transformers>`__ library, which are quantized
+with `GPTQ <https://github.com/PanQiWei/AutoGPTQ>`__ algorithm. Such models do not require
+additional optimization step because the conversion will automatically preserve
+the INT4 optimization results, and model inference will eventually benefit from it.
+
+See the `example of a model <https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ>`__
+that has been optimized with GPTQ.
+
+You can also refer to the code sample below which shows how to load a 4-bit
+GPTQ model and run inference.
+
+.. dropdown:: Using a GPTQ model.
+
+   Make sure to install GPTQ dependencies by running the following command:
+
+   .. code-block:: python
+
+      pip install optimum[openvino] auto-gptq
+
+   .. code-block:: python
+
+      from optimum.intel.openvino import OVModelForCausalLM
+      from transformers import AutoTokenizer, pipeline
+
+      # Load model from Hugging Face already optimized with GPTQ
+      model_id = "TheBloke/Llama-2-7B-Chat-GPTQ"
+      model = OVModelForCausalLM.from_pretrained(model_id, export=True)
+
+      # Inference
+      tokenizer = AutoTokenizer.from_pretrained(model_id)
+      pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+      phrase = "The weather is"
+      results = pipe(phrase)
+      print(results)

From c4d6d2b37db873e7c657a6049f858cf1cf472cb8 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 16 Nov 2024 14:55:20 +0100
Subject: [PATCH 28/28] Update scipy requirement from <1.12,>=1.5.4 to
 >=1.5.4,<1.15 in /tests (#26735)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updates the requirements on [scipy](https://github.com/scipy/scipy) to
permit the latest version.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/scipy/scipy/releases">scipy's
releases</a>.</em></p>
<blockquote>
<h1>SciPy 1.14.1 Release Notes</h1>
<p>SciPy <code>1.14.1</code> adds support for Python <code>3.13</code>,
including binary
wheels on PyPI. Apart from that, it is a bug-fix release with
no new features compared to <code>1.14.0</code>.</p>
<h1>Authors</h1>
<ul>
<li>Name (commits)</li>
<li>h-vetinari (1)</li>
<li>Evgeni Burovski (1)</li>
<li>CJ Carey (2)</li>
<li>Lucas Colley (3)</li>
<li>Ralf Gommers (3)</li>
<li>Melissa Weber Mendonça (1)</li>
<li>Andrew Nelson (3)</li>
<li>Nick ODell (1)</li>
<li>Tyler Reddy (36)</li>
<li>Daniel Schmitz (1)</li>
<li>Dan Schult (4)</li>
<li>Albert Steppi (2)</li>
<li>Ewout ter Hoeven (1)</li>
<li>Tibor Völcker (2) +</li>
<li>Adam Turner (1) +</li>
<li>Warren Weckesser (2)</li>
<li>ਗਗਨਦੀਪ ਸਿੰਘ (Gagandeep Singh) (1)</li>
</ul>
<p>A total of 17 people contributed to this release.
People with a &quot;+&quot; by their names contributed a patch for the
first time.
This list of names is automatically generated, and may not be fully
complete.</p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/scipy/scipy/commit/92d2a8592782ee19a1161d0bf3fc2241ba78bb63"><code>92d2a85</code></a>
REL: 1.14.1 rel commit [wheel build]</li>
<li><a
href="https://github.com/scipy/scipy/commit/85623a1fc90af6c3f4f15cd2321b7768c61bce7f"><code>85623a1</code></a>
Merge pull request <a
href="https://redirect.github.com/scipy/scipy/issues/21362">#21362</a>
from tylerjereddy/treddy_1.14.1_backports</li>
<li><a
href="https://github.com/scipy/scipy/commit/d924005fb77a26cefac7e6684068f798a635099c"><code>d924005</code></a>
MAINT: PR 21362 revisions [wheel build]</li>
<li><a
href="https://github.com/scipy/scipy/commit/b901a4e5b3ae4a9b5a6f4545cf6eabd9f72ae806"><code>b901a4e</code></a>
MAINT, CI: PR 21362 revisions [wheel build]</li>
<li><a
href="https://github.com/scipy/scipy/commit/2a7ec604098fb794cc8ebecd6d0a95b418dcdefd"><code>2a7ec60</code></a>
MAINT, BLD: PR 21362 revisions [wheel build]</li>
<li><a
href="https://github.com/scipy/scipy/commit/f4f084d10ae57cd7fa858347c0d11ee4c525efc1"><code>f4f084d</code></a>
MAINT, CI: PR 21362 revisions [wheel build]</li>
<li><a
href="https://github.com/scipy/scipy/commit/b712fc6e856de9ba982689c95efe1f600091d62d"><code>b712fc6</code></a>
DOC: update 1.14.1 relnotes [wheel build]</li>
<li><a
href="https://github.com/scipy/scipy/commit/cdd5aca303ad417c9403bffab28f1fd684a870c3"><code>cdd5aca</code></a>
MAINT: special: Accommodate changed integer handling in NumPy 2.0. (<a
href="https://redirect.github.com/scipy/scipy/issues/21401">#21401</a>)</li>
<li><a
href="https://github.com/scipy/scipy/commit/0f91838d8a426fc5bd1c3c7ffc1203b834b653a7"><code>0f91838</code></a>
BLD: cp313 wheels on <code>manylinux_aarch64</code> (<a
href="https://redirect.github.com/scipy/scipy/issues/21409">#21409</a>)</li>
<li><a
href="https://github.com/scipy/scipy/commit/6dd0b002bca67711cbd1310ea836da848c6af81a"><code>6dd0b00</code></a>
MAINT, CI: wheel build changes [wheel build]</li>
<li>Additional commits viewable in <a
href="https://github.com/scipy/scipy/compare/v1.11.1...v1.14.1">compare
view</a></li>
</ul>
</details>
<br />


You can trigger a rebase of this PR by commenting `@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

> **Note**
> Automatic rebases have been disabled on this pull request as it has
been open for over 30 days.

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 tests/e2e_tests/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e_tests/requirements.txt b/tests/e2e_tests/requirements.txt
index a4716e496470f4..29e1c1cf31c558 100644
--- a/tests/e2e_tests/requirements.txt
+++ b/tests/e2e_tests/requirements.txt
@@ -5,7 +5,7 @@
 
 # for common utils
 py-cpuinfo==9.0.0
-scipy>=1.5.4,<1.12
+scipy>=1.5.4,<1.15
 opencv-python>=4.5; sys_platform != "darwin"
 opencv-python==4.8.1.78; sys_platform == "darwin"
 unittest-xml-reporting==3.0.4