From 17ecf037b62aa8524c11fb5f7cc7791b82c5dba9 Mon Sep 17 00:00:00 2001
From: Andrew Kwangwoong Park <andrew.park@intel.com>
Date: Thu, 3 Oct 2024 14:24:50 +0900
Subject: [PATCH 1/9] [GPU] Support large N FC optimization for dynamic
 quantization case (#26848)

### Details:
 - Update `fc_bf_tiled_kernel_dyn_quan` for osv_is_yx_osv64_isv2 support

### Tickets:
 - 153232
---
 .../fully_connected_gpu_bf_tiled.cl           | 34 ++++++++++++++++---
 .../fully_connected_kernel_bf_tiled.cpp       |  3 +-
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
index 29d322d432dd35..57545b0df37cff 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
@@ -809,7 +809,20 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
     uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET;
 #endif
 
+#if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
+    const int power_of_two_for_simd = 5;
+    const int power_of_two_for_osv = 6;
+    const uint osv64_weight_base = (( (int) (out_f >> power_of_two_for_osv) ) << power_of_two_for_osv);
+    const uint osv_weight_stride = (INPUT_ELEMENTS_COUNT >> 1);
+    const uint out_f_offset = (int)((out_f >> power_of_two_for_simd) & 0x1) << power_of_two_for_simd;
+    // out_f(32)  : 0  * osv_weight_stride + 32;
+    // out_f(64)  : 64 * osv_weight_stride + 0;
+    // out_f(128) : 64 * osv_weight_stride + 32;
+    // ...
+    uint weights_offset =  osv64_weight_base * osv_weight_stride + out_f_offset;
+#else
     uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2);
+#endif
 
     ACCUMULATOR_VEC_TYPE    acc[TILE_B] = { };
 
@@ -905,7 +918,11 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
 
         __local int* char_slm_weight = (__local int*)wei_local_mem;
 
+        #if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
+        uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE * 2;
+        #else
         uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_ACTUAL_LOAD_BLOCK_SIZE;
+        #endif
         uint wei_local_idx = local_id * SIMD * FILTER_LOAD_ITERS * (FILTER_LOAD_BLOCK_SIZE/2) + sglid * 2;
 
         // DECOMPRESSION_SCALE_POST_OP SHOULD be enabled for dynamic quantize FC : scale is ACCUMULATOR_VAL_ONE
@@ -917,6 +934,17 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
                 // loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking
                 dq_wei_unpacked.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
                 dq_wei_unpacked.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
+            #elif FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
+                SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx);
+                SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + (FILTER_LOAD_BLOCK_SIZE * SIMD)));
+                DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked;
+                DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked_tmp;
+                dq_wei_unpacked_tmp.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
+                dq_wei_unpacked_tmp.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
+                dq_wei_unpacked.s01 = dq_wei_unpacked_tmp.s01;
+                dq_wei_unpacked.s23 = dq_wei_unpacked_tmp.s45;
+                dq_wei_unpacked.s45 = dq_wei_unpacked_tmp.s23;
+                dq_wei_unpacked.s67 = dq_wei_unpacked_tmp.s67;
             #else
                 SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx);
                 DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed));
@@ -996,11 +1024,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
                 acc_tmp[1][bi] = imad_SW(acc_tmp[1][bi], input_val, second_weight);
             }
 
-            #if FILTER_LAYOUT_OS_IYX_OSV16 && TILE_OFM == 2
-                weights_offset += (TILE_K_OFM_PACKED/2) * SIMD;
-            #else
-                weights_offset += TILE_K_OFM_PACKED * SIMD;
-            #endif
+            weights_offset += TILE_K_OFM_PACKED * TILE_OFM_PER_OSV_SIZE * SIMD;
 
             #if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE)
                 unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
index 5377387c8b497e..24641f3eb6aab0 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -781,8 +781,7 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
     auto output_f = get_output_aligned_bf_size(fc_params, false).second;
 
     WeightsLayout weights_layout = WeightsLayout::os_iyx_osv16;
-    // TODO: Update may also be required to fc_bf_tiled_kernel_dyn_quan kernel to support os_is_yx_osv64_isv2 format as needed
-    if (!should_dynamic_quantize(fc_params) && fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
+    if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
         && (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2)
         && (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4)
         && is_weight_horizontal(fc_params, output_f)) {

From 0ae44841dc164c9abd7a731bbfd35b95644cd19c Mon Sep 17 00:00:00 2001
From: Roman Kazantsev <roman.kazantsev@intel.com>
Date: Thu, 3 Oct 2024 15:44:09 +0400
Subject: [PATCH 2/9] [PT FE][GHA] Run PT FE layer tests on Ubuntu 24.04 with
 Python 3.12 and NumPy 2.X (#26886)

**Details:** Run PT FE layer tests on Ubuntu 24.04 with Python 3.12 and
NumPy 2.X
Also, this PR contains fixes:
- WA sporadic bug on Windows in case parallel run
- support PT FE and TF FE layer tests on MacOS x86
- leftovers from code-review

**Tickets:** 154003, 153800

---------

Signed-off-by: Kazantsev, Roman <roman.kazantsev@intel.com>
---
 .github/workflows/job_pytorch_layer_tests.yml | 39 +++++++------------
 .../workflows/job_tensorflow_layer_tests.yml  | 15 ++++---
 .github/workflows/linux_arm64.yml             |  6 +--
 .github/workflows/mac.yml                     |  6 +--
 .github/workflows/mac_arm64.yml               |  6 +--
 .github/workflows/ubuntu_22.yml               |  6 +--
 .github/workflows/ubuntu_24.yml               | 10 +++++
 .github/workflows/windows_vs2019_release.yml  |  6 +--
 .../pytorch_tests/test_bitwise_ops.py         |  6 ++-
 tests/requirements_pytorch                    | 26 +++++++------
 tests/requirements_tensorflow                 |  3 +-
 11 files changed, 68 insertions(+), 61 deletions(-)

diff --git a/.github/workflows/job_pytorch_layer_tests.yml b/.github/workflows/job_pytorch_layer_tests.yml
index 88b41f983f7094..50942cf331ab72 100644
--- a/.github/workflows/job_pytorch_layer_tests.yml
+++ b/.github/workflows/job_pytorch_layer_tests.yml
@@ -7,10 +7,6 @@ on:
         description: 'Machine on which the tests would run'
         type: string
         required: true
-      shell:
-        description: "shell to override the default shell settings in the runner's operating system."
-        type: string
-        required: true
       container:
         description: 'JSON to be converted to the value of the "container" configuration for the job'
         type: string
@@ -20,12 +16,15 @@ on:
         description: 'Components that are affected by changes in the commit defined by the Smart CI Action'
         type: string
         required: true
+      python-version:
+        description: 'Python version to setup. E.g., "3.11"'
+        type: string
+        required: true
 
 permissions: read-all
 
 env:
   PIP_CACHE_PATH: /mount/caches/pip/linux
-  PYTHON_VERSION: '3.11'
 
 jobs:
   PyTorch_Layer_Tests:
@@ -35,7 +34,7 @@ jobs:
     container: ${{ fromJSON(inputs.container) }}
     defaults:
       run:
-        shell: ${{ inputs.shell }}
+        shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }}
     env:
       DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
       OPENVINO_REPO: ${{ github.workspace }}/openvino
@@ -55,12 +54,6 @@ jobs:
           name: openvino_tests
           path: ${{ env.INSTALL_TEST_DIR }}
 
-      - name: Download OpenVINO tokenizers extension
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
-        with:
-          name: openvino_tokenizers_wheel
-          path: ${{ env.INSTALL_DIR }}
-
       # Needed as ${{ github.workspace }} is not working correctly when using Docker
       - name: Setup Variables
         if: runner.os != 'Windows'
@@ -98,10 +91,10 @@ jobs:
           sparse-checkout-cone-mode: false
           path: 'openvino'
 
-      - name: Setup Python ${{ env.PYTHON_VERSION }}
+      - name: Setup Python ${{ inputs.python-version }}
         uses: ./openvino/.github/actions/setup_python
         with:
-          version: ${{ env.PYTHON_VERSION }}
+          version: ${{ inputs.python-version }}
           pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }}
           should-setup-pip-paths: ${{ runner.os == 'Linux' }}
           self-hosted-runner: ${{ runner.os == 'Linux' }}
@@ -112,9 +105,6 @@ jobs:
           # Install the core OV wheel
           python3 -m pip install ${INSTALL_DIR}/tools/openvino-*.whl
 
-          # Install the core OV Tokenizers wheel
-          python3 -m pip install ${INSTALL_DIR}/openvino_tokenizers-*.whl
-
       - name: Install OpenVINO Python wheels (Windows)
         if: runner.os == 'Windows'
         run: |
@@ -122,10 +112,6 @@ jobs:
           $ovCoreWheelPath=Get-ChildItem -Path ${{ env.INSTALL_DIR }}\tools -Filter openvino-*.whl | % { $_.FullName }
           python3 -m pip install "$ovCoreWheelPath"
 
-          # Find and install the core OV Tokenizers wheel
-          $ovCoreWheelPath=Get-ChildItem -Path ${{ env.INSTALL_DIR }} -Filter openvino_tokenizers-*.whl | % { $_.FullName }
-          python3 -m pip install "$ovCoreWheelPath"
-
       - name: Install Pytorch Layer tests dependencies
         run: |
           # pytorch test requirements
@@ -133,22 +119,25 @@ jobs:
 
       - name: PyTorch Layer Tests
         if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287, 142196
-        run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -n logical -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
+        # due to CVS-152795, parallel run is not possible on Windows
+        run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
         env:
           TEST_DEVICE: CPU
           TEST_PRECISION: FP32
+          PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}}
 
       - name: PyTorch torch.export Layer Tests
-        if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287
+        if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287
         run: |
-          python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -n logical -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
+          python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
         env:
           TEST_DEVICE: CPU
           TEST_PRECISION: FP32
           PYTORCH_TRACING_MODE: EXPORT
+          PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}}
 
       - name: PyTorch torch.compile TORCHFX Layer Tests
-        if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' }} # Ticket: 126287
+        if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287
         run: |
           python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -m precommit_fx_backend --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
         env:
diff --git a/.github/workflows/job_tensorflow_layer_tests.yml b/.github/workflows/job_tensorflow_layer_tests.yml
index 0801010b86bde3..e8d7b51e14c02f 100644
--- a/.github/workflows/job_tensorflow_layer_tests.yml
+++ b/.github/workflows/job_tensorflow_layer_tests.yml
@@ -7,10 +7,6 @@ on:
         description: 'Machine on which the tests would run'
         type: string
         required: true
-      shell:
-        description: "shell to override the default shell settings in the runner's operating system."
-        type: string
-        required: true
       container:
         description: 'JSON to be converted to the value of the "container" configuration for the job'
         type: string
@@ -20,12 +16,15 @@ on:
         description: 'Components that are affected by changes in the commit defined by the Smart CI Action'
         type: string
         required: true
+      python-version:
+        description: 'Python version to setup. E.g., "3.11"'
+        type: string
+        required: true
 
 permissions: read-all
 
 env:
   PIP_CACHE_PATH: /mount/caches/pip/linux
-  PYTHON_VERSION: '3.11'
 
 jobs:
   TensorFlow_Layer_Tests:
@@ -35,7 +34,7 @@ jobs:
     container: ${{ fromJSON(inputs.container) }}
     defaults:
       run:
-        shell: ${{ inputs.shell }}
+        shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }}
     env:
       DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
       OPENVINO_REPO: ${{ github.workspace }}/openvino
@@ -98,10 +97,10 @@ jobs:
           sparse-checkout-cone-mode: false
           path: 'openvino'
 
-      - name: Setup Python ${{ env.PYTHON_VERSION }}
+      - name: Setup Python ${{ inputs.python-version }}
         uses: ./openvino/.github/actions/setup_python
         with:
-          version: ${{ env.PYTHON_VERSION }}
+          version: ${{ inputs.python-version }}
           pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }}
           should-setup-pip-paths: ${{ runner.os == 'Linux' }}
           self-hosted-runner: ${{ runner.os == 'Linux' }}
diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml
index 3506ca49846f45..e4e608f3aca6d4 100644
--- a/.github/workflows/linux_arm64.yml
+++ b/.github/workflows/linux_arm64.yml
@@ -173,19 +173,19 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'aks-linux-16-cores-arm'
-      shell: bash
       container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Build, Docker, Smart_CI, Openvino_tokenizers ]
+    needs: [ Build, Docker, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'aks-linux-16-cores-arm'
-      shell: bash
       container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CPU_Functional_Tests:
     name: CPU functional tests
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index da3224fa483ad1..20db9de1776015 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -276,17 +276,17 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'macos-13'
-      shell: bash
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Build, Smart_CI, Openvino_tokenizers ]
+    needs: [ Build, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'macos-13'
-      shell: bash
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CPU_Functional_Tests:
     name: CPU functional tests
diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml
index 331afc7266cd6a..a38179f71fb60c 100644
--- a/.github/workflows/mac_arm64.yml
+++ b/.github/workflows/mac_arm64.yml
@@ -275,17 +275,17 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'macos-13-xlarge'
-      shell: bash
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Build, Smart_CI, Openvino_tokenizers ]
+    needs: [ Build, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'macos-13-xlarge'
-      shell: bash
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CPU_Functional_Tests:
     name: CPU functional tests
diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml
index 8f461391f20a9f..2c20e5136cfc4e 100644
--- a/.github/workflows/ubuntu_22.yml
+++ b/.github/workflows/ubuntu_22.yml
@@ -305,19 +305,19 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'aks-linux-4-cores-16gb'
-      shell: bash
       container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ]
+    needs: [ Docker, Build, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'aks-linux-4-cores-16gb'
-      shell: bash
       container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CPU_Functional_Tests:
     name: CPU functional tests
diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml
index 6409b417a0731b..295a4dd0e2c61a 100644
--- a/.github/workflows/ubuntu_24.yml
+++ b/.github/workflows/ubuntu_24.yml
@@ -133,6 +133,16 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.12'
 
+  Pytorch_Layer_Tests:
+    name: Pytorch Layer Tests
+    needs: [ Docker, Build, Smart_CI ]
+    uses: ./.github/workflows/job_pytorch_layer_tests.yml
+    with:
+      runner: 'aks-linux-4-cores-16gb'
+      container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}'
+      affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.12'
+
   Overall_Status:
     name: ci/gha_overall_status_ubuntu_24
     needs: [Smart_CI, Build, Debian_Packages, Samples, Python_Unit_Tests]
diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml
index 39cf2161525513..122fcc3c1c5021 100644
--- a/.github/workflows/windows_vs2019_release.yml
+++ b/.github/workflows/windows_vs2019_release.yml
@@ -404,17 +404,17 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'aks-win-8-cores-16gb'
-      shell: pwsh
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Build, Smart_CI, Openvino_tokenizers ]
+    needs: [ Build, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'aks-win-8-cores-16gb'
-      shell: pwsh
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CXX_Unit_Tests:
     name: C++ unit tests
diff --git a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py
index 1cf458500bcc71..e55a86f279de21 100644
--- a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py
+++ b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py
@@ -4,6 +4,8 @@
 import numpy as np
 import pytest
 import torch
+from packaging import version
+
 from pytorch_layer_test_class import PytorchLayerTest, skip_if_export
 
 
@@ -69,10 +71,12 @@ def forward_not_out(self, tensor_a, out):
     )
     @pytest.mark.parametrize("out", [False, skip_if_export(True)])
     def test_bitwise_mixed_dtypes(
-        self, op_type, out, lhs_dtype, rhs_dtype, lhs_shape, rhs_shape, ie_device, precision, ir_version
+            self, op_type, out, lhs_dtype, rhs_dtype, lhs_shape, rhs_shape, ie_device, precision, ir_version
     ):
         if ie_device == "GPU" and (lhs_dtype != "bool" or rhs_dtype != "bool"):
             pytest.xfail(reason="bitwise ops are not supported on GPU")
+        if out and version.parse(np.__version__) >= version.parse("2.0.0"):
+            pytest.xfail(reason="CVS-154082: incorrect handling out type")
         self._test(
             *self.create_model(op_type, out),
             ie_device,
diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch
index b82e0c76409057..0d5ac61903b104 100644
--- a/tests/requirements_pytorch
+++ b/tests/requirements_pytorch
@@ -1,10 +1,14 @@
+# test ovc with NumPy 2.x on Ubuntu 24 with default Python 3.12
+# test against NumPy 1.x with older Python versions
 # optimum still requires numpy<2.0.0
-numpy==1.26.4
+numpy==1.26.4; python_version < "3.12"
+numpy==2.1.1; python_version >= "3.12"
 torch==2.4.1; platform_system != "Darwin" or platform_machine != "x86_64"
-torch==2.2.0; platform_system == "Darwin" and platform_machine == "x86_64"
+torch==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64"
 --extra-index-url https://download.pytorch.org/whl/cpu
 
-torchvision==0.19.1
+torchvision==0.19.1; platform_system != "Darwin" or platform_machine != "x86_64"
+torchvision==0.17.2; platform_system == "Darwin" and platform_machine == "x86_64"
 # transformers 4.45.1 is available
 # but optimum still requires <4.45.0
 transformers==4.44.2
@@ -13,22 +17,22 @@ pytest-html==4.1.1
 pytest-xdist[psutil]==3.6.1
 defusedxml==0.7.1
 
-auto-gptq==0.7.1; platform_system == "Linux" and platform_machine == "x86_64"
+auto-gptq==0.7.1; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.12"
 av==13.0.0
-basicsr==1.4.2
+basicsr==1.4.2; python_version < "3.12"
 datasets==3.0.1
 easyocr==1.7.2
-facexlib==0.3.0
-librosa==0.10.2
-optimum==1.22.0
+facexlib==0.3.0; python_version < "3.12"
+librosa==0.10.2; python_version < "3.12"
+optimum==1.22.0; python_version < "3.12"
 packaging==24.1
 pandas==2.2.3
 protobuf==5.28.2
-pyctcdecode==0.5.0
+pyctcdecode==0.5.0; python_version < "3.12"
 sacremoses==0.1.1
 sentencepiece==0.2.0
 soundfile==0.12.1
-super-image==0.1.7
+super-image==0.1.7; python_version < "3.12"
 timm==1.0.8
 torchaudio==2.4.1
 wheel==0.44.0
@@ -36,7 +40,7 @@ PyYAML==6.0.2
 kornia==0.7.3
 
 # use latest released version once it's available
-git+https://github.com/huggingface/optimum-intel.git@main
+git+https://github.com/huggingface/optimum-intel.git@main; python_version < "3.12"
 # set 'export HF_HUB_ENABLE_HF_TRANSFER=1' to benefits from hf_transfer
 hf_transfer==0.1.8
 
diff --git a/tests/requirements_tensorflow b/tests/requirements_tensorflow
index 9d025397ed1fbd..6042eb8a46a9c3 100644
--- a/tests/requirements_tensorflow
+++ b/tests/requirements_tensorflow
@@ -4,7 +4,8 @@ pytest==7.0.1
 pytest-xdist[psutil]==3.6.1
 pytest-html==4.1.1
 transformers==4.45.1
-tensorflow==2.17.0
+tensorflow==2.17.0; platform_system != "Darwin" or platform_machine != "x86_64"
+tensorflow==2.16.2; platform_system == "Darwin" and platform_machine == "x86_64"
 # tensorflow-text is not available for both Windows and ARM platforms
 tensorflow-text==2.17.0; platform_system == "Linux" and platform_machine == "x86_64"
 tensorflow-hub==0.16.1

From 1b892bfb00fcbccec8db96f66a86e3b1e01f6262 Mon Sep 17 00:00:00 2001
From: Pavel Durandin <pavel.durandin@intel.com>
Date: Thu, 3 Oct 2024 13:43:02 +0400
Subject: [PATCH 3/9] [GPU] Fix double jit constants (#26893)

### Details:
 - Fix double constant definition
---
 .../fully_connected/fully_connected_kernel_bf_tiled.cpp      | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
index 24641f3eb6aab0..c4115d74f54a92 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -534,6 +534,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
     size_t tile_k_ofm_packed = tile_k_ofm;
     size_t quantize_grp_size = get_dynamic_quantize_group_size(params);
 
+    bool add_decompress_scale_post_op = false;
     WeightsType weights_dt = params.weights.GetDType();
     if (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4) {
         tile_k_ofm_packed /= 2;
@@ -542,7 +543,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
         const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v;
         // Do not use SCALE_POST_OP for SLM kernel, since it demonstrates worse performance
         if (scale_group_size % simd == 0 && !dispatchData.use_slm)
-            jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
+            add_decompress_scale_post_op = true;
     }
     if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) {
         jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii"));
@@ -619,6 +620,8 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
         jit.AddConstant(MakeJitConstant("DQ_TYPE", "char"));
         jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));
     } else {
+        if (add_decompress_scale_post_op)
+            jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
         jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0));
         jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size));
     }

From 4254c13364ac212e47590184d82c6746bd36aae5 Mon Sep 17 00:00:00 2001
From: "Anastasiya(Asya) Pronina" <anastasiya.pronina@intel.com>
Date: Thu, 3 Oct 2024 12:27:03 +0200
Subject: [PATCH 4/9] NPUW: Bring back unpack and partitioning unit tests for
 NPUW (#26885)

This PR adds unit tests on

1. unpack routines within NPUW
2. main online partitioning functionality (smaller unit tests on Graph,
Group, Repeated, etc will be added separately)

Brings back https://github.com/openvinotoolkit/openvino/pull/25780

Local run:

```
[----------] Global test environment tear-down
[==========] 334 tests from 6 test suites ran. (3379 ms total)
[  PASSED  ] 334 tests.
```

---------

Co-authored-by: Alexey Smirnov <alexey.smirnov@intel.com>
Co-authored-by: Dmitry Matveev <dmitry.matveev@intel.com>
---
 .../npuw/partitioning/online/snapshot.hpp     |  16 +-
 src/plugins/intel_npu/tests/CMakeLists.txt    |   1 +
 .../intel_npu/tests/unit/CMakeLists.txt       |  46 ++
 .../tests/unit/npuw/online_partitioning.cpp   | 692 ++++++++++++++++++
 .../intel_npu/tests/unit/npuw/unpack.cpp      | 103 +++
 .../intel_npu/tests/unit/npuw/unpack.hpp      | 628 ++++++++++++++++
 6 files changed, 1478 insertions(+), 8 deletions(-)
 create mode 100644 src/plugins/intel_npu/tests/unit/CMakeLists.txt
 create mode 100644 src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp
 create mode 100644 src/plugins/intel_npu/tests/unit/npuw/unpack.cpp
 create mode 100644 src/plugins/intel_npu/tests/unit/npuw/unpack.hpp

diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp
index 72a62781580cda..e7e5121b1240e7 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp
@@ -16,8 +16,6 @@ namespace ov {
 namespace npuw {
 namespace online {
 
-class Group;  // forward declaration
-
 namespace detail {
 // At partitioning level we exclude some "non-Ops" to not interfere with the passes.
 // We include some of them back to properly link everything at plugin level
@@ -33,6 +31,8 @@ class Snapshot : public std::enable_shared_from_this<Snapshot> {
           m_node_to_prod_cons(std::make_shared<detail::OVNodeMap>()),
           m_node_to_gr(std::make_shared<detail::OVNodeToGroupMap>()) {}
 
+    friend class Group;  // forward declaration
+
     // Simple passes
     void singleGroup();
 
@@ -49,27 +49,27 @@ class Snapshot : public std::enable_shared_from_this<Snapshot> {
     void repeatedBlocks();
     void earlyAvoids();
     void earlyRegroup();
-    void markInternalCompute();
-    void resetExcludedRep();
 
     // Utility
     std::shared_ptr<own::ade::Graph> getGraph() const;
-    size_t graphSize() const;
-    const detail::OVNodeSet& getNodeProducers(const detail::OVNodePtr& node) const;
-    const detail::OVNodeSet& getNodeConsumers(const detail::OVNodePtr& node) const;
     const detail::OVPortsMap& getPortsMap() const;
     const detail::OVNodeToGroupMapPtr& getNodeToGroupMap() const;
     const std::map<std::string, std::vector<std::set<std::string>>>& getMatches() const;
-    detail::GPtrSet getRepGroups(const std::shared_ptr<Group>& group) const;
     void repeat(detail::Pass&& pass);
     void setCtx(const PassContext& ctx);
+    size_t graphSize() const;
 
 private:
+    detail::GPtrSet getRepGroups(const std::shared_ptr<Group>& group) const;
+    const detail::OVNodeSet& getNodeProducers(const detail::OVNodePtr& node) const;
+    const detail::OVNodeSet& getNodeConsumers(const detail::OVNodePtr& node) const;
     void identifyUniques();
     void mergeUniques();
     void mergeTriangles();
     void cleanUpUniques();
     void afterUniques();
+    void markInternalCompute();
+    void resetExcludedRep();
     bool cleanUpUniquesImpl(const detail::GPtrSet& gset);
     std::shared_ptr<Repeated> tryGrowRepeatingGroups(const detail::GPtrSet& repeating_groups);
     std::shared_ptr<Repeated> tryMergeTriangles(const detail::GPtrSet& repeating_groups);
diff --git a/src/plugins/intel_npu/tests/CMakeLists.txt b/src/plugins/intel_npu/tests/CMakeLists.txt
index 4c41f008eb7f81..0f5bd7a6b093b2 100644
--- a/src/plugins/intel_npu/tests/CMakeLists.txt
+++ b/src/plugins/intel_npu/tests/CMakeLists.txt
@@ -8,3 +8,4 @@ if (MSVC)
     ov_add_compiler_flags(/wd5105)
 endif()
 add_subdirectory(functional)
+add_subdirectory(unit)
diff --git a/src/plugins/intel_npu/tests/unit/CMakeLists.txt b/src/plugins/intel_npu/tests/unit/CMakeLists.txt
new file mode 100644
index 00000000000000..861a0ff6a47076
--- /dev/null
+++ b/src/plugins/intel_npu/tests/unit/CMakeLists.txt
@@ -0,0 +1,46 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set(TARGET_NAME "ov_npu_unit_tests")
+
+set(MANDATORY_UNIT_TESTS_LIBS
+        "openvino::commonTestUtils"
+        "openvino::gmock"
+        "openvino::gtest"
+        "openvino::gtest_main"
+        "openvino::runtime"
+        "openvino::npu_al"
+        "openvino::npu_logger_utils"
+)
+
+ov_add_test_target(
+        NAME ${TARGET_NAME}
+        ROOT ${CMAKE_CURRENT_SOURCE_DIR}
+        ADDITIONAL_SOURCE_DIRS
+            ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/plugin/npuw/
+        DEPENDENCIES
+            openvino::runtime
+        INCLUDES
+            ${CMAKE_CURRENT_SOURCE_DIR}
+            ${CMAKE_CURRENT_SOURCE_DIR}/npuw
+            ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/plugin/npuw
+            ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/utils/include
+            ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/plugin/include
+            ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/al/include
+        LINK_LIBRARIES
+            ${MANDATORY_UNIT_TESTS_LIBS}
+        LABELS
+            NPUW
+)
+
+if(ENABLE_AVX2)
+    ov_avx2_optimization_flags(avx2_flags)
+    target_compile_options(${TARGET_NAME} PRIVATE "${avx2_flags}")
+endif()
+
+install(TARGETS ${TARGET_NAME}
+        RUNTIME DESTINATION tests
+        COMPONENT tests
+        EXCLUDE_FROM_ALL
+)
diff --git a/src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp b/src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp
new file mode 100644
index 00000000000000..af1fc5de8e92c7
--- /dev/null
+++ b/src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp
@@ -0,0 +1,692 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include <iostream>
+
+#include "partitioning/online/compiler.hpp"
+#include "partitioning/online/snapshot.hpp"
+#include "partitioning/online/group.hpp"
+
+#include "intel_npu/al/config/config.hpp"
+#include "intel_npu/al/config/npuw.hpp"
+
+#include "openvino/openvino.hpp"
+#include "openvino/op/ops.hpp"
+#include "openvino/op/util/op_types.hpp"
+
+bool isEqualEns(ov::npuw::Ensemble& ens1, ov::npuw::Ensemble& ens2);
+bool isEqualEns(ov::npuw::Ensemble& ens1, ov::npuw::Ensemble& ens2) {
+    if (ens1.groups.size() != ens2.groups.size()) {
+        return false;
+    }
+
+    for (auto& g : ens1.groups) {
+        std::sort(g.input_layers.begin(), g.input_layers.end());
+        std::sort(g.output_layers.begin(), g.output_layers.end());
+        std::sort(g.all_layers.begin(), g.all_layers.end());
+    }
+
+    for (auto& g : ens2.groups) {
+        std::sort(g.input_layers.begin(), g.input_layers.end());
+        std::sort(g.output_layers.begin(), g.output_layers.end());
+        std::sort(g.all_layers.begin(), g.all_layers.end());
+    }
+
+    std::sort(ens1.groups.begin(), ens1.groups.end(), [](const ov::npuw::Group& g1,
+                                                         const ov::npuw::Group& g2){
+                                                                return g1.all_layers.front() < g2.all_layers.front();
+                                                        });
+
+    std::sort(ens2.groups.begin(), ens2.groups.end(), [](const ov::npuw::Group& g1,
+                                                         const ov::npuw::Group& g2){
+                                                                return g1.all_layers.front() < g2.all_layers.front();
+                                                        });
+
+    for (size_t i = 0; i < ens1.groups.size(); ++i) {
+        const auto& g1 = ens1.groups.at(i);
+        const auto& g2 = ens2.groups.at(i);
+
+        if (g1.avoid_list != g2.avoid_list ||
+            g1.input_layers != g2.input_layers ||
+            g1.output_layers != g2.output_layers ||
+            g1.all_layers != g2.all_layers) {
+            return false;
+        }
+
+        // Can't compare them directly since they are random, but dont't affect the structure
+        if ((g1.repeated_id.empty() && !g2.repeated_id.empty()) ||
+            (!g1.repeated_id.empty() && g2.repeated_id.empty())) {
+            return false;
+        }
+    }
+
+    if (ens1.repeated.size() != ens2.repeated.size()) {
+        return false;
+    }
+
+    auto get_sorted_rep = [](const std::map<std::string, ov::npuw::RepeatedBlock>& rep) {
+        std::vector<std::vector<std::set<std::string>>> sorted_rep;
+
+        std::transform(rep.begin(), rep.end(), std::back_inserter(sorted_rep), [](const auto& v) {
+            return v.second.matches;
+        });
+
+        for (auto& g : sorted_rep) {
+            std::sort(g.begin(), g.end(),
+                    [](const auto& a, const auto& b) {return *a.begin() < *b.begin();});
+        }
+
+        std::sort(sorted_rep.begin(), sorted_rep.end(),
+                    [](const auto& a, const auto& b) {return *a.front().begin() < *b.front().begin();});
+
+        return sorted_rep;
+    };
+
+
+    if (get_sorted_rep(ens1.repeated) != get_sorted_rep(ens2.repeated)) {
+        return false;
+    }
+
+    return true;
+}
+
+class ModelGenerator {
+public:
+    ModelGenerator() = default;
+
+    std::shared_ptr<ov::Model> get_model_without_repeated_blocks() {
+        std::shared_ptr<ov::op::v0::Parameter> input = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::Shape{1, 1, 40});
+        m_nodes.push_back(input);
+        set_name(input);
+
+        std::shared_ptr<ov::Node> res = get_block(input);
+
+        auto result = std::make_shared<ov::op::v0::Result>(res);
+        m_nodes.push_back(result);
+        set_name(result);
+
+        ov::ParameterVector params = {input};
+        ov::ResultVector results = {result};
+
+        return std::make_shared<ov::Model>(results, params);
+    }
+
+    std::shared_ptr<ov::Model> get_model_with_repeated_blocks() {
+        // Generate head
+        std::shared_ptr<ov::op::v0::Parameter> input = std::make_shared<ov::op::v0::Parameter>(ov::element::i32, ov::Shape{1, 1, 40});
+        m_nodes.push_back(input);
+        set_name(input);
+
+        std::vector<std::shared_ptr<ov::Node>> head(7, nullptr);
+        head[0] = std::make_shared<ov::op::v1::Add>(input, input);
+        head[1] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<int>{2});
+        head[2] = std::make_shared<ov::op::v1::Divide>(head[0], head[1], true);
+        head[3] = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int>{1, 1, 4, 10});
+        head[4] = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{3}, std::vector<int>{1, 1, 40});
+        head[5] = std::make_shared<ov::op::v1::Reshape>(head[2], head[3], false);
+        head[6] = std::make_shared<ov::op::v1::Reshape>(head[5], head[4], false);
+
+        for (const auto& h : head) {
+            m_nodes.push_back(h);
+            set_name(h);
+        }
+
+        // Generate repeated blocks
+        std::shared_ptr<ov::Node> output = get_block(head[6]);
+        std::vector<std::shared_ptr<ov::Node>> outputs;
+        outputs.push_back(output);
+
+        for (size_t i = 0; i < 9; ++i) {
+            output = get_block(output);
+            outputs.push_back(output);
+        }
+
+        // Generate tail
+        std::vector<std::shared_ptr<ov::Node>> tail(6, nullptr);
+        tail[0] = std::make_shared<ov::op::v0::Concat>(outputs, -1);
+        tail[1] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, std::vector<int>{1, 20, 20});
+        tail[2] = std::make_shared<ov::op::v1::Reshape>(tail[0], tail[1], false);
+        tail[3] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1, 1, 1});
+        tail[4] = std::make_shared<ov::op::v1::Multiply>(tail[2], tail[3]);
+        tail[5] = std::make_shared<ov::op::v1::Add>(tail[4], tail[4]);
+
+        for (const auto& t : tail) {
+            m_nodes.push_back(t);
+            set_name(t);
+        }
+
+        // Create model
+        auto result = std::make_shared<ov::op::v0::Result>(tail[5]);
+        m_nodes.push_back(result);
+        set_name(result);
+
+        ov::ParameterVector params = {input};
+        ov::ResultVector results = {result};
+
+        return std::make_shared<ov::Model>(results, params);
+    }
+
+    std::shared_ptr<ov::Node> get_block(const std::shared_ptr<ov::Node>& input) {
+        // Parameters
+        // input
+
+        // Constants
+        std::vector<std::shared_ptr<ov::Node>> model_c(18, nullptr);
+        model_c[0] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int>{0, 2, 1, 3});
+        model_c[1] = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int>{1});
+        model_c[2] = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int>{0});
+        model_c[3] = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int>{2});
+        model_c[4] = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int>{0});
+        model_c[5] = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int>{1, 1, 1, 1});
+        model_c[6] = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int>{1});
+        model_c[7] = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int>{0});
+        model_c[8] = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int>{1, 1, 1, 1});
+        model_c[9] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int>{1, 1, 1, 2});
+        model_c[10] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int>{1, 1, 1, 1});
+        model_c[11] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, std::vector<int>{1, 1, 1, 2});
+        model_c[12] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1, 1, 1, 1});
+        model_c[13] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1, 1, 1, 1});
+        model_c[14] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{1, 1, 1, 1});
+        model_c[15] = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{40, 40});
+        model_c[16] = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{4}, std::vector<int>{1, 1, 4, 10});
+        model_c[17] = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, std::vector<int>{1, 1, 40});
+
+        for (const auto& c : model_c) {
+            m_nodes.push_back(c);
+            set_name(c);
+        }
+
+        // Converts
+        std::vector<std::shared_ptr<ov::Node>> convert(3, nullptr);
+        convert[0] = std::make_shared<ov::op::v0::Convert>(model_c[15], ov::element::f16);
+        convert[1] = std::make_shared<ov::op::v0::Convert>(convert[0], ov::element::i32);
+        convert[2] = std::make_shared<ov::op::v0::Convert>(model_c[12], ov::element::i32);
+
+        for (const auto& c : convert) {
+            m_nodes.push_back(c);
+            set_name(c);
+        }
+
+        // Ops
+        std::vector<std::shared_ptr<ov::Node>> op(16, nullptr);
+        op[0] = std::make_shared<ov::op::v0::MatMul>(input, convert[1], false, true);
+        op[1] = std::make_shared<ov::op::v1::Reshape>(op[0], model_c[16], false);
+        op[2] = std::make_shared<ov::op::v1::Transpose>(op[1], model_c[0]);
+        op[3] = std::make_shared<ov::op::v0::ShapeOf>(op[2]);
+        op[4] = std::make_shared<ov::op::v1::Gather>(op[3], model_c[1], model_c[2]);
+        op[5] = std::make_shared<ov::op::v1::Divide>(op[4], model_c[3], true);
+        op[6] = std::make_shared<ov::op::v0::Floor>(op[5]);
+        op[7] = std::make_shared<ov::op::v3::ScatterUpdate>(model_c[5], model_c[6], op[6], model_c[7]);
+        op[8] = std::make_shared<ov::op::v1::StridedSlice>(op[2],
+                                                            model_c[8],
+                                                            op[7],
+                                                            model_c[9],
+                                                            std::vector<int64_t>{1, 1, 1, 1},
+                                                            std::vector<int64_t>{1, 1, 1, 1});
+        op[9] = std::make_shared<ov::op::v1::StridedSlice>(op[2],
+                                                            op[7],
+                                                            model_c[10],
+                                                            model_c[11],
+                                                            std::vector<int64_t>{1, 1, 1, 1},
+                                                            std::vector<int64_t>{1, 1, 1, 1});
+        op[10] = std::make_shared<ov::op::v1::Multiply>(op[9], convert[2]);
+        op[11] = std::make_shared<ov::op::v0::Concat>(std::vector<std::shared_ptr<ov::Node>>{op[10], op[8]}, -1);
+        op[12] = std::make_shared<ov::op::v1::Multiply>(model_c[13], op[11]);
+        op[13] = std::make_shared<ov::op::v1::Multiply>(model_c[14], op[2]);
+        op[14] = std::make_shared<ov::op::v1::Add>(op[13], op[12]);
+        op[15] = std::make_shared<ov::op::v1::Reshape>(op[14], model_c[17], false);
+
+        for (const auto& o : op) {
+            m_nodes.push_back(o);
+            set_name(o);
+        }
+
+        return op[15];
+    }
+
+private:
+    void set_name(const std::shared_ptr<ov::Node>& node) {
+        node->set_friendly_name("node_" + std::to_string(m_name_idx++));
+    }
+
+    std::vector<std::shared_ptr<ov::Node>> m_nodes;
+    size_t m_name_idx;
+};
+
+TEST(OnlinePartitioningTest, Partitioning_IsTheSame_SmallModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_without_repeated_blocks();
+
+    auto opt_desc = std::make_shared<::intel_npu::OptionsDesc>();
+    auto cfg = ::intel_npu::Config(opt_desc);
+    ::intel_npu::registerNPUWOptions(*opt_desc);
+    std::map<std::string, std::string> cfg_map = {{ "NPUW_ONLINE_KEEP_BLOCK_SIZE", "9" }};
+    cfg.update(cfg_map);
+
+    auto ens = ov::npuw::online::buildPartitioning(model, cfg);
+
+    for (size_t i = 0; i < 100; ++i) {
+        auto ens_again = ov::npuw::online::buildPartitioning(model, cfg);
+        EXPECT_TRUE(isEqualEns(ens, ens_again));
+    }
+}
+
+TEST(OnlinePartitioningTest, Partitioning_IsTheSame_RepeatedModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_with_repeated_blocks();
+
+    auto opt_desc = std::make_shared<::intel_npu::OptionsDesc>();
+    auto cfg = ::intel_npu::Config(opt_desc);
+    ::intel_npu::registerNPUWOptions(*opt_desc);
+    std::map<std::string, std::string> cfg_map = {{ "NPUW_ONLINE_KEEP_BLOCK_SIZE", "9" }};
+    cfg.update(cfg_map);
+
+    auto ens = ov::npuw::online::buildPartitioning(model, cfg);
+
+    for (size_t i = 0; i < 100; ++i) {
+        auto ens_again = ov::npuw::online::buildPartitioning(model, cfg);
+        EXPECT_TRUE(isEqualEns(ens, ens_again));
+    }
+}
+
+TEST(OnlinePartitioningTest, Partitioning_SingleGroup_SmallModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_without_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->singleGroup();
+    EXPECT_EQ(snap->graphSize(), 1);
+}
+
+TEST(OnlinePartitioningTest, Partitioning_SingleGroup_RepeatedModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_with_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->singleGroup();
+    EXPECT_EQ(snap->graphSize(), 1);
+}
+
+TEST(OnlinePartitioningTest, Partitioning_buildGraph_SmallModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_without_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+    auto g = snap->getGraph();
+    for (const auto& nh : g->sorted()) {
+        ov::npuw::online::Group::GPtr group = g->meta(nh).get<ov::npuw::online::Group::GPtr>();
+        EXPECT_EQ(group->size(), 1);
+    }
+    EXPECT_EQ(snap->getNodeToGroupMap()->size(), snap->graphSize());
+}
+
+TEST(OnlinePartitioningTest, Partitioning_buildGraph_RepeatedModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_with_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+    auto g = snap->getGraph();
+    for (const auto& nh : g->sorted()) {
+        ov::npuw::online::Group::GPtr group = g->meta(nh).get<ov::npuw::online::Group::GPtr>();
+        EXPECT_EQ(group->size(), 1);
+    }
+    EXPECT_EQ(snap->getNodeToGroupMap()->size(), snap->graphSize());
+}
+
+TEST(OnlinePartitioningTest, Partitioning_earlyAvoids_SmallModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_without_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    ov::npuw::online::PassContext ctx;
+    ctx.avoids = {{ov::npuw::online::PatternType::OP, "Gather", "mydevice"}, {ov::npuw::online::PatternType::OP, "MatMul", "mydevice"}};
+    snap->setCtx(ctx);
+    snap->buildGraph();
+    snap->earlyAvoids();
+    auto g = snap->getGraph();
+    size_t count = 0;
+    for (const auto& nh : g->sorted()) {
+        ov::npuw::online::Group::GPtr group = g->meta(nh).get<ov::npuw::online::Group::GPtr>();
+        EXPECT_EQ(group->size(), 1);
+        if (group->avoidedTargets().size() == 1 && *(group->avoidedTargets().begin()) == "mydevice") {
+            ++count;
+        }
+    }
+    EXPECT_EQ(count, 2);
+}
+
+TEST(OnlinePartitioningTest, Partitioning_earlyAvoids_RepeatedModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_with_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    ov::npuw::online::PassContext ctx;
+    ctx.avoids = {{ov::npuw::online::PatternType::OP, "Gather", "mydevice"}, {ov::npuw::online::PatternType::OP, "MatMul", "mydevice"}};
+    snap->setCtx(ctx);
+    snap->buildGraph();
+    snap->earlyAvoids();
+    auto g = snap->getGraph();
+    size_t count = 0;
+    for (const auto& nh : g->sorted()) {
+        ov::npuw::online::Group::GPtr group = g->meta(nh).get<ov::npuw::online::Group::GPtr>();
+        EXPECT_EQ(group->size(), 1);
+        if (group->avoidedTargets().size() == 1 && *(group->avoidedTargets().begin()) == "mydevice") {
+            ++count;
+        }
+    }
+    EXPECT_EQ(count, 20);
+}
+
+TEST(OnlinePartitioningTest, Partitioning_collectLHF_SmallModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_without_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+
+    std::vector<std::size_t> sizes = {10, 10};
+    size_t iter = 0;
+
+    snap->repeat([&]{
+        snap->collectLHF();
+        EXPECT_LT(iter, sizes.size());
+        EXPECT_EQ(snap->graphSize(), sizes[iter++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_collectLHF_RepeatedModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_with_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+
+    std::vector<std::size_t> sizes = {82, 82};
+    size_t iter = 0;
+
+    snap->repeat([&]{
+        snap->collectLHF();
+        EXPECT_LT(iter, sizes.size());
+        EXPECT_EQ(snap->graphSize(), sizes[iter++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_fuseRemnants_SmallModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_without_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+
+    std::vector<std::size_t> sizes = {10, 10};
+    size_t iter = 0;
+
+    snap->repeat([&]{
+        snap->fuseRemnants();
+        EXPECT_LT(iter, sizes.size());
+        EXPECT_EQ(snap->graphSize(), sizes[iter++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_fuseRemnants_RepeatedModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_with_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+
+    std::vector<std::size_t> sizes = {75, 38, 19, 10};
+    size_t iter = 0;
+
+    snap->repeat([&]{
+        snap->fuseRemnants();
+        EXPECT_LT(iter, sizes.size());
+        EXPECT_EQ(snap->graphSize(), sizes[iter++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_fuseRemnantsExtended_SmallModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_without_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+
+    std::vector<std::size_t> sizes = {10, 10};
+    size_t iter = 0;
+
+    snap->repeat([&]{
+        snap->fuseRemnantsExtended();
+        EXPECT_LT(iter, sizes.size());
+        EXPECT_EQ(snap->graphSize(), sizes[iter++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_fuseRemnantsExtended_RepeatedModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_with_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+
+    std::vector<std::size_t> sizes = {10, 10};
+    size_t iter = 0;
+
+    snap->repeat([&]{
+        snap->fuseRemnantsExtended();
+        EXPECT_LT(iter, sizes.size());
+        EXPECT_EQ(snap->graphSize(), sizes[iter++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_fuseInputs_SmallModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_without_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+
+    std::vector<std::size_t> sizes = {15, 14, 14};
+    size_t iter = 0;
+
+    snap->repeat([&]{
+        snap->fuseInputs();
+        EXPECT_LT(iter, sizes.size());
+        EXPECT_EQ(snap->graphSize(), sizes[iter++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_fuseInputs_RepeatedModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_with_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+
+    std::vector<std::size_t> sizes = {148, 138, 138};
+    size_t iter = 0;
+
+    snap->repeat([&]{
+        snap->fuseInputs();
+        EXPECT_LT(iter, sizes.size());
+        EXPECT_EQ(snap->graphSize(), sizes[iter++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_Compiler_Just_SmallModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_without_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+
+    std::vector<std::size_t> sizes_lhf = {10, 10};
+    size_t iter_lhf = 0;
+
+    std::vector<std::size_t> sizes_fr = {10, 10};
+    size_t iter_fr = 0;
+
+    snap->repeat([&] {
+        snap->collectLHF();
+        EXPECT_LT(iter_lhf, sizes_lhf.size());
+        EXPECT_EQ(snap->graphSize(), sizes_lhf[iter_lhf++]);
+    });
+    snap->repeat([&] {
+        snap->fuseRemnants();
+        EXPECT_LT(iter_fr, sizes_fr.size());
+        EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_Compiler_Just_RepeatedModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_with_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+
+    std::vector<std::size_t> sizes_lhf = {82, 82};
+    size_t iter_lhf = 0;
+
+    std::vector<std::size_t> sizes_fr = {41, 21, 11, 10, 10};
+    size_t iter_fr = 0;
+
+    snap->repeat([&] {
+        snap->collectLHF();
+        EXPECT_LT(iter_lhf, sizes_lhf.size());
+        EXPECT_EQ(snap->graphSize(), sizes_lhf[iter_lhf++]);
+    });
+    snap->repeat([&] {
+        snap->fuseRemnants();
+        EXPECT_LT(iter_fr, sizes_fr.size());
+        EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_Compiler_RepeatedBlocks_SmallModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_without_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+
+
+    std::vector<std::size_t> sizes_fr = {10, 10};
+    size_t iter_fr = 0;
+
+    snap->earlyAvoids();
+    snap->earlyRegroup();
+    snap->repeatedBlocks();
+    EXPECT_EQ(snap->graphSize(), 17);
+
+    auto matches = snap->getMatches();
+    EXPECT_EQ(matches.size(), 0);
+
+    snap->repeat([&] {
+        snap->fuseRemnantsExtended();
+        EXPECT_LT(iter_fr, sizes_fr.size());
+        EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_Compiler_RepeatedBlocks_RepeatedModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_with_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+    snap->buildGraph();
+
+
+    std::vector<std::size_t> sizes_fr = {12, 12};
+    size_t iter_fr = 0;
+
+    snap->earlyAvoids();
+    snap->earlyRegroup();
+    snap->repeatedBlocks();
+    EXPECT_EQ(snap->graphSize(), 18);
+
+    auto matches = snap->getMatches();
+    EXPECT_EQ(matches.size(), 1);
+
+    for (const auto& m : matches) {
+        EXPECT_EQ(m.second.size(), 17);
+        for (const auto& layers : m.second) {
+            EXPECT_EQ(layers.size(), 10);
+        }
+    }
+
+    snap->repeat([&] {
+        snap->fuseRemnantsExtended();
+        EXPECT_LT(iter_fr, sizes_fr.size());
+        EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_Compiler_Compute_SmallModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_without_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+
+    std::vector<std::size_t> sizes_fr = {10, 10};
+    size_t iter_fr = 0;
+
+    ov::npuw::online::PassContext ctx;
+    ctx.isolates = {{ov::npuw::online::PatternType::OP, "Transpose", "test_compute"}, {ov::npuw::online::PatternType::OP, "ScatterUpdate", "test_compute"}};
+    ctx.nofolds = {"test_compute"};
+    snap->setCtx(ctx);
+
+    snap->buildGraph();
+    snap->earlyAvoids();
+    snap->earlyRegroup();
+    snap->repeatedBlocks();
+    EXPECT_EQ(snap->graphSize(), 17);
+
+    auto matches = snap->getMatches();
+    EXPECT_EQ(matches.size(), 0);
+
+    snap->repeat([&] {
+        snap->fuseRemnantsExtended();
+        EXPECT_LT(iter_fr, sizes_fr.size());
+        EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]);
+    });
+}
+
+TEST(OnlinePartitioningTest, Partitioning_Compiler_Compute_RepeatedModel) {
+    ModelGenerator mg;
+    auto model = mg.get_model_with_repeated_blocks();
+
+    auto snap = std::make_shared<ov::npuw::online::Snapshot>(model);
+
+    std::vector<std::size_t> sizes_fr = {10, 10};
+    size_t iter_fr = 0;
+
+    ov::npuw::online::PassContext ctx;
+    ctx.isolates = {{ov::npuw::online::PatternType::OP, "Gather", "test_compute"},
+                    {ov::npuw::online::PatternType::OP, "ScatterUpdate", "test_compute"},
+                    {ov::npuw::online::PatternType::OP, "ShapeOf", "test_compute"},
+                    {ov::npuw::online::PatternType::OP, "Divide", "test_compute"},
+                    {ov::npuw::online::PatternType::OP, "Floor", "test_compute"}};
+    ctx.nofolds = {"test_compute"};
+    snap->setCtx(ctx);
+
+    snap->buildGraph();
+    snap->earlyAvoids();
+    snap->earlyRegroup();
+    snap->repeatedBlocks();
+    EXPECT_EQ(snap->graphSize(), 29);
+
+    // FIXME: create a config in which there will be repeated blocks
+    auto matches = snap->getMatches();
+    EXPECT_EQ(matches.size(), 0);
+
+    snap->repeat([&] {
+        snap->fuseRemnantsExtended();
+        EXPECT_LT(iter_fr, sizes_fr.size());
+        EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]);
+    });
+}
diff --git a/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp b/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp
new file mode 100644
index 00000000000000..1049832f6ead7c
--- /dev/null
+++ b/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp
@@ -0,0 +1,103 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef HAVE_AVX2
+#include "unpack.hpp"
+
+namespace {
+
+const auto TestCases = ::testing::Combine(
+        ::testing::ValuesIn({ov::element::Type_t::i4}),
+        ::testing::ValuesIn({ov::element::Type_t::i8, ov::element::Type_t::f16}),
+        ::testing::ValuesIn({ov::element::Type_t::undefined}), // no used in this test
+        ::testing::ValuesIn({ov::element::Type_t::undefined}), // no used in this test
+        ::testing::ValuesIn({3lu, 0lu}),
+        ::details::ShapesIn({Tensors{input={1, 1, 1, 32};},
+                             Tensors{input={1,1,1, 128};},
+                             Tensors{input={1,1,1, 390};},
+                             Tensors{input={1,1,1, 82};}}),
+        ::testing::ValuesIn({true, false}),
+        ::testing::ValuesIn({true, false})
+);
+
+INSTANTIATE_TEST_SUITE_P(UnpackTests, UnpackTests,
+                         TestCases,
+                         UnpackTests::getTestCaseName);
+
+const auto TestCasesScale = ::testing::Combine(
+        ::testing::ValuesIn({ov::element::Type_t::i4}), // TODO: add i8 as input for test
+        ::testing::ValuesIn({ov::element::Type_t::f16, ov::element::Type_t::f32}),
+        ::testing::ValuesIn({ov::element::Type_t::f16, ov::element::Type_t::f32}),
+        ::testing::ValuesIn({ov::element::Type_t::undefined}), // no used in this test
+        ::testing::ValuesIn({3lu, 0lu}),
+        ::details::ShapesIn({Tensors{input={1,32, 128};     scale = {1, 32, 1};},
+                             Tensors{input={32, 128};       scale = {32, 1};},
+                             Tensors{input={64, 160};       scale = {64, 1};},
+                             Tensors{input={1024, 4};       scale = {64, 1};},
+                             Tensors{input={1, 1, 1024, 4}; scale = {1, 1, 64, 1};}}),
+        ::testing::ValuesIn({true, false}),
+        ::testing::ValuesIn({true, false})
+);
+
+INSTANTIATE_TEST_SUITE_P(UnpackWithScaleTests, UnpackWithScaleTests,
+                         TestCasesScale,
+                         UnpackWithScaleTests::getTestCaseName);
+
+
+const auto TestCasesScaleAndZeroPoints = ::testing::Combine(
+        ::testing::ValuesIn({ov::element::Type_t::u4}),
+        ::testing::ValuesIn({ov::element::Type_t::f16}),
+        ::testing::ValuesIn({ov::element::Type_t::f16}),
+        ::testing::ValuesIn({ov::element::Type_t::u4}),
+        ::testing::ValuesIn({3lu, 0lu}),
+        ::details::ShapesIn({Tensors{input={1,32, 128};     scale = {1, 32, 1};},
+                             Tensors{input={1,64, 160};     scale = {1, 64, 1};},
+                             Tensors{input={1,1024, 4};     scale = {1, 64, 1};},
+                             Tensors{input={1,1, 1024, 4};  scale = {1, 1, 64, 1};},
+                             Tensors{input={64, 1};         scale = {64, 1};}}),
+        ::testing::ValuesIn({true, false}),
+        ::testing::ValuesIn({true, false})
+);
+
+INSTANTIATE_TEST_SUITE_P(UnpackTestsWithScaleAndZeroPoint, UnpackTestsWithScaleAndZeroPoint,
+                         TestCasesScaleAndZeroPoints,
+                         UnpackTestsWithScaleAndZeroPoint::getTestCaseName);
+
+const auto TestCasesScaleAndZeroPoints2 = ::testing::Combine(
+        ::testing::ValuesIn({ov::element::Type_t::u4}),
+        ::testing::ValuesIn({ov::element::Type_t::f16}),
+        ::testing::ValuesIn({ov::element::Type_t::f32}),
+        ::testing::ValuesIn({ov::element::Type_t::f32}),
+        ::testing::ValuesIn({3lu, 0lu}),
+        ::details::ShapesIn({Tensors{input={32, 32, 64};    scale = {32, 1, 64};},
+                             Tensors{input={64, 64, 128};   scale = {64, 1, 128};},
+                             Tensors{input={64, 32, 32};    scale = {64, 1, 32};}}),
+        ::testing::ValuesIn({true, false}),
+        ::testing::ValuesIn({true, false})
+);
+
+INSTANTIATE_TEST_SUITE_P(UnpackTestsWithScaleAndZeroPointTest2, UnpackTestsWithScaleAndZeroPointTest2,
+                         TestCasesScaleAndZeroPoints2,
+                         UnpackTestsWithScaleAndZeroPointTest2::getTestCaseName);
+
+const auto TestCasesScaleAndZeroPoints3 = ::testing::Combine(
+        ::testing::ValuesIn({ov::element::Type_t::u4}),
+        ::testing::ValuesIn({ov::element::Type_t::f16}),
+        ::testing::ValuesIn({ov::element::Type_t::f16}),
+        ::testing::ValuesIn({ov::element::Type_t::u4}),
+        ::testing::ValuesIn({3lu, 0lu}),
+        ::details::ShapesIn({Tensors{input={1, 32, 128};     scale = {1, 32, 1};   zerop = {1, 32, 1};},
+                             Tensors{input={16, 64, 64};     scale = {16, 64, 1};  zerop = {16, 64, 1};},
+                             Tensors{input={1, 1024, 4};     scale = {1, 64, 1};   zerop = {1, 32, 1};}}),
+        ::testing::ValuesIn({true, false}),
+        ::testing::ValuesIn({true, false})
+);
+
+INSTANTIATE_TEST_SUITE_P(UnpackTestsWithScaleAndZeroPointTest3, UnpackTestsWithScaleAndZeroPointTest3,
+                         TestCasesScaleAndZeroPoints3,
+                         UnpackTestsWithScaleAndZeroPointTest3::getTestCaseName);
+
+} // anonymous namespace
+
+#endif // __AVX2__
diff --git a/src/plugins/intel_npu/tests/unit/npuw/unpack.hpp b/src/plugins/intel_npu/tests/unit/npuw/unpack.hpp
new file mode 100644
index 00000000000000..da5bb4e4720f3e
--- /dev/null
+++ b/src/plugins/intel_npu/tests/unit/npuw/unpack.hpp
@@ -0,0 +1,628 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gmock/gmock-matchers.h>
+#include <gtest/gtest.h>
+#include <immintrin.h>
+#include <cstdint>
+#include <iomanip>
+#include <iostream>
+#include <array>
+
+#include "openvino/runtime/make_tensor.hpp"
+
+#include "util.hpp"
+
+namespace {
+
+#define ASSERT_NO_THROW_WITH_MESSAGE(code) do{ \
+    try {\
+     code;\
+     }catch (const std::exception &ex ) {\
+         FAIL()<<ex.what();\
+     }catch (...) {\
+         FAIL() << "Unknown exception";\
+     }\
+}while(0)
+
+#define ASSERT_NO_THROW_IF(condition, code) do { \
+if (condition) {ASSERT_NO_THROW_WITH_MESSAGE(code);} else {ASSERT_ANY_THROW(code);} \
+}while(0);
+
+namespace details {
+
+inline int8_t hi4(int8_t x) {
+    return ((x & (1 << 7)) >> 4) | ((x & (1 << 6)) >> 4) | ((x & (1 << 5)) >> 4) | ((x & (1 << 4)) >> 4);
+}
+
+inline int8_t lo4(int8_t x) {
+    return (x & (1 << 3)) | (x & (1 << 2)) | (x & (1 << 1)) | (x & (1 << 0));
+}
+
+inline uint8_t hi4(uint8_t x) {
+    return x >> 4;
+}
+
+inline uint8_t lo4(uint8_t x) {
+    return x & 0x0F;
+}
+
+inline int8_t upc(int8_t h) {
+    return h | (-((h & (1 << 3)) >> 3) & (-8));
+}
+
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+float half_to_float(const ushort x) {
+
+    __m128i halfVector = _mm_cvtsi32_si128(x);
+    __m128 floatVector = _mm_cvtph_ps(halfVector);
+    return _mm_cvtss_f32(floatVector);
+}
+
+ushort float_to_half(const float x) {
+    __m128 floatVector = _mm_set_ss(x);
+    __m128i halfVector = _mm_cvtps_ph(floatVector, _MM_FROUND_TO_NEAREST_INT);
+    return _mm_extract_epi16(halfVector, 0);
+}
+
+inline uint16_t int2hfloat(int8_t x)
+{
+    float inputFl32 = static_cast<float>(x);
+    float* inputFl32_ptr = &inputFl32;
+    unsigned int* fltInt32Ptr = reinterpret_cast<unsigned int*>(inputFl32_ptr);
+    unsigned int fltInt32 = *fltInt32Ptr;
+    unsigned short fltInt16;
+
+    fltInt16 = (fltInt32 >> 31) << 5;
+    unsigned short tmp = (fltInt32 >> 23) & 0xff;
+    tmp = (tmp - 0x70) & ((unsigned int)((int)(0x70 - tmp) >> 4) >> 27);
+    fltInt16 = (fltInt16 | tmp) << 10;
+    fltInt16 |= (fltInt32 >> 13) & 0x3ff;
+
+    return fltInt16;
+}
+
+
+void unpack(const int8_t* in, int8_t* out, int size) {
+    for (int i = 0; i < size / 2; i++) {
+        *(out++) = upc(lo4(*in));
+        *(out++) = upc(hi4(*in));
+        in++;
+    }
+}
+
+void unpack_i4f16(const int8_t* in, int8_t* out, int size) {
+    uint16_t *hFloatOut = reinterpret_cast<uint16_t *>(out);
+
+    for (int i = 0; i < size / 2; i++) {
+        *(hFloatOut++) = int2hfloat(upc(lo4(*in)));
+        *(hFloatOut++) = int2hfloat(upc(hi4(*in)));
+        in++;
+    }
+}
+
+/*u4 order*/
+void unpack_u4f32(const int8_t* in, float* out, int size) {
+    for (int i = 0; i < size / 2; i++) {
+        *(out++) = static_cast<float>(lo4(*in));
+        *(out++) = static_cast<float>(hi4(*in));
+        in++;
+    }
+}
+
+template<typename T>
+::testing::AssertionResult fp16ArraysMatch(const T &actual,
+                                           const T &expected,
+                                           const T &i4Input,
+                                           bool int4 = 1 /*i4 or u4*/){
+    for (size_t i = 0; i < expected.size() / 2; ++i) {
+
+        int int8Input[] ={
+                details::lo4(i4Input[i / 2]),
+                details::hi4(i4Input[i / 2])
+        };
+
+        if (int4) {
+            int8Input[0] = details::upc(int8Input[1]);
+            int8Input[1] = details::upc(int8Input[0]);
+        };
+
+        auto fp16ref = int{*((uint16_t*)expected.data() + i)};
+        auto fp16out = int{*((uint16_t*)actual.data() + i)};
+
+#define _P(x) std::dec << std::setw(5) << (x) << '(' << std::setw(4) << std::hex << (x) << ')'
+        if (fp16ref != fp16out) {
+            return ::testing::AssertionFailure() << std::dec << std::setw(4) << i << ", i4:"
+                                                 << std::setw(2) << int8Input[i % 2]
+                                                 << " | ref " << _P(fp16ref)
+                                                 << ", test "  << _P(fp16out) << "\n";
+        }
+#undef  _P
+
+    }
+
+    return ::testing::AssertionSuccess();
+}
+
+}  // namespace details
+
+using ShapesInitializer = std::function<void (std::vector<int>&, std::vector<int>&, std::vector<int>&)>;
+
+
+using UnpackTestsParams = std::tuple<
+        ov::element::Type_t,  // fromPrecision
+        ov::element::Type_t,  // toPrecision
+        ov::element::Type_t,  // scalePrecision
+        ov::element::Type_t,  // zeroPointPrecision
+        unsigned long,        // nPartitions
+        ShapesInitializer,    // input_shape , scale_shape, zerop initializer
+        bool,                 // use parallel_for
+        bool                  // strict partitioning
+        >;
+
+class UnpackTestsBase {
+protected:
+    ov::element::Type fromType;
+    ov::element::Type toType;
+    ov::element::Type scaleType;
+    ov::element::Type zeropType;
+    std::shared_ptr<ov::ITensor> from, to, scale, zerop;
+
+    std::vector<int8_t> input;
+    std::vector<int8_t> output;
+    std::vector<int8_t> ref_output;
+    std::vector<int8_t> scalesStorage;
+    std::vector<int8_t> zeropStorage;
+    float zeropValue;
+    ov::Shape input_shape;
+    ov::Shape scale_shape;
+    ov::Shape zerop_shape;
+
+    size_t nPartitions;
+    bool useParallelFor = false;
+    bool strictPartitions = false;
+
+    void make_zeropoints() {
+        if (zeropType == ov::element::undefined) {
+            return;
+        }
+
+        const std::vector<float> zeropValues = {15.0f, 12.0f, 0.0f, 31.0f};
+        const size_t nElements = shape_size(zerop_shape);
+
+        // Set zeropValue if there's only one element
+        if (nElements == 1) {
+            zeropValue = zeropValues.front();
+        }
+
+        // Determine the size of the storage based on the type and resize the storage vector
+        if (zeropType == ov::element::Type_t::u4) {
+            zeropStorage.resize((nElements + 1) / 2, 0); // Each u4 zeropoint is 4 bits, so two zeropoints fit in one byte
+        } else if (zeropType == ov::element::Type_t::f32) {
+            zeropStorage.resize(nElements * sizeof(float), 0);
+        } else {
+            ASSERT_TRUE(zeropType == ov::element::u4 || zeropType == ov::element::f32);
+        }
+
+        // Fill the storage with the appropriate values
+        if (zeropType == ov::element::Type_t::u4) {
+            for (size_t i = 0; i < nElements; ++i) {
+                uint8_t zeropValueU4 = static_cast<uint8_t>(zeropValues[i % zeropValues.size()]) & 0x0F;
+                size_t byteIndex = i / 2;
+                if (i % 2 == 0) {
+                    zeropStorage[byteIndex] = zeropValueU4;
+                } else {
+                    zeropStorage[byteIndex] = (zeropValueU4 << 4);
+                }
+            }
+        } else if (zeropType == ov::element::Type_t::f32) {
+            float* ptrWork = reinterpret_cast<float*>(zeropStorage.data());
+            for (size_t i = 0; i < nElements; ++i) {
+                ptrWork[i] = zeropValues[i % zeropValues.size()];
+            }
+        }
+
+        // Create the tensor
+        zerop = ov::make_tensor(zeropType, zerop_shape, zeropStorage.data());
+    }
+
+    void make_scales() {
+        if (scaleType == ov::element::undefined) {
+            return;
+        }
+        ASSERT_TRUE(scaleType == ov::element::f16 || scaleType == ov::element::f32);
+        size_t nElements = shape_size(scale_shape);
+
+        // creating custom scale factors
+        const size_t nScaleBytes  = scaleType.bitwidth() * nElements  / 8;
+
+        std::vector<float> sc(nElements);
+        float coeffTable[] = {
+                0.1f,
+                0.5f,
+                1.f,
+                2.f
+        };
+        for (size_t i = 0; i != nElements; i++) {
+            sc[i] = coeffTable[i % (sizeof (coeffTable) / sizeof(*coeffTable))];
+        }
+        scalesStorage.resize(nScaleBytes);
+
+        if (scaleType == ov::element::f16) {
+            uint16_t * ptrWork = reinterpret_cast<uint16_t*>(scalesStorage.data());
+            for (size_t i = 0; i != nElements; i++) {
+                ptrWork[i] = details::float_to_half(sc[i]);
+            }
+        }
+        if (scaleType == ov::element::f32) {
+            float* ptrWork = reinterpret_cast<float*>(scalesStorage.data());
+            for (size_t i = 0; i != nElements; i++) {
+                ptrWork[i] = sc[i];
+            }
+        }
+        scale = ov::make_tensor(scaleType, scale_shape, scalesStorage.data());
+    }
+
+    void make_input() {
+
+        size_t nElements = shape_size(input_shape);
+
+        ASSERT_EQ((fromType.bitwidth() * nElements) % 8, 0) << "Input len has to be byte boundary aligned, but was "
+                                                            << fromType.bitwidth() * nElements << " bits";
+        ASSERT_EQ((toType.bitwidth() * nElements) % 8, 0) << "Output len has to be byte boundary aligned";
+
+        const size_t nInputBytes  = fromType.bitwidth() * nElements  / 8;
+        const size_t nOutputBytes = toType.bitwidth() * nElements  / 8;
+
+        input.resize(nInputBytes);
+        ref_output.resize(nOutputBytes);
+        output.resize(nOutputBytes);
+        std::fill(ref_output.begin(), ref_output.end(), 0);
+        std::fill(output.begin(), output.end(), 0);
+
+        std::array<int8_t, 32> input_local = {
+                0x0A, 0x0B, 0x1C, 0x1D, 0x2E, 0x2F, 0x35, 0x36,
+                0x4A, 0x4B, 0x5A, 0x5B, 0x6A, 0x6B, 0x7A, 0x7B,
+                0x0C, 0x0D, 0x1C, 0x1D, 0x2C, 0x2D, 0x3C, 0x3D,
+                0x4C, 0x4D, 0x5C, 0x5D, 0x6C, 0x6D, 0x7C, 0x7D,
+        };
+
+        for (size_t idx = 0, k = 0; k < nInputBytes; k++, idx = (idx + 1) % input_local.size()) {
+            input[k] = input_local[idx];
+        }
+
+        from = ov::make_tensor(fromType, input_shape, input.data());
+        to = ov::make_tensor(toType, input_shape, output.data());
+    }
+public:
+    void SetUp(const UnpackTestsParams & getParam) {
+        ShapesInitializer shapeInit;
+
+        std::tie(fromType, toType, scaleType, zeropType, nPartitions, shapeInit, useParallelFor, strictPartitions) = getParam;
+
+        std::vector<int> input, scale, zerop;
+        shapeInit(input, scale, zerop);
+
+        input_shape = ov::Shape{input.begin(), input.end()};
+        scale_shape = ov::Shape{scale.begin(), scale.end()};
+        if (zerop.empty()) {
+            zerop_shape = ov::Shape({1});
+        } else {
+            zerop_shape = ov::Shape{zerop.begin(), zerop.end()};
+        }
+
+        make_input();
+        make_scales();
+        make_zeropoints();
+
+        make_ref_output();
+    }
+    std::string ToString() const {
+        std::ostringstream result;
+        result << (isNegative() ? "NEGATIVE_" : "")
+               <<"[";
+
+        for (size_t i = 0; i != input_shape.size(); i++) {
+            result << input_shape[i] << ((i + 1 == input_shape.size()) ? "" : "x");
+        }
+        result <<"]"
+               << "_p" << nPartitions
+               << (strictPartitions ? "_SP" : "")
+               << (useParallelFor ? "_parallel" : "_serial")
+               << "_from_" << fromType
+               << "_to_" << toType;
+        if (scaleType != ov::element::Type_t::undefined)
+            result << "_scale_" << scaleType;
+        if (zeropType != ov::element::Type_t::undefined)
+            result << "_zerop_" << zeropType;
+
+        return result.str();
+    }
+
+    /**
+     * Negative test cases has to be carefully reviewed, to still remain positive runs at some points
+     * @return
+     */
+    virtual bool isNegative() const {
+        return false;
+    }
+
+    virtual void make_ref_output() {
+        size_t nElements = 1;
+        for (size_t dim : input_shape) {
+            nElements *= dim;
+        }
+        if (toType == ov::element::i8) {
+            details::unpack(input.data(), ref_output.data(), static_cast<int>(nElements));
+        } else if (toType == ov::element::f16) {
+            details::unpack_i4f16(input.data(), ref_output.data(), static_cast<int>(nElements));
+        }
+    }
+};
+
+template <class T>
+class UnpackTestsTmpl :
+        public ::testing::Test,
+        public T,
+        public ::testing::WithParamInterface<UnpackTestsParams> {
+protected:
+
+    void SetUp() override {
+        T::SetUp(GetParam());
+    }
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<UnpackTestsParams>& obj) {
+        T _bt;
+        _bt.SetUp(obj.param);
+        return _bt.ToString();
+    }
+};
+
+using UnpackTests = UnpackTestsTmpl<UnpackTestsBase>;
+class UnpackTestsRef : public UnpackTests {};
+
+TEST_P(UnpackTests, i4) {
+    ASSERT_NO_THROW_WITH_MESSAGE(ov::npuw::util::unpack(from, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions}));
+    ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input));
+}
+
+class UnpackWithScaleTestsBase : public UnpackTestsBase {
+protected:
+    bool isNegative() const override {
+        if (scale_shape.size() != 3 && scale_shape.size() != 2) return true;
+        if (input_shape.back() % 64) return true;
+        if ((from->get_size() / scale->get_size()) % 64) return true;
+        if (toType != ov::element::f16) return true;
+
+        return false;
+    }
+
+    void make_ref_output() override {
+        if (isNegative()) return;
+
+        size_t nElements = from->get_size();
+
+        const size_t nOutputElementsPerScale = ref_output.size() / (toType.bitwidth() / 8) / scale->get_size();
+
+        details::unpack_i4f16(input.data(), ref_output.data(), static_cast<int>(nElements));
+
+        // lets apply per channel scale
+        uint16_t * pRef = reinterpret_cast<uint16_t*>(ref_output.data());
+        uint16_t * pScale_f16 = reinterpret_cast<uint16_t*>(scale->data());
+        float * pScale_f32 = reinterpret_cast<float*>(scale->data());
+
+        for (size_t i = 0; i < scale->get_size(); i++) {
+            for (size_t sc = 0; sc != nOutputElementsPerScale; sc++) {
+                float ref_scaled = details::half_to_float(pRef[0]);
+                if (scaleType == ov::element::f32) {
+                    ref_scaled *= pScale_f32[0];
+                } else if (scaleType == ov::element::f16) {
+                    ref_scaled *= details::half_to_float(pScale_f16[0]);
+                }
+                *pRef = details::float_to_half(ref_scaled);
+                pRef++;
+            }
+            pScale_f32++;
+            pScale_f16++;
+        }
+    }
+
+};
+
+using UnpackWithScaleTests = UnpackTestsTmpl<UnpackWithScaleTestsBase>;
+
+
+TEST_P(UnpackWithScaleTests, i4_scale) {
+    ASSERT_NO_THROW_IF(!isNegative(),
+                       ov::npuw::util::unpack(from, scale, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions}));
+    if (!isNegative()) {
+        ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input));
+    }
+}
+
+
+class UnpackTestsWithScaleAndZeroPointBase : public UnpackTestsBase {
+protected:
+    bool isNegative() const override {
+        if (scale_shape.size() != 3 && scale_shape.size() != 2) return true;
+        if (input_shape.back() % 64) return true;
+
+        return false;
+    }
+
+    void make_ref_output() override {
+        if (isNegative()) return;
+
+        size_t nElements = from->get_size();
+
+        const size_t nOutputElementsPerScale = ref_output.size() / (toType.bitwidth() / 8) / scale->get_size();
+
+        std::vector<float> floatRef(nElements);
+        details::unpack_u4f32(input.data(), floatRef.data(), static_cast<int>(nElements));
+
+
+        // lets apply per channel scale
+        uint16_t * pRef = reinterpret_cast<uint16_t*>(ref_output.data());
+        float * pFloatRef = reinterpret_cast<float*>(floatRef.data());
+        const uint16_t * pScale_f16 = reinterpret_cast<uint16_t*>(scale->data());
+        const float * pScale_f32 = reinterpret_cast<float*>(scale->data());
+
+        for (size_t i = 0; i < scale->get_size(); i++) {
+            for (size_t sc = 0; sc != nOutputElementsPerScale; sc++) {
+                // applying zeropoint
+                float ref_scaled = *pFloatRef - zeropValue;
+
+                if (scaleType == ov::element::f32) {
+                    ref_scaled *= pScale_f32[0];
+                } else if (scaleType == ov::element::f16) {
+                    ref_scaled *= details::half_to_float(pScale_f16[0]);
+                }
+                *pRef = details::float_to_half(ref_scaled);
+
+                pFloatRef++;
+                pRef++;
+            }
+            pScale_f32++;
+            pScale_f16++;
+        }
+    }
+};
+
+using UnpackTestsWithScaleAndZeroPoint = UnpackTestsTmpl<UnpackTestsWithScaleAndZeroPointBase>;
+
+TEST_P(UnpackTestsWithScaleAndZeroPoint, u4) {
+    ASSERT_NO_THROW_IF(!isNegative(),
+                       ov::npuw::util::unpack(from, zerop, scale, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions}));
+    if (!isNegative()) {
+        ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input, false));
+    }
+}
+
+class UnpackTestsWithScaleAndZeroPoint2 : public UnpackTestsWithScaleAndZeroPointBase {
+protected:
+    bool isNegative() const override {
+        if (input_shape.back() % 64 || input_shape.size() != 3) return true;
+        if (scale_shape.back() % 64 || scale_shape.size() != 3) return true;
+
+        return false;
+    }
+
+    void make_ref_output() override {
+        if (isNegative()) return;
+
+        size_t nElements = from->get_size();
+        const auto from_shape = from->get_shape();
+
+        const size_t C = from_shape[from_shape.size() - 3];
+        const size_t H = from_shape[from_shape.size() - 2];
+        const size_t W = from_shape[from_shape.size() - 1];
+
+        std::vector<float> floatRef(nElements);
+        details::unpack_u4f32(input.data(), floatRef.data(), static_cast<int>(nElements));
+
+        uint16_t * pRef = reinterpret_cast<uint16_t*>(ref_output.data());
+        float * pFloatRef = reinterpret_cast<float*>(floatRef.data());
+        const uint16_t * pScale_f16 = reinterpret_cast<uint16_t*>(scale->data());
+        const float * pScale_f32 = reinterpret_cast<float*>(scale->data());
+
+        for (size_t c = 0; c < C; ++c) {
+            for (size_t h = 0; h < H; ++h) {
+                for (size_t w = 0; w < W; ++w) {
+                    size_t input_index =  w + W * h + W * H * c;
+                    size_t scale_index = w + W * c;
+                    float ref_scaled = pFloatRef[input_index] - zeropValue;
+                    if (scaleType == ov::element::f32) {
+                        ref_scaled *= pScale_f32[scale_index];
+                    } else if (scaleType == ov::element::f16) {
+                        ref_scaled *= details::half_to_float(pScale_f16[scale_index]);
+                    }
+                    pRef[w + W * h + c * W * H] = details::float_to_half(ref_scaled);
+                }
+            }
+        }
+    }
+};
+
+using UnpackTestsWithScaleAndZeroPointTest2 = UnpackTestsTmpl<UnpackTestsWithScaleAndZeroPoint2>;
+
+TEST_P(UnpackTestsWithScaleAndZeroPointTest2, u4) {
+    ASSERT_NO_THROW_IF(!isNegative(),
+                       ov::npuw::util::unpack(from, zerop, scale, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions}));
+    if (!isNegative()) {
+        ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input, false));
+    }
+}
+
+class UnpackTestsWithScaleAndZeroPoint3 : public UnpackTestsWithScaleAndZeroPointBase {
+protected:
+    bool isNegative() const override {
+        if (scale_shape.size() != 3 || zerop_shape.size() != 3) return true;
+        if (input_shape[2] % 64 || input_shape.size() != 3) return true;
+
+        return false;
+    }
+
+    void make_ref_output() override {
+        if (isNegative()) return;
+
+        size_t nElements = from->get_size();
+
+        const size_t nOutputElementsPerScale = ref_output.size() / (toType.bitwidth() / 8) / scale->get_size();
+
+        std::vector<float> floatRef(nElements);
+        details::unpack_u4f32(input.data(), floatRef.data(), static_cast<int>(nElements));
+
+
+        // lets apply per channel scale
+        uint16_t * pRef = reinterpret_cast<uint16_t*>(ref_output.data());
+        const uint8_t* pZer = static_cast<uint8_t*>(zerop->data());
+        float * pFloatRef = reinterpret_cast<float*>(floatRef.data());
+        const uint16_t * pScale_f16 = reinterpret_cast<uint16_t*>(scale->data());
+        const float * pScale_f32 = reinterpret_cast<float*>(scale->data());
+
+        for (size_t i = 0; i < scale->get_size(); i++) {
+            float zeroPointValue = static_cast<float>((i % 2 == 0) ? details::lo4(pZer[i / 2]) : details::hi4(pZer[i / 2]));
+            for (size_t sc = 0; sc != nOutputElementsPerScale; sc++) {
+                // applying zeropoint
+                float ref_scaled = *pFloatRef - zeroPointValue;
+
+                if (scaleType == ov::element::f32) {
+                    ref_scaled *= pScale_f32[0];
+                } else if (scaleType == ov::element::f16) {
+                    ref_scaled *= details::half_to_float(pScale_f16[0]);
+                }
+                *pRef = details::float_to_half(ref_scaled);
+
+                pFloatRef++;
+                pRef++;
+            }
+            pScale_f32++;
+            pScale_f16++;
+        }
+    }
+};
+
+using UnpackTestsWithScaleAndZeroPointTest3 = UnpackTestsTmpl<UnpackTestsWithScaleAndZeroPoint3>;
+
+TEST_P(UnpackTestsWithScaleAndZeroPointTest3, u4) {
+    ASSERT_NO_THROW_IF(!isNegative(),
+                       ov::npuw::util::unpack(from, zerop, scale, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions}));
+    if (!isNegative()) {
+        ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input, false));
+    }
+}
+
+#define Tensors [](std::vector<int>& input, std::vector<int>&scale, std::vector<int>&zerop)
+
+
+namespace details {
+::testing::internal::ParamGenerator<typename std::vector<ShapesInitializer>::value_type> ShapesIn(
+        const std::vector<ShapesInitializer>& container) {
+    return ::testing::ValuesIn(container.begin(), container.end());
+}
+
+}  // namespace details
+}  // anonymous namespace

From 2fc0faedfa69caf2af5b5cd27c2f3cf5ad2203bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hubert=20B=C5=82aszczyk?=
 <56601011+hub-bla@users.noreply.github.com>
Date: Thu, 3 Oct 2024 14:21:01 +0200
Subject: [PATCH 5/9] [TF FE]: Support complex tensors for ExpandDims operation
 (#26892)

### Details:
 - Support complex tensors for `ExpandDims` operation + tests

### Tickets:
 - [None](https://github.com/openvinotoolkit/openvino/issues/22950)
---
 .../tensorflow_common/src/op/expand_dims.cpp  | 30 ++++++++++-
 .../tensorflow_tests/test_tf_ExpandDims.py    | 52 +++++++++++++++++++
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/frontends/tensorflow_common/src/op/expand_dims.cpp b/src/frontends/tensorflow_common/src/op/expand_dims.cpp
index b3b37ad38cc302..a40e5c9b1bc6df 100644
--- a/src/frontends/tensorflow_common/src/op/expand_dims.cpp
+++ b/src/frontends/tensorflow_common/src/op/expand_dims.cpp
@@ -3,7 +3,13 @@
 //
 
 #include "common_op_table.hpp"
+#include "helper_ops/complex_type_mark.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/less.hpp"
+#include "openvino/op/select.hpp"
+#include "openvino/op/subtract.hpp"
 #include "openvino/op/unsqueeze.hpp"
+#include "utils.hpp"
 
 using namespace std;
 using namespace ov::op;
@@ -14,9 +20,31 @@ namespace tensorflow {
 namespace op {
 
 OutputVector translate_expand_dims_op(const NodeContext& node) {
-    default_op_checks(node, 2, {"ExpandDims", "EXPAND_DIMS"});
+    default_op_checks(node, 2, {"ExpandDims", "EXPAND_DIMS"}, true);
     auto input = node.get_input(0);
     auto axis = node.get_input(1);
+    auto complex_type_mark = as_type_ptr<ComplexTypeMark>(input.get_node_shared_ptr());
+
+    if (complex_type_mark) {
+        element::Type complex_part_type = complex_type_mark->get_complex_part_type();
+        input = complex_type_mark->input_value(0);
+
+        auto const_zero = create_same_type_const_scalar<int32_t>(axis, 0);
+
+        auto is_axis_neg = make_shared<v1::Less>(axis, const_zero);
+
+        auto const_one = create_same_type_const_scalar<int32_t>(axis, 1);
+        auto axis_min_one = make_shared<v1::Subtract>(axis, const_one);
+
+        auto new_axis = make_shared<v1::Select>(is_axis_neg, axis_min_one, axis);
+
+        auto unsqueeze = make_shared<v0::Unsqueeze>(input, new_axis);
+
+        set_node_name(node.get_name(), unsqueeze);
+        auto complex_result = make_shared<ComplexTypeMark>(unsqueeze, complex_part_type);
+        return {complex_result};
+    }
+
     auto unsqueeze = make_shared<v0::Unsqueeze>(input, axis);
     set_node_name(node.get_name(), unsqueeze);
     return {unsqueeze};
diff --git a/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py b/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py
index f0f9085d32ba2f..e982867c9ac08d 100644
--- a/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py
+++ b/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py
@@ -6,6 +6,7 @@
 import tensorflow as tf
 from common.tf_layer_test_class import CommonTFLayerTest
 
+rng = np.random.default_rng(62362)
 
 class TestExpandDims(CommonTFLayerTest):
     def _prepare_input(self, inputs_info):
@@ -40,3 +41,54 @@ def test_expand_dims_basic(self, params, ie_device, precision, ir_version, temp_
         self._test(*self.create_expand_dims_net(**params),
                    ie_device, precision, ir_version, temp_dir=temp_dir,
                    use_legacy_frontend=use_legacy_frontend)
+
+
+class TestExpandDimsComplex(CommonTFLayerTest):
+    def _prepare_input(self, inputs_info):
+        # generate elements so that the input tensor may contain repeating elements
+        assert 'param_real:0' in inputs_info
+        assert 'param_imag:0' in inputs_info
+
+        input_shape = inputs_info['param_real:0']
+
+        inputs_data = {}
+        inputs_data['param_real:0'] = rng.integers(-10.0, 10.0, input_shape).astype(np.float32)
+        inputs_data['param_imag:0'] = rng.integers(-10.0, 10.0, input_shape).astype(np.float32)
+
+        return inputs_data
+
+    def create_expand_dims_complex_net(self, axis_dtype, input_shape, axis):
+        tf.compat.v1.reset_default_graph()
+        with tf.compat.v1.Session() as sess:
+            param_real = tf.compat.v1.placeholder(np.float32, input_shape, 'param_real')
+            param_imag = tf.compat.v1.placeholder(np.float32, input_shape, 'param_imag')
+
+            complex = tf.raw_ops.Complex(real=param_real, imag=param_imag)
+
+            axis = tf.constant(axis, dtype=axis_dtype)
+
+            result = tf.raw_ops.ExpandDims(input=complex, axis=axis)
+
+            tf.raw_ops.Real(input=result)
+            tf.raw_ops.Imag(input=result)
+
+            tf.compat.v1.global_variables_initializer()
+            tf_net = sess.graph_def
+
+        return tf_net, None
+
+    test_basic = [
+        dict(input_shape=[], axis=0),
+        dict(input_shape=[2, 3], axis=1),
+        dict(input_shape=[2, 3, 4], axis=-1),
+        dict(input_shape=[2, 6, 5], axis=-2),
+    ]
+
+    @pytest.mark.parametrize("axis_dtype", [np.int32, np.int64])
+    @pytest.mark.parametrize("op_args", test_basic)
+    @pytest.mark.nightly
+    @pytest.mark.precommit
+    def test_expand_dims_basic_complex(self, axis_dtype, op_args, ie_device, precision, ir_version, temp_dir, use_legacy_frontend):
+        self._test(*self.create_expand_dims_complex_net(axis_dtype, **op_args),
+                   ie_device, precision, ir_version, temp_dir=temp_dir,
+                   use_legacy_frontend=use_legacy_frontend)

From 5c1e1fba4b70832f0dd14764c5f1302412827718 Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Thu, 3 Oct 2024 16:30:41 +0400
Subject: [PATCH 6/9] [GPU] Extract debug code from network::execute() (#26888)

### Details:
- Debug code in network::execute() method is extracted to separate
class/file to improve code readability
---
 .../include/intel_gpu/graph/network.hpp       |  10 +-
 .../intel_gpu/src/graph/debug_helper.cpp      | 526 +++++++++++++++++
 .../intel_gpu/src/graph/debug_helper.hpp      |  69 +++
 .../src/graph/include/program_dump_graph.h    |   2 +-
 src/plugins/intel_gpu/src/graph/network.cpp   | 542 +-----------------
 .../src/graph/program_dump_graph.cpp          |   2 +-
 6 files changed, 623 insertions(+), 528 deletions(-)
 create mode 100644 src/plugins/intel_gpu/src/graph/debug_helper.cpp
 create mode 100644 src/plugins/intel_gpu/src/graph/debug_helper.hpp

diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
index 71623f32843eac..63adae28ddabf3 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
@@ -4,17 +4,15 @@
 
 #pragma once
 
-#include "openvino/runtime/threading/cpu_streams_executor.hpp"
+#include "openvino/runtime/threading/istreams_executor.hpp"
 
 #include "intel_gpu/graph/topology.hpp"
 #include "intel_gpu/graph/program.hpp"
 #include "intel_gpu/graph/serialization/binary_buffer.hpp"
-#include "intel_gpu/runtime/compounds.hpp"
 #include "intel_gpu/runtime/memory.hpp"
 #include "intel_gpu/runtime/engine.hpp"
 #include "intel_gpu/runtime/event.hpp"
 #include "intel_gpu/runtime/stream.hpp"
-#include "intel_gpu/runtime/lru_cache.hpp"
 #include "intel_gpu/runtime/shape_predictor.hpp"
 #include "intel_gpu/plugin/variable_state.hpp"
 
@@ -211,7 +209,7 @@ struct network {
     bool is_dynamic() const { return _is_dynamic; }
     size_t get_weights_cache_capacity() const { return _weights_cache_capacity; }
 
-    memory_pool& get_memory_pool() {
+    memory_pool& get_memory_pool() const {
         return *_memory_pool;
     }
 
@@ -284,7 +282,9 @@ struct network {
     void dump_memory_pool(std::string dump_path, int64_t curr_iter);
 
 #ifdef GPU_DEBUG_CONFIG
-    int64_t iteration = 0;
+    mutable int64_t iteration = 0;
+    friend class NetworkDebugHelper;
+    friend class NodeDebugHelper;
 #endif
 };
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/debug_helper.cpp b/src/plugins/intel_gpu/src/graph/debug_helper.cpp
new file mode 100644
index 00000000000000..7f7071e704683e
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/debug_helper.cpp
@@ -0,0 +1,526 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "debug_helper.hpp"
+#include "openvino/util/file_util.hpp"
+
+#ifdef GPU_DEBUG_CONFIG
+
+#include "to_string_utils.h"
+#include "loop_inst.h"
+#include "condition_inst.h"
+#include "program_dump_graph.h"
+
+#include <iomanip>
+#include <fstream>
+#include <sys/stat.h>
+
+namespace cldnn {
+
+namespace {
+
+float convert_element(int64_t i) { return static_cast<float>(i); }
+float convert_element(int32_t i) { return static_cast<float>(i); }
+
+float convert_element(float f) { return f; }
+
+float convert_element(ov::float16 h) { return static_cast<float>(h); }
+
+size_t get_x_pitch(const layout& layout) {
+    try {
+        auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0));
+        auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0));
+        auto x0 = layout.get_linear_offset(tensor_x0);
+        auto x1 = layout.get_linear_offset(tensor_x1);
+        return (x1 - x0);
+    } catch (...) {
+        // When spatial size of x=0, x_pitch is meaningless
+        return 0;
+    }
+}
+
+template <class T>
+void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
+    auto&& size = mem->get_layout().get_tensor();
+
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
+    tensor tmp_size(size);
+    tmp_size.batch[0] = batch_size;
+    if (tmp_size == size) {
+        file_stream << "shape: " << size.to_string() << " ";
+        file_stream << "(count: " << size.count()
+                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")"
+                    << (dump_raw ? " raw data" : "") << std::endl;
+    } else {
+        file_stream << "shape: " << tmp_size.to_string() << " ";
+        file_stream << "(count: " << tmp_size.count()
+                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
+                    << ", original shape: " << size.to_string() << ")"
+                    << (dump_raw ? " raw data" : "") << std::endl;
+    }
+
+    if (size.count() == 0) {
+        file_stream << "Empty buffer" << std::endl;
+        return;
+    }
+
+    mem_lock<T, mem_lock_type::read> lock(mem, stream);
+    auto mem_ptr = lock.data();
+    auto x_pitch = get_x_pitch(mem->get_layout());
+    std::stringstream buffer;
+
+    if (!dump_raw) {
+        for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
+            for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
+                for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
+                    for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
+                        for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
+                            for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) {
+                                cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w));
+                                size_t input_it = mem->get_layout().get_linear_offset(t);
+
+                                for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) {
+                                    buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        for (size_t i = 0; i < lock.size(); ++i) {
+            buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[i]) << std::endl;
+        }
+    }
+    file_stream << buffer.str();
+}
+
+void unpack(cldnn::data_types type, uint8_t input, int8_t &v0, int8_t &v1) {
+    if (type == cldnn::data_types::i4) {
+        char s_bit = (input & 0x08);
+        char mask = s_bit > 0 ? 0xF0 : 0x00;
+        v0 = (input & 0x0F) | mask;
+
+        input >>= 4;
+        s_bit = (input & 0x08);
+        mask = s_bit > 0 ? 0xF0 : 0x00;
+        v1 = (input & 0x0F) | mask;
+    } else if (type == cldnn::data_types::u4) {
+        v0 = input & 0x0F;
+        v1 = input >> 4;
+    } else {
+        OPENVINO_ASSERT(false, "not supported unpacking");
+    }
+}
+
+void dump_i4u4(cldnn::data_types type, memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
+    auto&& size = mem->get_layout().get_tensor();
+
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
+    tensor tmp_size(size);
+    tmp_size.batch[0] = batch_size;
+    if (tmp_size == size) {
+        file_stream << "shape: " << size.to_string() << " ";
+        file_stream << "(count: " << size.count()
+                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")"
+                    << (dump_raw ? " raw data" : "") << std::endl;
+    } else {
+        file_stream << "shape: " << tmp_size.to_string() << " ";
+        file_stream << "(count: " << tmp_size.count()
+                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
+                    << ", original shape: " << size.to_string() << ")"
+                    << (dump_raw ? " raw data" : "") << std::endl;
+    }
+
+    if (size.count() == 0) {
+        file_stream << "Empty buffer" << std::endl;
+        return;
+    }
+
+    mem_lock<uint8_t, mem_lock_type::read> lock(mem, stream);
+    auto mem_ptr = lock.data();
+    std::stringstream buffer;
+
+    if (dump_raw) {
+        for (size_t i = 0; i < lock.size(); ++i) {
+            int8_t v0, v1;
+            unpack(type, mem_ptr[i], v0, v1);
+            buffer << std::fixed << std::setprecision(6) << static_cast<int>(v0) << std::endl;
+            buffer << std::fixed << std::setprecision(6) << static_cast<int>(v1) << std::endl;
+        }
+    } else {
+        std::cout << __func__ << " supports raw dump only" << std::endl;
+    }
+    file_stream << buffer.str();
+}
+
+void log_memory_to_file(memory::ptr mem, layout data_layout, stream& stream, std::string layerName, bool dump_raw) {
+    std::cout << "Dump " << (dump_raw ? "raw " : "") << layerName << std::endl;
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    std::string filename = debug_config->get_name_for_dump(layerName);
+    filename = debug_config->dump_layers_path + filename + ".txt";
+    std::ofstream file_stream(filename);
+    if (!mem) {
+        file_stream << "Empty" << std::endl;
+        return;
+    }
+
+    // Reinterpret buffer to represent actual data layout
+    auto actual_mem = mem->get_engine()->reinterpret_buffer(*mem, data_layout);
+
+    auto mem_dt = actual_mem->get_layout().data_type;
+    if (mem_dt == cldnn::data_types::f32)
+        dump<float>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::f16)
+        dump<ov::float16>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::i64)
+        dump<int64_t>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::i32)
+        dump<int32_t>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::i8)
+        dump<int8_t>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::u8)
+        dump<uint8_t>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::u8)
+        dump<uint8_t>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::i4 || mem_dt == cldnn::data_types::u4)
+        dump_i4u4(mem_dt, actual_mem, stream, file_stream, dump_raw);
+    else
+        std::cout << "Dump for this data type is not supported: " << dt_to_str(mem_dt) << std::endl;
+}
+
+}  // namespace
+
+static std::string get_file_path_for_binary_dump(cldnn::layout layout, std::string name) {
+    std::string filename;
+    std::string data_type = ov::element::Type(layout.data_type).get_type_name();
+    std::string format = layout.format.to_string();
+    std::string tensor;
+    auto dims = layout.get_dims();
+    for (size_t r = 0 ; r < layout.get_rank() ; r++) {
+        tensor += ("_" + to_string(dims[r]));
+    }
+
+#ifdef GPU_DEBUG_CONFIG
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    std::string layer_name = debug_config->get_name_for_dump(name);
+    filename = debug_config->dump_layers_path + layer_name
+                + "__" + data_type + "_" + tensor + "__" + format + ".bin";
+#endif
+    return filename;
+}
+
+NodeDebugHelper::NodeDebugHelper(const primitive_inst& inst)
+    : m_inst(inst)
+    , m_stream(inst.get_network().get_stream())
+    , m_network(inst.get_network())
+    , m_program(inst.get_network().get_program().get())
+    , m_iter(m_network.iteration) {
+    // Load binary dump for input layers
+    if (!debug_config->load_layers_raw_dump.empty()) {
+        const std::string layer_name = m_inst.id();
+        auto files = debug_config->get_filenames_for_matched_layer_loading_binaries(layer_name);
+        if (!files.empty()) {
+            if (m_inst.is_input()) {
+                // Loading binary dumps for output tensors of input-layers : only one output exists or index(dstN) exists
+                auto dump_file = debug_config->get_matched_from_filelist(files, "_dst0__");
+                OPENVINO_ASSERT((files.size() == 1 || dump_file.length() != 0), "Unexpected binary dump for input layer");
+
+                OPENVINO_ASSERT(files.size() == m_inst.outputs_memory_count(), "Mis-match dump file count");
+
+                for (size_t i = 0; i < m_inst.outputs_memory_count(); i++) {
+                    auto dump_file = files[0];
+                    if (files.size() > 1 || m_inst.outputs_memory_count() != 1) {
+                        std::string pattern = "_dst" + std::to_string(i) + "__";
+                        dump_file = debug_config->get_matched_from_filelist(files, pattern);
+                    }
+                    OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_dst[N]__' for binary dump");
+                    GPU_DEBUG_COUT << " Load binary dump : " << dump_file << " for " << layer_name << std::endl;
+
+                    std::vector<uint8_t> bin = ov::util::load_binary(dump_file);
+                    OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file);
+
+                    auto output_mem = m_inst.output_memory_ptr(i);
+                    OPENVINO_ASSERT(output_mem->size() == bin.size(), "memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name
+                                    + "\n Expected size : " + to_string(output_mem->size()) + ", Binary : " + to_string(bin.size()));
+
+                    output_mem->copy_from(m_stream, static_cast<void *>(&bin[0]), true);
+                }
+            } else {
+                auto check_dst = debug_config->get_matched_from_filelist(files, "_dst0__");
+                OPENVINO_ASSERT(check_dst.length() == 0, "Expected to load binaries for inputs of " + layer_name);
+
+                // Loading input tensors for any layer
+                auto dump_file = debug_config->get_matched_from_filelist(files, "_src0__");
+                OPENVINO_ASSERT(dump_file.length() != 0, "Could not find expected pattern '_src[N]__' for binary dump input : " + layer_name);
+
+                for (size_t i = 0; i < m_inst.dependencies().size(); i++) {
+                    auto dump_file = files[0];
+                    if (files.size() > 1 || m_inst.dependencies().size() != 1) {
+                        std::string pattern = "_src" + std::to_string(i) + "__";
+                        dump_file = debug_config->get_matched_from_filelist(files, pattern);
+                    }
+                    if (dump_file.length() == 0) {
+                        GPU_DEBUG_COUT  << " Skip loading for  input(" << i << ") of " << layer_name << std::endl;
+                        continue;
+                    }
+                    OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_src[N]__' for binary dump input");
+                    GPU_DEBUG_COUT  << " Load binary dump : " << dump_file << " for input(" << i << ") of " << layer_name << std::endl;
+
+                    std::vector<uint8_t> bin = ov::util::load_binary(dump_file);
+                    OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file);
+
+                    auto input_mem = m_inst.dep_memory_ptr(i);
+                    if (input_mem->size() != bin.size()) {
+                        std::cout << "WARNING: memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name
+                                    << "  " << input_mem->size() << " / " << bin.size() << std::endl;
+                        bin.resize(input_mem->size());
+                    }
+
+                    input_mem->copy_from(m_stream, static_cast<void *>(&bin[0]), true);
+                }
+            }
+        }
+    }
+
+    // Dump input buffers of 'inst'
+    if (debug_config->dump_layers_path.length() > 0) {
+        const std::string layer_name = inst.id();
+
+        if (debug_config->is_target_iteration(m_iter) &&
+            debug_config->dump_layers_dst_only == 0 && debug_config->is_layer_for_dumping(layer_name)) {
+            std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\"" + layer_name + ":";
+            for (size_t i = 0; i < m_inst.dependencies().size(); i++) {
+                std::string name = get_file_prefix() + layer_name + "_src" + std::to_string(i);
+                auto input_mem = m_inst.dep_memory_ptr(i);
+                if (input_mem == nullptr) {
+                    GPU_DEBUG_COUT  << " input_mem_" << i << " is nullptr. Nothing to dump." << std::endl;
+                    continue;
+                }
+
+                auto dep = m_inst.dependencies().at(i);
+                auto input_layout = dep.first->get_output_layout(dep.second);
+                GPU_DEBUG_IF(debug_config->dump_layers_binary) {
+                    // Binary dump : raw
+                    auto filename = get_file_path_for_binary_dump(input_layout, name);
+
+                    mem_lock<char, mem_lock_type::read> lock(input_mem, m_stream);
+                    ov::util::save_binary(filename, lock.data(), input_mem->size());
+                    GPU_DEBUG_COUT  << " Dump layer src : " << layer_name << " to " << filename << std::endl;
+                    debug_str_for_bin_load += (filename + ",");
+                } else {
+                    log_memory_to_file(input_mem,
+                                       input_layout,
+                                       m_stream,
+                                       name,
+                                       debug_config->dump_layers_raw);
+                }
+            }
+
+            if (debug_config->dump_layers_binary && !inst.is_input()) {
+                debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"';
+                GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;
+            }
+        }
+    }
+}
+
+
+NodeDebugHelper::~NodeDebugHelper() {
+    // Dump output buffers of 'inst'
+    if (debug_config->dump_layers_path.length() > 0) {
+        m_stream.finish();
+        const std::string layer_name = m_inst.id();
+
+        GPU_DEBUG_IF(debug_config->is_target_iteration(m_iter) &&
+                    debug_config->is_layer_for_dumping(layer_name, m_inst.is_output(), m_inst.is_input())) {
+            std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\""
+                                                    + layer_name + ":";
+            for (size_t i = 0; i < m_inst.outputs_memory_count(); i++) {
+                std::string name = get_file_prefix() + "_dst" + std::to_string(i);
+                auto output_mem = m_inst.output_memory_ptr(i);
+                if (output_mem == nullptr) {
+                    GPU_DEBUG_COUT  << " output_mem is nullptr. Nothing to dump." << std::endl;
+                    continue;
+                }
+
+                GPU_DEBUG_IF(debug_config->dump_layers_binary) {
+                    // Binary dump : raw
+                    auto output_layout = m_inst.get_output_layout(i);
+                    auto filename = get_file_path_for_binary_dump(output_layout, name);
+
+                    mem_lock<char, mem_lock_type::read> lock(output_mem, m_stream);
+                    ov::util::save_binary(filename, lock.data(), output_mem->size());
+                    GPU_DEBUG_COUT  << " Dump layer dst : " << layer_name << " to " << filename << std::endl;
+                    debug_str_for_bin_load += (filename + ",");
+                } else {
+                    // Text dump
+                    log_memory_to_file(output_mem, m_inst.get_output_layout(i), m_stream, name, debug_config->dump_layers_raw);
+                }
+            }
+
+            GPU_DEBUG_IF(debug_config->dump_layers_binary && m_inst.is_input()) {
+                debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"';
+                GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;;
+            }
+        }
+    }
+}
+
+NetworkDebugHelper::NetworkDebugHelper(const network& net)
+    : m_network(net)
+    , m_iter(net.iteration) {
+    auto net_id = m_network.get_id();
+    GPU_DEBUG_IF(debug_config->dump_memory_pool > 0) {
+        auto& iters = debug_config->dump_memory_pool_iters;
+        if (iters.empty() || iters.find(m_iter) != iters.end()) {
+            GPU_DEBUG_COUT << "============================================================================" << std::endl;
+            GPU_DEBUG_COUT << "Start network execution (net_id : " << net_id << ", iter :" << m_iter << ")" << std::endl;
+            if (m_iter == 0 && net_id > 0) {
+                dump_memory_pool(debug_config->dump_memory_pool_path, m_iter);
+                GPU_DEBUG_COUT << "============================================================================" << std::endl;
+            }
+        }
+    } else {
+        GPU_DEBUG_TRACE << "============================================================================" << std::endl;
+        GPU_DEBUG_TRACE << "Start network execution (net_id : " << net_id << ", iter :" << m_iter << ")" << std::endl;
+    }
+
+    if (debug_config->list_layers == 1) {
+        for (auto& inst : m_network._exec_order) {
+            GPU_DEBUG_COUT << inst->id() << std::endl;
+            if (inst->get_node().is_type<loop>()) {
+                auto& loop_node = inst->get_node().as<loop>();
+                for (auto& prim : loop_node.get_body_program()->get_processing_order()) {
+                    GPU_DEBUG_COUT << "\t" << prim->id() << std::endl;
+                }
+            } else if (inst->get_node().is_type<condition>()) {
+                auto& cond_node = inst->get_node().as<condition>();
+                GPU_DEBUG_COUT << "* Branch_True" << std::endl;
+                for (auto& prim : cond_node.get_branch_true().inner_program->get_processing_order()) {
+                    GPU_DEBUG_COUT << "\t" << prim->id() << std::endl;
+                }
+                GPU_DEBUG_COUT << "* Branch_False" << std::endl;
+                for (auto& prim : cond_node.get_branch_false().inner_program->get_processing_order()) {
+                    GPU_DEBUG_COUT << "\t" << prim->id() << std::endl;
+                }
+            }
+        }
+
+        if (!m_network.is_internal())
+            exit(0);
+    }
+}
+
+NetworkDebugHelper::~NetworkDebugHelper() {
+    auto prog = m_network.get_program().get();
+    auto net_id = m_network.get_id();
+    // print '-data_shape' option for benchmark_app
+    if (debug_config->print_input_data_shapes == 1) {
+        std::stringstream data_shape_str;
+        auto add_string = [&data_shape_str](std::string str) {
+            data_shape_str << ((data_shape_str.rdbuf()->in_avail() == 0) ? " -data_shape " : ",") << str;
+        };
+
+        for (auto& inst : m_network._exec_order) {
+            auto name = inst->id();
+            auto pos = name.find(':');
+            auto type = name.substr(0, pos);
+            name.erase(0, pos + 1);
+            if (inst->is_input() && type == "parameter") {
+                add_string(name + inst->get_output_layout().get_partial_shape().to_string());
+            }
+        }
+
+        GPU_DEBUG_COUT << "[program:" << std::setw(2) << ((prog != nullptr) ? prog->get_id() : 0)
+                       << "|network:" << std::setw(2) << net_id << "|iter:" << std::setw(4) << m_iter <<  "] benchmark_app cmd: "
+                       << data_shape_str.str() << std::endl;
+    }
+
+    if (!debug_config->dump_graphs.empty() && debug_config->is_target_iteration(m_iter)) {
+        auto get_fixed_str = [](int value, int length = 2) -> std::string {
+            std::ostringstream ss;
+            ss << std::setw(length) << std::setfill('0') << std::to_string(value);
+            return ss.str();
+        };
+        std::string path = get_dir_path(m_network.get_config());
+        if (!path.empty()) {
+            std::ofstream ofs(path + "cldnn_program_exec_p" + get_fixed_str(prog->get_id()) + "_n" + get_fixed_str(net_id)
+                              + "_" + get_fixed_str(m_iter, 5) + ".graph");
+            dump_graph_init(ofs, *prog, [this](const primitive_id& id) -> std::shared_ptr<const primitive_inst> {
+                return m_network.get_primitive(id);
+            });
+        }
+    }
+
+    if (debug_config->dump_memory_pool > 0) {
+        auto& iters = debug_config->dump_memory_pool_iters;
+        if (iters.empty() || iters.find(m_iter) != iters.end()) {
+            dump_memory_pool(debug_config->dump_memory_pool_path, m_iter);
+            GPU_DEBUG_COUT << "============================================================================" << std::endl;
+        }
+    }
+
+    m_network.iteration++;
+}
+
+void NetworkDebugHelper::dump_memory_pool(std::string dump_path, int64_t curr_iter) const {
+    m_network.get_memory_pool().dump(m_network.get_id(), curr_iter, dump_path);
+    auto get_constants_mem_size = [&](allocation_type type) -> size_t {
+        size_t mem_size = 0;
+        for (auto& prim : m_network._primitives) {
+            if (prim.second->get_node().is_constant()) {
+                for (size_t i = 0; i < prim.second->outputs_memory_count(); i++) {
+                    if (prim.second->output_memory_ptr(i)->get_allocation_type() == type)
+                        mem_size += prim.second->output_memory_ptr(i)->size();
+                }
+            }
+        }
+        return mem_size;
+    };
+    auto get_variables_mem_size = [&](allocation_type type) -> size_t {
+        size_t mem_size = 0;
+        for (auto& var : m_network.get_variables()) {
+            if (var.second->get_memory() && var.second->get_memory()->get_allocation_type() == type)
+                mem_size += var.second->get_actual_mem_size();
+        }
+        return mem_size;
+    };
+    auto get_mb_size = [&](int64_t size) -> std::string {
+        if (size == 0) return "0 MB";
+        return std::to_string(static_cast<float>(size) / (1024 * 1024)) + " MB";
+    };
+    int64_t usm_host_const_mem_size     = get_constants_mem_size(allocation_type::usm_host);
+    int64_t usm_device_const_mem_size   = get_constants_mem_size(allocation_type::usm_device);
+    int64_t usm_host_var_mem_size       = get_variables_mem_size(allocation_type::usm_host);
+    int64_t usm_device_var_mem_size     = get_variables_mem_size(allocation_type::usm_device);
+    int64_t host_mem_size               = m_network.get_engine().get_used_device_memory(allocation_type::usm_host);
+    int64_t device_mem_size             = m_network.get_engine().get_used_device_memory(allocation_type::usm_device);
+    int64_t usm_host_mem_pool_size      = m_network.get_memory_pool().get_total_mem_pool_size(allocation_type::usm_host);
+    int64_t usm_host_etc_size           = host_mem_size - usm_host_mem_pool_size
+                                            - usm_host_const_mem_size - usm_host_var_mem_size;
+    int64_t usm_device_mem_pool_size    = m_network.get_memory_pool().get_total_mem_pool_size(allocation_type::usm_device);
+    int64_t usm_device_etc_size         = device_mem_size - usm_device_mem_pool_size
+                                            - usm_device_const_mem_size - usm_device_var_mem_size;
+    GPU_DEBUG_COUT << "------------------------------------------------------------------------" << std::endl;
+    GPU_DEBUG_COUT << "Memory statistics for (net_id:" << m_network.get_id() << ", iter:" << curr_iter << ")" << std::endl;
+    GPU_DEBUG_COUT << " Total host mem size     : " << get_mb_size(host_mem_size)               << std::endl;
+    GPU_DEBUG_COUT << " * Memory pool           : " << get_mb_size(usm_host_mem_pool_size)      << std::endl;
+    GPU_DEBUG_COUT << " * Constant              : " << get_mb_size(usm_host_const_mem_size)     << std::endl;
+    GPU_DEBUG_COUT << " * Variable              : " << get_mb_size(usm_host_var_mem_size)       << std::endl;
+    GPU_DEBUG_COUT << " * ETC                   : " << get_mb_size(usm_host_etc_size)           << std::endl;
+    GPU_DEBUG_COUT << " Total device mem size   : " << get_mb_size(device_mem_size)             << std::endl;
+    GPU_DEBUG_COUT << " * Memory pool           : " << get_mb_size(usm_device_mem_pool_size)    << std::endl;
+    GPU_DEBUG_COUT << " * Constant              : " << get_mb_size(usm_device_const_mem_size)   << std::endl;
+    GPU_DEBUG_COUT << " * Variable              : " << get_mb_size(usm_device_var_mem_size)     << std::endl;
+    GPU_DEBUG_COUT << " * ETC                   : " << get_mb_size(usm_device_etc_size)         << std::endl;
+    GPU_DEBUG_COUT << "------------------------------------------------------------------------" << std::endl;
+}
+
+}  // namespace cldnn
+
+#endif // GPU_DEBUG_CONFIG
diff --git a/src/plugins/intel_gpu/src/graph/debug_helper.hpp b/src/plugins/intel_gpu/src/graph/debug_helper.hpp
new file mode 100644
index 00000000000000..c7c6bd006af1db
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/debug_helper.hpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_gpu/graph/network.hpp"
+#include "intel_gpu/graph/program.hpp"
+#include "intel_gpu/runtime/stream.hpp"
+#include "intel_gpu/runtime/debug_configuration.hpp"
+#include "primitive_inst.h"
+
+namespace cldnn {
+
+#ifdef GPU_DEBUG_CONFIG
+
+class NodeDebugHelper {
+public:
+    NodeDebugHelper(const primitive_inst& inst);
+    ~NodeDebugHelper();
+
+private:
+    std::string get_iteration_prefix() {
+        if (m_iter < 0)
+            return std::string("");
+        return std::to_string(m_iter) + "_";
+    }
+
+    std::string get_file_prefix() {
+        auto prog_id = ((m_program != nullptr) ? m_program->get_id() : 0);
+        auto net_id = m_network.get_id();
+
+        return "program" + std::to_string(prog_id) + "_network" + std::to_string(net_id) + "_" + get_iteration_prefix() + m_inst.id();
+    }
+
+
+    const primitive_inst& m_inst;
+    stream& m_stream;
+    const network& m_network;
+    const program* m_program;
+    const size_t m_iter;
+
+    const debug_configuration* debug_config = cldnn ::debug_configuration ::get_instance();
+};
+
+class NetworkDebugHelper {
+public:
+    NetworkDebugHelper(const network& net);
+    ~NetworkDebugHelper();
+
+private:
+    void dump_memory_pool(std::string dump_path, int64_t curr_iter) const;
+    const network& m_network;
+    const size_t m_iter;
+
+    const debug_configuration* debug_config = cldnn ::debug_configuration ::get_instance();
+};
+
+#define NETWORK_DEBUG(net) NetworkDebugHelper __network_debug_helper(net)
+#define NODE_DEBUG(inst) NodeDebugHelper __node_debug_helper(inst)
+
+#else
+
+#define NETWORK_DEBUG(...)
+#define NODE_DEBUG(...)
+
+#endif  // GPU_DEBUG_CONFIG
+
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/include/program_dump_graph.h b/src/plugins/intel_gpu/src/graph/include/program_dump_graph.h
index 075422a4196b38..cf5111de6b247e 100644
--- a/src/plugins/intel_gpu/src/graph/include/program_dump_graph.h
+++ b/src/plugins/intel_gpu/src/graph/include/program_dump_graph.h
@@ -14,6 +14,6 @@ std::string get_dir_path(const ExecutionConfig& config);
 void dump_graph_optimized(std::ofstream&, const program&);
 void dump_graph_processing_order(std::ofstream&, const program&);
 void dump_graph_init(std::ofstream&, const program&,
-                     std::function<std::shared_ptr<primitive_inst>(const primitive_id&)> get_primitive_inst = nullptr);
+                     std::function<std::shared_ptr<const primitive_inst>(const primitive_id&)> get_primitive_inst = nullptr);
 void dump_graph_info(std::ofstream&, const program&);
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
index 92d62782828d78..8f0e97dd51ee12 100644
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -4,7 +4,6 @@
 
 #include "intel_gpu/plugin/variable_state.hpp"
 #include "intel_gpu/primitives/read_value.hpp"
-#include "openvino/util/file_util.hpp"
 
 #include "intel_gpu/primitives/data.hpp"
 #include "intel_gpu/primitives/mutable_data.hpp"
@@ -31,13 +30,10 @@
 #include "deconvolution_inst.h"
 #include "mutable_data_inst.h"
 #include "condition_inst.h"
-#include "loop_inst.h"
-#include "assign_inst.h"
 #include "read_value_inst.h"
 #include "reshape_inst.h"
 #include "kv_cache_inst.h"
 #include "program_helpers.h"
-#include "to_string_utils.h"
 #include "program_dump_graph.h"
 
 #include <algorithm>
@@ -51,8 +47,8 @@
 #include <functional>
 #include <fstream>
 
+#include "debug_helper.hpp"
 #ifdef GPU_DEBUG_CONFIG
-#include <iomanip>
 #include <fstream>
 #include <sys/stat.h>
 #include <chrono>
@@ -60,7 +56,6 @@
 #endif
 
 namespace cldnn {
-
 namespace {
 
 #ifdef GPU_DEBUG_CONFIG
@@ -143,179 +138,6 @@ void dump_perf_data_raw(std::string dump_path, const std::list<std::shared_ptr<p
     }
 }
 
-float convert_element(int64_t i) { return static_cast<float>(i); }
-float convert_element(int32_t i) { return static_cast<float>(i); }
-
-float convert_element(float f) { return f; }
-
-float convert_element(ov::float16 h) { return static_cast<float>(h); }
-
-size_t get_x_pitch(const layout& layout) {
-    try {
-        auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0));
-        auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0));
-        auto x0 = layout.get_linear_offset(tensor_x0);
-        auto x1 = layout.get_linear_offset(tensor_x1);
-        return (x1 - x0);
-    } catch (...) {
-        // When spatial size of x=0, x_pitch is meaningless
-        return 0;
-    }
-}
-
-template <class T>
-void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
-    auto&& size = mem->get_layout().get_tensor();
-
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
-    tensor tmp_size(size);
-    tmp_size.batch[0] = batch_size;
-    if (tmp_size == size) {
-        file_stream << "shape: " << size.to_string() << " ";
-        file_stream << "(count: " << size.count()
-                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")"
-                    << (dump_raw ? " raw data" : "") << std::endl;
-    } else {
-        file_stream << "shape: " << tmp_size.to_string() << " ";
-        file_stream << "(count: " << tmp_size.count()
-                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
-                    << ", original shape: " << size.to_string() << ")"
-                    << (dump_raw ? " raw data" : "") << std::endl;
-    }
-
-    if (size.count() == 0) {
-        file_stream << "Empty buffer" << std::endl;
-        return;
-    }
-
-    mem_lock<T, mem_lock_type::read> lock(mem, stream);
-    auto mem_ptr = lock.data();
-    auto x_pitch = get_x_pitch(mem->get_layout());
-    std::stringstream buffer;
-
-    if (!dump_raw) {
-        for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
-            for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
-                for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
-                    for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
-                        for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
-                            for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) {
-                                cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w));
-                                size_t input_it = mem->get_layout().get_linear_offset(t);
-
-                                for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) {
-                                    buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        for (size_t i = 0; i < lock.size(); ++i) {
-            buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[i]) << std::endl;
-        }
-    }
-    file_stream << buffer.str();
-}
-
-void unpack(cldnn::data_types type, uint8_t input, int8_t &v0, int8_t &v1) {
-    if (type == cldnn::data_types::i4) {
-        char s_bit = (input & 0x08);
-        char mask = s_bit > 0 ? 0xF0 : 0x00;
-        v0 = (input & 0x0F) | mask;
-
-        input >>= 4;
-        s_bit = (input & 0x08);
-        mask = s_bit > 0 ? 0xF0 : 0x00;
-        v1 = (input & 0x0F) | mask;
-    } else if (type == cldnn::data_types::u4) {
-        v0 = input & 0x0F;
-        v1 = input >> 4;
-    } else {
-        OPENVINO_ASSERT(false, "not supported unpacking");
-    }
-}
-
-void dump_i4u4(cldnn::data_types type, memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
-    auto&& size = mem->get_layout().get_tensor();
-
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
-    tensor tmp_size(size);
-    tmp_size.batch[0] = batch_size;
-    if (tmp_size == size) {
-        file_stream << "shape: " << size.to_string() << " ";
-        file_stream << "(count: " << size.count()
-                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")"
-                    << (dump_raw ? " raw data" : "") << std::endl;
-    } else {
-        file_stream << "shape: " << tmp_size.to_string() << " ";
-        file_stream << "(count: " << tmp_size.count()
-                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
-                    << ", original shape: " << size.to_string() << ")"
-                    << (dump_raw ? " raw data" : "") << std::endl;
-    }
-
-    if (size.count() == 0) {
-        file_stream << "Empty buffer" << std::endl;
-        return;
-    }
-
-    mem_lock<uint8_t, mem_lock_type::read> lock(mem, stream);
-    auto mem_ptr = lock.data();
-    std::stringstream buffer;
-
-    if (dump_raw) {
-        for (size_t i = 0; i < lock.size(); ++i) {
-            int8_t v0, v1;
-            unpack(type, mem_ptr[i], v0, v1);
-            buffer << std::fixed << std::setprecision(6) << static_cast<int>(v0) << std::endl;
-            buffer << std::fixed << std::setprecision(6) << static_cast<int>(v1) << std::endl;
-        }
-    } else {
-        std::cout << __func__ << " supports raw dump only" << std::endl;
-    }
-    file_stream << buffer.str();
-}
-
-void log_memory_to_file(memory::ptr mem, layout data_layout, stream& stream, std::string layerName, bool dump_raw) {
-    std::cout << "Dump " << (dump_raw ? "raw " : "") << layerName << std::endl;
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    std::string filename = debug_config->get_name_for_dump(layerName);
-    filename = debug_config->dump_layers_path + filename + ".txt";
-    std::ofstream file_stream(filename);
-    if (!mem) {
-        file_stream << "Empty" << std::endl;
-        return;
-    }
-
-    // Reinterpret buffer to represent actual data layout
-    auto actual_mem = mem->get_engine()->reinterpret_buffer(*mem, data_layout);
-
-    auto mem_dt = actual_mem->get_layout().data_type;
-    if (mem_dt == cldnn::data_types::f32)
-        dump<float>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::f16)
-        dump<ov::float16>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::i64)
-        dump<int64_t>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::i32)
-        dump<int32_t>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::i8)
-        dump<int8_t>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::u8)
-        dump<uint8_t>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::u8)
-        dump<uint8_t>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::i4 || mem_dt == cldnn::data_types::u4)
-        dump_i4u4(mem_dt, actual_mem, stream, file_stream, dump_raw);
-    else
-        std::cout << "Dump for this data type is not supported: " << dt_to_str(mem_dt) << std::endl;
-}
-
 void wait_for_the_turn() {
     GPU_DEBUG_GET_INSTANCE(debug_config);
     bool need_to_wait;
@@ -336,7 +158,6 @@ void wait_for_the_turn() {
 
 #else
 void dump_perf_data_raw(std::string, const std::list<std::shared_ptr<primitive_inst>>&) {}
-void log_memory_to_file(memory::ptr, layout, stream&, std::string, bool dump_raw) {}
 void wait_for_the_turn() {}
 #endif
 }  // namespace
@@ -346,25 +167,6 @@ static uint32_t get_unique_net_id() {
     return ++id_gen;
 }
 
-static std::string get_file_path_for_binary_dump(cldnn::layout layout, std::string name) {
-    std::string filename;
-    std::string data_type = ov::element::Type(layout.data_type).get_type_name();
-    std::string format = layout.format.to_string();
-    std::string tensor;
-    auto dims = layout.get_dims();
-    for (size_t r = 0 ; r < layout.get_rank() ; r++) {
-        tensor += ("_" + to_string(dims[r]));
-    }
-
-#ifdef GPU_DEBUG_CONFIG
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    std::string layer_name = debug_config->get_name_for_dump(name);
-    filename = debug_config->dump_layers_path + layer_name
-                + "__" + data_type + "_" + tensor + "__" + format + ".bin";
-#endif
-    return filename;
-}
-
 /*
 Network will always have net_id = 0 when it will be cldnn internal micronetwork (created i.e by propagate_constants
 opt pass).
@@ -939,28 +741,10 @@ std::map<primitive_id, network_output> network::execute(const std::vector<event:
 
 void network::execute_impl(const std::vector<event::ptr>& events) {
     OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "NetworkImpl::Execute");
-    int64_t curr_iter = -1;
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-#ifdef GPU_DEBUG_CONFIG
-    curr_iter = iteration;
-#endif
+    NETWORK_DEBUG(*this);
 
     // Wait for previous execution completion
     reset_execution(false);
-    GPU_DEBUG_IF(debug_config->dump_memory_pool > 0) {
-        auto& iters = debug_config->dump_memory_pool_iters;
-        if (iters.empty() || iters.find(curr_iter) != iters.end()) {
-            GPU_DEBUG_COUT << "============================================================================" << std::endl;
-            GPU_DEBUG_COUT << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl;
-            if (curr_iter == 0 && get_id() > 0) {
-                dump_memory_pool(debug_config->dump_memory_pool_path, curr_iter);
-                GPU_DEBUG_COUT << "============================================================================" << std::endl;
-            }
-        }
-    } else {
-        GPU_DEBUG_TRACE << "============================================================================" << std::endl;
-        GPU_DEBUG_TRACE << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl;
-    }
 
     std::vector<memory::ptr> in_out_mem;
     auto is_surface_lock_check_needed = [&](const shared_mem_type& shared_mem_type) {
@@ -996,33 +780,6 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
     auto surf_lock = surfaces_lock::create(get_engine().type(), in_out_mem, get_stream());
 
     set_arguments();
-    GPU_DEBUG_IF(debug_config->list_layers == 1) {
-        for (auto& inst : _exec_order) {
-            GPU_DEBUG_COUT << inst->id() << std::endl;
-            if (inst->get_node().is_type<loop>()) {
-                auto& loop_node = inst->get_node().as<loop>();
-                for (auto& prim : loop_node.get_body_program()->get_processing_order()) {
-                    GPU_DEBUG_COUT << "\t" << prim->id() << std::endl;
-                }
-            } else if (inst->get_node().is_type<condition>()) {
-                auto& cond_node = inst->get_node().as<condition>();
-                GPU_DEBUG_COUT << "* Branch_True" << std::endl;
-                for (auto& prim : cond_node.get_branch_true().inner_program->get_processing_order()) {
-                    GPU_DEBUG_COUT << "\t" << prim->id() << std::endl;
-                }
-                GPU_DEBUG_COUT << "* Branch_False" << std::endl;
-                for (auto& prim : cond_node.get_branch_false().inner_program->get_processing_order()) {
-                    GPU_DEBUG_COUT << "\t" << prim->id() << std::endl;
-                }
-            }
-        }
-        if (!is_internal()) exit(0);
-    }
-    auto get_iteration_prefix = [](int64_t iter) {
-        if (iter < 0)
-            return std::string("");
-        return std::to_string(iter) + "_";
-    };
 
     // This extra flush command is needed for dynamic models in both cases of out_of_order / in_order operating mode
     // since it reduces `bubbles` number in pipeline and GPU's idle time by timely flushing new kernels to device.
@@ -1033,233 +790,43 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
     size_t executed_prims = 0;
 
     for (auto& inst : _exec_order) {
-        // Load binary dump for input layers
-        GPU_DEBUG_IF(!debug_config->load_layers_raw_dump.empty()) {
-            const std::string layer_name = inst->id();
-            auto files = debug_config->get_filenames_for_matched_layer_loading_binaries(layer_name);
-            if (!files.empty()) {
-                if (inst->is_input()) {
-                    // Loading binary dumps for output tensors of input-layers : only one output exists or index(dstN) exists
-                    auto dump_file = debug_config->get_matched_from_filelist(files, "_dst0__");
-                    OPENVINO_ASSERT((files.size() == 1 || dump_file.length() != 0), "Unexpected binary dump for input layer");
-
-                    OPENVINO_ASSERT(files.size() == get_primitive(inst->id())->outputs_memory_count(), "Mis-match dump file count");
-
-                    for (size_t i = 0; i < get_primitive(inst->id())->outputs_memory_count(); i++) {
-                        auto dump_file = files[0];
-                        if (files.size() > 1 || get_primitive(inst->id())->outputs_memory_count() != 1) {
-                            std::string pattern = "_dst" + std::to_string(i) + "__";
-                            dump_file = debug_config->get_matched_from_filelist(files, pattern);
-                        }
-                        OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_dst[N]__' for binary dump");
-                        GPU_DEBUG_COUT  << " Load binary dump : " << dump_file << " for " << layer_name << std::endl;
-
-                        std::vector<uint8_t> bin = ov::util::load_binary(dump_file);
-                        OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file);
-
-                        auto output_mem = get_primitive(layer_name)->output_memory_ptr(i);
-                        OPENVINO_ASSERT(output_mem->size() == bin.size(), "memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name
-                                        + "\n Expected size : " + to_string(output_mem->size()) + ", Binary : " + to_string(bin.size()));
-
-                        output_mem->copy_from(get_stream(), static_cast<void *>(&bin[0]), true);
-                    }
-                } else {
-                    auto check_dst = debug_config->get_matched_from_filelist(files, "_dst0__");
-                    OPENVINO_ASSERT(check_dst.length() == 0, "Expected to load binaries for inputs of " + layer_name);
-
-                    // Loading input tensors for any layer
-                    auto dump_file = debug_config->get_matched_from_filelist(files, "_src0__");
-                    OPENVINO_ASSERT(dump_file.length() != 0, "Could not find expected pattern '_src[N]__' for binary dump input : " + layer_name);
-
-                    for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
-                        auto dump_file = files[0];
-                        if (files.size() > 1 || get_primitive(inst->id())->dependencies().size() != 1) {
-                            std::string pattern = "_src" + std::to_string(i) + "__";
-                            dump_file = debug_config->get_matched_from_filelist(files, pattern);
-                        }
-                        if (dump_file.length() == 0) {
-                            GPU_DEBUG_COUT  << " Skip loading for  input(" << i << ") of " << layer_name << std::endl;
-                            continue;
-                        }
-                        OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_src[N]__' for binary dump input");
-                        GPU_DEBUG_COUT  << " Load binary dump : " << dump_file << " for input(" << i << ") of " << layer_name << std::endl;
-
-                        std::vector<uint8_t> bin = ov::util::load_binary(dump_file);
-                        OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file);
-
-                        auto input_mem = get_primitive(inst->id())->dep_memory_ptr(i);
-                        if (input_mem->size() != bin.size()) {
-                            std::cout << "WARNING: memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name
-                                      << "  " << input_mem->size() << " / " << bin.size() << std::endl;
-                            bin.resize(input_mem->size());
-                        }
-
-                        input_mem->copy_from(get_stream(), static_cast<void *>(&bin[0]), true);
-                    }
-                }
-            }
-        }
-
-        // Dump input buffers of 'inst'
-        GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
-            const std::string layer_name = inst->id();
-
-            GPU_DEBUG_IF(debug_config->is_target_iteration(curr_iter) &&
-                        debug_config->dump_layers_dst_only == 0 && debug_config->is_layer_for_dumping(layer_name)) {
-                std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\"" + layer_name + ":";
-                for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
-                    std::string name = "program" + std::to_string((get_program() != nullptr) ? get_program()->get_id() : 0) +
-                                        "_network" + std::to_string(get_id()) +
-                                        "_" + get_iteration_prefix(curr_iter) +
-                                        layer_name + "_src" + std::to_string(i);
-                    auto input_mem = get_primitive(inst->id())->dep_memory_ptr(i);
-                    if (input_mem == nullptr) {
-                        GPU_DEBUG_COUT  << " input_mem_" << i << " is nullptr. Nothing to dump." << std::endl;
-                        continue;
-                    }
-
-                    auto dep = inst->dependencies().at(i);
-                    auto input_layout = dep.first->get_output_layout(dep.second);
-                    GPU_DEBUG_IF(debug_config->dump_layers_binary) {
-                        // Binary dump : raw
-                        auto filename = get_file_path_for_binary_dump(input_layout, name);
-
-                        mem_lock<char, mem_lock_type::read> lock(input_mem, get_stream());
-                        ov::util::save_binary(filename, lock.data(), input_mem->size());
-                        GPU_DEBUG_COUT  << " Dump layer src : " << layer_name << " to " << filename << std::endl;
-                        debug_str_for_bin_load += (filename + ",");
-                    } else {
-                        log_memory_to_file(input_mem,
-                                        input_layout,
-                                        get_stream(),
-                                        name,
-                                        debug_config->dump_layers_raw);
-                    }
-                }
-
-                GPU_DEBUG_IF(debug_config->dump_layers_binary && !inst->is_input()) {
-                    debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"';
-                    GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;;
-                }
-            }
-        }
+        NODE_DEBUG(*inst);
 
         execute_primitive(inst, events);
         executed_prims++;
         if (needs_flushing && executed_prims % flush_frequency == 0)
             get_stream().flush();
-
-        // Dump output buffers of 'inst'
-        GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
-            get_stream().finish();
-            const std::string layer_name = inst->id();
-            auto prog_id = ((get_program() != nullptr) ? get_program()->get_id() : 0);
-            auto net_id = get_id();
-            GPU_DEBUG_IF(debug_config->is_target_iteration(curr_iter) &&
-                        debug_config->is_layer_for_dumping(layer_name, inst->is_output(), inst->is_input())) {
-                std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\""
-                                                        + layer_name + ":";
-                for (size_t i = 0; i < get_primitive(layer_name)->outputs_memory_count(); i++) {
-                    std::string name = "program" + std::to_string(prog_id) +
-                                        "_network" + std::to_string(net_id) +
-                                        "_" + get_iteration_prefix(curr_iter) +
-                                        layer_name + "_dst" + std::to_string(i);
-                    auto output_mem = get_primitive(layer_name)->output_memory_ptr(i);
-                    if (output_mem == nullptr) {
-                        GPU_DEBUG_COUT  << " output_mem is nullptr. Nothing to dump." << std::endl;
-                        continue;
-                    }
-
-                    GPU_DEBUG_IF(debug_config->dump_layers_binary) {
-                        // Binary dump : raw
-                        auto output_layout = inst->get_output_layout(i);
-                        auto filename = get_file_path_for_binary_dump(output_layout, name);
-
-                        mem_lock<char, mem_lock_type::read> lock(output_mem, get_stream());
-                        ov::util::save_binary(filename, lock.data(), output_mem->size());
-                        GPU_DEBUG_COUT  << " Dump layer dst : " << layer_name << " to " << filename << std::endl;
-                        debug_str_for_bin_load += (filename + ",");
-                    } else {
-                        // Text dump
-                        log_memory_to_file(output_mem, inst->get_output_layout(i), get_stream(), name, debug_config->dump_layers_raw);
-                    }
-                }
-
-                GPU_DEBUG_IF(debug_config->dump_layers_binary && inst->is_input()) {
-                    debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"';
-                    GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;;
-                }
-            }
-        }
-    }
-
-    // print '-data_shape' option for benchmark_app
-    GPU_DEBUG_IF(debug_config->print_input_data_shapes == 1) {
-        std::stringstream data_shape_str;
-        auto add_string = [&data_shape_str](std::string str) {
-            data_shape_str << ((data_shape_str.rdbuf()->in_avail() == 0) ? " -data_shape " : ",") << str;
-        };
-
-        for (auto& inst : _exec_order) {
-            auto name = inst->id();
-            auto pos = name.find(':');
-            auto type = name.substr(0, pos);
-            name.erase(0, pos + 1);
-            if (inst->is_input() && type == "parameter") {
-                add_string(name + inst->get_output_layout().get_partial_shape().to_string());
-            }
-        }
-
-        GPU_DEBUG_COUT << "[program:" << std::setw(2) << ((get_program() != nullptr) ? get_program()->get_id() : 0)
-                       << "|network:" << std::setw(2) << get_id() << "|iter:" << std::setw(4) << curr_iter <<  "] benchmark_app cmd: "
-                       << data_shape_str.str() << std::endl;
-    }
-
-    GPU_DEBUG_IF(!debug_config->dump_graphs.empty() && debug_config->is_target_iteration(curr_iter)) {
-        auto get_fixed_str = [](int value, int length = 2) -> std::string {
-            std::ostringstream ss;
-            ss << std::setw(length) << std::setfill('0') << std::to_string(value);
-            return ss.str();
-        };
-        std::string path = get_dir_path(get_config());
-        if (!path.empty()) {
-            std::ofstream ofs(path + "cldnn_program_exec_p" + get_fixed_str(get_program()->get_id()) + "_n" + get_fixed_str(get_id())
-                              + "_" + get_fixed_str(curr_iter, 5) + ".graph");
-            dump_graph_init(ofs, *get_program(), [&](const primitive_id& id) -> std::shared_ptr<primitive_inst> {
-                return get_primitive(id);
-            });
-        }
     }
 
     // Store events only in case of OOO queue or enabled Profiling
     auto store_events = is_out_of_order_queue || _enable_profiling;
     if (store_events) {
         if (_program != nullptr) {
-        for (auto& inst : _program->get_processing_order()) {
-            // Special handling for mutable data. The event should be the same as the user or dependency with highest
-            // processing_num as the mutable_data can be updated when is both user or dependency.
-            if (inst->is_type<mutable_data>()) {
-                decltype(_program->get_processing_order().get_processing_number(inst)) proc_num = 0;
-                for (auto& user : inst->get_users()) {
-                    auto user_proc_num = _program->get_processing_order().get_processing_number(user);
-                    if (user_proc_num > proc_num) {
-                        _events[inst->id()] = _events[user->id()];
-                        proc_num = user_proc_num;
+            for (auto& inst : _program->get_processing_order()) {
+                // Special handling for mutable data. The event should be the same as the user or dependency with highest
+                // processing_num as the mutable_data can be updated when is both user or dependency.
+                if (inst->is_type<mutable_data>()) {
+                    decltype(_program->get_processing_order().get_processing_number(inst)) proc_num = 0;
+                    for (auto& user : inst->get_users()) {
+                        auto user_proc_num = _program->get_processing_order().get_processing_number(user);
+                        if (user_proc_num > proc_num) {
+                            _events[inst->id()] = _events[user->id()];
+                            proc_num = user_proc_num;
+                        }
                     }
-                }
 
-                if (!inst->get_dependencies().empty()) {
-                    for (auto& dep : inst->get_dependencies()) {
-                        auto dep_proc_num = _program->get_processing_order().get_processing_number(dep.first);
-                        if (dep_proc_num > proc_num) {
-                            _events[inst->id()] = _events[dep.first->id()];
-                            proc_num = dep_proc_num;
+                    if (!inst->get_dependencies().empty()) {
+                        for (auto& dep : inst->get_dependencies()) {
+                            auto dep_proc_num = _program->get_processing_order().get_processing_number(dep.first);
+                            if (dep_proc_num > proc_num) {
+                                _events[inst->id()] = _events[dep.first->id()];
+                                proc_num = dep_proc_num;
+                            }
                         }
                     }
                 }
             }
         }
-        }
 
         for (auto& dout : _data_outputs) {  // data primitives are not executed so if they are marked as output we need to add
                                             // them valid events manually
@@ -1278,73 +845,6 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
 
     // Deallocate events from the previos iteration
     _old_events.clear();
-
-    GPU_DEBUG_IF(debug_config->dump_memory_pool > 0) {
-        auto& iters = debug_config->dump_memory_pool_iters;
-        if (iters.empty() || iters.find(curr_iter) != iters.end()) {
-            dump_memory_pool(debug_config->dump_memory_pool_path, curr_iter);
-            GPU_DEBUG_COUT << "============================================================================" << std::endl;
-        }
-    }
-
-#ifdef GPU_DEBUG_CONFIG
-    iteration++;
-#endif
-}
-
-void network::dump_memory_pool(std::string dump_path, int64_t curr_iter) {
-#ifdef GPU_DEBUG_CONFIG
-    get_memory_pool().dump(get_id(), curr_iter, dump_path);
-    auto get_constants_mem_size = [&](allocation_type type) -> size_t {
-        size_t mem_size = 0;
-        for (auto& prim : _primitives) {
-            if (prim.second->get_node().is_constant()) {
-                for (size_t i = 0; i < prim.second->outputs_memory_count(); i++) {
-                    if (prim.second->output_memory_ptr(i)->get_allocation_type() == type)
-                        mem_size += prim.second->output_memory_ptr(i)->size();
-                }
-            }
-        }
-        return mem_size;
-    };
-    auto get_variables_mem_size = [&](allocation_type type) -> size_t {
-        size_t mem_size = 0;
-        for (auto& var : get_variables()) {
-            if (var.second->get_memory() && var.second->get_memory()->get_allocation_type() == type)
-                mem_size += var.second->get_actual_mem_size();
-        }
-        return mem_size;
-    };
-    auto get_mb_size = [&](int64_t size) -> std::string {
-        if (size == 0) return "0 MB";
-        return std::to_string(static_cast<float>(size) / (1024 * 1024)) + " MB";
-    };
-    int64_t usm_host_const_mem_size     = get_constants_mem_size(allocation_type::usm_host);
-    int64_t usm_device_const_mem_size   = get_constants_mem_size(allocation_type::usm_device);
-    int64_t usm_host_var_mem_size       = get_variables_mem_size(allocation_type::usm_host);
-    int64_t usm_device_var_mem_size     = get_variables_mem_size(allocation_type::usm_device);
-    int64_t host_mem_size               = get_engine().get_used_device_memory(allocation_type::usm_host);
-    int64_t device_mem_size             = get_engine().get_used_device_memory(allocation_type::usm_device);
-    int64_t usm_host_mem_pool_size      = get_memory_pool().get_total_mem_pool_size(allocation_type::usm_host);
-    int64_t usm_host_etc_size           = host_mem_size - usm_host_mem_pool_size
-                                            - usm_host_const_mem_size - usm_host_var_mem_size;
-    int64_t usm_device_mem_pool_size    = get_memory_pool().get_total_mem_pool_size(allocation_type::usm_device);
-    int64_t usm_device_etc_size         = device_mem_size - usm_device_mem_pool_size
-                                            - usm_device_const_mem_size - usm_device_var_mem_size;
-    GPU_DEBUG_COUT << "------------------------------------------------------------------------" << std::endl;
-    GPU_DEBUG_COUT << "Memory statistics for (net_id:" << get_id() << ", iter:" << curr_iter << ")" << std::endl;
-    GPU_DEBUG_COUT << " Total host mem size     : " << get_mb_size(host_mem_size)               << std::endl;
-    GPU_DEBUG_COUT << " * Memory pool           : " << get_mb_size(usm_host_mem_pool_size)      << std::endl;
-    GPU_DEBUG_COUT << " * Constant              : " << get_mb_size(usm_host_const_mem_size)     << std::endl;
-    GPU_DEBUG_COUT << " * Variable              : " << get_mb_size(usm_host_var_mem_size)       << std::endl;
-    GPU_DEBUG_COUT << " * ETC                   : " << get_mb_size(usm_host_etc_size)           << std::endl;
-    GPU_DEBUG_COUT << " Total device mem size   : " << get_mb_size(device_mem_size)             << std::endl;
-    GPU_DEBUG_COUT << " * Memory pool           : " << get_mb_size(usm_device_mem_pool_size)    << std::endl;
-    GPU_DEBUG_COUT << " * Constant              : " << get_mb_size(usm_device_const_mem_size)   << std::endl;
-    GPU_DEBUG_COUT << " * Variable              : " << get_mb_size(usm_device_var_mem_size)     << std::endl;
-    GPU_DEBUG_COUT << " * ETC                   : " << get_mb_size(usm_device_etc_size)         << std::endl;
-    GPU_DEBUG_COUT << "------------------------------------------------------------------------" << std::endl;
-#endif
 }
 
 std::vector<primitive_id> network::get_input_ids() const {
diff --git a/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp b/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp
index bff45cd81f9900..4a2f43b28d9360 100644
--- a/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp
@@ -170,7 +170,7 @@ std::string get_dir_path(const ExecutionConfig& config) {
 
 void dump_graph_init(std::ofstream& graph,
                      const program& program,
-                     std::function<std::shared_ptr<primitive_inst>(const primitive_id&)> get_primitive_inst) {
+                     std::function<std::shared_ptr<const primitive_inst>(const primitive_id&)> get_primitive_inst) {
     const std::string invalid_layout_msg = "(invalid layout)";
 
     const auto dump_mem_info = [&invalid_layout_msg, &get_primitive_inst](const program_node* ptr) {

From 108bb731395edb98c3d55bea53d438db247d6aee Mon Sep 17 00:00:00 2001
From: Pavel Durandin <pavel.durandin@intel.com>
Date: Thu, 3 Oct 2024 16:57:56 +0400
Subject: [PATCH 7/9] [GPU] Fix unit tests ph4 (#26879)

### Details:
 - Fix oob array
---
 src/plugins/intel_gpu/src/graph/program_node.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp
index 3c9ad0f7317a27..21ba4e656fae0d 100644
--- a/src/plugins/intel_gpu/src/graph/program_node.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_node.cpp
@@ -611,9 +611,9 @@ bool program_node::is_padded_spatial(size_t idx) const {
     auto& layout = get_output_layout(idx);
     const auto& lower_size = layout.data_padding._lower_size;
     const auto& upper_size = layout.data_padding._upper_size;
-    return std::any_of(std::begin(lower_size) + 2, std::begin(lower_size) + layout.get_spatial_rank() - 1,
+    return std::any_of(std::begin(lower_size) + 2, std::begin(lower_size) + 2 + layout.get_spatial_rank(),
                         [](const tensor::value_type& el) { return el != 0; }) ||
-           std::any_of(std::begin(upper_size) + 2, std::begin(upper_size) + layout.get_spatial_rank() - 1,
+           std::any_of(std::begin(upper_size) + 2, std::begin(upper_size) + 2 + layout.get_spatial_rank(),
                         [](const tensor::value_type& el) { return el != 0; });
 }
 

From 0ae368396a5ca2c5925465b4a55442623c8e33ad Mon Sep 17 00:00:00 2001
From: Sergey Shlyapnikov <sergey.shlyapnikov@intel.com>
Date: Thu, 3 Oct 2024 17:01:19 +0400
Subject: [PATCH 8/9] [GPU] Adjust usm_host memory usage for InferRequest
 tensors (#26896)

### Details:
 - Adjust usm_host memory usage for InferRequest tensors
---
 src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
index 346b4471779593..88d69dcd3e47b3 100644
--- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -34,9 +34,12 @@ namespace {
 inline bool can_use_usm_host(const cldnn::engine& engine) {
     auto can_use_usm = engine.use_unified_shared_memory();
 
-    if (engine.get_device_info().gfx_ver.major == 12 && engine.get_device_info().gfx_ver.minor == 60) {
-        // WA: Disable USM host memory for infer request`s tensors for PVC as
-        // it has performance issues in case of host <-> device data transfers inside kernels
+    const auto& device_info = engine.get_device_info();
+    if ((device_info.gfx_ver.major == 12 && device_info.gfx_ver.minor == 60) ||
+        (device_info.gfx_ver.major >= 20 && device_info.dev_type == cldnn::device_type::discrete_gpu)) {
+        // WA: Disable USM host memory for infer request`s tensors for PVC and subsequent dGPUs, as kernel access
+        // to system memory is slower than using an explicit memcpy (Host <-> Device) call with the copy engine
+        // Driver tickets with additional details: 6155, 10054
         GPU_DEBUG_TRACE << "Do not use usm_host for performance issue" << std::endl;
         can_use_usm = false;
     }

From 890f2e12c98fa53163439170eaa304e8bd45337b Mon Sep 17 00:00:00 2001
From: Pavel Durandin <pavel.durandin@intel.com>
Date: Thu, 3 Oct 2024 17:48:26 +0400
Subject: [PATCH 9/9] [GPU] Fix empty oneDnn users (#26900)

### Details:
 - Fix empty oneDnn users
---
 src/plugins/intel_gpu/src/graph/layout_optimizer.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
index ae64846a0c9b5e..57f2fb41c7cc06 100644
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -1089,11 +1089,13 @@ format layout_optimizer::get_expected_format(quantize_node const& node) {
     auto use_onednn_impls = _optimization_attributes.use_onednn_impls;
 
     if (use_onednn_impls) {
-        auto& user = node.get_users().front();
-        if (user != nullptr && user->get_preferred_input_fmt(user->get_dependency_index(node)) != format::any) {
-            expected = user->get_preferred_input_fmt(user->get_dependency_index(node));
-        } else {
-            expected = format::any;
+        expected = format::any;
+        auto& users = node.get_users();
+        if (users.size() != 0) {
+            auto& user = users.front();
+            if (user != nullptr && user->get_preferred_input_fmt(user->get_dependency_index(node)) != format::any) {
+                expected = user->get_preferred_input_fmt(user->get_dependency_index(node));
+            }
         }
     } else if (only_gemm_users(node)) {
         // TODO: Gemm is not supporting fsv layouts