diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
index 5b43caf43c43ee..2144fa4c730540 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -876,21 +876,47 @@ void prepare_buffer_fusing::run(program& p) {
                 padding::DynamicDimsMask info_dynamic_pad;
                 info_dynamic_pad[concat_axis] = 1;
                 kv_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad;
+                GPU_DEBUG_TRACE_DETAIL << node.id() << " 0th  output  layout before before " << node.get_output_layout(false,  0) << "\n";
                 node.set_output_layout(kv_out_layout);
                 node.can_share_buffer(false);
+                GPU_DEBUG_TRACE_DETAIL << node.id() << " 0th  output  layout after " << node.get_output_layout(false,  0) << "\n";
 
-                auto update_dep = [&info_dynamic_pad](program_node* dep) {
-                    auto prev_layout = dep->get_output_layout();
+                auto update_dep = [](program_node* dep, padding::DynamicDimsMask& info_dynamic_pad, size_t idx) {
+                    auto prev_layout = dep->get_output_layout(true, idx);
                     prev_layout.data_padding._dynamic_dims_mask = info_dynamic_pad;
-                    dep->set_output_layout(prev_layout);
+                    dep->set_output_layout(prev_layout, true, idx);
                     dep->can_share_buffer(false);
                 };
 
                 if (rv_prim) {
-                    update_dep(rv_prim);
+                    update_dep(rv_prim, info_dynamic_pad, 0);
                 }
                 if (gather_prim) {
-                    update_dep(gather_prim);
+                    update_dep(gather_prim, info_dynamic_pad, 0);
+                }
+
+                GPU_DEBUG_TRACE_DETAIL << "valid first? " << node.is_valid_output_layout(0) << "\n";
+                GPU_DEBUG_TRACE_DETAIL << "first output :" << node.get_output_layout(false,  0) << "\n";
+
+                if (node.get_primitive()->compressed) {
+                    const auto scales_output_idx = 2;
+                    auto scales_out_layout = node.get_output_layout(false, scales_output_idx);
+
+                    const size_t scales_zp_concat_axis = 3;
+                    padding::DynamicDimsMask info_dynamic_pad_scales;
+                    info_dynamic_pad_scales[scales_zp_concat_axis] = 1;
+                    GPU_DEBUG_TRACE_DETAIL << "Set this pad: " << info_dynamic_pad_scales << "\n";
+                    scales_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad_scales;
+                    GPU_DEBUG_TRACE_DETAIL << "Pad after: " << info_dynamic_pad_scales << " " << scales_out_layout.data_padding._dynamic_dims_mask << " " << scales_out_layout.data_padding.is_dynamic() << "\n";
+                    GPU_DEBUG_TRACE_DETAIL << scales_out_layout.to_string() << "\n";
+                    GPU_DEBUG_TRACE_DETAIL << node.id() << " 2nd  output  layout before before " << node.get_output_layout(false,  scales_output_idx) << "\n";
+                    node.set_output_layout(scales_out_layout, true, scales_output_idx);
+                    GPU_DEBUG_TRACE_DETAIL << node.id() << " 2nd  output  after " << node.get_output_layout(false,  scales_output_idx) << " " << node.get_output_layout(false,  scales_output_idx).data_padding._dynamic_dims_mask << "\n";
+
+                    update_dep(rv_prim, info_dynamic_pad_scales, 1);
+
+                    GPU_DEBUG_TRACE_DETAIL << "valid 3d? " << node.is_valid_output_layout(scales_output_idx) << "\n";
+                    GPU_DEBUG_TRACE_DETAIL << node.id() << " 2nd  output layout " << node.get_output_layout(false,  scales_output_idx) << " " << info_dynamic_pad << "\n";
                 }
             }
         });
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
index 1b4421c2689fc2..5b59f75f6aff5d 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
@@ -76,7 +76,7 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
 
     cldnn::memory::ptr beam_table_prev = nullptr;
     cldnn::memory::ptr beam_table_new = nullptr;
-    cldnn::memory::ptr compression_scale = nullptr;
+    // cldnn::memory::ptr compression_scale = nullptr;
 
     void load(BinaryInputBuffer& ib) override {
         parent::load(ib);
@@ -111,7 +111,7 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         } else if (stage == scale_concat_stage) {
             // FIXME: indirectness and compression are orthogonal feature.
             args.inputs = { instance.input_memory_ptr(3) }; // [past, new, beam_table, past_scale, new_scale]
-            args.outputs = { compression_scale };
+            args.outputs = { instance.output_memory_ptr(2) };
         } else if (stage == dq_concat_stage) {
             args.inputs = { instance.input_memory_ptr(1) }; // [past, new, beam_table, past_scale, new_scale]
             args.outputs = { instance.output_memory_ptr(0), instance.output_memory_ptr(2) };
@@ -175,8 +175,9 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
             // However, allow execution of the first token for the case if KV-cache can't be optimized (if optimization is disabled, or
             // variables memory was reallocated and we have to copy past KV-cache to new memory)
             _kernels_data[concat_stage].kernels[1].skip_execution = true;
-            if (_kernels_data[concat_stage].kernels[0].skip_execution)
+            if (!_kernels_data[concat_stage].kernels[0].skip_execution) {
                 GPU_DEBUG_TRACE_DETAIL << "Run copy of data!\n";
+            }
         }
 
         execute_stage(events, instance, res_events, concat_stage);
@@ -217,39 +218,43 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         }
 
         if (desc->compressed) {
-            const auto scale_alloc_type = engine.get_preferred_memory_allocation_type(false);
+            // const auto scale_alloc_type = engine.get_preferred_memory_allocation_type(false);
             auto comp_scale_state =
                 dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCache&>(variable).get_compression_scale_state();
-            auto comp_scale_layout = instance.get_impl_params()->output_layouts[2];
-            auto comp_scale_shape = comp_scale_layout.get_shape();
+            // auto comp_scale_layout = instance.get_impl_params()->output_layouts[2];
+            // auto comp_scale_shape = comp_scale_layout.get_shape();
 
             bool skip_first_kernel = true;
-            const auto preallocation_size = instance.get_prealloc_iter_num();
+            // const auto preallocation_size = instance.get_prealloc_iter_num();
             // const auto preallocation_size = 4;
-            if (compression_scale) {
-                GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale->get_layout().to_short_string() << ", req size" << ov::shape_size(comp_scale_shape) << ", has " << compression_scale->count() << "\n";
-            } else {
-                GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale << ", req size" << ov::shape_size(comp_scale_shape) << "\n";
-            }
-
-            if (!compression_scale || compression_scale->count() < ov::shape_size(comp_scale_shape)) {
-                const auto concat_axis = 2;
-                auto alloc_shape = comp_scale_shape;
-                alloc_shape[concat_axis] += preallocation_size;
-                const layout comp_scale_alloc_layout = {alloc_shape, comp_scale_layout.data_type, comp_scale_layout.format};
-                GPU_DEBUG_TRACE_DETAIL << "Realloc compression scale table to " << comp_scale_alloc_layout.to_short_string() << std::endl;
-                compression_scale = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false);
-
-                skip_first_kernel = comp_scale_state->get_layout().count() == 0;
-
-                if (comp_scale_state->get_layout().count() > 64) {
-                    GPU_DEBUG_TRACE_DETAIL << "Reallocation of scales buffer. Prev " << comp_scale_state->get_layout().to_short_string() << " new: " << comp_scale_alloc_layout.to_short_string() << "(prealloc=" << preallocation_size << ")\n";
-                }
-            }
-
-            instance.set_output_memory(compression_scale, false, 2);
-            GPU_DEBUG_TRACE_DETAIL << "Override Variable memory\n";
-            comp_scale_state->set_memory(compression_scale, instance.get_impl_params()->output_layouts[2]);
+            // if (compression_scale) {
+            //     GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale->get_layout().to_short_string() << ", req size" << ov::shape_size(comp_scale_shape) << ", has " << compression_scale->count() << "\n";
+            // } else {
+            //     GPU_DEBUG_TRACE_DETAIL << "Has compression, mem=" << compression_scale << ", req size" << ov::shape_size(comp_scale_shape) << "\n";
+            // }
+
+            // if (!compression_scale || compression_scale->count() < ov::shape_size(comp_scale_shape)) {
+            //     const auto concat_axis = 3;
+            //     auto alloc_shape = comp_scale_shape;
+            //     alloc_shape[concat_axis] += preallocation_size;
+            //     const layout comp_scale_alloc_layout = {alloc_shape, comp_scale_layout.data_type, comp_scale_layout.format};
+            //     GPU_DEBUG_TRACE_DETAIL << "Realloc compression scale table to " << comp_scale_alloc_layout.to_short_string() << std::endl;
+            //     compression_scale = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false);
+
+            //     skip_first_kernel = comp_scale_state->get_layout().count() == 0;
+
+            //     if (comp_scale_state->get_layout().count() > 64) {
+            //         GPU_DEBUG_TRACE_DETAIL << "Reallocation of scales buffer. Prev " << comp_scale_state->get_layout().to_short_string() << " new: " << comp_scale_alloc_layout.to_short_string() << "(prealloc=" << preallocation_size << ")\n";
+            //     }
+            // }
+
+            // instance.set_output_memory(compression_scale, false, 2);
+            // auto scales_layout = instance.get_impl_params()->output_layouts[2];
+            // size_t scale_concat_axis = 3;
+            // scales_layout.data_padding._upper_size[scale_concat_axis] =
+            // GPU_DEBUG_TRACE_DETAIL << "Override Variable memory with layoyut " << instance.get_impl_params()->output_layouts[2] << "\n";
+
+            // comp_scale_state->set_memory(compression_scale, instance.get_impl_params()->output_layouts[2]);
 
             if (!skip_first_kernel) {
                 GPU_DEBUG_TRACE_DETAIL << "Run copy of scales!\n";
@@ -260,8 +265,6 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
                 execute_stage(events, instance, res_events, scale_concat_stage);
             }
 
-
-
             auto dq_params = get_dq_update_kernel_params(impl_param, impl_param.is_dynamic());
             (_kernels_data[dq_concat_stage].update_dispatch_data_func)(dq_params, _kernels_data[dq_concat_stage]);
             execute_stage(events, instance, res_events, dq_concat_stage);
@@ -454,7 +457,7 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         const auto& primitive = impl_param.typed_desc<kv_cache>();
         auto params = get_default_params<kernel_selector::concatenation_params>(impl_param, is_shape_agnostic);
 
-        const auto concat_axis = 2;
+        const auto concat_axis = 3;
         params.axis = convert_axis(concat_axis, impl_param.get_output_layout().get_rank());
 
         auto inputs_count = 1;
diff --git a/src/plugins/intel_gpu/src/graph/include/program_node.h b/src/plugins/intel_gpu/src/graph/include/program_node.h
index 029755c4733fe4..323d630732b5c4 100644
--- a/src/plugins/intel_gpu/src/graph/include/program_node.h
+++ b/src/plugins/intel_gpu/src/graph/include/program_node.h
@@ -232,7 +232,7 @@ struct program_node {
     }
 
     void merge_output_padding(padding const& padd, size_t idx = 0) {
-        set_output_padding(padding::max(padd, output_layouts[idx].data_padding));
+        set_output_padding(padding::max(padd, output_layouts[idx].data_padding), idx);
     }
 
     // only calculated output layout (for external usage), does not modify/use cached output layout nor invalidate users
diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp
index 325447a5b71d31..dbaf2c55e40eb3 100644
--- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp
+++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp
@@ -48,6 +48,9 @@ std::vector<layout> kv_cache_inst::calc_output_layouts(kv_cache_node const& node
     std::vector<ShapeType> output_shapes = desc->compressed ? shape_infer(&op, input_shapes, desc->group_sizes, desc->scales_output_order)
                                                             : shape_infer(&op, input_shapes);
 
+    if (desc->num_outputs == 3)
+        GPU_DEBUG_TRACE_DETAIL << desc->id << " scales output calculated shape: " << output_shapes[2] << "\n";
+
     static const std::map<size_t, size_t> ports_map = {{0, 0}, {1, 2}};
 
     std::vector<layout> out_layouts;
@@ -95,7 +98,7 @@ int32_t kv_cache_inst::get_prealloc_iter_num() {
     //   iteration.
     // - Therfore, to avoid this situation where the allocation and copying occurs simutaneously for all the kv_cache_insts,
     //   we assigned different prealloc-size for each kv cache so that we could prevent a memory peak
-    return 128 + kv_cache_id % 64;
+    return 10;
 }
 
 void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) {
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index 09480176df8afd..239e7f3953c547 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -490,6 +490,12 @@ void primitive_inst::update_shape() {
         auto& variable = get_network().get_variable(desc->variable_id);
         // Custom output layout update as update_output_layout handles paddings incorrectly for optimized out read_value + kv_cache pattern
         _impl_params->output_layouts[0] = variable.get_layout();
+
+        if (desc->compressed) {
+            GPU_DEBUG_TRACE_DETAIL << "Update ReadValue output\n";
+            auto& multi_tensors_variable = downcast<ov::intel_gpu::VariableStateIndirectKVCache>(variable);
+            _impl_params->output_layouts[1] = multi_tensors_variable.get_compression_scale_state()->get_layout();
+        }
     }
 
     if (get_node().is_type<kv_cache>()) {
@@ -572,6 +578,13 @@ event::ptr primitive_inst::realloc_if_needed() {
                                     << ", variable layout " << variable.get_layout().to_short_string() << ")" << std::endl;
 
                 _outputs[0] = variable.get_memory();
+
+                auto prim = _node->as<kv_cache>().get_primitive();
+                if (prim->compressed) {
+                    const auto& multi_tensor_var = downcast<ov::intel_gpu::VariableStateIndirectKVCache>(variable);
+                    _outputs[2] = multi_tensor_var.get_compression_scale_state()->get_memory();
+                }
+
                 // To record shape predictor
                 for (size_t j = 0; j < _impl_params->output_layouts.size(); ++j)
                     sp.predict_preallocation_shape(id(), _impl_params->output_layouts[j], true, j);
@@ -719,13 +732,19 @@ event::ptr primitive_inst::realloc_if_needed() {
     for (size_t i = 0; i < updated_layouts.size(); ++i) {
         bool reclaim = 0;
         size_t required_buffer_size = 0;
-        if (_node->is_type<kv_cache>() && i == 0) {
+        if (_node->is_type<kv_cache>() && (i == 0 || i == 2)) {
             // Relax reclaiming condition for kv cache
             const auto& desc = _node->as<kv_cache>().get_primitive();
             auto prealloc_shape = updated_layouts[i].get_shape();
             const auto shape_rank = prealloc_shape.size();
-            auto seq_axis =
-                static_cast<int32_t>(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis);
+            auto seq_axis = 0;
+            if (i == 0) {
+                // seq_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, shape_rank);
+                seq_axis = static_cast<int32_t>(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis);
+            } else if (i == 2) {
+                seq_axis = 3;
+            }
+
             prealloc_shape[seq_axis] += tmp_prealloc_count;
             required_buffer_size = std::accumulate(prealloc_shape.begin(), prealloc_shape.end(), size_t(1), std::multiplies<size_t>());
         } else {
@@ -758,11 +777,17 @@ event::ptr primitive_inst::realloc_if_needed() {
         //     continue;
 
         std::pair<bool, ov::Shape> prealloc_info;
-        if (_node->is_type<kv_cache>() && i == 0) {
+        if (_node->is_type<kv_cache>() && (i == 0 || i == 2)) {
             const auto& desc = _node->as<kv_cache>().get_primitive();
             auto shape_rank = updated_layouts[i].get_shape().size();
-            auto seq_axis =
-                static_cast<int32_t>(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis);
+            auto seq_axis = 0;
+            if (i == 0) {
+                // seq_axis = static_cast<int32_t>(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis);
+                seq_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, shape_rank);
+            } else if (i == 2) {
+                seq_axis = 3;
+            }
+
             prealloc_info = sp.predict_preallocation_shape(id(), updated_layouts[i], false, i, tmp_prealloc_count, seq_axis);
         } else {
             prealloc_info = sp.predict_preallocation_shape(id(), updated_layouts[i], can_reuse_buffer, i, tmp_prealloc_count);
@@ -778,20 +803,20 @@ event::ptr primitive_inst::realloc_if_needed() {
             GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer[" << i << "] - "
                                    << actual_layouts[i].get_linear_size() << "/" << _max_output_layout_count[i]
                                    << std::endl;
-            if (_node->is_type<kv_cache>() && (i == 0)) {
+            if (_node->is_type<kv_cache>() && (i == 0 || i == 2)) {
                 // kv_cache has already assigned memory.
                 // No need to reinterpret output memory but need to update padding
                 const auto& desc = _node->as<kv_cache>().get_primitive();
                 auto& present_layout = _impl_params->output_layouts[i];
                 const auto present_layout_rank = present_layout.get_partial_shape().size();
-                const auto sequence_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank);
+                const auto sequence_axis = i == 0 ? kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank) : 3;
                 GPU_DEBUG_TRACE_DETAIL << "get_max_pad: " << present_layout.to_short_string() << " " << _max_output_layout_count[0] << " " << sequence_axis << "\n";
                 auto max_pad = kv_cache_inst::get_max_pad(present_layout,
                                                           _max_output_layout_count[i],
                                                           sequence_axis,
-                                                          "present_layout");
+                                                          i == 0 ? "present_layout" : "present_scales_layout");
                 kv_cache_inst::update_pad(present_layout, max_pad, sequence_axis);
-                GPU_DEBUG_TRACE_DETAIL << _impl_params->output_layouts[i].to_string() << std::endl;
+                GPU_DEBUG_TRACE_DETAIL << i << ". " << _impl_params->output_layouts[i].to_string() << std::endl;
                 set_shape_change();
             } else {
                 _outputs[i] = _network.get_engine().reinterpret_buffer(*_outputs[i], actual_layouts[i]);
@@ -853,7 +878,26 @@ event::ptr primitive_inst::realloc_if_needed() {
                                                       sequence_axis,
                                                       "present_layout");
             if (max_pad > 0) {
+                if (desc->compressed) {
+                    GPU_DEBUG_TRACE_DETAIL << "Compressed case!\n";
+                    auto present_scales_layout = _impl_params->output_layouts[2];
+
+                    const auto sequence_axis = 3;
+                    GPU_DEBUG_TRACE_DETAIL << id() << " is kv_cache => set the variable with newly allocated output memory"
+                                        << std::endl;
+
+                    kv_cache_inst::update_pad(present_scales_layout, max_pad, sequence_axis);
+                    GPU_DEBUG_TRACE_DETAIL << "Updated scales pad (" << max_pad << " " << sequence_axis << "): " << present_scales_layout.to_string() << "\n";
+                    if (!axis_is_outer_most) {
+                        _impl_params->output_layouts[2] = present_scales_layout;
+                    }
+
+                    const auto& multi_tensor_var = downcast<ov::intel_gpu::VariableStateIndirectKVCache>(variable);
+                    multi_tensor_var.get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout);
+                }
+
                 kv_cache_inst::update_pad(present_layout, max_pad, sequence_axis);
+                GPU_DEBUG_TRACE_DETAIL << "Updated data pad (" << max_pad << " " << sequence_axis << "): " << present_layout.to_string() << "\n";
                 if (!axis_is_outer_most) {
                     GPU_DEBUG_TRACE_DETAIL << id() << ": Update impl with new output padding" << std::endl;
                     set_shape_change();
@@ -873,12 +917,28 @@ event::ptr primitive_inst::realloc_if_needed() {
                                        << "'s layout with allocated kv cache output: " << present_layout.to_short_string()
                                        << " (is_set  = " << variable.is_set() << ") " << std::endl;
                 variable.set_memory(_outputs[0], present_layout);
+
+                if (desc->compressed) {
+                    GPU_DEBUG_TRACE_DETAIL << "Compressed case[2]!\n";
+                    auto present_scales_layout = _impl_params->output_layouts[2];
+
+                    const auto& multi_tensor_var = downcast<ov::intel_gpu::VariableStateIndirectKVCache>(variable);
+                    multi_tensor_var.get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout);
+                }
             }
         } else {
             GPU_DEBUG_TRACE_DETAIL << id() << ": Update variable " << variable.get_name()
                                    << "'s layout with allocated kv cache output: " << present_layout.to_short_string()
                                    << " (is_set  = " << variable.is_set() << ") " << std::endl;
             variable.set_layout(present_layout);
+
+            if (desc->compressed) {
+                GPU_DEBUG_TRACE_DETAIL << "Compressed case[2]!\n";
+                auto present_scales_layout = _impl_params->output_layouts[2];
+
+                const auto& multi_tensor_var = downcast<ov::intel_gpu::VariableStateIndirectKVCache>(variable);
+                multi_tensor_var.get_compression_scale_state()->set_layout(present_scales_layout);
+            }
         }
     }
 
@@ -1241,6 +1301,13 @@ void primitive_inst::do_runtime_in_place_kv_cache() {
         return;
     }
     const auto& desc = _node->as<kv_cache>().get_primitive();
+
+    if (desc->compressed) {
+        GPU_DEBUG_TRACE_DETAIL << "Original layouts\n";
+        GPU_DEBUG_TRACE_DETAIL << _impl_params->input_layouts[0] << "\n";
+        GPU_DEBUG_TRACE_DETAIL << _impl_params->input_layouts[3] << "\n";
+    }
+
     auto& past_layout = _impl_params->input_layouts[0];
     auto& new_layout = _impl_params->input_layouts[1];
     auto& present_layout = _impl_params->output_layouts[0];
@@ -1268,11 +1335,33 @@ void primitive_inst::do_runtime_in_place_kv_cache() {
         GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_layout's pad : " << present_layout.to_string() << std::endl;
         auto& variable = get_network().get_variable(desc->variable_info.variable_id);
         variable.set_layout(present_layout);
+
+        if (desc->compressed) {
+            GPU_DEBUG_TRACE_DETAIL << "Compressed case[1]!\n";
+            auto& present_scales_layout = _impl_params->output_layouts[2];
+            const auto sequence_axis = 3;
+            kv_cache_inst::update_pad(present_scales_layout, max_pad - new_seq_len, sequence_axis);
+            GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_scale_layout's pad : " << present_scales_layout.to_string() << std::endl;
+
+            const auto& multi_tensor_var = downcast<ov::intel_gpu::VariableStateIndirectKVCache>(variable);
+            multi_tensor_var.get_compression_scale_state()->set_layout(present_scales_layout);
+        }
+
         GPU_DEBUG_TRACE_DETAIL << "[do_runtime_in_place_kv_cache] " << id() << "Updated variable with present_layout"
                                << variable.get_layout().to_string() << " is_set  = " << variable.is_set() << std::endl;
         if (past_layout.data_padding._upper_size[sequence_axis] > 0 && variable.is_set()) {
             kv_cache_inst::update_pad(past_layout, max_pad, sequence_axis);
             _impl_params->_can_be_optimized = true;
+
+            GPU_DEBUG_TRACE_DETAIL << "Updated data layout (" << max_pad << " " << sequence_axis << "): " << _impl_params->input_layouts[0] << "\n";
+
+            if (desc->compressed) {
+                GPU_DEBUG_TRACE_DETAIL << "Compressed case[2]!\n";
+                auto& past_scale_layout = _impl_params->input_layouts[3];
+                const auto sequence_axis = 3;
+                kv_cache_inst::update_pad(past_scale_layout, max_pad, sequence_axis);
+                GPU_DEBUG_TRACE_DETAIL << "Updated scales layout (" << max_pad << " " << sequence_axis << "): " << _impl_params->input_layouts[3] << "\n";
+            }
             GPU_DEBUG_TRACE_DETAIL << "[do_runtime_in_place_kv_cache] " << id() << " Updated past layout's pad : " << past_layout.to_string() << std::endl;
         }
     }
diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp
index 0b7b0ca4ca2b1b..047e04f88d7e8a 100644
--- a/src/plugins/intel_gpu/src/graph/program_node.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_node.cpp
@@ -435,10 +435,15 @@ layout program_node::get_non_padded_output_layout(bool invalidate_users_if_chang
 }
 
 bool program_node::set_output_layout(layout& new_layout, bool invalidate_users_if_changed, size_t idx) {
+    // GPU_DEBUG_TRACE_DETAIL << "TEST: " << padding::max(new_layout.data_padding, output_layouts[idx].data_padding)._dynamic_dims_mask << "\n";
+
     merge_output_padding(new_layout.data_padding, idx);
+    // GPU_DEBUG_TRACE_DETAIL << "Merged padding[1] " << new_layout.to_string() << "\n";
+    // GPU_DEBUG_TRACE_DETAIL << "Merged padding[2] " << output_layouts[idx].data_padding._dynamic_dims_mask << "\n";
     OPENVINO_ASSERT(idx < output_layouts.size(), id(), " has invalid index : index is ", std::to_string(idx),
                                         " but output_layouts length is ", std::to_string(output_layouts.size()));
     new_layout.data_padding = output_layouts[idx].data_padding;
+    // GPU_DEBUG_TRACE_DETAIL << "Merged padding[3] " << new_layout.to_string() << "\n";
     bool changed = (new_layout != output_layouts[idx]);
     if (changed && invalidate_users_if_changed)  // output_layout has changed! invalidate users
         invalidate_users();
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl
index c5dc3609133192..656699162aeba0 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl
@@ -62,7 +62,7 @@ KERNEL(dynamic_quantize_gpu_opt_generic)(
     DECLARE_GROUPED_DIMS_INDEXES(grouped_indexes);
 
     // the innermost dimension is always handled in the loop inside the kernel
-    const uint x = 0;
+    uint x = 0;
 
     half max_value = 0.0001h;
     half val[INNERMOST_DIM_VALUE / SUBGROUP_SIZE];
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
index 8e3bf05ee11f18..eddda57006bf9f 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
@@ -258,11 +258,11 @@ KERNEL(sdpa_opt)(
 #endif
 #ifdef COMPRESSED_PER_HEAD
                 // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, b1_idx, 0);
-                const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len, b1_idx / BROADCAST_GROUP_SIZE);
+                const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len);
 
 #else
                 // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, 0, 0);
-                const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len, 0);
+                const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, start_partition_idx + seq_len);
 #endif
                 KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset];
 #endif
@@ -584,10 +584,10 @@ KERNEL(sdpa_opt)(
 #ifdef COMPRESSED_PER_HEAD
             // TODO: consider to change scales layout from [batch, seq_len, num_heads, 1] to [batch, num_heads, seq_len, 1]
             // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, b1_idx, 0);
-            const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, b1_idx / BROADCAST_GROUP_SIZE);
+            const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid);
 #else
             // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, 0, 0);
-            const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0);
+            const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid);
 #endif
             VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset];
 #endif
@@ -655,10 +655,10 @@ KERNEL(sdpa_opt)(
 #ifdef COMPRESSED_PER_HEAD
             // TODO: consider to change scales layout from [batch, seq_len, num_heads, 1] to [batch, num_heads, seq_len, 1]
             // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len, 0, 0, b1_idx, 0);
-            const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len, b1_idx / BROADCAST_GROUP_SIZE);
+            const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len);
 #else
             // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len, 0, 0, 0, 0);
-            const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len, 0);
+            const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, start_partition_idx + seq_len);
 #endif
             VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset];
 #endif
@@ -928,12 +928,18 @@ KERNEL(sdpa_opt)(
 #endif
         uint query_local_offset = head_size_idx * TARGET_SEQ_LEN_BLOCK_SIZE;
 
+#if HAS_SCALE_INPUT
+        const OUTPUT_TYPE scale_val = *scale;
+#else
+        const OUTPUT_TYPE scale_val = TO_OUTPUT_TYPE(STATIC_SCALE_VALUE);
+#endif
+
         if (cur_target_seq_len_size != TARGET_SEQ_LEN_BLOCK_SIZE) {
             if (sgid * SUBGROUP_SIZE < HEAD_SIZE) {
                 for (uint seq_idx = 0; seq_idx < cur_target_seq_len_size; seq_idx++) {
                     INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset);
 
-                    slm_query[query_local_offset] = val;
+                    slm_query[query_local_offset] = val * scale_val;
                     query_offset += query_pitch;
                     query_local_offset++;
                 }
@@ -944,7 +950,7 @@ KERNEL(sdpa_opt)(
                     unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) {
                         INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset);
 
-                        slm_query[query_local_offset] = val;
+                        slm_query[query_local_offset] = val * scale_val;
                         query_offset += query_pitch;
                         query_local_offset++;
                     }
@@ -954,7 +960,7 @@ KERNEL(sdpa_opt)(
                     unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) {
                         INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset);
 
-                        slm_query[query_local_offset] = val;
+                        slm_query[query_local_offset] = val * scale_val;
                         query_offset += query_pitch;
                         query_local_offset++;
                     }
@@ -965,7 +971,7 @@ KERNEL(sdpa_opt)(
                 unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) {
                     INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset);
 
-                    slm_query[query_local_offset] = val;
+                    slm_query[query_local_offset] = val * scale_val;
                     query_offset += query_pitch;
                     query_local_offset++;
                 }
@@ -973,7 +979,7 @@ KERNEL(sdpa_opt)(
                 unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
                     INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset);
 
-                    slm_query[query_local_offset] = val;
+                    slm_query[query_local_offset] = val * scale_val;
                     query_offset += query_pitch;
                     query_local_offset++;
                 }
@@ -1053,10 +1059,10 @@ KERNEL(sdpa_opt)(
 #endif
 #ifdef COMPRESSED_PER_HEAD
                 // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, b1_idx, 0);
-                const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, seq_len + sglid, b1_idx / BROADCAST_GROUP_SIZE);
+                const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid);
 #else
                 // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, 0, 0);
-                const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, seq_len + sglid, 0);
+                const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, seq_len + sglid);
 #endif
                 KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset];
                 // printf("[0]key_scale_comp_offset=%d, sglid=%d: %f\n", key_scale_comp_offset, sglid, key_comp_scale);
@@ -1102,10 +1108,10 @@ KERNEL(sdpa_opt)(
 #endif
 #ifdef COMPRESSED_PER_HEAD
                 // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, b1_idx, 0);
-                const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, seq_len + sglid, b1_idx / BROADCAST_GROUP_SIZE);
+                const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid);
 #else
                 // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, 0, 0);
-                const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, seq_len + sglid, 0);
+                const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, seq_len + sglid);
 #endif
                 KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset];
                 // printf("[1]key_scale_comp_offset=%d, sglid=%d: %f\n", key_scale_comp_offset, sglid, key_comp_scale);
@@ -1185,12 +1191,7 @@ KERNEL(sdpa_opt)(
 
             {
                 unroll_for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) {
-#if HAS_SCALE_INPUT
-                    const OUTPUT_TYPE scale_val = *scale;
-#else
-                    const OUTPUT_TYPE scale_val = TO_OUTPUT_TYPE(STATIC_SCALE_VALUE);
-#endif
-                    qk_acc[i] *= scale_val;
+
 
 #ifdef HAS_ALIBI
                     const int alibi_val = (1 - SOURCE_SEQ_LEN) + seq_len + i;
@@ -1320,10 +1321,10 @@ KERNEL(sdpa_opt)(
 #endif
 #ifdef COMPRESSED_PER_HEAD
                     // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len) + sglid, 0, 0, b1_idx, 0);
-                    const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len) + sglid, b1_idx / BROADCAST_GROUP_SIZE);
+                    const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len) + sglid);
 #else
                     // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len) + sglid, 0, 0, 0, 0);
-                    const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len) + sglid, 0);
+                    const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, start_partition_idx + (seq_len) + sglid);
 #endif
                     VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset];
                     // printf("[0]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale);
@@ -1386,10 +1387,10 @@ KERNEL(sdpa_opt)(
 #endif
 #ifdef COMPRESSED_PER_HEAD
                     // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, b1_idx, 0);
-                    const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, b1_idx / BROADCAST_GROUP_SIZE);
+                    const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid);
 #else
                     // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, 0, 0);
-                    const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0);
+                    const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid);
 #endif
                     VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset];
                     // printf("[1]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale);
@@ -1461,10 +1462,10 @@ KERNEL(sdpa_opt)(
 #endif
 #ifdef COMPRESSED_PER_HEAD
                     // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len_leftovers_start + sglid, 0, 0, b1_idx, 0);
-                    const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid, b1_idx / BROADCAST_GROUP_SIZE);
+                    const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len_leftovers_start + sglid);
 #else
                     // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len_leftovers_start + sglid, 0, 0, 0, 0);
-                    const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid, 0);
+                    const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid);
 #endif
                     VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset];
                     // printf("[2]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp
index 906ad3dcb23d5e..6d157fe901efcb 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp
@@ -13,13 +13,13 @@
 namespace kernel_selector {
 
 sdpa_kernel_selector::sdpa_kernel_selector() {
-    int USE_REF = 0;
-    if (const auto env_var = std::getenv("USE_REF")) {
+    int USE_REF_SDPA = 0;
+    if (const auto env_var = std::getenv("USE_REF_SDPA")) {
         std::istringstream ss(env_var);
-        ss >> USE_REF;
+        ss >> USE_REF_SDPA;
     }
 
-    if (!USE_REF) {
+    if (!USE_REF_SDPA) {
         Attach<SDPAKernelOpt>();
         Attach<SDPAKernelRef>();
     #ifdef ENABLE_ONEDNN_FOR_GPU
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp
index d7c2c0170aa3a8..1f591d9571dc2a 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp
@@ -126,8 +126,8 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher() {
                 std::vector<uint64_t> scales_output_order(rank, 1);
                 scales_output_order[0] = transposed_order[0];
                 scales_output_order[1] = transposed_order[3];
-                scales_output_order[2] = transposed_order[2];
-                scales_output_order[3] = transposed_order[1];
+                scales_output_order[2] = transposed_order[1];
+                scales_output_order[3] = transposed_order[2];
 
                 return scales_output_order;
             };
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp
index fa7e803c1fcc74..da160e26aa6749 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp
@@ -174,7 +174,9 @@ std::vector<ov::PartialShape> shape_infer(const KVCache* op,
             ov::op::internal::DynamicQuantize op;
             auto new_token_data_quantized_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, {input_shapes[1]}, group_sizes, scales_output_order);
 
-            const auto scales_concat_axis = 2;
+            std::cout << "KV input: " << input_shapes[0] << " " << input_shapes[1] << " " << input_shapes[2] << " " << input_shapes[3] << "\n";
+            std::cout << "DQ output results for KV: " << new_token_data_quantized_shapes[0] << " " << new_token_data_quantized_shapes[1] << "\n";
+            const auto scales_concat_axis = 3;
             ov::PartialShape compression_scale_shape = input_shapes[3];
             compression_scale_shape[scales_concat_axis] += new_token_data_quantized_shapes[1][scales_concat_axis];
             out_shapes[2] = compression_scale_shape;