From 7a0aa3e6d2529707c43df336f10b5c35631d4b55 Mon Sep 17 00:00:00 2001
From: Sergey Shlyapnikov <sergey.shlyapnikov@intel.com>
Date: Thu, 17 Oct 2024 18:48:59 +0400
Subject: [PATCH] WIP: [GPU] Some fixes

---
 .../include/ov_ops/dynamic_quantize.hpp       |  54 +-
 .../src/ov_ops/dynamic_quantize.cpp           |  36 +-
 .../intel_gpu/graph/kernel_impl_params.hpp    |   2 +-
 .../include/intel_gpu/op/dynamic_quantize.hpp |  57 ++
 .../include/intel_gpu/op/indirect_sdpa.hpp    |  12 +
 .../include/intel_gpu/op/kv_cache.hpp         |  35 +-
 .../include/intel_gpu/op/read_value.hpp       |   7 +
 .../include/intel_gpu/op/read_values.hpp      |  41 ++
 .../intel_gpu/include/intel_gpu/op/sdpa.hpp   |  22 +
 .../plugin/multi_tensor_variable_state.hpp    |  27 +-
 .../intel_gpu/plugin/primitives_list.hpp      |   2 +
 .../intel_gpu/primitives/dynamic_quantize.hpp |  40 +-
 .../include/intel_gpu/primitives/kv_cache.hpp |  24 +-
 .../intel_gpu/primitives/read_value.hpp       |  27 +-
 .../scaled_dot_product_attention.hpp          |  31 +-
 .../intel_gpu/runtime/debug_configuration.hpp |   1 +
 .../intel_gpu/src/graph/dynamic_quantize.cpp  |  37 +-
 .../graph_optimizer/build_implementations.cpp |   3 +
 .../graph_optimizer/prepare_buffer_fusing.cpp |  59 ++-
 .../src/graph/impls/cpu/read_value.cpp        |  26 +-
 .../src/graph/impls/ocl/dynamic_quantize.cpp  |  10 +
 .../impls/ocl/kernel_selector_helper.cpp      |   6 +-
 .../graph/impls/ocl/kernel_selector_helper.h  |   2 +-
 .../src/graph/impls/ocl/kv_cache.cpp          | 238 ++++++++-
 .../ocl/scaled_dot_product_attention.cpp      |  82 ++-
 .../src/graph/include/dynamic_quantize_inst.h |   5 +-
 .../src/graph/include/kv_cache_inst.h         |   3 +
 .../src/graph/include/program_node.h          |   2 +-
 .../src/graph/include/read_value_inst.h       |  11 +-
 src/plugins/intel_gpu/src/graph/kv_cache.cpp  |  44 +-
 .../intel_gpu/src/graph/primitive_inst.cpp    | 249 +++++++--
 .../intel_gpu/src/graph/program_node.cpp      |   8 +-
 .../intel_gpu/src/graph/read_value.cpp        |  39 +-
 .../graph/scaled_dot_product_attention.cpp    |   8 +
 .../cl_kernels/dynamic_quantize_gpu_opt.cl    |   2 +-
 .../dynamic_quantize_gpu_opt_generic.cl       | 130 +++++
 .../cl_kernels/dynamic_quantize_gpu_ref.cl    | 101 +++-
 .../kernel_selector/cl_kernels/sdpa_opt.cl    | 485 ++++++++++++++++--
 .../kernel_selector/cl_kernels/sdpa_ref.cl    |  51 ++
 .../dynamic_quantize_kernel_opt_generic.cpp   | 327 ++++++++++++
 .../dynamic_quantize_kernel_opt_generic.h     |  30 ++
 .../dynamic_quantize_kernel_ref.cpp           |  57 +-
 .../dynamic_quantize_kernel_ref.h             |   7 +
 .../dynamic_quantize_kernel_selector.cpp      |  12 +-
 .../kernels/sdpa/sdpa_kernel_base.cpp         |  23 +
 .../kernels/sdpa/sdpa_kernel_base.h           |   7 +
 .../kernels/sdpa/sdpa_kernel_micro.cpp        |   3 +
 .../kernels/sdpa/sdpa_kernel_opt.cpp          |  40 +-
 .../kernels/sdpa/sdpa_kernel_ref.cpp          |  26 +-
 .../kernels/sdpa/sdpa_kernel_selector.cpp     |  20 +-
 .../plugin/multi_tensor_variable_state.cpp    |  61 +++
 .../src/plugin/ops/dynamic_quantize.cpp       |  36 +-
 .../intel_gpu/src/plugin/ops/kv_cache.cpp     |  19 +-
 .../ops/scaled_dot_product_attention.cpp      |   9 +-
 .../intel_gpu/src/plugin/ops/variable.cpp     |  47 +-
 .../src/plugin/sync_infer_request.cpp         |  35 +-
 .../dynamic_quantize_fully_connected.cpp      |   9 +-
 .../transformations/kv_cache_compression.cpp  | 349 +++++++++++++
 .../transformations/kv_cache_compression.hpp  |  43 ++
 .../transformations/op/dynamic_quantize.cpp   |  86 ++++
 .../transformations/op/indirect_sdpa.cpp      |  57 +-
 .../plugin/transformations/op/kv_cache.cpp    | 114 +++-
 .../plugin/transformations/op/read_value.cpp  |  83 ++-
 .../src/plugin/transformations/op/sdpa.cpp    |  44 +-
 .../src/plugin/transformations_pipeline.cpp   |   3 +
 .../src/runtime/debug_configuration.cpp       |   6 +-
 66 files changed, 3330 insertions(+), 242 deletions(-)
 create mode 100644 src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp
 create mode 100644 src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.h
 create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp
 create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp
 create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp

diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp
index 69c148305fb94f..643a726ffbcee6 100644
--- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp
+++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp
@@ -11,32 +11,66 @@ namespace ov {
 namespace op {
 namespace internal {
 
+struct QuantizationConfig {
+    enum class QuantizationMode {
+        Symmetric,
+        Asymmetric
+    };
+
+    QuantizationMode mode = QuantizationMode::Symmetric;
+    element::Type quantization_dt = element::undefined;
+    element::Type scale_dt = element::undefined;
+    element::Type zp_dt = element::undefined;
+    std::vector<uint64_t> group_sizes = {};
+
+    bool operator==(const QuantizationConfig& rhs) const {
+        return mode == rhs.mode &&
+               quantization_dt == rhs.quantization_dt &&
+               scale_dt == rhs.scale_dt &&
+               zp_dt == rhs.zp_dt &&
+               group_sizes == rhs.group_sizes;
+    }
+
+    bool is_asymmetric_quantization() const {
+        return mode == QuantizationMode::Asymmetric;
+    }
+};
+
 /// \brief Operator performing Dynamic Quantize
 class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op {
 public:
-    OPENVINO_OP("DynamicQuantize", "gpu_opset");
-
+    OPENVINO_OP("DynamicQuantize", "ie_internal_opset");
     DynamicQuantize() = default;
     /// \brief Constructs an DynamicQuantize operation.
     ///
     /// \param data Input tensor with data
-    /// \param group_sizes Group sizes for dynamic quantization
-    /// \param dt_scale Data type for scale output
-    DynamicQuantize(const Output<Node>& data, std::vector<uint64_t> group_sizes, element::Type dt_scale);
+    /// \param config Dynamic quantization configuration
+    DynamicQuantize(const Output<Node>& data, const QuantizationConfig& config);
 
     void validate_and_infer_types() override;
 
     std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+
     const std::vector<uint64_t>& get_group_sizes() const {
-        return m_group_sizes;
+        return m_config.group_sizes;
+    };
+
+    QuantizationConfig::QuantizationMode get_quantization_mode() const {
+        return m_config.mode;
     };
+
+    QuantizationConfig get_quantization_config() const {
+        return m_config;
+    };
+
     static std::vector<ov::PartialShape> shape_infer(const DynamicQuantize* op,
                                                      const std::vector<ov::PartialShape>& input_shapes,
-                                                     const std::vector<uint64_t>& group_sizes);
+                                                     const QuantizationConfig& config);
+
+protected:
+    DynamicQuantize(const Output<Node>& data, const QuantizationConfig& config, size_t outputs_number);
 
-private:
-    std::vector<uint64_t> m_group_sizes;
-    element::Type m_dt_scale;
+    QuantizationConfig m_config;
 };
 
 }  // namespace internal
diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp
index 74c0498e9a4425..3bf75a0dbb7895 100644
--- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp
+++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp
@@ -13,35 +13,42 @@ namespace ov {
 namespace op {
 namespace internal {
 
-DynamicQuantize::DynamicQuantize(const Output<Node>& data, std::vector<uint64_t> group_sizes, element::Type dt_scale)
+DynamicQuantize::DynamicQuantize(const Output<Node>& data, const QuantizationConfig& config, size_t outputs_number)
     : Op({data}),
-      m_group_sizes(std::move(group_sizes)),
-      m_dt_scale(dt_scale) {
-    OPENVINO_ASSERT(data.get_partial_shape().rank() == m_group_sizes.size(),
+      m_config(config) {
+    OPENVINO_ASSERT(data.get_partial_shape().rank() == m_config.group_sizes.size(),
                     "FC input rank should be same as the rank of group_size ",
                     data.get_tensor_ptr()->get_partial_shape().rank(),
                     " / ",
-                    m_group_sizes.size());
-    set_output_size(2);
+                    m_config.group_sizes.size());
+    set_output_size(outputs_number);
+}
+
+DynamicQuantize::DynamicQuantize(const Output<Node>& data, const QuantizationConfig& config)
+    : DynamicQuantize(data, config, config.mode == QuantizationConfig::QuantizationMode::Symmetric ? 2 : 3) {
     validate_and_infer_types();
 }
 
 void DynamicQuantize::validate_and_infer_types() {
     std::vector<ov::PartialShape> input_shapes = {get_input_partial_shape(0)};
 
-    auto out_shapes = shape_infer(this, input_shapes, m_group_sizes);
-    set_output_type(0, element::i8, out_shapes[0]);
-    set_output_type(1, m_dt_scale, out_shapes[1]);
+    auto out_shapes = shape_infer(this, input_shapes, m_config);
+    set_output_type(0, m_config.quantization_dt, out_shapes[0]);
+    set_output_type(1, m_config.scale_dt, out_shapes[1]);
+
+    if (m_config.is_asymmetric_quantization())
+        set_output_type(2, m_config.zp_dt, out_shapes[2]);
 }
 
 std::shared_ptr<Node> DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const {
     check_new_args_count(this, new_args);
-    return std::make_shared<DynamicQuantize>(new_args.at(0), m_group_sizes, m_dt_scale);
+    return std::make_shared<DynamicQuantize>(new_args.at(0), m_config);
 }
 
 std::vector<ov::PartialShape> DynamicQuantize::shape_infer(const DynamicQuantize* op,
                                                            const std::vector<ov::PartialShape>& input_shapes,
-                                                           const std::vector<uint64_t>& group_sizes) {
+                                                           const QuantizationConfig& config) {
+    const auto& group_sizes = config.group_sizes;
     std::vector<ov::PartialShape> out_shapes;
     out_shapes.push_back(input_shapes[0]);
 
@@ -52,7 +59,7 @@ std::vector<ov::PartialShape> DynamicQuantize::shape_infer(const DynamicQuantize
                     " / ",
                     group_sizes.size());
     for (size_t i = 0; i < scale_shape.size(); i++) {
-        if (scale_shape[i].is_dynamic())
+        if (scale_shape[i].is_dynamic() || scale_shape[i] == 0)
             continue;
 
         if (group_sizes[i] == UINT64_MAX)
@@ -63,6 +70,11 @@ std::vector<ov::PartialShape> DynamicQuantize::shape_infer(const DynamicQuantize
         }
     }
     out_shapes.push_back(scale_shape);
+
+    // Add zero points shape
+    if (config.is_asymmetric_quantization())
+        out_shapes.push_back(scale_shape);
+
     return out_shapes;
 }
 
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp
index 3e8887fbb2f7ee..72623f6d120955 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp
@@ -53,7 +53,7 @@ struct kernel_impl_params final {
     optional_layout weights_zero_points_layout = optional_layout();
     optional_layout activations_zero_points_layout = optional_layout();
     optional_layout compensation_layout = optional_layout();
-    optional_layout state_layout = optional_layout();
+    std::vector<layout> state_layouts;
 
     std::map<size_t, memory::ptr> memory_deps = {};
     size_t primary_input_idx = 0;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp
new file mode 100644
index 00000000000000..007231daf4de15
--- /dev/null
+++ b/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp
@@ -0,0 +1,57 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/op/op.hpp"
+#include "ov_ops/dynamic_quantize.hpp"
+
+namespace ov {
+namespace intel_gpu {
+namespace op {
+
+class DynamicQuantize : public ov::op::internal::DynamicQuantize {
+public:
+    OPENVINO_OP("DynamicQuantize", "gpu_opset");
+
+    using QuantizationConfig = ov::op::internal::QuantizationConfig;
+
+    DynamicQuantize() = default;
+    /// \brief Constructs an DynamicQuantize operation.
+    ///
+    /// \param data Input tensor with data
+    /// \param config Dynamic quantization configuration
+    /// \param scales_zp_output_order Non default order of scales
+    /// \param combine_scales_and_zp Save scales and zero points into single buffer by pairs (scale, zp)
+    DynamicQuantize(const Output<Node>& data,
+                    const QuantizationConfig& config,
+                    const std::vector<uint64_t>& scales_zp_output_order = {},
+                    const bool combine_scales_and_zp = false);
+
+    void validate_and_infer_types() override;
+
+    std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+
+    const std::vector<uint64_t>& get_scales_zp_output_order() const {
+        return m_scales_zp_output_order;
+    };
+
+    bool get_combine_scales_and_zp() const {
+        return m_combine_scales_and_zp;
+    };
+
+    static std::vector<ov::PartialShape> shape_infer(const DynamicQuantize* op,
+                                                     const std::vector<ov::PartialShape>& input_shapes,
+                                                     const QuantizationConfig& config,
+                                                     const std::vector<uint64_t>& scales_zp_output_order,
+                                                     const bool combine_scales_and_zp = false);
+
+private:
+    bool m_combine_scales_and_zp = false;
+    std::vector<uint64_t> m_scales_zp_output_order;
+};
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp
index b4d34a3975af6b..4ce90a685690e5 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp
@@ -29,6 +29,18 @@ class IndirectSDPA : public ov::intel_gpu::op::SDPA {
                  const std::vector<int64_t>& order_out,
                  const ov::element::Type output_type = ov::element::undefined);
 
+    IndirectSDPA(const OutputVector& data_inputs,
+                 const ov::Output<Node>& beam_table,
+                 const bool is_causal,
+                 const int64_t indirect_axis,
+                 const std::vector<int64_t>& order_q,
+                 const std::vector<int64_t>& order_k,
+                 const std::vector<int64_t>& order_v,
+                 const std::vector<int64_t>& order_out,
+                 const QuantizationConfig& quantization_config,
+                 const bool combine_scales_and_zp,
+                 const ov::element::Type output_type = ov::element::undefined);
+
     bool visit_attributes(ov::AttributeVisitor &visitor) override;
     void validate_and_infer_types() override;
 
diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp
index 402ff6e46c1607..e6dc5cd7c5ae94 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp
@@ -7,6 +7,7 @@
 #include "openvino/op/op.hpp"
 #include "openvino/op/util/variable.hpp"
 #include "openvino/op/util/variable_extension.hpp"
+#include "intel_gpu/op/dynamic_quantize.hpp"
 
 namespace ov {
 namespace intel_gpu {
@@ -18,20 +19,31 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension {
 public:
     OPENVINO_OP("KVCache", "gpu_opset");
 
+    using QuantizationConfig = ov::op::internal::QuantizationConfig;
+
     KVCache() = default;
 
     KVCache(const Output<Node>& past,
             const Output<Node>& new_token_data,
-            const Output<Node>& beam_idx,
             const std::shared_ptr<ov::op::util::Variable>& past_values,
             int64_t concat_axis,
-            int64_t gather_axis,
             const ov::element::Type output_type = ov::element::undefined);
 
     KVCache(const Output<Node>& past,
             const Output<Node>& new_token_data,
+            const Output<Node>& beam_idx,
+            const std::shared_ptr<ov::op::util::Variable>& past_values,
+            int64_t concat_axis,
+            int64_t gather_axis,
+            const ov::element::Type output_type = ov::element::undefined);
+
+    KVCache(const OutputVector& inputs,
             const std::shared_ptr<ov::op::util::Variable>& past_values,
             int64_t concat_axis,
+            int64_t gather_axis,
+            bool combine_scales_and_zp,
+            const QuantizationConfig& config,
+            const std::vector<uint64_t>& scales_zp_output_order,
             const ov::element::Type output_type = ov::element::undefined);
 
     bool visit_attributes(ov::AttributeVisitor& visitor) override;
@@ -53,14 +65,31 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension {
 
     bool get_indirect() const { return m_indirect; }
 
+    bool get_kv_compressed() const { return m_compressed; }
+    bool get_combine_scales_and_zp() const { return m_combine_scales_and_zp; }
+    QuantizationConfig get_quantization_config() const { return m_quantization_config; }
+    std::vector<uint64_t> get_scales_zp_output_order() const { return m_scales_zp_output_order; }
+
 private:
     int64_t m_concat_axis = 0;
     int64_t m_gather_axis = 0;
     bool m_indirect = false;
+
+    bool m_compressed = false;
+    bool m_combine_scales_and_zp = false;
+    QuantizationConfig m_quantization_config = {};
+    std::vector<uint64_t> m_scales_zp_output_order = {};
+
     ov::element::Type m_output_type;
 };
 
-std::vector<ov::PartialShape> shape_infer(const KVCache* op, std::vector<ov::PartialShape> input_shapes);
+std::vector<ov::PartialShape> shape_infer(const KVCache* op, const std::vector<ov::PartialShape>& input_shapes);
+
+std::vector<ov::PartialShape> shape_infer(const KVCache* op,
+                                          const std::vector<ov::PartialShape>& input_shapes,
+                                          const ov::op::internal::QuantizationConfig& config,
+                                          const std::vector<uint64_t>& scales_output_order = {},
+                                          bool combine_scales_and_zp = false);
 
 }   // namespace op
 }   // namespace intel_gpu
diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp
index a9c47f3fa39fa6..419c18118229ff 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp
@@ -26,6 +26,7 @@ class ReadValue : public ov::op::Op, public ov::op::util::VariableExtension {
     bool visit_attributes(ov::AttributeVisitor& visitor) override;
 
     void validate_and_infer_types() override;
+    void validate_and_infer_types(size_t output_idx, const ov::op::util::VariableInfo& variable_info);
 
     std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
 
@@ -33,6 +34,12 @@ class ReadValue : public ov::op::Op, public ov::op::util::VariableExtension {
         OPENVINO_ASSERT(m_variable, "Variable is not initialized. Variable_id is unavailable");
         return m_variable->get_info().variable_id;
     }
+
+protected:
+    ReadValue(const std::vector<Output<Node>>& variable_initializers, const std::shared_ptr<ov::op::util::Variable>& variable)
+    : Op(variable_initializers) {
+        m_variable = variable;
+    }
 };
 
 }   // namespace op
diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp
new file mode 100644
index 00000000000000..90e1f885254346
--- /dev/null
+++ b/src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp
@@ -0,0 +1,41 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_gpu/op/read_value.hpp"
+
+namespace ov {
+namespace intel_gpu {
+namespace op {
+
+/// \brief This operation handles the OpenVINO GPU Plugin's custom variable representation (which can store multiple states in a single variable) at the graph level.
+class ReadValues : public ReadValue {
+public:
+    OPENVINO_OP("ReadValues", "gpu_opset");
+
+    ReadValues() = default;
+
+    ReadValues(const std::shared_ptr<ov::op::util::Variable>& variable,
+               const std::vector<ov::op::util::VariableInfo>& internal_states_infos);
+
+    ReadValues(const OutputVector& variable_initializers,
+               const std::shared_ptr<ov::op::util::Variable>& variable,
+               const std::vector<ov::op::util::VariableInfo>& internal_states_infos);
+
+    bool visit_attributes(ov::AttributeVisitor& visitor) override;
+
+    void validate_and_infer_types() override;
+
+    std::vector<ov::op::util::VariableInfo> get_all_internal_states_info() const;
+
+    std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;
+
+private:
+    std::vector<ov::op::util::VariableInfo> m_internal_states_infos;
+};
+
+}   // namespace op
+}   // namespace intel_gpu
+}   // namespace ov
diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp
index 9f5d4dad16efd7..741644208cfe76 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp
@@ -8,6 +8,7 @@
 #include "openvino/core/partial_shape.hpp"
 #include "openvino/op/op.hpp"
 #include "openvino/op/scaled_dot_product_attention.hpp"
+#include "intel_gpu/op/dynamic_quantize.hpp"
 
 namespace ov {
 namespace intel_gpu {
@@ -17,6 +18,8 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention {
 public:
     OPENVINO_OP("SDPA", "gpu_opset");
 
+    using QuantizationConfig = ov::op::internal::QuantizationConfig;
+
     SDPA() = default;
 
     SDPA(const OutputVector& inputs,
@@ -27,6 +30,16 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention {
          const std::vector<int64_t>& order_out,
          const ov::element::Type output_type = ov::element::undefined);
 
+    SDPA(const OutputVector& inputs,
+         const bool is_causal,
+         const std::vector<int64_t>& order_q,
+         const std::vector<int64_t>& order_k,
+         const std::vector<int64_t>& order_v,
+         const std::vector<int64_t>& order_out,
+         const QuantizationConfig& quantization_config,
+         const bool m_combine_scales_and_zp,
+         const ov::element::Type output_type = ov::element::undefined);
+
     bool visit_attributes(ov::AttributeVisitor &visitor) override;
 
     void validate_and_infer_types() override;
@@ -41,6 +54,11 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention {
     std::vector<int64_t> get_output_transpose_order() const { return m_order_out; }
     ov::element::Type get_output_type() const { return m_output_type; }
 
+    bool get_kv_compressed() const { return m_compressed; }
+    bool get_combine_scales_and_zp() const { return m_combine_scales_and_zp; }
+    QuantizationConfig get_quantization_config() const { return m_quantization_config; }
+    size_t get_compression_inputs_num() const;
+
     static std::vector<int64_t> default_order(size_t rank) {
         std::vector<int64_t> order(rank);
         std::iota(order.begin(), order.end(), 0);
@@ -54,6 +72,10 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention {
     std::vector<int64_t> m_order_v;
     std::vector<int64_t> m_order_out;
     ov::element::Type m_output_type;
+
+    bool m_compressed = false;
+    bool m_combine_scales_and_zp = false;
+    QuantizationConfig m_quantization_config = {};
 };
 
 std::vector<ov::PartialShape> shape_infer(const SDPA* op,
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp
index 0cad36f62e47b9..d1777d6478eef1 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp
@@ -41,10 +41,35 @@ class VariableStateIndirectKVCache : public MultiTensorState {
     VariableState::Ptr get_beam_table_state() const;
     ov::PartialShape get_beam_table_shape(const ov::PartialShape& kv_cache_shape);
 
-private:
+protected:
     size_t m_beam_axis = 0;
     size_t m_concat_axis = 0;
 };
 
+// This is multi-tensor state for Indirect KV-Cache + Gemm pattern
+// Internally it stores KV Cache state + Beam Table state + compression scales state + (optional compression zero points)
+class VariableStateIndirectKVCacheCompressed : public VariableStateIndirectKVCache {
+public:
+    VariableStateIndirectKVCacheCompressed(const VariableStateInfo& info,
+                                           std::shared_ptr<RemoteContextImpl> context,
+                                           std::shared_ptr<cldnn::ShapePredictor> shape_predictor,
+                                           size_t beam_idx,
+                                           size_t concat_idx,
+                                           bool has_zp_state);
+    using Ptr = std::shared_ptr<VariableStateIndirectKVCacheCompressed>;
+
+    void set_state(const ov::SoPtr<ov::ITensor>& state) override;
+    ov::SoPtr<ov::ITensor> get_state() const override;
+
+    VariableState::Ptr get_compression_scale_state() const;
+    void set_compression_scale_layout(const cldnn::layout& new_layout);
+
+    VariableState::Ptr get_compression_zp_state() const;
+    void set_compression_zp_layout(const cldnn::layout& new_layout);
+    bool has_zp_state() const;
+
+private:
+    bool m_has_zp_state = false;
+};
 }  // namespace intel_gpu
 }  // namespace ov
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
index 1dbd769444b1aa..947a010efe0377 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp
@@ -284,6 +284,7 @@ REGISTER_FACTORY(internal, RMS);
 REGISTER_FACTORY(internal, GatherCompressed);
 REGISTER_FACTORY(internal, KVCache);
 REGISTER_FACTORY(internal, ReadValue);
+REGISTER_FACTORY(internal, ReadValues);
 REGISTER_FACTORY(internal, Gemm);
 REGISTER_FACTORY(internal, SwiGLU);
 REGISTER_FACTORY(internal, IndirectGemm);
@@ -293,4 +294,5 @@ REGISTER_FACTORY(internal, SDPA);
 REGISTER_FACTORY(internal, IndirectSDPA);
 REGISTER_FACTORY(internal, RoPE);
 REGISTER_FACTORY(internal, DynamicQuantize);
+REGISTER_FACTORY(internal, DynamicQuantizeExtended);
 REGISTER_FACTORY(internal, PagedAttentionExtension);
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp
index d93e2f86eed144..e0a920c5c4cf7d 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp
@@ -5,6 +5,8 @@
 #pragma once
 #include "primitive.hpp"
 
+#include "ov_ops/dynamic_quantize.hpp"
+
 namespace cldnn {
 
 /// @brief Dynamic Quantize primitive
@@ -12,26 +14,41 @@ namespace cldnn {
 struct dynamic_quantize : public primitive_base<dynamic_quantize> {
     CLDNN_DECLARE_PRIMITIVE(dynamic_quantize);
 
-    dynamic_quantize() : primitive_base("", {}), group_size(0) {}
+    using QuantizationConfig = ov::op::internal::QuantizationConfig;
+
+    dynamic_quantize() : primitive_base("", {}) {}
 
     /// @brief Constructs dynamic_quantize primitive
     /// @param id This primitive id
     /// @param input Input primitive id
-    /// @param group_size Quantization group size
+    /// @param group_sizes Quantization group size
     /// @param data_type Output data type of quantized
     /// @param output_size Output data size of the primitive
     dynamic_quantize(const primitive_id& id,
            const input_info& input,
-           const uint64_t group_size,
-           const std::vector<optional_data_type> data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)})
-           : primitive_base(id, {input}, 2, data_types),
-             group_size(group_size) {}
+           const QuantizationConfig& config,
+           const bool combine_scales_and_zp = false,
+           const std::vector<uint64_t>& scales_zp_output_order = {})
+           : primitive_base(id, {input})
+           , combine_scales_and_zp(combine_scales_and_zp)
+           , quantization_config(config)
+           , scales_zp_output_order(scales_zp_output_order) {}
 
-    uint64_t group_size = 0;
+    bool combine_scales_and_zp = false;
+    QuantizationConfig quantization_config;
+    std::vector<uint64_t> scales_zp_output_order = {};
 
     size_t hash() const override {
         size_t seed = primitive::hash();
-        seed = hash_combine(seed, group_size);
+        // TODO: add more parameters
+        seed = hash_range(seed, scales_zp_output_order.begin(), scales_zp_output_order.end());
+        seed = hash_range(seed, quantization_config.group_sizes.begin(), quantization_config.group_sizes.end());
+        seed = hash_combine(seed, combine_scales_and_zp);
+        seed = hash_combine(seed, quantization_config.mode);
+        // seed = hash_combine(seed, quantization_config.quantization_dt);
+        // seed = hash_combine(seed, quantization_config.scale_dt);
+        // seed = hash_combine(seed, quantization_config.zp_dt);
+
         return seed;
     }
 
@@ -40,18 +57,19 @@ struct dynamic_quantize : public primitive_base<dynamic_quantize> {
             return false;
 
         auto rhs_casted = downcast<const dynamic_quantize>(rhs);
+        // TODO: add more parameters
 
-        return group_size == rhs_casted.group_size;
+        return scales_zp_output_order == rhs_casted.scales_zp_output_order ||
+               quantization_config == rhs_casted.quantization_config;
     }
 
     void save(BinaryOutputBuffer& ob) const override {
         primitive_base<dynamic_quantize>::save(ob);
-        ob << group_size;
+        // TODO: add more parameters
     }
 
     void load(BinaryInputBuffer& ib) override {
         primitive_base<dynamic_quantize>::load(ib);
-        ib >> group_size;
     }
 };
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp
index f87041dcff66d6..2086c21033f400 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp
@@ -3,10 +3,14 @@
 //
 
 #pragma once
+
+#include "primitive.hpp"
+
 #include "openvino/core/partial_shape.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "openvino/op/util/variable.hpp"
-#include "primitive.hpp"
+#include "ov_ops/dynamic_quantize.hpp"
+
 #include <vector>
 
 namespace cldnn {
@@ -14,6 +18,8 @@ namespace cldnn {
 struct kv_cache : public primitive_base<kv_cache> {
     CLDNN_DECLARE_PRIMITIVE(kv_cache)
 
+    using QuantizationConfig = ov::op::internal::QuantizationConfig;
+
     kv_cache() : primitive_base("", {}) {}
 
     kv_cache(const primitive_id& id,
@@ -33,11 +39,18 @@ struct kv_cache : public primitive_base<kv_cache> {
     int64_t gather_axis = 0;
     bool indirect = false;
 
+    bool compressed = false;
+    bool combine_scales_and_zp = false;
+    QuantizationConfig quantization_config;
+    std::vector<uint64_t> scales_zp_output_order = {};
+
     size_t hash() const override {
         size_t seed = primitive::hash();
         seed = hash_combine(seed, concat_axis);
         seed = hash_combine(seed, gather_axis);
         seed = hash_combine(seed, indirect);
+        seed = hash_combine(seed, compressed);
+        // TODO: add here
         return seed;
     }
 
@@ -50,7 +63,10 @@ struct kv_cache : public primitive_base<kv_cache> {
         return variable_info == rhs_casted.variable_info &&
                concat_axis == rhs_casted.concat_axis &&
                gather_axis == rhs_casted.gather_axis &&
-               indirect == rhs_casted.indirect;
+               indirect == rhs_casted.indirect &&
+               compressed == rhs_casted.compressed &&
+               quantization_config == rhs_casted.quantization_config;
+        // TODO: add here
     }
 
     void save(BinaryOutputBuffer& ob) const override {
@@ -62,6 +78,8 @@ struct kv_cache : public primitive_base<kv_cache> {
         ob << concat_axis;
         ob << gather_axis;
         ob << indirect;
+        ob << compressed;
+        // TODO: add here
     }
 
     void load(BinaryInputBuffer& ib) override {
@@ -76,6 +94,8 @@ struct kv_cache : public primitive_base<kv_cache> {
         ib >> concat_axis;
         ib >> gather_axis;
         ib >> indirect;
+        ib >> compressed;
+        // TODO: add here
     }
 };
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp
index 7d9e919f56cf13..26465692ef6352 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp
@@ -22,19 +22,23 @@ struct read_value : public primitive_base<read_value> {
     /// @param id This primitive id
     /// @param inputs Input parameters ids
     /// @param variable_id Variable id
-    /// @param output_layout Memory layout
+    /// @param output_layouts Memory layouts
     read_value(const primitive_id& id,
                const std::vector<input_info>& inputs,
                const std::string& variable_id,
-               const layout& output_layout,
+               const std::vector<layout>& output_layouts,
                const ov::element::Type& user_specified_type = ov::element::undefined)
-            : primitive_base(id, inputs, 1, {optional_data_type{output_layout.data_type}}),
+            : primitive_base(id, inputs, output_layouts.size()),
               variable_id{variable_id},
-              output_layout{output_layout},
-              user_specified_type(user_specified_type) {}
+              output_layouts{output_layouts},
+              user_specified_type(user_specified_type) {
+        for (size_t output_idx = 0; output_idx < output_layouts.size(); output_idx++) {
+            output_data_types[output_idx] = optional_data_type(output_layouts[output_idx].data_type);
+        }
+    }
 
     std::string variable_id;
-    layout output_layout;
+    std::vector<layout> output_layouts;
     ov::element::Type user_specified_type;
 
     bool operator==(const primitive& rhs) const override {
@@ -51,7 +55,9 @@ struct read_value : public primitive_base<read_value> {
         primitive_base<read_value>::save(ob);
         ov::element::Type_t data_type = user_specified_type;
         ob << variable_id;
-        ob << output_layout;
+        ob << output_layouts.size();
+        for (const auto& layout : output_layouts)
+            ob << layout;
         ob << make_data(&data_type, sizeof(ov::element::Type_t));
     }
 
@@ -59,7 +65,12 @@ struct read_value : public primitive_base<read_value> {
         primitive_base<read_value>::load(ib);
         ov::element::Type_t data_type = ov::element::Type_t::undefined;
         ib >> variable_id;
-        ib >> output_layout;
+        size_t output_layouts_size;
+        ib >> output_layouts_size;
+        output_layouts.resize(output_layouts_size);
+        for (size_t i = 0; i < output_layouts_size; i++) {
+            ib >> output_layouts[i];
+        }
         ib >> make_data(&data_type, sizeof(ov::element::Type_t));
         user_specified_type = data_type;
     }
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp
index d66012bfac8889..5b77c7e6c8551d 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp
@@ -5,16 +5,20 @@
 #pragma once
 #include "primitive.hpp"
 
+#include "ov_ops/dynamic_quantize.hpp"
+
 namespace cldnn {
 
 struct scaled_dot_product_attention : public primitive_base<scaled_dot_product_attention> {
     CLDNN_DECLARE_PRIMITIVE(scaled_dot_product_attention)
 
+    using QuantizationConfig = ov::op::internal::QuantizationConfig;
+
     scaled_dot_product_attention() : primitive_base("", {}) {}
 
     /// @brief Constructs scaled_dot_product_attention primitive.
     /// @param id This primitive id.
-    /// @param inputs Input data primitives id (query, keys, values, [attention_mask], [scale]).
+    /// @param inputs Input data primitives id (query, keys, values, [attention_mask], [scale], [keys scales], [keys zp], [values scales], [values zp]).
     /// @param is_causal If true, assumes causal attention masking. In this case attention_mask input is ignored.
     scaled_dot_product_attention(const primitive_id& id,
                                  const std::vector<cldnn::input_info> inputs,
@@ -23,18 +27,30 @@ struct scaled_dot_product_attention : public primitive_base<scaled_dot_product_a
                                  const std::vector<int64_t>& input_q_transpose_order = {},
                                  const std::vector<int64_t>& input_k_transpose_order = {},
                                  const std::vector<int64_t>& input_v_transpose_order = {},
-                                 const std::vector<int64_t>& output_transpose_order = {})
+                                 const std::vector<int64_t>& output_transpose_order = {},
+                                 bool is_kv_compressed = false,
+                                 bool combine_scales_and_zp = false,
+                                 const QuantizationConfig& quantization_config = {})
         : primitive_base(id, inputs)
         , is_causal(is_causal)
         , indirect_axis(indirect_axis)
+        , is_kv_compressed(is_kv_compressed)
+        , combine_scales_and_zp(combine_scales_and_zp)
+        , quantization_config(quantization_config)
         , input_q_transpose_order(input_q_transpose_order)
         , input_k_transpose_order(input_k_transpose_order)
         , input_v_transpose_order(input_v_transpose_order)
         , output_transpose_order(output_transpose_order) {
             auto data_inputs_num = inputs.size();
-            if (indirect_axis != -1)
+            if (indirect_axis != -1) {
                 data_inputs_num--;
+            }
+            if (is_kv_compressed) {
+                data_inputs_num -= 2; // scales
 
+                if (quantization_config.is_asymmetric_quantization() && !combine_scales_and_zp)
+                    data_inputs_num -= 2; // zp
+            }
             has_attn_mask_input = data_inputs_num > 3;
             has_scale_input = data_inputs_num > 4;
         }
@@ -44,6 +60,10 @@ struct scaled_dot_product_attention : public primitive_base<scaled_dot_product_a
     bool has_scale_input = false;
     int64_t indirect_axis = -1;
 
+    bool is_kv_compressed = false;
+    bool combine_scales_and_zp = false;
+    QuantizationConfig quantization_config;
+
     std::vector<int64_t> input_q_transpose_order;
     std::vector<int64_t> input_k_transpose_order;
     std::vector<int64_t> input_v_transpose_order;
@@ -52,6 +72,7 @@ struct scaled_dot_product_attention : public primitive_base<scaled_dot_product_a
     size_t hash() const override {
         size_t seed = primitive::hash();
         seed = hash_combine(seed, is_causal);
+        seed = hash_combine(seed, is_kv_compressed);
         seed = hash_combine(seed, has_attn_mask_input);
         seed = hash_combine(seed, has_scale_input);
         seed = hash_combine(seed, indirect_axis);
@@ -69,6 +90,7 @@ struct scaled_dot_product_attention : public primitive_base<scaled_dot_product_a
         auto rhs_casted = downcast<const scaled_dot_product_attention>(rhs);
 
         return is_causal == rhs_casted.is_causal &&
+               is_kv_compressed == rhs_casted.is_kv_compressed &&
                has_attn_mask_input == rhs_casted.has_attn_mask_input &&
                has_scale_input == rhs_casted.has_scale_input &&
                indirect_axis == rhs_casted.indirect_axis &&
@@ -81,6 +103,7 @@ struct scaled_dot_product_attention : public primitive_base<scaled_dot_product_a
     void save(BinaryOutputBuffer& ob) const override {
         primitive_base<scaled_dot_product_attention>::save(ob);
         ob << is_causal;
+        ob << is_kv_compressed;
         ob << has_attn_mask_input;
         ob << has_scale_input;
         ob << indirect_axis;
@@ -88,11 +111,13 @@ struct scaled_dot_product_attention : public primitive_base<scaled_dot_product_a
         ob << input_k_transpose_order;
         ob << input_v_transpose_order;
         ob << output_transpose_order;
+        // TODO: add new params
     }
 
     void load(BinaryInputBuffer& ib) override {
         primitive_base<scaled_dot_product_attention>::load(ib);
         ib >> is_causal;
+        ib >> is_kv_compressed;
         ib >> has_attn_mask_input;
         ib >> has_scale_input;
         ib >> indirect_axis;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
index fbc8ae84c36a29..7ac2cbbbef543b 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@@ -145,6 +145,7 @@ class debug_configuration {
     std::vector<std::string> dynamic_quantize_layers_without_onednn;  // Specify Fully-connected layers which enable Dynamic quantization
     int dynamic_quantize_group_size;                            // Enable Dynamic quantization for fully connected primitive by specified group size
     int disable_horizontal_fc_fusion;                           // Disable fc horizontal fusion
+    int enable_kv_cache_compression;                            // Enable KV cache compression
     std::set<int64_t> dump_iteration;                           // Dump n-th execution of network.
     std::vector<std::string> load_layers_raw_dump;              // List of layers to load dumped raw binary and filenames
     static const debug_configuration *get_instance();
diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp
index 5c945f4c2d389c..0f0ffbac9cea4e 100644
--- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp
+++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include "ov_ops/dynamic_quantize.hpp"
+#include "intel_gpu/op/dynamic_quantize.hpp"
 #include "dynamic_quantize_inst.h"
 
 #include "primitive_type_base.h"
@@ -22,29 +22,42 @@ layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& no
 }
 
 template<typename ShapeType>
-std::vector<layout> dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size) {
-    ov::op::internal::DynamicQuantize op;
+std::vector<layout> dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout,
+                                                                 const dynamic_quantize::QuantizationConfig& config,
+                                                                 const std::vector<uint64_t>& scales_zp_output_order,
+                                                                 const bool combine_scales_and_zp) {
+    ov::intel_gpu::op::DynamicQuantize op;
     auto output_format = act_layout.format;
 
     std::vector<ShapeType> input_shapes = {
         act_layout.get<ShapeType>(),
     };
 
-    std::vector<uint64_t> shape_group_size(act_layout.get<ShapeType>().size(), 1);
-    shape_group_size.back() = group_size;
+    auto output_shapes = ov::intel_gpu::op::DynamicQuantize::shape_infer(&op, input_shapes, config, scales_zp_output_order, combine_scales_and_zp);
+    GPU_DEBUG_TRACE_DETAIL << "shape infer dynamic" << output_shapes[0] << " " << output_shapes[1] << "\n";
 
-    auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, shape_group_size);
+    std::vector<layout> output_layouts = { layout(output_shapes[0], config.quantization_dt, output_format),
+                                           layout(output_shapes[1], config.scale_dt, output_format) };
 
-    return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) };
+    const bool is_asymmetric = config.is_asymmetric_quantization();
+    if (is_asymmetric && !combine_scales_and_zp) {
+        output_layouts.emplace_back(layout(output_shapes[2], config.zp_dt, output_format));
+    }
+
+    return output_layouts;
 }
 
-template std::vector<layout> dynamic_quantize_inst::__calc_output_layouts<ov::PartialShape>(const layout &act_layout, uint64_t group_size);
+template std::vector<layout> dynamic_quantize_inst::__calc_output_layouts<ov::PartialShape>(const layout &act_layout,
+                                                                                            const dynamic_quantize::QuantizationConfig& config,
+                                                                                            const std::vector<uint64_t>& scales_zp_output_order,
+                                                                                            const bool combine_scales_and_zp);
 
 template<typename ShapeType>
 std::vector<layout> dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) {
     auto desc = impl_param.typed_desc<dynamic_quantize>();
     const auto& input_layout = impl_param.get_input_layout();
-    return __calc_output_layouts<ov::PartialShape>(input_layout, UINT64_MAX /* TODO: handle group_size here */);
+
+    return __calc_output_layouts<ov::PartialShape>(input_layout, desc->quantization_config, desc->scales_zp_output_order, desc->combine_scales_and_zp);
 }
 
 template std::vector<layout> dynamic_quantize_inst::calc_output_layouts<ov::PartialShape>(dynamic_quantize_node const& node,
@@ -56,6 +69,12 @@ std::string dynamic_quantize_inst::to_string(dynamic_quantize_node const& node)
 
     std::stringstream primitive_description;
 
+    json_composite dynamic_quantize_info;
+    // TOOD: Update info
+    // dynamic_quantize_info.add("activation dt", desc->get_output_data_type(0).value_or(data_types::undefined));
+    // dynamic_quantize_info.add("scale dt", desc->get_output_data_type(1).value_or(data_types::undefined));
+
+    node_info->add("dynamic_quantize info", dynamic_quantize_info);
     node_info->dump(primitive_description);
 
     return primitive_description.str();
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp
index 4c1b1008434144..84f1b26507f19e 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp
@@ -6,6 +6,7 @@
 #include "program_helpers.h"
 
 #include "intel_gpu/runtime/itt.hpp"
+#include "intel_gpu/runtime/debug_configuration.hpp"
 
 using namespace cldnn;
 
@@ -19,6 +20,7 @@ void build_implementations::run(program& p) {
     for (auto& n : p.get_processing_order()) {
         if (auto impl = n->get_selected_impl()) {
             auto params = n->get_kernel_impl_params();
+            GPU_DEBUG_TRACE << "add_kernels_source: " << params->desc->id << std::endl;
             cache.add_kernels_source(*params, impl->get_kernels_source());
         }
     }
@@ -26,6 +28,7 @@ void build_implementations::run(program& p) {
     for (auto& n : p.get_processing_order()) {
         if (auto impl = n->get_selected_impl()) {
             auto params = n->get_kernel_impl_params();
+            GPU_DEBUG_TRACE << "init_kernels: " << params->desc->id << std::endl;
             impl->init_kernels(cache, *params);
             impl->reset_kernels_source();
         }
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
index 9f2895a008cfa8..64a3512afe546b 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -420,10 +420,13 @@ bool crop_in_place_optimization::can_crop_be_optimized_simple_data_format(const
 }
 
 static bool can_read_value_be_optimize(const read_value_node& node) {
-    if (node.get_users().size() == 1)
+    GPU_DEBUG_TRACE_DETAIL << "Check " << node.id() << " users:" << node.get_users().size() << "\n";
+    std::unordered_set<const cldnn::program_node*> unique_users(node.get_users().begin(), node.get_users().end());
+
+    if (unique_users.size() == 1)
         return true;
 
-    const auto non_shape_of_users_count = std::count_if(node.get_users().begin(), node.get_users().end(), [](const program_node* user) {
+    const auto non_shape_of_users_count = std::count_if(unique_users.begin(), unique_users.end(), [](const program_node* user) {
         return !user->is_type<shape_of>();
     });
     if (non_shape_of_users_count <= 1)
@@ -847,8 +850,24 @@ void prepare_buffer_fusing::run(program& p) {
             if (!rv_prim)
                 return;
 
-            if (kv_out_layout.data_type != rv_prim->get_output_layout().data_type)
+            int DISABLE_KV_OPT = 0;
+            static bool warned = false;
+            if (const auto env_var = std::getenv("DISABLE_KV_OPT")) {
+                std::istringstream ss(env_var);
+                ss >> DISABLE_KV_OPT;
+
+                if (!warned) {
+                    std::cout << "Opt " << DISABLE_KV_OPT << " for kvcache\n";
+                    warned = true;
+                }
+            }
+
+            if (kv_out_layout.data_type != rv_prim->get_output_layout().data_type || DISABLE_KV_OPT) {
+                GPU_DEBUG_TRACE_DETAIL << node.id() << " can't optimize because of different formats: " << kv_out_layout.to_short_string() << " vs " << rv_prim->get_output_layout().to_short_string() << "\n";
                 return;
+            } else {
+                GPU_DEBUG_TRACE_DETAIL << node.id() << " can optimize because of different formats: " << kv_out_layout.to_short_string() << " vs " << rv_prim->get_output_layout().to_short_string() << "\n";
+            }
 
             auto concat_axis = node.get_primitive()->concat_axis;
 
@@ -857,21 +876,43 @@ void prepare_buffer_fusing::run(program& p) {
                 padding::DynamicDimsMask info_dynamic_pad;
                 info_dynamic_pad[concat_axis] = 1;
                 kv_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad;
+                GPU_DEBUG_TRACE_DETAIL << node.id() << " 0th  output  layout before before " << node.get_output_layout(false,  0) << "\n";
                 node.set_output_layout(kv_out_layout);
                 node.can_share_buffer(false);
+                GPU_DEBUG_TRACE_DETAIL << node.id() << " 0th  output  layout after " << node.get_output_layout(false,  0) << "\n";
 
-                auto update_dep = [&info_dynamic_pad](program_node* dep) {
-                    auto prev_layout = dep->get_output_layout();
+                auto update_dep = [](program_node* dep, padding::DynamicDimsMask& info_dynamic_pad, size_t idx) {
+                    auto prev_layout = dep->get_output_layout(true, idx);
                     prev_layout.data_padding._dynamic_dims_mask = info_dynamic_pad;
-                    dep->set_output_layout(prev_layout);
+                    dep->set_output_layout(prev_layout, true, idx);
                     dep->can_share_buffer(false);
                 };
 
                 if (rv_prim) {
-                    update_dep(rv_prim);
+                    update_dep(rv_prim, info_dynamic_pad, 0);
                 }
                 if (gather_prim) {
-                    update_dep(gather_prim);
+                    update_dep(gather_prim, info_dynamic_pad, 0);
+                }
+
+                const auto& desc = node.get_primitive();
+                if (desc->compressed) {
+                    auto update_scale_zp = [&](size_t kv_cache_output_idx, size_t read_value_output_idx) {
+                        auto scales_out_layout = node.get_output_layout(false, kv_cache_output_idx);
+
+                        const size_t scales_zp_concat_axis = 2;
+                        padding::DynamicDimsMask info_dynamic_pad_scales;
+                        info_dynamic_pad_scales[scales_zp_concat_axis] = 1;
+                        scales_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad_scales;
+                        node.set_output_layout(scales_out_layout, true, kv_cache_output_idx);
+
+                        update_dep(rv_prim, info_dynamic_pad_scales, read_value_output_idx);
+                    };
+
+                    update_scale_zp(2, 1);
+                    if (desc->quantization_config.is_asymmetric_quantization() && !desc->combine_scales_and_zp) {
+                        update_scale_zp(3, 2);
+                    }
                 }
             }
         });
@@ -905,7 +946,7 @@ void prepare_buffer_fusing::run(program& p) {
             // TODO: Allow optimizations for the case above too. Looks like it can be achieved by more careful
             // topological sort (i.e. if we ensure that all read_value users are completed before assign is run)
             node.can_be_optimized(can_read_value_be_optimize(node));
-            GPU_DEBUG_TRACE_DETAIL << "[prepare_buffer_fusing] : " << node.id() << " can be optimized" << std::endl;
+            GPU_DEBUG_TRACE_DETAIL << "[prepare_buffer_fusing] : " << node.id() << " can be optimized = " << node.can_be_optimized() << std::endl;
         });
     }
 }
diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp
index 6c16618ac816d0..6d70be9c046cf8 100644
--- a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp
@@ -6,6 +6,8 @@
 #include "impls/registry/implementation_map.hpp"
 #include "register.hpp"
 
+#include "intel_gpu/plugin/multi_tensor_variable_state.hpp"
+
 namespace cldnn {
 namespace cpu {
 
@@ -64,7 +66,29 @@ struct read_value_impl : public typed_primitive_impl<read_value> {
         }
 
         if (!instance.can_be_optimized()) {
-            return instance.output_memory(0).copy_from(stream, *variable.get_memory(), false);
+            GPU_DEBUG_TRACE_DETAIL << "Copying variable's memory to new read_value's buffer output\n";
+            std::vector<cldnn::event::ptr> res_events;
+            res_events.push_back(instance.output_memory(0).copy_from(stream, *variable.get_memory(), false));
+
+            if (auto compressed_cache_variable = dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCacheCompressed*>(&variable)) {
+                auto scales_state = compressed_cache_variable->get_compression_scale_state();
+                res_events.push_back(instance.output_memory(1).copy_from(stream, *scales_state->get_memory(), false));
+
+                if (compressed_cache_variable->has_zp_state()) {
+                    auto zp_state = compressed_cache_variable->get_compression_zp_state();
+                    res_events.push_back(instance.output_memory(1).copy_from(stream, *zp_state->get_memory(), false));
+                }
+            }
+
+            // auto desc = instance.get_impl_params()->typed_desc<read_value>();
+            // if (desc->compressed) {
+            //     auto multi_tensor_variable = downcast<const ov::intel_gpu::VariableStateIndirectKVCache>(variable);
+            //     auto scales_variable = multi_tensor_variable.get_compression_scale_state();
+
+            //     res_events.push_back(instance.output_memory(1).copy_from(stream, *scales_variable->get_memory(), false));
+            // }
+
+            return stream.aggregate_events(res_events, res_events.size() > 1);
         }
 
         return instance.get_network().get_stream().create_user_event(true);
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp
index 91f141ae062723..3af726f8d43823 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp
@@ -37,6 +37,16 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl<dynamic_quantize> {
         auto params = get_default_params<kernel_selector::dynamic_quantize_params>(impl_param, is_shape_agnostic);
         params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1)));
 
+        if (impl_param.output_layouts.size() > 2) {
+            params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(2)));
+        }
+
+        const auto& desc = impl_param.typed_desc<dynamic_quantize>();
+        params.group_sizes = desc->quantization_config.group_sizes;
+        params.scales_output_order = desc->scales_zp_output_order;
+        params.use_asymmetric_quantization = desc->quantization_config.is_asymmetric_quantization();
+        params.combine_scales_and_zp = desc->combine_scales_and_zp;
+
         return params;
     }
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
index 762118ec5b1af2..19fff28a0e611e 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
@@ -828,7 +828,7 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
     }
 }
 
-kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset) {
+kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset, const data_types dt) {
     const auto& pad = l.data_padding;
     const auto& vals_original = l.get_partial_shape();
 
@@ -868,7 +868,9 @@ kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor v
         pitch *= (reserved_in_mem_count + lp + up);
     }
 
-    return kernel_selector::data_tensor(vec, to_data_type(l.data_type), ks_layout);
+    data_types new_dt = (dt == data_types::undefined) ? l.data_type : dt;
+
+    return kernel_selector::data_tensor(vec, to_data_type(new_dt), ks_layout);
 }
 
 kernel_selector::weights_tensor convert_weights_tensor(const layout& l, bool is_grouped) {
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
index 3ddb5bf8793c29..6c36e9002640c3 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
@@ -101,7 +101,7 @@ kernel_selector::data_layout to_data_layout(format f);
 cldnn::format from_data_layout(kernel_selector::data_layout l);
 kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped);
 cldnn::format::type from_weights_layout(kernel_selector::weights_layout l);
-kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset = tensor {});
+kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset = tensor {}, const data_types dt = data_types::undefined);
 kernel_selector::weights_tensor convert_weights_tensor(const layout& l, bool is_grouped = false);
 layout from_weights_tensor(const kernel_selector::weights_tensor& t);
 kernel_selector::activation_function get_kernel_selector_activation_param(activation_func activation_func);
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
index e4e4adfbb15452..7498215b1682ba 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
@@ -9,10 +9,13 @@
 #include "multi_stage_primitive.hpp"
 
 #include "kv_cache_inst.h"
+#include "dynamic_quantize_inst.h"
 #include "concatenation/concatenation_kernel_selector.h"
 #include "concatenation/concatenation_kernel_base.h"
 #include "beam_table_update/beam_table_update_kernel_selector.hpp"
 #include "beam_table_update/beam_table_update_kernel_ref.hpp"
+#include "dynamic_quantize/dynamic_quantize_kernel_selector.h"
+#include "dynamic_quantize/dynamic_quantize_kernel_opt_generic.h"
 #include "openvino/core/dimension.hpp"
 
 namespace cldnn {
@@ -57,6 +60,9 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
     using bt_kernel_selector_t = kernel_selector::beam_table_update_kernel_selector;
     using bt_kernel_params_t = kernel_selector::beam_table_update_params;
 
+    using dq_kernel_selector_t = kernel_selector::dynamic_quantize_kernel_selector;
+    using dq_kernel_params_t = kernel_selector::dynamic_quantize_params;
+
     DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::kv_cache_impl)
 
     std::unique_ptr<primitive_impl> clone() const override {
@@ -65,9 +71,12 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
 
     const size_t concat_stage = 0;
     const size_t beam_table_stage = 1;
+    const size_t scale_concat_stage = 2;
+    const size_t dq_concat_stage = 3;
 
     cldnn::memory::ptr beam_table_prev = nullptr;
     cldnn::memory::ptr beam_table_new = nullptr;
+    // cldnn::memory::ptr compression_scale = nullptr;
 
     void load(BinaryInputBuffer& ib) override {
         parent::load(ib);
@@ -75,11 +84,17 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
             auto& kernel_selector = kernel_selector_t::Instance();
             auto kernel_impl = kernel_selector.GetImplementation(_kernels_data[concat_stage].kernelName);
             kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[concat_stage]);
-            if (_kernels_data.size() == 2) {
+            if (_kernels_data.size() >= 2) {
                 auto& bt_kernel_selector = bt_kernel_selector_t::Instance();
                 auto bt_kernel_impl = bt_kernel_selector.GetImplementation(_kernels_data[beam_table_stage].kernelName);
                 bt_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[beam_table_stage]);
             }
+            // FIXME: indirectness and compression are orthogonal feature.
+            if (_kernels_data.size() == 3) {
+                auto& scale_kernel_selector = kernel_selector_t::Instance();
+                auto scale_kernel_impl = scale_kernel_selector.GetImplementation(_kernels_data[scale_concat_stage].kernelName);
+                scale_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[scale_concat_stage]);
+            }
         }
     }
     void set_arguments_impl(kv_cache_inst& instance) override {}
@@ -93,6 +108,16 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         } else if (stage == beam_table_stage) {
             args.inputs = { beam_table_prev, instance.input_memory_ptr(2) };
             args.outputs = { beam_table_new };
+        } else if (stage == scale_concat_stage) {
+            // FIXME: indirectness and compression are orthogonal feature.
+            args.inputs = { instance.input_memory_ptr(3) }; // [past, new, beam_table, past_scale, new_scale]
+            args.outputs = { instance.output_memory_ptr(2) };
+        } else if (stage == dq_concat_stage) {
+            args.inputs = { instance.input_memory_ptr(1) }; // [past, new, beam_table, past_scale, new_scale]
+            args.outputs = { instance.output_memory_ptr(0) };
+            for (size_t i = 2; i < instance.outputs_memory_count(); i++) {
+                args.outputs.push_back(instance.output_memory_ptr(i));
+            }
         }
 
         return args;
@@ -139,6 +164,7 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
     }
 
     event::ptr execute_impl(const std::vector<event::ptr>& events, kv_cache_inst& instance) override {
+        GPU_DEBUG_TRACE_DETAIL << "Execute kv-cache: " << instance.get_impl_params()->_can_be_optimized << " " << instance.get_impl_params()->get_input_layout(3).to_short_string() << "\n";
         const bool can_be_optimized = instance.get_impl_params()->_can_be_optimized;
         auto& stream = instance.get_network().get_stream();
         auto& engine = instance.get_network().get_engine();
@@ -146,6 +172,17 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         auto& variable = instance.get_network().get_variable(desc->variable_info.variable_id);
         std::vector<event::ptr> res_events;
 
+        if (desc->compressed) {
+            // In case of KV-cache with compression enabled, skip second concat's kernel as new token data append will
+            // be handled by dynamic quantization kernel
+            // However, allow execution of the first token for the case if KV-cache can't be optimized (if optimization is disabled, or
+            // variables memory was reallocated and we have to copy past KV-cache to new memory)
+            _kernels_data[concat_stage].kernels[1].skip_execution = true;
+            if (!_kernels_data[concat_stage].kernels[0].skip_execution) {
+                GPU_DEBUG_TRACE_DETAIL << "Run copy of data!\n";
+            }
+        }
+
         execute_stage(events, instance, res_events, concat_stage);
 
         const auto& impl_param = *instance.get_impl_params();
@@ -183,14 +220,30 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
             beam_table_state->set();
         }
 
+        if (desc->compressed) {
+
+            execute_stage(events, instance, res_events, scale_concat_stage);
+
+            auto dq_params = get_dq_update_kernel_params(impl_param, impl_param.is_dynamic());
+            (_kernels_data[dq_concat_stage].update_dispatch_data_func)(dq_params, _kernels_data[dq_concat_stage]);
+            execute_stage(events, instance, res_events, dq_concat_stage);
+
+            auto compressed_cache_variable = dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCacheCompressed*>(&variable);
+            compressed_cache_variable->get_compression_scale_state()->set();
+
+            if (desc->quantization_config.is_asymmetric_quantization() && !desc->combine_scales_and_zp) {
+                compressed_cache_variable->get_compression_zp_state()->set();
+            }
+        }
+
         variable.set();
         if (can_be_optimized) {
             GPU_DEBUG_TRACE_DETAIL << desc->id  << " : Output is same as variable memory! Skip copying " << std::endl;
             // When primitive is optimized, concat kernel writes directly to variable memory
             return stream.aggregate_events(res_events, res_events.size() > 1);
         } else {
-            // Othwerise, we need to copy result from out buffer to state memory
-            GPU_DEBUG_TRACE_DETAIL << desc->id  << " : Copying output to variable meomry" << std::endl;
+            // Otherwise, we need to copy result from out buffer to state memory
+            GPU_DEBUG_TRACE_DETAIL << desc->id  << " : Copying output to variable memory" << std::endl;
 
             stream.enqueue_barrier();
             auto out = instance.get_network().get_engine().reinterpret_buffer(instance.output_memory(0), variable.get_memory()->get_layout());
@@ -213,6 +266,21 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         return layout{beam_table_shape, impl_param.output_layouts[1].data_type, format::get_default_format(beam_table_shape.size())};
     }
 
+    static layout get_compression_scale_layout(const kernel_impl_params& impl_param) {
+        // FIXME: it is implemented in multiple places
+        GPU_DEBUG_GET_INSTANCE(debug_config);
+        const auto& primitive = impl_param.typed_desc<kv_cache>();
+        auto kv_layout = impl_param.get_input_layout(0);
+        auto kv_shape = kv_layout.get_partial_shape();
+        auto comp_scale_shape = ov::PartialShape(std::vector<size_t>(kv_shape.size(), 1));
+        comp_scale_shape[0] = kv_shape[0];
+        comp_scale_shape[1] = kv_shape[1];
+        GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // per-head compression
+            comp_scale_shape[2] = kv_shape[2];
+        }
+        return layout{comp_scale_shape, impl_param.output_layouts[2].data_type, format::get_default_format(comp_scale_shape.size())};
+    }
+
     static kernel_params_t get_concat_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
         const auto& primitive = impl_param.typed_desc<kv_cache>();
         auto params = get_default_params<kernel_selector::concatenation_params>(impl_param, is_shape_agnostic);
@@ -221,7 +289,9 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         const auto inputs_count = 2;
         params.inputs.resize(inputs_count);
         for (size_t i = 0; i < inputs_count; ++i) {
-            params.inputs[i] = convert_data_tensor(impl_param.input_layouts[i]);
+            auto tmp = impl_param.input_layouts[i];
+            tmp.data_type = data_types::i8;
+            params.inputs[i] = convert_data_tensor(tmp);
         }
 
         params.axis = convert_axis(axis, impl_param.get_output_layout().get_rank());
@@ -229,6 +299,21 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
 
         const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, beam_table_past]
         const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present]
+
+        GPU_DEBUG_TRACE_DETAIL << "Concat output start offset: " << in_offsets_map.size() << " " << out_offsets_map.size() << "\n";
+
+
+        // for (const auto& in_offset : in_offsets_map) {
+        //     if (impl_param.input_layouts.size() > in_offset.first)
+        //         std::cout << in_offset.first << ". " << impl_param.input_layouts[in_offset.first].to_short_string() << ", input, offset=" << in_offset.second << "\n";
+        //     else
+        //         std::cout << in_offset.first << ". NOPE " << ", input, offset=" << in_offset.second << "\n";
+        // }
+
+        // for (const auto& out_offset : out_offsets_map) {
+        //     std::cout << out_offset.first << ". " << impl_param.output_layouts[out_offset.first].to_short_string() << ", output, offset=" << out_offset.second << "\n";
+        // }
+
         std::map<size_t, size_t> in_tensor_to_offset_map = {
             {0, in_offsets_map.at(0)},
             {1, in_offsets_map.at(1)},
@@ -237,6 +322,8 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
             {0, out_offsets_map.at(0)},
         };
 
+        GPU_DEBUG_TRACE_DETAIL << "Concat output start offset: " << primitive->id << " " << out_offsets_map.at(0) << " layout: " << impl_param.output_layouts[0].to_string() << "\n";
+
         params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map);
 
         return params;
@@ -264,10 +351,12 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         params.is_state_set = is_state_set;
         params.indirect_axis = indirect_axis;
 
-        const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, beam_table_past]]
-        const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present]
+        const bool compressed = impl_param.typed_desc<kv_cache>()->compressed;
+        const auto beam_table_past_idx = compressed ? 4 : 3;
+        const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, compression_scale_past, beam_table_past]]
+        const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present, compression_scale_present]
         std::map<size_t, size_t> in_tensor_to_offset_map = {
-            {0, in_offsets_map.at(3)}, // beam_table_past
+            {0, in_offsets_map.at(beam_table_past_idx)}, // beam_table_past
             {1, in_offsets_map.at(2)}, // beam_idx
         };
         std::map<size_t, size_t> out_tensor_to_offset_map = {
@@ -279,21 +368,148 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         return params;
     }
 
+    static dq_kernel_params_t get_dq_update_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
+        const auto& primitive = impl_param.typed_desc<kv_cache>();
+        auto params = get_default_params<dq_kernel_params_t>(impl_param, is_shape_agnostic);
+
+        params.append_axis = primitive->concat_axis;
+        params.group_sizes = primitive->quantization_config.group_sizes;
+        params.scales_output_order = primitive->scales_zp_output_order;
+        params.use_asymmetric_quantization = primitive->quantization_config.is_asymmetric_quantization();
+        params.combine_scales_and_zp = primitive->combine_scales_and_zp;
+
+        if (!is_shape_agnostic) {
+            const auto& past_kv_cache_shape = impl_param.input_layouts[0].get_partial_shape();
+            params.axis_offset = past_kv_cache_shape[primitive->concat_axis].get_length();
+        } else {
+            params.axis_offset = 0;
+        }
+
+        auto inputs_count = 1;
+        auto outputs_count = 2;
+        params.inputs.resize(inputs_count);
+        params.outputs.resize(outputs_count);
+
+        auto current_token_layout = impl_param.input_layouts[1];
+        auto present_layout = impl_param.output_layouts[0];
+        auto present_scales_layout = impl_param.output_layouts[2];
+        params.inputs[0] = convert_data_tensor(current_token_layout);
+        params.outputs[0] = convert_data_tensor(present_layout);
+        params.outputs[1] = convert_data_tensor(present_scales_layout);
+
+        const bool has_zp_output_buffer = primitive->quantization_config.is_asymmetric_quantization() && !primitive->combine_scales_and_zp;
+        if (has_zp_output_buffer) {
+            auto present_zp_layout = impl_param.output_layouts[3];
+            params.outputs.resize(outputs_count + 1);
+            params.outputs[2] = convert_data_tensor(present_zp_layout);
+        }
+
+        const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset;
+        const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset;
+
+        // FIXME: need to handle the index properly when indirect is off
+        std::map<size_t, size_t> in_tensor_to_offset_map = {
+            {0, in_offsets_map.at(1)}, // compression_scale_past
+        };
+        std::map<size_t, size_t> out_tensor_to_offset_map = {
+            {0, out_offsets_map.at(0)}, // compression_scale_present
+            {1, out_offsets_map.at(2)}, // compression_scale_present
+        };
+
+        GPU_DEBUG_TRACE_DETAIL << "DQ shapes: " << current_token_layout.to_short_string() << " " << present_layout.to_short_string() << " " << present_scales_layout.to_short_string() << "\n";
+        GPU_DEBUG_TRACE_DETAIL << "DQ: Dynamic shape in0 " << in_offsets_map.at(1) << "\n";
+        GPU_DEBUG_TRACE_DETAIL << "DQ: Dynamic shape out " << out_offsets_map.at(0) << "\n";
+        GPU_DEBUG_TRACE_DETAIL << "DQ: Dynamic shape out " << out_offsets_map.at(2) << "\n";
+
+        if (has_zp_output_buffer) {
+            out_tensor_to_offset_map.emplace(2, out_offsets_map.at(3));
+            GPU_DEBUG_TRACE_DETAIL << "DQ: Dynamic shape out " << out_offsets_map.at(3) << "\n";
+        }
+        params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map);
+
+        return params;
+    }
+
+
+    static kernel_params_t get_compression_scale_update_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
+        const auto& primitive = impl_param.typed_desc<kv_cache>();
+        auto params = get_default_params<kernel_selector::concatenation_params>(impl_param, is_shape_agnostic);
+
+        const auto concat_axis = 2;
+        params.axis = convert_axis(concat_axis, impl_param.get_output_layout().get_rank());
+
+        auto inputs_count = 1;
+        auto comp_scale_past_layout = impl_param.input_layouts[3];
+        // auto comp_scale_new_layout = impl_param.input_layouts[4]; // <-- this should be replaced with inner layout
+
+        auto comp_scale_present_layout = impl_param.output_layouts[2];
+
+        GPU_DEBUG_TRACE_DETAIL << "Past scale: " << comp_scale_past_layout.to_short_string() << "\n";
+        // GPU_DEBUG_TRACE_DETAIL << "New scale: " << comp_scale_new_layout.to_short_string() << "\n";
+        GPU_DEBUG_TRACE_DETAIL << "Present scale: " << comp_scale_present_layout.to_short_string() << "\n";
+
+        params.inputs.resize(inputs_count);
+        params.inputs[0] = convert_data_tensor(comp_scale_past_layout);
+        // params.inputs[1] = convert_data_tensor(comp_scale_new_layout);
+        params.outputs[0] = convert_data_tensor(comp_scale_present_layout);
+
+        const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset;
+        const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset;
+
+        // FIXME: need to handle the index properly when indirect is off
+        std::map<size_t, size_t> in_tensor_to_offset_map = {
+            {0, in_offsets_map.at(3)}, // compression_scale_past
+            // {1, in_offsets_map.at(4)}, // compression_scale_new
+        };
+        std::map<size_t, size_t> out_tensor_to_offset_map = {
+            {0, out_offsets_map.at(2)}, // compression_scale_present
+        };
+
+        GPU_DEBUG_TRACE_DETAIL << "Dynamic shape in0 " << in_offsets_map.at(3) << "\n";
+        // GPU_DEBUG_TRACE_DETAIL << "Dynamic shape in1 " << in_offsets_map.at(4) << "\n";
+        GPU_DEBUG_TRACE_DETAIL << "Dynamic shape offset " << out_offsets_map.at(2) << "\n";
+        params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map);
+
+        return params;
+    }
+
+
     static std::unique_ptr<primitive_impl> create(const typed_program_node<kv_cache>& arg, const kernel_impl_params& impl_param) {
         std::vector<kernel_selector::kernel_data> kernels_data;
+        // if (arg.id().find("kvcache:__module.model.transformer.h.0.attn/aten::cat/Concat_4") != std::string::npos)
+        //     std::cout << "mingyuki: create " << arg.id() << std::endl;
+        GPU_DEBUG_TRACE_DETAIL << "KVCACHE Select concat\n";
+        GPU_DEBUG_TRACE_DETAIL << "KVCACHE Select concat\n";
         auto concat_kernel_params = get_concat_kernel_params(impl_param, impl_param.is_dynamic());
         auto& concat_kernel_selector = kernel_selector_t::Instance();
         kernels_data.push_back(concat_kernel_selector.get_best_kernel(concat_kernel_params));
         const bool indirect = impl_param.typed_desc<kv_cache>()->indirect;
+        const bool compressed = impl_param.typed_desc<kv_cache>()->compressed;
+        GPU_DEBUG_TRACE_DETAIL << "KVCACHE Select beam table\n";
+        GPU_DEBUG_TRACE_DETAIL << "KVCACHE Select beam table\n";
         if (indirect) {
             auto bt_update_kernel_params = get_bt_update_kernel_params(impl_param, false);
             auto& bt_update_kernel_selector = bt_kernel_selector_t::Instance();
             kernels_data.push_back(bt_update_kernel_selector.get_best_kernel(bt_update_kernel_params));
         }
+        GPU_DEBUG_TRACE_DETAIL << "KVCACHE Select DQ\n";
+        GPU_DEBUG_TRACE_DETAIL << "KVCACHE Select DQ\n";
+        if (compressed) {
+            auto comp_scale_update_kernel_params = get_compression_scale_update_kernel_params(impl_param, impl_param.is_dynamic());
+            auto& comp_scale_update_kernel_selector = kernel_selector_t::Instance();
+            kernels_data.push_back(comp_scale_update_kernel_selector.get_best_kernel(comp_scale_update_kernel_params));
+
+            // kernels_data.push_back(kernel_selector::kernel_data());
+
+            auto dq_kernel_params = get_dq_update_kernel_params(impl_param, impl_param.is_dynamic());
+            auto& dq_kernel_selector = dq_kernel_selector_t::Instance();
+            kernels_data.push_back(dq_kernel_selector.get_best_kernel(dq_kernel_params));
+        }
         return cldnn::make_unique<kv_cache_impl>(kernels_data);
     }
 
     void update_dispatch_data(const kernel_impl_params& impl_param) override {
+        GPU_DEBUG_TRACE_DETAIL << "update_dispatch_data kv-cache: " << impl_param._can_be_optimized << " " << impl_param.get_input_layout(3).to_short_string() << "\n";
         // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
         if (_kernels_data[concat_stage].params == nullptr) {
             _kernels_data[concat_stage].params = std::make_shared<kernel_params_t>(get_concat_kernel_params(impl_param, true));
@@ -307,13 +523,19 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
 
         (_kernels_data[concat_stage].update_dispatch_data_func)(params, _kernels_data[concat_stage]);
         _kernels_data[concat_stage].kernels[0].skip_execution = impl_param._can_be_optimized || impl_param.get_input_layout(0).count() == 0;
+
+        if (impl_param.typed_desc<kv_cache>()->compressed) {
+            auto comp_scale_kernel_params = get_compression_scale_update_kernel_params(impl_param, impl_param.is_dynamic());
+            (_kernels_data[scale_concat_stage].update_dispatch_data_func)(comp_scale_kernel_params, _kernels_data[scale_concat_stage]);
+            _kernels_data[scale_concat_stage].kernels[0].skip_execution = impl_param._can_be_optimized || impl_param.get_input_layout(3).count() == 0;
+        }
     }
 };
 
 namespace detail {
 
 attach_kv_cache_impl::attach_kv_cache_impl() {
-    auto types = { data_types::f16, data_types::f32 };
+    auto types = { data_types::i8, data_types::f16, data_types::f32 };
     auto formats = { format::bfyx };
     implementation_map<kv_cache>::add(impl_types::ocl,
                                            shape_types::dynamic_shape,
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp
index b33871110ec6b4..998ca892dc4558 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp
@@ -83,14 +83,19 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
     kernel_arguments_data get_arguments(const scaled_dot_product_attention_inst& instance, size_t stage) const override {
         kernel_arguments_data args;
 
+        GPU_DEBUG_TRACE_DETAIL << "Stage : " << stage << "\n";
+        GPU_DEBUG_TRACE_DETAIL << "Original count: " << instance.inputs_memory_count() << "\n";
         auto inputs_num = instance.inputs_memory_count();
         if (instance.has_indirect_inputs() && stage == default_sdpa)
             inputs_num--;
+        GPU_DEBUG_TRACE_DETAIL << "updated count: " << inputs_num << "\n";
 
         for (size_t i = 0; i < inputs_num; i++) {
+            GPU_DEBUG_TRACE_DETAIL << "add: " << instance.input_memory_ptr(i)->buffer_ptr() << "\n";
             args.inputs.push_back(instance.input_memory_ptr(i));
         }
 
+        GPU_DEBUG_TRACE_DETAIL << "instance.has_fused_primitives(): " << instance.has_fused_primitives() << "\n";
         if (instance.has_fused_primitives()) {
             size_t count = instance.get_fused_mem_count();
             for (size_t i = 0; i < count; i++) {
@@ -98,7 +103,9 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
             }
         }
 
+        GPU_DEBUG_TRACE_DETAIL << "Original output count " << instance.outputs_memory_count() << "\n";
         for (size_t i = 0; i < instance.outputs_memory_count(); i++) {
+            GPU_DEBUG_TRACE_DETAIL << "add out: " << instance.output_memory_ptr(i)->buffer_ptr() << "\n";
             args.outputs.push_back(instance.output_memory_ptr(i));
         }
 
@@ -133,6 +140,20 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
             for (size_t i = 0; i < instance.get_intermediates_memories().size(); i++)
                 args.intermediates.push_back(instance.get_intermediates_memories()[i]);
 
+            GPU_DEBUG_TRACE_DETAIL << "Configured kernel arguments:\n";
+            for (size_t i = 0; i < _kernels_data[stage].kernels[kd_idx].params.arguments.size(); i++) {
+                GPU_DEBUG_TRACE_DETAIL << "\t" << i << ": type=" << static_cast<int>(_kernels_data[stage].kernels[kd_idx].params.arguments[i].t) << " "
+                                       << "index=" << _kernels_data[stage].kernels[kd_idx].params.arguments[i].index << "\n";
+            }
+
+            GPU_DEBUG_TRACE_DETAIL << "Memory buffers:"
+                                   << "shape_info=" << args.shape_info << " "
+                                   << "inputs=" << args.inputs.size() << " "
+                                   << "outputs=" << args.outputs.size() << " "
+                                   << "intermediates=" << args.intermediates.size() << " "
+                                   << "weights=" << args.weights << " "
+                                   << "scalars=" << (args.scalars ? args.scalars->size() : 0) << "\n";
+
             stream.set_arguments(*_kernels[idx_final], _kernels_data[stage].kernels[kd_idx].params, args);
 
             const auto& gws = params.workGroups.global;
@@ -177,6 +198,9 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
     }
 
     event::ptr execute_impl(const std::vector<event::ptr>& events, scaled_dot_product_attention_inst& instance) override {
+        GPU_DEBUG_TRACE_DETAIL << "SDPA inputs \n";
+        for (size_t i = 0; i < instance.inputs_memory_count(); i++)
+            GPU_DEBUG_TRACE_DETAIL << "SDPA input: " << instance.input_memory_ptr(i)->buffer_ptr() << "\n";
         if (need_indirect_load(instance))
             return execute_stage(events, instance, indirect_sdpa);
         else
@@ -216,6 +240,12 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
             config.head_size = query_shape[query_shape.size() - 1].get_length();
 
         config.is_causal = desc->is_causal;
+        config.is_kv_compressed = desc->is_kv_compressed;
+        config.use_asymmetric_quantization = desc->quantization_config.is_asymmetric_quantization();
+        config.combine_scales_and_zp = desc->combine_scales_and_zp;
+
+        GPU_DEBUG_TRACE << "Set is_kv_compressed to " << config.is_kv_compressed << "\n";
+        GPU_DEBUG_TRACE << "Set is_asym_compressed to " << config.use_asymmetric_quantization << "\n";
 
         return config;
     }
@@ -229,6 +259,15 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
         if (has_indirect_inputs(impl_param))
             data_inputs_num--;
 
+        auto has_zp_input_buffers = false;
+        if (desc->is_kv_compressed) {
+            data_inputs_num -= 2; // key and value compression scales are handled separately
+
+            has_zp_input_buffers = desc->quantization_config.is_asymmetric_quantization() && !desc->combine_scales_and_zp;
+            if (has_zp_input_buffers)
+                data_inputs_num -= 2; // key and value compression zp are handled separately
+        }
+
         params.inputs.resize(data_inputs_num);
         for (size_t i = 0; i < data_inputs_num; i++) {
             params.inputs[i] = convert_data_tensor(impl_param.get_input_layout(i));
@@ -246,15 +285,43 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
             params.indirect_axis = desc->indirect_axis;
         }
 
-        params.set_dynamic_shape_offsets();
+        if (desc->is_kv_compressed) {
+            params.key_cache_comp_scale = convert_data_tensor(impl_param.get_input_layout(data_inputs_num));
+            params.value_cache_comp_scale = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 1));
 
-        // Need to adjust sdpa kernel offset to consider beam table input
-        if (has_indirect_inputs(impl_param)) {
-            auto out_offset = params.outputs[0].get_dynamic_shape_offset();
-            if (indirect)
-                params.beam_table.SetDynamicShapeOffset(out_offset);
+            GPU_DEBUG_TRACE_DETAIL << "Add Scales params\n";
+            if (has_zp_input_buffers) {
+                GPU_DEBUG_TRACE_DETAIL << "Add zp params\n";
+                params.key_cache_comp_zp = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 2));
+                params.value_cache_comp_zp = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 3));
+            }
+        }
 
-            params.outputs[0].SetDynamicShapeOffset(out_offset + kernel_selector::DataTensor::max_rank());
+        const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset;
+        std::map<size_t, size_t> in_tensor_to_offset_map;
+        for (size_t i = 0; i < data_inputs_num; i++) {
+            in_tensor_to_offset_map[i] = in_offsets_map.at(i);
+        }
+
+        const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset;
+        std::map<size_t, size_t> out_tensor_to_offset_map = {
+            {0, out_offsets_map.at(0)},
+        };
+
+        params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map);
+
+        if (desc->is_kv_compressed) {
+            params.key_cache_comp_scale.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num));
+            params.value_cache_comp_scale.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 1));
+
+            if (has_zp_input_buffers) {
+                params.key_cache_comp_zp.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 2));
+                params.value_cache_comp_zp.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 3));
+            }
+        }
+
+        if (indirect && has_indirect_inputs(impl_param)) {
+            params.beam_table.SetDynamicShapeOffset(get_beam_table_id(desc));
         }
 
         return params;
@@ -300,6 +367,7 @@ attach_scaled_dot_product_attention_impl::attach_scaled_dot_product_attention_im
     auto types = {
         data_types::f32,
         data_types::f16,
+        data_types::i8,
     };
 
     auto formats = {
diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h
index 49dd62c6332549..bb8ebd093fa696 100644
--- a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h
@@ -35,7 +35,10 @@ class typed_primitive_inst<dynamic_quantize> : public typed_primitive_inst_base<
 
     // Internal function to be used from fakealignment
     template<typename ShapeType>
-    static std::vector<layout> __calc_output_layouts(const layout &act_layout, uint64_t group_size);
+    static std::vector<layout> __calc_output_layouts(const layout &act_layout,
+                                                     const dynamic_quantize::QuantizationConfig& config,
+                                                     const std::vector<uint64_t>& scales_zp_output_order,
+                                                     const bool combine_scales_and_zp);
     static std::string to_string(dynamic_quantize_node const& node);
 
     typed_primitive_inst(network& network, dynamic_quantize_node const& node);
diff --git a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h
index f3aa4de5ec34e1..b8f05ac3272353 100644
--- a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h
@@ -26,6 +26,7 @@ struct typed_program_node<kv_cache> : public typed_program_node_base<kv_cache> {
 
     std::vector<layout> get_shape_info_input_layouts() const override {
         std::vector<layout> res;
+        GPU_DEBUG_TRACE_DETAIL << "get_shape_info_input_layouts, get_dependencies.size()=" << get_dependencies().size() << "\n";
         for (size_t i = 0; i < get_dependencies().size(); i++) {
             const auto& d = get_dependency_with_port(i);
             res.push_back(d.first->get_output_layout(false, d.second));
@@ -35,6 +36,8 @@ struct typed_program_node<kv_cache> : public typed_program_node_base<kv_cache> {
             res.push_back(layout(ov::PartialShape::dynamic(4), data_types::i32, format::bfyx));
         }
 
+        GPU_DEBUG_TRACE_DETAIL << "Total shape info input layouts: " << res.size() << "\n";
+
         return res;
     }
 };
diff --git a/src/plugins/intel_gpu/src/graph/include/program_node.h b/src/plugins/intel_gpu/src/graph/include/program_node.h
index 029755c4733fe4..323d630732b5c4 100644
--- a/src/plugins/intel_gpu/src/graph/include/program_node.h
+++ b/src/plugins/intel_gpu/src/graph/include/program_node.h
@@ -232,7 +232,7 @@ struct program_node {
     }
 
     void merge_output_padding(padding const& padd, size_t idx = 0) {
-        set_output_padding(padding::max(padd, output_layouts[idx].data_padding));
+        set_output_padding(padding::max(padd, output_layouts[idx].data_padding), idx);
     }
 
     // only calculated output layout (for external usage), does not modify/use cached output layout nor invalidate users
diff --git a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h
index 74f9ffff581b87..0a33b41e940565 100644
--- a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h
@@ -33,9 +33,16 @@ class typed_primitive_inst<read_value> : public typed_primitive_inst_base<read_v
     template<typename ShapeType>
     static std::vector<layout> calc_output_layouts(read_value_node const& /*node*/, const kernel_impl_params& impl_param) {
         auto desc = impl_param.typed_desc<read_value>();
-        const auto& default_layout = desc->output_layout;
+        std::vector<layout> output_layouts;
 
-        return { impl_param.state_layout.value_or(default_layout) };
+        for (size_t i = 0; i < desc->num_outputs; i++) {
+            if (impl_param.state_layouts.size() > i)
+                GPU_DEBUG_TRACE_DETAIL << "Use state " << impl_param.state_layouts[i].to_short_string() << "\n";
+            const auto& default_layout = desc->output_layouts[i];
+            output_layouts.push_back(impl_param.state_layouts.size() > i ? impl_param.state_layouts[i] : default_layout);
+        }
+
+        return output_layouts;
     }
 
     static layout calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param);
diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp
index 95cdd587cdf175..1733743af996ce 100644
--- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp
+++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp
@@ -10,6 +10,7 @@
 #include "primitive_type_base.h"
 #include <sstream>
 #include <json_object.h>
+#include "to_string_utils.h"
 
 namespace cldnn {
 GPU_DEFINE_PRIMITIVE_TYPE_ID(kv_cache)
@@ -25,7 +26,7 @@ layout kv_cache_inst::calc_output_layout(const kv_cache_node& node, kernel_impl_
 }
 
 template<typename ShapeType>
-std::vector<layout> kv_cache_inst::calc_output_layouts(kv_cache_node const& /*node*/, kernel_impl_params const& impl_param) {
+std::vector<layout> kv_cache_inst::calc_output_layouts(kv_cache_node const& node, kernel_impl_params const& impl_param) {
     auto desc = impl_param.typed_desc<kv_cache>();
 
     ov::intel_gpu::op::KVCache op;
@@ -35,16 +36,30 @@ std::vector<layout> kv_cache_inst::calc_output_layouts(kv_cache_node const& /*no
 
     std::vector<ShapeType> input_shapes = {impl_param.get_input_layout(0).get<ShapeType>(),
                                            impl_param.get_input_layout(1).get<ShapeType>()};
-    if (desc->num_outputs > 1)
+    if (desc->indirect) {
         input_shapes.push_back(impl_param.get_input_layout(2).get<ShapeType>());
+    }
+
+    if (desc->compressed) {
+        input_shapes.push_back(impl_param.get_input_layout(3).get<ShapeType>());
+
+        if (desc->quantization_config.is_asymmetric_quantization() && !desc->combine_scales_and_zp) {
+            input_shapes.push_back(impl_param.get_input_layout(4).get<ShapeType>());
+        }
+    }
+
+    GPU_DEBUG_TRACE_DETAIL << "kv_cache combine_scales_and_zp=" << desc->combine_scales_and_zp << "\n";
+    std::vector<ShapeType> output_shapes = desc->compressed ? shape_infer(&op, input_shapes, desc->quantization_config, desc->scales_zp_output_order, desc->combine_scales_and_zp)
+                                                            : shape_infer(&op, input_shapes);
 
-    std::vector<ShapeType> output_shapes = shape_infer(&op, input_shapes);
+    if (desc->num_outputs == 3)
+        GPU_DEBUG_TRACE_DETAIL << desc->id << " scales output calculated shape: " << output_shapes[2] << "\n";
 
     static const std::map<size_t, size_t> ports_map = {{0, 0}, {1, 2}};
 
     std::vector<layout> out_layouts;
     for (size_t i = 0; i < desc->num_outputs; i++) {
-        auto out_type = desc->output_data_types[i].value_or(impl_param.get_input_layout(ports_map.at(i)).data_type);
+        auto out_type = desc->output_data_types[i].value();
         out_layouts.emplace_back(output_shapes[i], out_type, impl_param.get_output_layout(i).format);
     }
 
@@ -63,6 +78,7 @@ std::string kv_cache_inst::to_string(const kv_cache_node& node) {
     kv_cache_info.add("concat axis", node.get_primitive()->concat_axis);
     kv_cache_info.add("gather axis", node.get_primitive()->gather_axis);
     kv_cache_info.add("indirect", node.get_primitive()->indirect);
+    kv_cache_info.add("compressed", node.get_primitive()->compressed);
     node_info->add("kv_cache info", kv_cache_info);
     std::stringstream primitive_description;
     node_info->dump(primitive_description);
@@ -92,6 +108,7 @@ void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) {
 
     size_t i = 0;
     // [kv_state, kv_new_token, [beam_idx, bt_past]]
+    // FIXME: do we need to handle compression scale value?
     for (i = 0; i < _node->get_dependencies().size(); i++) {
         const auto& node_in_lay = _node->get_input_layout(i);
         const auto& runtime_in_lay = params.input_layouts[i];
@@ -113,10 +130,27 @@ void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) {
             bt_layout.set_partial_shape(bt_shape);
         }
 
-        GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for input[" << i << "]" << std::endl;
+        GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for input[" << i++ << "]" << std::endl;
         fill_shape_info_data(bt_layout, bt_state->get_initial_layout(), shape_info_ptr, offset);
     }
 
+    // if (params.typed_desc<kv_cache>()->compressed) {
+    //     auto& var = dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCache&>(get_network().get_variable(variable_id()));
+    //     const auto& scale_state = var.get_compression_scale_state();
+    //     auto scale_layout = scale_state->get_layout();
+    //     if (scale_layout.is_dynamic()) {
+    //         auto bt_shape = scale_layout.get_partial_shape();
+    //         for (auto& d : bt_shape) {
+    //             if (d.is_dynamic())
+    //                 d = 0;
+    //         }
+    //         scale_layout.set_partial_shape(bt_shape);
+    //     }
+
+    //     GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for input[" << i++ << "]" << std::endl;
+    //     fill_shape_info_data(scale_layout, scale_state->get_initial_layout(), shape_info_ptr, offset);
+    // }
+
     for (size_t i = 0; i < _node->get_output_layouts().size(); i++) {
         GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for output[" << i << "]" << std::endl;
         const auto& node_out_lay = _node->get_output_layout(i);
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index d69a2767c018f6..272172ff6c7659 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -37,6 +37,7 @@
 #include "graph_optimizer/prepare_buffer_fusing.h"
 
 #include "intel_gpu/plugin/common_utils.hpp"
+#include "intel_gpu/plugin/multi_tensor_variable_state.hpp"
 #include "intel_gpu/graph/network.hpp"
 #include "intel_gpu/graph/serialization/set_serializer.hpp"
 #include "intel_gpu/runtime/engine.hpp"
@@ -292,30 +293,91 @@ void primitive_inst::update_shape() {
         auto prim = get_node().as<read_value>().get_primitive();
         const auto& variable_id = prim->variable_id;
         auto& variable = get_network().get_variable(variable_id);
-        // Initial variable shape is taken from variable itself
-        auto new_layout = variable.get_layout();
 
-        // If variable is not set and we have an initializer - use it's shape as shape of variable
-        if (!variable.is_set() && _impl_params->input_layouts.size() == 1) {
-            new_layout = _impl_params->get_input_layout(0);
-        }
+        auto update_state_layout = [&](ov::intel_gpu::VariableStateBase& variable, layout new_layout, size_t layout_idx) {
+            // If variable is not set and we have an initializer - use it's shape as shape of variable
+            if (!variable.is_set() && _impl_params->input_layouts.size() >= layout_idx) {
+                new_layout = _impl_params->get_input_layout(layout_idx);
+            }
 
-        // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0
-        if (new_layout.is_dynamic()) {
-            auto pshape = new_layout.get_partial_shape();
-            for (auto& d : pshape) {
-                if (d.is_dynamic()) {
-                    d = 0;
+            // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0
+            if (new_layout.is_dynamic()) {
+                auto pshape = new_layout.get_partial_shape();
+                for (auto& d : pshape) {
+                    if (d.is_dynamic()) {
+                        d = 0;
+                    }
                 }
+                new_layout.set_partial_shape(pshape);
             }
-            new_layout.set_partial_shape(pshape);
-        }
 
-        variable.set_layout(new_layout);
+            variable.set_layout(new_layout);
 
-        if (!_impl_params->state_layout.has_value() || _impl_params->state_layout.value() != new_layout) {
-            _impl_params->state_layout = new_layout;
-            input_shape_changed = true;
+            if (_impl_params->state_layouts[layout_idx] != new_layout) {
+                _impl_params->state_layouts[layout_idx] = new_layout;
+                GPU_DEBUG_TRACE_DETAIL << "Update " << layout_idx << " layout: " << new_layout.to_short_string() << "\n";
+                input_shape_changed = true;
+            }
+        };
+
+        if (_impl_params->state_layouts.empty())
+            _impl_params->state_layouts.resize(1);
+
+        // Initial variable shape is taken from variable itself
+        auto new_layout = variable.get_layout();
+        update_state_layout(variable, new_layout, 0);
+
+        // // If variable is not set and we have an initializer - use it's shape as shape of variable
+        // if (!variable.is_set() && _impl_params->input_layouts.size() >= 1) {
+        //     new_layout = _impl_params->get_input_layout(0);
+        // }
+
+        // // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0
+        // auto replace_dynamic_dims = [](layout& layout) {
+        //     if (layout.is_dynamic()) {
+        //         auto pshape = layout.get_partial_shape();
+        //         for (auto& d : pshape) {
+        //             if (d.is_dynamic()) {
+        //                 d = 0;
+        //             }
+        //         }
+        //         layout.set_partial_shape(pshape);
+        //     }
+        // };
+        // replace_dynamic_dims(new_layout);
+
+        // GPU_DEBUG_TRACE_DETAIL << id() << " set new layout " << new_layout.to_short_string() << "\n";
+        // variable.set_layout(new_layout);
+
+        // if (_impl_params->state_layouts.empty()) {
+        //     _impl_params->state_layouts.resize(1);
+        // }
+
+        // if (_impl_params->state_layouts[0] != new_layout) {
+        //     _impl_params->state_layouts[0] = new_layout;
+        //     input_shape_changed = true;
+        // }
+
+        if (prim->num_outputs > 1) {
+            // _impl_params->state_layouts.resize(2);
+            GPU_DEBUG_TRACE_DETAIL << "This readvalue check\n";
+            if (auto compressed_cache_variable = dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCacheCompressed*>(&variable)) {
+                _impl_params->state_layouts.resize(compressed_cache_variable->has_zp_state() ? 3 : 2);
+                GPU_DEBUG_TRACE_DETAIL << "Has multi tensor variable" << _impl_params->state_layouts.size() << "!\n";
+
+                auto scales_state = compressed_cache_variable->get_compression_scale_state();
+                auto new_scales_layout = compressed_cache_variable->get_compression_scale_state()->get_layout();
+                update_state_layout(*scales_state, new_scales_layout, 1);
+
+                if (compressed_cache_variable->has_zp_state()) {
+                    auto zp_state = compressed_cache_variable->get_compression_zp_state();
+                    auto new_zp_layout = compressed_cache_variable->get_compression_zp_state()->get_layout();
+                    update_state_layout(*zp_state, new_zp_layout, 2);
+
+                    GPU_DEBUG_TRACE_DETAIL << "Set new zp layout " << new_zp_layout.to_short_string() << "\n";
+                    GPU_DEBUG_TRACE_DETAIL << "states[2] " << _impl_params->state_layouts[2].to_short_string() << "\n";
+                }
+            }
         }
     }
 
@@ -460,6 +522,17 @@ void primitive_inst::update_shape() {
         auto& variable = get_network().get_variable(desc->variable_id);
         // Custom output layout update as update_output_layout handles paddings incorrectly for optimized out read_value + kv_cache pattern
         _impl_params->output_layouts[0] = variable.get_layout();
+        GPU_DEBUG_TRACE_DETAIL << "Update ReadValue output0 " << _impl_params->output_layouts[0].to_short_string() << "\n";
+
+        if (auto compressed_cache_variable = dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCacheCompressed*>(&variable)) {
+            _impl_params->output_layouts[1] = compressed_cache_variable->get_compression_scale_state()->get_layout();
+            GPU_DEBUG_TRACE_DETAIL << "Update ReadValue output1 " << _impl_params->output_layouts[1].to_short_string() << "\n";
+
+            if (compressed_cache_variable->has_zp_state()) {
+                GPU_DEBUG_TRACE_DETAIL << "Update ReadValue output2 " << _impl_params->output_layouts[1].to_short_string() << "\n";
+               _impl_params->output_layouts[2] = compressed_cache_variable->get_compression_zp_state()->get_layout();
+            }
+        }
     }
 
     if (get_node().is_type<kv_cache>()) {
@@ -542,6 +615,15 @@ event::ptr primitive_inst::realloc_if_needed() {
                                     << ", variable layout " << variable.get_layout().to_short_string() << ")" << std::endl;
 
                 _outputs[0] = variable.get_memory();
+
+                if (auto compressed_cache_variable = dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCacheCompressed*>(&variable)) {
+                    _outputs[2] = compressed_cache_variable->get_compression_scale_state()->get_memory();
+
+                    if (compressed_cache_variable->has_zp_state()) {
+                        _outputs[3] = compressed_cache_variable->get_compression_zp_state()->get_memory();
+                    }
+                }
+
                 // To record shape predictor
                 for (size_t j = 0; j < _impl_params->output_layouts.size(); ++j)
                     sp.predict_preallocation_shape(id(), _impl_params->output_layouts[j], true, j);
@@ -644,7 +726,12 @@ event::ptr primitive_inst::realloc_if_needed() {
 
                 // dynamic quantization is only applied to activation of FC
                 if (get_node().is_type<dynamic_quantize>()) {
-                    auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts<ov::PartialShape>(updated_layouts[dep_idx], 0);
+                    const auto& desc = get_node().as<dynamic_quantize>().get_primitive();
+                    auto dyn_quan_scale_layout =
+                        dynamic_quantize_inst::__calc_output_layouts<ov::PartialShape>(updated_layouts[dep_idx],
+                                                                                       desc->quantization_config,
+                                                                                       desc->scales_zp_output_order,
+                                                                                       desc->combine_scales_and_zp);
                     GPU_DEBUG_TRACE_DETAIL << "update layout of dynamic quantize scale parameter layout "
                                         << dyn_quan_scale_layout[1].to_short_string() << std::endl;
                     updated_params.output_layouts[1] = dyn_quan_scale_layout[1];
@@ -688,13 +775,19 @@ event::ptr primitive_inst::realloc_if_needed() {
     for (size_t i = 0; i < updated_layouts.size(); ++i) {
         bool reclaim = 0;
         size_t required_buffer_size = 0;
-        if (_node->is_type<kv_cache>() && i == 0) {
+        if (_node->is_type<kv_cache>() && (i == 0 || i == 2 || i == 3)) {
             // Relax reclaiming condition for kv cache
             const auto& desc = _node->as<kv_cache>().get_primitive();
             auto prealloc_shape = updated_layouts[i].get_shape();
             const auto shape_rank = prealloc_shape.size();
-            auto seq_axis =
-                static_cast<int32_t>(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis);
+            auto seq_axis = 0;
+            if (i == 0) {
+                // seq_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, shape_rank);
+                seq_axis = static_cast<int32_t>(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis);
+            } else if (i == 2 || i == 3) {
+                seq_axis = 2;
+            }
+
             prealloc_shape[seq_axis] += tmp_prealloc_count;
             required_buffer_size = std::accumulate(prealloc_shape.begin(), prealloc_shape.end(), size_t(1), std::multiplies<size_t>());
         } else {
@@ -720,12 +813,19 @@ event::ptr primitive_inst::realloc_if_needed() {
 
     for (size_t i = 0; i < actual_layouts.size(); ++i) {
         bool can_reuse_buffer = (_outputs[i] && updated_layouts[i].get_linear_size() <= _max_output_layout_count[i]);
+
         std::pair<bool, ov::Shape> prealloc_info;
-        if (_node->is_type<kv_cache>() && i == 0) {
+        if (_node->is_type<kv_cache>() && (i == 0 || i == 2 || i == 3)) {
             const auto& desc = _node->as<kv_cache>().get_primitive();
             auto shape_rank = updated_layouts[i].get_shape().size();
-            auto seq_axis =
-                static_cast<int32_t>(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis);
+            auto seq_axis = 0;
+            if (i == 0) {
+                // seq_axis = static_cast<int32_t>(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis);
+                seq_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, shape_rank);
+            } else if (i == 2 || i == 3) {
+                seq_axis = 2;
+            }
+
             prealloc_info = sp.predict_preallocation_shape(id(), updated_layouts[i], false, i, tmp_prealloc_count, seq_axis);
         } else {
             prealloc_info = sp.predict_preallocation_shape(id(), updated_layouts[i], can_reuse_buffer, i, tmp_prealloc_count);
@@ -741,19 +841,20 @@ event::ptr primitive_inst::realloc_if_needed() {
             GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer[" << i << "] - "
                                    << actual_layouts[i].get_linear_size() << "/" << _max_output_layout_count[i]
                                    << std::endl;
-            if (_node->is_type<kv_cache>() && (i == 0)) {
+            if (_node->is_type<kv_cache>() && (i == 0 || i == 2 || i == 3)) {
                 // kv_cache has already assigned memory.
                 // No need to reinterpret output memory but need to update padding
                 const auto& desc = _node->as<kv_cache>().get_primitive();
                 auto& present_layout = _impl_params->output_layouts[i];
                 const auto present_layout_rank = present_layout.get_partial_shape().size();
-                const auto sequence_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank);
+                const auto sequence_axis = i == 0 ? kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank) : 2;
+                GPU_DEBUG_TRACE_DETAIL << "get_max_pad: " << present_layout.to_short_string() << " " << _max_output_layout_count[0] << " " << sequence_axis << "\n";
                 auto max_pad = kv_cache_inst::get_max_pad(present_layout,
                                                           _max_output_layout_count[i],
                                                           sequence_axis,
-                                                          "present_layout");
+                                                          i == 0 ? "present_layout" : "present_scales_layout");
                 kv_cache_inst::update_pad(present_layout, max_pad, sequence_axis);
-                GPU_DEBUG_TRACE_DETAIL << _impl_params->output_layouts[i].to_string() << std::endl;
+                GPU_DEBUG_TRACE_DETAIL << i << ". " << _impl_params->output_layouts[i].to_string() << std::endl;
                 set_shape_change();
             } else {
                 _outputs[i] = _network.get_engine().reinterpret_buffer(*_outputs[i], actual_layouts[i]);
@@ -809,12 +910,35 @@ event::ptr primitive_inst::realloc_if_needed() {
         }
         if (present_layout.data_padding._dynamic_dims_mask[sequence_axis] == 1) {
             // Apply padding of variable to make it be optimized in the next iteration
+            GPU_DEBUG_TRACE_DETAIL << "get_max_pad: " << present_layout.to_short_string() << " " << _max_output_layout_count[0] << " " << sequence_axis << "\n";
             auto max_pad = kv_cache_inst::get_max_pad(present_layout,
                                                       _max_output_layout_count[0],
                                                       sequence_axis,
                                                       "present_layout");
             if (max_pad > 0) {
+                if (auto compressed_cache_variable = dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCacheCompressed*>(&variable)) {
+                    GPU_DEBUG_TRACE_DETAIL << "Compressed case!\n";
+                    auto present_scales_layout = _impl_params->output_layouts[2];
+
+                    const auto sequence_axis = 2;
+                    GPU_DEBUG_TRACE_DETAIL << id() << " is kv_cache => set the variable with newly allocated output memory"
+                                        << std::endl;
+
+                    kv_cache_inst::update_pad(present_scales_layout, max_pad, sequence_axis);
+                    GPU_DEBUG_TRACE_DETAIL << "Updated scales pad (" << max_pad << " " << sequence_axis << "): " << present_scales_layout.to_string() << "\n";
+                    if (!axis_is_outer_most) {
+                        _impl_params->output_layouts[2] = present_scales_layout;
+                    }
+
+                    compressed_cache_variable->get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout);
+                    if (compressed_cache_variable->has_zp_state()) {
+                        auto present_zp_layout = present_scales_layout;
+                        compressed_cache_variable->get_compression_zp_state()->set_memory(_outputs[3], present_zp_layout);
+                    }
+                }
+
                 kv_cache_inst::update_pad(present_layout, max_pad, sequence_axis);
+                GPU_DEBUG_TRACE_DETAIL << "Updated data pad (" << max_pad << " " << sequence_axis << "): " << present_layout.to_string() << "\n";
                 if (!axis_is_outer_most) {
                     GPU_DEBUG_TRACE_DETAIL << id() << ": Update impl with new output padding" << std::endl;
                     set_shape_change();
@@ -834,12 +958,34 @@ event::ptr primitive_inst::realloc_if_needed() {
                                        << "'s layout with allocated kv cache output: " << present_layout.to_short_string()
                                        << " (is_set  = " << variable.is_set() << ") " << std::endl;
                 variable.set_memory(_outputs[0], present_layout);
+
+                if (auto compressed_cache_variable = dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCacheCompressed*>(&variable)) {
+                    GPU_DEBUG_TRACE_DETAIL << "Compressed case[2]!\n";
+                    auto present_scales_layout = _impl_params->output_layouts[2];
+
+                    compressed_cache_variable->get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout);
+                    if (compressed_cache_variable->has_zp_state()) {
+                        auto present_zp_layout = present_scales_layout;
+                        compressed_cache_variable->get_compression_zp_state()->set_memory(_outputs[3], present_zp_layout);
+                    }
+                }
             }
         } else {
             GPU_DEBUG_TRACE_DETAIL << id() << ": Update variable " << variable.get_name()
                                    << "'s layout with allocated kv cache output: " << present_layout.to_short_string()
                                    << " (is_set  = " << variable.is_set() << ") " << std::endl;
             variable.set_layout(present_layout);
+
+            if (auto compressed_cache_variable = dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCacheCompressed*>(&variable)) {
+                GPU_DEBUG_TRACE_DETAIL << "Compressed case[2]!\n";
+                auto present_scales_layout = _impl_params->output_layouts[2];
+
+                compressed_cache_variable->get_compression_scale_state()->set_layout(present_scales_layout);
+                if (compressed_cache_variable->has_zp_state()) {
+                    auto present_zp_layout = present_scales_layout;
+                    compressed_cache_variable->get_compression_zp_state()->set_layout(present_zp_layout);
+                }
+            }
         }
     }
 
@@ -1202,6 +1348,13 @@ void primitive_inst::do_runtime_in_place_kv_cache() {
         return;
     }
     const auto& desc = _node->as<kv_cache>().get_primitive();
+
+    if (desc->compressed) {
+        GPU_DEBUG_TRACE_DETAIL << "Original layouts\n";
+        GPU_DEBUG_TRACE_DETAIL << _impl_params->input_layouts[0] << "\n";
+        GPU_DEBUG_TRACE_DETAIL << _impl_params->input_layouts[3] << "\n";
+    }
+
     auto& past_layout = _impl_params->input_layouts[0];
     auto& new_layout = _impl_params->input_layouts[1];
     auto& present_layout = _impl_params->output_layouts[0];
@@ -1220,6 +1373,7 @@ void primitive_inst::do_runtime_in_place_kv_cache() {
 
     GPU_DEBUG_TRACE_DETAIL << "[do runtime kv_cache opt] " << id() << " initial present_layout : " << present_layout.to_string() << std::endl;
     GPU_DEBUG_TRACE_DETAIL << "[do runtime kv_cache opt] " << id() << " initial past_layout : " << past_layout.to_string() << std::endl;
+    GPU_DEBUG_TRACE_DETAIL << "get_max_pad: " << past_layout.to_short_string() << " " << _deps[0].first->_max_output_layout_count[0] << " " << sequence_axis << "\n";
     auto max_pad = kv_cache_inst::get_max_pad(past_layout, _deps[0].first->_max_output_layout_count[0], sequence_axis, "past_layout");
     const auto new_seq_len = static_cast<int64_t>(new_layout.get_shape()[sequence_axis]);
     // In chatbot scenario, when chat history must be stored in kvcache, new_seq_len may not be 1 even if max_pad is greater than 0
@@ -1228,11 +1382,45 @@ void primitive_inst::do_runtime_in_place_kv_cache() {
         GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_layout's pad : " << present_layout.to_string() << std::endl;
         auto& variable = get_network().get_variable(desc->variable_info.variable_id);
         variable.set_layout(present_layout);
+
+        if (desc->compressed) {
+            GPU_DEBUG_TRACE_DETAIL << "Compressed case[1]!\n";
+            auto compressed_cache_variable = dynamic_cast<ov::intel_gpu::VariableStateIndirectKVCacheCompressed*>(&variable);
+            auto& present_scales_layout = _impl_params->output_layouts[2];
+            const auto sequence_axis = 2;
+            kv_cache_inst::update_pad(present_scales_layout, max_pad - new_seq_len, sequence_axis);
+            GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_scale_layout's pad : " << present_scales_layout.to_string() << std::endl;
+
+            compressed_cache_variable->get_compression_scale_state()->set_layout(present_scales_layout);
+            if (_impl_params->input_layouts.size() > 4) {
+                auto& present_zp_layout = _impl_params->output_layouts[3];
+                kv_cache_inst::update_pad(present_zp_layout, max_pad - new_seq_len, sequence_axis);
+
+                compressed_cache_variable->get_compression_zp_state()->set_layout(present_scales_layout);
+            }
+        }
+
         GPU_DEBUG_TRACE_DETAIL << "[do_runtime_in_place_kv_cache] " << id() << "Updated variable with present_layout"
                                << variable.get_layout().to_string() << " is_set  = " << variable.is_set() << std::endl;
         if (past_layout.data_padding._upper_size[sequence_axis] > 0 && variable.is_set()) {
             kv_cache_inst::update_pad(past_layout, max_pad, sequence_axis);
             _impl_params->_can_be_optimized = true;
+
+            GPU_DEBUG_TRACE_DETAIL << "Updated data layout (" << max_pad << " " << sequence_axis << "): " << _impl_params->input_layouts[0] << "\n";
+
+            if (desc->compressed) {
+                GPU_DEBUG_TRACE_DETAIL << "Compressed case[2]!\n";
+                auto& past_scale_layout = _impl_params->input_layouts[3];
+                const auto sequence_axis = 2;
+                kv_cache_inst::update_pad(past_scale_layout, max_pad, sequence_axis);
+
+                if (_impl_params->input_layouts.size() >= 4) {
+                    auto& past_zp_layout = _impl_params->input_layouts[4];
+                    const auto sequence_axis = 2;
+                    kv_cache_inst::update_pad(past_zp_layout, max_pad, sequence_axis);
+                }
+                GPU_DEBUG_TRACE_DETAIL << "Updated scales layout (" << max_pad << " " << sequence_axis << "): " << _impl_params->input_layouts[3] << "\n";
+            }
             GPU_DEBUG_TRACE_DETAIL << "[do_runtime_in_place_kv_cache] " << id() << " Updated past layout's pad : " << past_layout.to_string() << std::endl;
         }
     }
@@ -1883,6 +2071,9 @@ primitive_inst::primitive_inst(network & network, program_node const& node, bool
             _dynamic_impl = _impl->clone();
         }
     }
+    if (_node) {
+        GPU_DEBUG_TRACE_DETAIL << _node->type()->to_string(*_node) << "\n";
+    }
     _impl_params->strm = _network.get_stream_ptr();
     for (size_t i = 0; i < get_node().get_output_layouts().size(); ++i) {
         if (_outputs.size() > i) {
diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp
index 3c21800c66d938..047e04f88d7e8a 100644
--- a/src/plugins/intel_gpu/src/graph/program_node.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_node.cpp
@@ -90,7 +90,8 @@ void program_node::replace_dependency(size_t idx, std::pair<program_node*, int32
 std::vector<layout> const program_node::get_input_layouts() const {
     std::vector<layout> layouts;
     for (size_t i = 0; i < dependencies.size(); i++) {
-        layouts.push_back(get_input_layout(i));
+        auto input_layout = get_input_layout(i);
+        layouts.push_back(input_layout);
     }
     return layouts;
 }
@@ -434,10 +435,15 @@ layout program_node::get_non_padded_output_layout(bool invalidate_users_if_chang
 }
 
 bool program_node::set_output_layout(layout& new_layout, bool invalidate_users_if_changed, size_t idx) {
+    // GPU_DEBUG_TRACE_DETAIL << "TEST: " << padding::max(new_layout.data_padding, output_layouts[idx].data_padding)._dynamic_dims_mask << "\n";
+
     merge_output_padding(new_layout.data_padding, idx);
+    // GPU_DEBUG_TRACE_DETAIL << "Merged padding[1] " << new_layout.to_string() << "\n";
+    // GPU_DEBUG_TRACE_DETAIL << "Merged padding[2] " << output_layouts[idx].data_padding._dynamic_dims_mask << "\n";
     OPENVINO_ASSERT(idx < output_layouts.size(), id(), " has invalid index : index is ", std::to_string(idx),
                                         " but output_layouts length is ", std::to_string(output_layouts.size()));
     new_layout.data_padding = output_layouts[idx].data_padding;
+    // GPU_DEBUG_TRACE_DETAIL << "Merged padding[3] " << new_layout.to_string() << "\n";
     bool changed = (new_layout != output_layouts[idx]);
     if (changed && invalidate_users_if_changed)  // output_layout has changed! invalidate users
         invalidate_users();
diff --git a/src/plugins/intel_gpu/src/graph/read_value.cpp b/src/plugins/intel_gpu/src/graph/read_value.cpp
index bf6e730e8a808b..21518d60604b65 100644
--- a/src/plugins/intel_gpu/src/graph/read_value.cpp
+++ b/src/plugins/intel_gpu/src/graph/read_value.cpp
@@ -2,8 +2,11 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include <read_value_inst.h>
+#include "read_value_inst.h"
 #include "primitive_type_base.h"
+
+#include "intel_gpu/plugin/multi_tensor_variable_state.hpp"
+
 #include <sstream>
 #include <json_object.h>
 
@@ -16,7 +19,7 @@ read_value_inst::typed_primitive_inst(network& network, const read_value_node& n
 }
 
 layout read_value_inst::calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param) {
-    return impl_param.typed_desc<read_value>()->output_layout;
+    return impl_param.typed_desc<read_value>()->output_layouts[0];
 }
 
 std::string read_value_inst::to_string(const read_value_node& node) {
@@ -45,5 +48,37 @@ void read_value_inst::update_output_memory() {
     GPU_DEBUG_TRACE_DETAIL << " - layout " << variable.get_layout().to_string() << std::endl;
     GPU_DEBUG_TRACE_DETAIL << " - actual_size " << variable.get_actual_mem_size() << " bytes" << std::endl;
     set_output_memory(variable.get_memory(), false, 0);
+
+    auto desc = _impl_params->typed_desc<read_value>();
+    if (auto compressed_cache_variable = dynamic_cast<const ov::intel_gpu::VariableStateIndirectKVCacheCompressed*>(&variable)) {
+        auto scales_state = compressed_cache_variable->get_compression_scale_state();
+        set_output_memory(scales_state->get_memory(), false, 1);
+
+        GPU_DEBUG_TRACE_DETAIL << id() << " Update output memory with variable " << scales_state->get_name() << std::endl;
+        GPU_DEBUG_TRACE_DETAIL << " - ptr : " << scales_state->get_memory()->buffer_ptr() << std::endl;
+        GPU_DEBUG_TRACE_DETAIL << " - layout " << scales_state->get_layout().to_string() << std::endl;
+        GPU_DEBUG_TRACE_DETAIL << " - actual_size " << scales_state->get_actual_mem_size() << " bytes" << std::endl;
+
+        if (compressed_cache_variable->has_zp_state()) {
+            auto zp_state = compressed_cache_variable->get_compression_zp_state();
+            set_output_memory(zp_state->get_memory(), false, 2);
+
+            GPU_DEBUG_TRACE_DETAIL << id() << " Update output memory with variable " << zp_state->get_name() << std::endl;
+            GPU_DEBUG_TRACE_DETAIL << " - ptr : " << zp_state->get_memory()->buffer_ptr() << std::endl;
+            GPU_DEBUG_TRACE_DETAIL << " - layout " << zp_state->get_layout().to_string() << std::endl;
+            GPU_DEBUG_TRACE_DETAIL << " - actual_size " << zp_state->get_actual_mem_size() << " bytes" << std::endl;
+        }
+    }
+    // if (desc->compressed) {
+    //     auto multi_tensor_variable = downcast<const ov::intel_gpu::VariableStateIndirectKVCache>(variable);
+    //     auto scales_variable = multi_tensor_variable.get_compression_scale_state();
+
+    //     GPU_DEBUG_TRACE_DETAIL << id() << " Update output memory with variable " << scales_variable->get_name() << std::endl;
+    //     GPU_DEBUG_TRACE_DETAIL << " - ptr : " << scales_variable->get_memory()->buffer_ptr() << std::endl;
+    //     GPU_DEBUG_TRACE_DETAIL << " - layout " << scales_variable->get_layout().to_string() << std::endl;
+    //     GPU_DEBUG_TRACE_DETAIL << " - actual_size " << scales_variable->get_actual_mem_size() << " bytes" << std::endl;
+    //     set_output_memory(scales_variable->get_memory(), false, 1);
+    // }
+    GPU_DEBUG_TRACE_DETAIL << id() << variable_id() << std::endl;
 }
 } // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp
index e8e213ad97011a..14aea98822a393 100644
--- a/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp
+++ b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp
@@ -87,6 +87,14 @@ std::string scaled_dot_product_attention_inst::to_string(scaled_dot_product_atte
     json_composite scaled_dot_product_attention_info;
     scaled_dot_product_attention_info.add("input id", input.id());
     scaled_dot_product_attention_info.add("is_causal", desc->is_causal);
+    scaled_dot_product_attention_info.add("is_kv_compressed", desc->is_kv_compressed);
+    scaled_dot_product_attention_info.add("combine_scales_and_zp", desc->combine_scales_and_zp);
+    scaled_dot_product_attention_info.add("group_size", desc->quantization_config.group_sizes);
+    scaled_dot_product_attention_info.add("is_asymmetric_quantization", desc->quantization_config.is_asymmetric_quantization());
+    scaled_dot_product_attention_info.add("quantization_dt", desc->quantization_config.quantization_dt);
+    scaled_dot_product_attention_info.add("scale_dt", desc->quantization_config.scale_dt);
+    scaled_dot_product_attention_info.add("zp_dt", desc->quantization_config.zp_dt);
+    scaled_dot_product_attention_info.add("indirect_axis", desc->indirect_axis);
     scaled_dot_product_attention_info.add("has_attn_mask_input", desc->has_attn_mask_input);
     scaled_dot_product_attention_info.add("has_scale_input", desc->has_scale_input);
     scaled_dot_product_attention_info.add("input_q_transpose_order", desc->input_q_transpose_order);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl
index 628bc69f3886df..073061789c597e 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl
@@ -16,7 +16,7 @@
 #define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x)
 
 REQD_SUB_GROUP_SIZE(SIMD)
-KERNEL(dynamic_quantize_gpu_opt)(
+KERNEL(dynamic_quantize_gpu_opt_generic)(
     OPTIONAL_SHAPE_INFO_ARG
     const __global INPUT0_TYPE* input,
     __global OUTPUT_TYPE* output,
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl
new file mode 100644
index 00000000000000..9d4965be101d69
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl
@@ -0,0 +1,130 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "include/batch_headers/fetch_data.cl"
+#include "include/batch_headers/fetch_data.cl"
+#include "include/batch_headers/common.cl"
+#include "include/batch_headers/sub_group_block_read.cl"
+#include "include/batch_headers/sub_group_block_write.cl"
+#include "include/batch_headers/sub_group_shuffle.cl"
+
+
+#if OUTPUT_DIMS != 4
+#error "dynamic_quantize_gpu_opt.cl: Unsupported output dimension"
+#endif
+
+#define VLOAD_N CAT(vload, VEC_SIZE)
+#define VSTORE_N CAT(vstore, VEC_SIZE)
+#define CONVERT_CHAR_N CAT(convert_char, VEC_SIZE)
+#define AS_TYPE_N_(type, n, x) as_##type##n(x)
+#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
+#define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x)
+
+
+inline uint FUNC(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) {
+    return OUTPUT1_GET_INDEX(b, f, y, x);
+}
+
+inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) {
+#ifdef SCALES_OUTPUT_ORDER
+    return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR SCALES_OUTPUT_ORDER);
+#else
+    return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x);
+#endif
+}
+
+#define SUBGROUP_SIZE 16
+#define INNERMOST_DIM_VALUE INPUT0_SIZE_X
+#define INPUT_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, 1, ptr, offset)
+#define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, 1, ptr, offset, val)
+
+__attribute__((reqd_work_group_size(SUBGROUP_SIZE, SUBGROUPS_NUMBER, 1)))
+REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE)
+KERNEL(dynamic_quantize_gpu_opt_generic)(
+    OPTIONAL_SHAPE_INFO_ARG
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output,
+    __global OUTPUT1_TYPE* output_scale
+#if ASYMMETRIC_QUANTIZATION && !GROUP_SCALES_WITH_ZP
+    , __global OUTPUT2_TYPE* output_zp
+#endif
+#ifdef APPEND_MODE
+    , const uint axis_offset
+#endif
+    )
+{
+    const uint sglid = get_sub_group_local_id();
+    const uint grouped_indexes = get_global_id(1);
+    const uint batch_indexes = get_global_id(2);
+
+    DECLARE_BATCHED_DIMS_INDEXES(batch_indexes);
+    DECLARE_GROUPED_DIMS_INDEXES(grouped_indexes);
+
+    // the innermost dimension is always handled in the loop inside the kernel
+    const uint x = 0;
+
+    half max_value = INPUT0_VAL_MIN;
+    half min_value = INPUT0_VAL_MAX;
+
+    half val[INNERMOST_DIM_VALUE / SUBGROUP_SIZE];
+
+    const uint input_offset = INPUT0_GET_INDEX(b, f, y, x);
+    unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) {
+        val[i] = INPUT_BLOCK_READ(input, input_offset + i * SUBGROUP_SIZE);
+#if ASYMMETRIC_QUANTIZATION
+        max_value = fmax(max_value, val[i]);
+        min_value = fmin(min_value, val[i]);
+#else
+        max_value = fmax(max_value, fabs(val[i]));
+#endif
+    }
+
+#if ASYMMETRIC_QUANTIZATION
+    min_value = work_group_reduce_min(min_value);
+    max_value = work_group_reduce_max(max_value);
+    OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value));
+    OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_value * scale) - CHAR_MAX;
+#else
+    max_value = work_group_reduce_max(max_value);
+    OUTPUT1_TYPE scale = 127.0h / max_value;
+#endif
+
+#ifdef APPEND_MODE
+    APPEND_AXIS_NAME += axis_offset;
+#endif
+
+    const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+    unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) {
+#if ASYMMETRIC_QUANTIZATION
+        OUTPUT_TYPE res = convert_char(val[i] * scale + zp);
+#else
+        OUTPUT_TYPE res = convert_char(val[i] * scale);
+#endif
+        OUTPUT_BLOCK_WRITE(output, output_offset + i * SUBGROUP_SIZE, res);
+    }
+
+    const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x);
+
+    if (grouped_indexes == 0 && sglid == 0) {
+#ifdef APPEND_MODE
+#if GROUP_SCALES_WITH_ZP
+        // half result0 = (convert_half(convert_char(val[0] * scale + zp)) - zp) * (1.0h / scale);
+        // half result1 = (convert_half(convert_char(val[1] * scale + zp)) - zp) * (1.0h / scale);
+        // half result2 = (convert_half(convert_char(val[2] * scale + zp)) - zp) * (1.0h / scale);
+        // half result3 = (convert_half(convert_char(val[3] * scale + zp)) - zp) * (1.0h / scale);
+        // printf("Save scale_idx=%d, axis_offset=%d; scale=%f; zp=%f, min=%f, max=%f; orig=(%f %f %f %f), compressed=(%d %d %d %d), decompressed=(%f %f)\n", scale_idx, axis_offset, scale, zp, min_value, max_value,
+        //     val[0], val[1], val[2], val[3],
+        //     convert_char(val[0] * scale + zp), convert_char(val[1] * scale + zp), convert_char(val[2] * scale + zp), convert_char(val[3] * scale + zp),
+        //     result0,
+        //     result1);
+#endif
+#endif
+        output_scale[scale_idx] = 1.0h / scale;
+#if ASYMMETRIC_QUANTIZATION && GROUP_SCALES_WITH_ZP
+        output_scale[scale_idx + 1] = zp;
+#elif ASYMMETRIC_QUANTIZATION
+        output_zp[scale_idx] = zp;
+#endif
+    }
+}
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl
index 436276a67e48c0..d771cb7110be8f 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl
@@ -8,6 +8,59 @@
 #error "dynamic_quantize_gpu_ref.cl: Unsupported output dimension"
 #endif
 
+/*
+TODO: check this coniguration:
+GPU_Debug: primitive_inst.cpp:1921:primitive_inst:
+{
+    dynamic_quantize info :
+    {
+        scale dt : f16,
+        activation dt : i8,
+        group size : 1,18446744073709551615,1,18446744073709551615,
+    }
+    implementation : dynamic_quantize_gpu_ref,
+    cl dump_ info :
+    {
+        kernel_entry : dynamic_quantize_gpu_ref_17256100832148678061_0_0__sa,
+        batch_hash : 8176231137263359740,
+    }
+    ptr : node_213002368,
+    id : dynamicquantize:__module.model.layers.9.self_attn/aten::cat/Concat_3_init_dyn_quan,
+    optimized : false,
+    type : dynamic_quantize,
+    valid output layout : true,
+    output layouts :
+    {
+        1 : f16:bfyx:?x1x0x1:nopad,
+        0 : i8:bfyx:?x32x0x128:nopad,
+    }
+    dependant_shape_of_nodes_ids : ,
+    fused primitives :
+    {
+    }
+    constant : false,
+    in_shape_of_subgraph : 0,
+    in data flow : true,
+    output : false,
+    preferred impl : any,
+    dependencies : 219236192(0),
+    users : 207254784,207254784,
+}
+
+ */
+
+inline uint FUNC(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) {
+    return OUTPUT1_GET_INDEX(b, f, y, x);
+}
+
+inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) {
+#ifdef SCALES_OUTPUT_ORDER
+    return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR SCALES_OUTPUT_ORDER);
+#else
+    return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x);
+#endif
+}
+
 KERNEL(dynamic_quantize_gpu_ref)(
     OPTIONAL_SHAPE_INFO_ARG
     const __global INPUT0_TYPE* input,
@@ -15,14 +68,27 @@ KERNEL(dynamic_quantize_gpu_ref)(
     __global OUTPUT1_TYPE* output_scale)
 {
     const uint bf = (uint)get_global_id(0);
-    const uint b = (uint)get_global_id(0) / INPUT0_FEATURE_NUM;
-    const uint f = (uint)get_global_id(0) % INPUT0_FEATURE_NUM;
+    const uint b = bf / INPUT0_FEATURE_NUM;
+    const uint f = bf % INPUT0_FEATURE_NUM;
     const uint y = (uint)get_global_id(1);
-    const uint scale_idx = OUTPUT1_GET_INDEX(b, f, y, 0);
+    const uint x = (uint)get_global_id(2);
+#ifdef SCALES_OUTPUT_ORDER
+    const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x);
+#else
+    const uint scale_idx = OUTPUT1_GET_INDEX_SAFE(b, f, y, x);
+#endif
 
     half max_val = 0.0001h;
-    for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) {
-        const uint offset = INPUT0_GET_INDEX(b, f, y + y_off, 0);
+    for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) {
+    for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) {
+    for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) {
+#if GROUP_SIZE_DIM3 == 1
+        const uint offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x);
+        half val = input[offset];
+        half abs_val = fabs(val);
+        max_val = fmax(max_val, abs_val);
+#else
+        const uint offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0);
         int x;
         for (x = 0; x < INPUT0_SIZE_X / 8; x++) {
             half8 val = as_half8(vload8(0, (ushort*)input + offset + x * 8));
@@ -34,21 +100,40 @@ KERNEL(dynamic_quantize_gpu_ref)(
         x *= 8;
         for (; x < INPUT0_SIZE_X; x++)
             max_val = fmax(max_val, fabs(input[offset + x]));
+#endif
+    }
+    }
     }
 
     half scale = 127.0h / max_val;
-    for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) {
-        const uint in_offset = INPUT0_GET_INDEX(b, f, y + y_off, 0);
-        const uint out_offset = OUTPUT_GET_INDEX(b, f, y + y_off, 0);
+    for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) {
+    for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) {
+    for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) {
+#if GROUP_SIZE_DIM3 == 1
+        const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x);
+        const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, x);
+
+        half val = input[in_offset];
+        val *= scale;
+        output[out_offset] = convert_char(val);
+#else
+        const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0);
+        const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, 0);
         int x;
         for (x = 0; x < INPUT0_SIZE_X / 8; x++) {
             half8 val = as_half8(vload8(0, (ushort*)input + in_offset + x * 8));
             val *= scale;
+            // TODO: why it's _rtz instead of _rte?
             vstore8(convert_char8(val), 0, output + out_offset + x * 8);
+            // vstore8(convert_char8_rte(val), 0, output + out_offset + x * 8);
         }
         x *= 8;
         for (; x < INPUT0_SIZE_X; x++)
             output[out_offset + x] = convert_char(input[in_offset + x] * scale);
+            // output[out_offset + x] = convert_char_rte(input[in_offset + x] * scale);
+#endif
+    }
+    }
     }
 
     output_scale[scale_idx] = 1.0h / scale;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
index 948bd3c0f1a305..f9aa12aa257eb0 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl
@@ -17,6 +17,8 @@
 // max_logits    [batch, heads_num, q_len, partition_idx]
 // tmp_out       [batch, heads_num, q_len, partition_idx, head_size]
 
+ulong __attribute__((overloadable)) intel_get_cycle_counter( void );
+
 inline uint FUNC(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) {
 #if INPUT0_SIMPLE
     return GET_DATA_INDEX_6D(INPUT0, b, f, w, z, y, x);
@@ -124,6 +126,7 @@ inline uint FUNC(get_bt_index_value)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uin
 /* This version is used for 2nd token */
 
 REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE)
+__attribute__((reqd_work_group_size(1, 1, HEAD_SIZE * SG_SCALE_FACTOR)))
 KERNEL(sdpa_opt)(
     OPTIONAL_SHAPE_INFO_ARG
     const __global INPUT0_TYPE* query_input,
@@ -136,6 +139,10 @@ KERNEL(sdpa_opt)(
     const __global INPUT4_TYPE* scale,
 #endif
     __global OUTPUT_TYPE* output,
+#if IS_KV_COMPRESSED
+    const __global KEY_COMPRESSION_SCALE_TYPE* key_scale,
+    const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale,
+#endif
 #ifdef BEAM_TABLE_TYPE
     const __global BEAM_TABLE_TYPE* beam_table,
 #endif
@@ -149,7 +156,14 @@ KERNEL(sdpa_opt)(
     const uint b1_idx = batch_idx % NUM_HEADS; /* HEADS_NUM dim */
     const uint target_seq_idx = get_global_id(1);
     const uint lid = get_local_id(2);
+
+    #if SG_SCALE_FACTOR == 2
+    const uint head_size_idx = lid % HEAD_SIZE;
+    #elif SG_SCALE_FACTOR == 1
     const uint head_size_idx = lid;
+    #else
+    #error "Unsupported scale factor"
+    #endif
 
     const uint sgid = get_sub_group_id();
     const uint sglid = get_sub_group_local_id();
@@ -199,13 +213,19 @@ KERNEL(sdpa_opt)(
                 uint query_offset = INPUT0_GET_INDEX(b0_idx, b1_idx, target_seq_idx, (sgid * SUBGROUP_SIZE));
                 const uint query_pitch = QUERY_STEP_LOCAL;
 #endif
-                for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
-                    #define QUERY_BLOCK_SIZE 1
+    #if SG_SCALE_FACTOR == 2
+                if (sgid < HEAD_SIZE / SUBGROUP_SIZE) {
+    #else
+                {
+    #endif
+                    for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
+                        #define QUERY_BLOCK_SIZE 1
 
-                    INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset);
-                    query_local[query_local_offset] = val * scale_val;
-                    query_local_offset += QUERY_STEP_LOCAL;
-                    query_offset += query_pitch;
+                        INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset);
+                        query_local[query_local_offset] = val * scale_val;
+                        query_local_offset += QUERY_STEP_LOCAL;
+                        query_offset += query_pitch;
+                    }
                 }
                 #undef QUERY_BLOCK_SIZE
                 #undef QUERY_STEP
@@ -216,28 +236,69 @@ KERNEL(sdpa_opt)(
             // Main Gemm1 calculation loop
             // Each SG performs element-wise multiplications of Q[HEAD_SIZE]xK[HEAD_SIZE] values
             // HEAD_SIZE / SUBGROUPS_PER_WG times in the loop and saves the result to the qk_local SLM buffer
-            for (uint seq_len = sgid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE / SUBGROUP_SIZE)) {
+            for (uint seq_len = sgid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE / SUBGROUP_SIZE) * SG_SCALE_FACTOR) {
 #ifdef INPUT1_DIMS_ORDER
 #ifdef BEAM_TABLE_TYPE
                 const uint b_idx = beam_table[FUNC_CALL(get_bt_index_key)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0)];
 #else
                 const uint b_idx = b0_idx;
 #endif
-                const uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0);
+                uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0);
 #else
-                const uint key_offset = INPUT1_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, 0);
+                uint key_offset = INPUT1_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, 0);
 #endif
+                // if (start_partition_idx < 8096)
+                //     key_offset = 0;
 
                 INPUT0_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {INPUT0_VAL_ZERO};
 
+#if IS_KV_COMPRESSED
+#ifndef INPUT1_DIMS_ORDER
+                const uint b_idx = b0_idx;
+#endif
+#ifdef COMPRESSED_PER_HEAD
+                // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, b1_idx, 0);
+                const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0);
+
+#else
+                // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, 0, 0);
+                const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + seq_len, 0);
+#endif
+                KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset];
+#if USE_ASYMMETRIC_QUANTIZATION
+                KEY_COMPRESSION_SCALE_TYPE key_comp_zp = key_scale[key_scale_comp_offset + 1];
+#endif
+#endif
+
+                // ulong timer_start = intel_get_cycle_counter();
                 uint head_idx_index = 0;
                 #define KEY_BLOCK_SIZE 8
                 for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) {
                     #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset);
                     #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE)
+                    #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE)
+                    #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val)
                     #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE)
 
+
+#if IS_KV_COMPRESSED
+                    #define KEY_BLOCK_READ_TEST(ptr, offset) BLOCK_READN(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE, ptr, offset);
+                    #define KEY_BLOCK_NEW MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE)
+
+
+                    #define KEY_BLOCK_READ_TEST1(ptr, offset) KEY_BLOCK_READ(ptr, offset)
+                    #define KEY_BLOCK_NEW1 KEY_BLOCK
+                    // KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+                    // KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale;
+                    KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+#if USE_ASYMMETRIC_QUANTIZATION
+                    KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) - key_comp_zp) * key_comp_scale;
+#else
+                    KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale;
+#endif
+#else
                     KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+#endif
 
                     uint query_offset = head_idx_index + sglid;
                     unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
@@ -258,9 +319,20 @@ KERNEL(sdpa_opt)(
                 for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) {
                     #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset);
                     #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE)
+                    #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE)
+                    #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val)
                     #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE)
 
+#if IS_KV_COMPRESSED
+                    KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+#if USE_ASYMMETRIC_QUANTIZATION
+                    KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) - key_comp_zp) * key_comp_scale;
+#else
+                    KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale;
+#endif
+#else
                     KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+#endif
 
                     uint query_offset = head_idx_index + sglid;
                     unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
@@ -281,9 +353,20 @@ KERNEL(sdpa_opt)(
                 for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) {
                     #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset);
                     #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE)
+                    #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE)
+                    #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val)
                     #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE)
 
+#if IS_KV_COMPRESSED
+                    KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+#if USE_ASYMMETRIC_QUANTIZATION
+                    KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) - key_comp_zp) * key_comp_scale;
+#else
+                    KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale;
+#endif
+#else
                     KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+#endif
 
                     uint query_offset = head_idx_index + sglid;
                     unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
@@ -304,9 +387,20 @@ KERNEL(sdpa_opt)(
                 for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) {
                     #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset);
                     #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE)
+                    #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE)
+                    #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val)
                     #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE)
 
+#if IS_KV_COMPRESSED
+                    KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+#if USE_ASYMMETRIC_QUANTIZATION
+                    KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) - key_comp_zp) * key_comp_scale;
+#else
+                    KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale;
+#endif
+#else
                     KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index);
+#endif
 
                     uint query_offset = head_idx_index + sglid;
                     unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
@@ -320,6 +414,12 @@ KERNEL(sdpa_opt)(
                     }
                 }
 
+                // ulong timer_end = intel_get_cycle_counter();
+                // ulong time = timer_end - timer_start;
+                // if (batch_idx == 0 && sglid == 0) {
+                //     printf("part=%d, sgid=%d, seq_len=%d: %d\n", partition_idx, sgid, seq_len, time);
+                // }
+
                 // Sum up all accumulators accross single SG and save result to SLM
                 unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
                     acc[seq_idx] = sub_group_reduce_add(acc[seq_idx]);
@@ -335,7 +435,7 @@ KERNEL(sdpa_opt)(
                 const uint seq_idx_end = 1;
                 for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
                     // Iterate over all values QK values in SLM and apply scale and attention mask
-                    for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE)) {
+                    for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE * SG_SCALE_FACTOR)) {
                         // Read value from SLM and apply scale
                         qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len];
 
@@ -378,6 +478,10 @@ KERNEL(sdpa_opt)(
             for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
                 qk_max[seq_idx] = SOFTMAX_ACCUMULATOR_VAL_MIN;
 
+                #if SUBGROUPS_PER_WG > SUBGROUP_SIZE
+                #error "Number of subgroups per work group should be less than subgroup_size
+                #endif
+
                 if (sglid < SUBGROUPS_PER_WG)
                     qk_max[seq_idx] = qk_max_vals[seq_idx * SUBGROUPS_PER_WG + sglid];
 
@@ -388,7 +492,7 @@ KERNEL(sdpa_opt)(
             SOFTMAX_ACCUMULATOR_TYPE exp_sum[TARGET_SEQ_LEN_BLOCK_SIZE] = {SOFTMAX_ACCUMULATOR_VAL_ZERO};
             const uint qk_num_per_wi = CEIL_DIV(partition_seq_len, SUBGROUPS_PER_WG * SUBGROUP_SIZE);
             for (uint qk_idx = 0; qk_idx < qk_num_per_wi; qk_idx++) {
-                const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + head_size_idx;
+                const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + lid;
                 if (local_data_idx < partition_seq_len) {
                     for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
                         SOFTMAX_ACCUMULATOR_TYPE qk_new = native_exp(TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx]) - qk_max[seq_idx]);
@@ -411,6 +515,10 @@ KERNEL(sdpa_opt)(
             unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
                 exp_sum[seq_idx] = SOFTMAX_ACCUMULATOR_VAL_ZERO;
 
+                #if SUBGROUPS_PER_WG > SUBGROUP_SIZE
+                #error "Number of subgroups per work group should be less than subgroup_size
+                #endif
+
                 if (sglid < SUBGROUPS_PER_WG)
                     exp_sum[seq_idx] = qk_sum_vals[seq_idx * SUBGROUPS_PER_WG + sglid];
 
@@ -420,7 +528,7 @@ KERNEL(sdpa_opt)(
 
             // const SOFTMAX_ACCUMULATOR_TYPE inv_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ONE / exp_sum[seq_idx];
             for (uint qk_idx = 0; qk_idx < qk_num_per_wi; qk_idx++) {
-                const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + sgid * SUBGROUP_SIZE + sglid;
+                const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + lid;
                 if (local_data_idx < partition_seq_len) {
                     for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
                         SOFTMAX_ACCUMULATOR_TYPE qk_new = TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx]) / exp_sum[seq_idx];
@@ -434,7 +542,7 @@ KERNEL(sdpa_opt)(
             {
                 // If the number of partitions is greater than 1, save exm_sums and max_logits to the temporary buffers
                 // Use single WI in the WG, since all the WIs have the same value
-                if (num_of_partitions > 1 && head_size_idx == 0) {
+                if (num_of_partitions > 1 && lid == 0) {
                     for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
                         const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) +
                                                      b1_idx * (TARGET_SEQ_LEN * num_of_partitions) +
@@ -463,8 +571,18 @@ KERNEL(sdpa_opt)(
 #endif
 #endif
 
-        for (uint seq_len = 0; seq_len < partition_seq_len / SUBGROUP_SIZE; seq_len++) {
+#if SG_SCALE_FACTOR > 1
+        // SUBGROUPS_PER_WG (HEAD_SIZE * SG_SCALE_FACTOR / SUBGROUP_SIZE)
+        const uint seq_len_start = (sgid / (HEAD_SIZE / SUBGROUP_SIZE)) * (SEQ_LEN_PARTITION_SIZE / SG_SCALE_FACTOR / SUBGROUP_SIZE);
+        const uint seq_len_end = min(seq_len_start + (SEQ_LEN_PARTITION_SIZE / SG_SCALE_FACTOR / SUBGROUP_SIZE), partition_seq_len / SUBGROUP_SIZE);
+#else
+        const uint seq_len_start = 0;
+        const uint seq_len_end = partition_seq_len / SUBGROUP_SIZE;
+#endif
+
+        for (uint seq_len = seq_len_start; seq_len < seq_len_end; seq_len++) {
 #ifdef BEAM_TABLE_TYPE
+            // TODO: Handle beam search
             uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE)];
             uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE);
 #else
@@ -473,6 +591,27 @@ KERNEL(sdpa_opt)(
 #else
             uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx);
 #endif
+#endif
+
+            // if (start_partition_idx < 8096)
+            //     value_offset = 0;
+
+#if IS_KV_COMPRESSED
+#ifndef BEAM_TABLE_TYPE
+            const uint b_idx = b0_idx;
+#endif
+#ifdef COMPRESSED_PER_HEAD
+            // TODO: consider to change scales layout from [batch, seq_len, num_heads, 1] to [batch, num_heads, seq_len, 1]
+            // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, b1_idx, 0);
+            const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0);
+#else
+            // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, 0, 0);
+            const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0);
+#endif
+            VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset];
+#if USE_ASYMMETRIC_QUANTIZATION
+            VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1];
+#endif
 #endif
 
             OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE];
@@ -481,10 +620,29 @@ KERNEL(sdpa_opt)(
             }
 
             unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) {
+#if IS_KV_COMPRESSED
+
+                #define VALUE_BLOCK_READ_TEST(ptr, offset) BLOCK_READN(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE, ptr, offset);
+                #define VALUE_BLOCK_NEW MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE)
+
+                #define VALUE_BLOCK_READ_TEST1(ptr, offset) VALUE_BLOCK_READ(ptr, offset)
+                #define VALUE_BLOCK_NEW1 INPUT2_TYPE
+#ifdef BEAM_TABLE_TYPE
+                INPUT2_TYPE value_val_compressed = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i));
+#else
+                INPUT2_TYPE value_val_compressed = VALUE_BLOCK_READ(value_input, value_offset);
+#endif
+#if USE_ASYMMETRIC_QUANTIZATION
+                VALUE_COMPRESSION_SCALE_TYPE value_val = (value_val_compressed - sub_group_broadcast(value_comp_zp, i)) * sub_group_broadcast(value_comp_scale, i);
+#else
+                VALUE_COMPRESSION_SCALE_TYPE value_val = value_val_compressed * sub_group_broadcast(value_comp_scale, i);
+#endif
+#else
 #ifdef BEAM_TABLE_TYPE
                 INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i));
 #else
                 INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset);
+#endif
 #endif
                 unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
                     acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc[seq_idx]);
@@ -496,33 +654,86 @@ KERNEL(sdpa_opt)(
             }
         }
 
-        const uint seq_len_leftovers_start = (partition_seq_len / SUBGROUP_SIZE) * SUBGROUP_SIZE;
-        for (uint seq_len = seq_len_leftovers_start; seq_len < partition_seq_len; seq_len++) {
+
+#if SG_SCALE_FACTOR > 1
+        if (sgid >= HEAD_SIZE / SUBGROUP_SIZE) {
+#endif
+
+        for (uint seq_len = (partition_seq_len / SUBGROUP_SIZE) * SUBGROUP_SIZE; seq_len < partition_seq_len; seq_len++) {
 #ifdef INPUT2_DIMS_ORDER
 #ifdef BEAM_TABLE_TYPE
             const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, head_size_idx)];
 #else
             const uint b_idx = b0_idx;
 #endif
-            const uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, head_size_idx);
+            uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, head_size_idx);
 #else
             const uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, head_size_idx);
 #endif
 
+            // if (start_partition_idx < 8096)
+            //     value_offset = 0;
+
+#if IS_KV_COMPRESSED
+#ifndef INPUT2_DIMS_ORDER
+            const uint b_idx = b0_idx;
+#endif
+#ifdef COMPRESSED_PER_HEAD
+            // TODO: consider to change scales layout from [batch, seq_len, num_heads, 1] to [batch, num_heads, seq_len, 1]
+            // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len, 0, 0, b1_idx, 0);
+            const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0);
+#else
+            // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len, 0, 0, 0, 0);
+            const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + seq_len, 0);
+#endif
+            VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset];
+#if USE_ASYMMETRIC_QUANTIZATION
+            VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1];
+#endif
+#endif
+
             OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE];
             unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
                 qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len];
             }
 
+#if IS_KV_COMPRESSED
+            INPUT2_TYPE value_val_compressed = VALUE_BLOCK_READ(value_input, value_offset);
+#if USE_ASYMMETRIC_QUANTIZATION
+            VALUE_COMPRESSION_SCALE_TYPE value_val = (value_val_compressed - value_comp_zp) * value_comp_scale;
+#else
+            VALUE_COMPRESSION_SCALE_TYPE value_val = value_val_compressed * value_comp_scale;
+#endif
+#else
             INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset);
+#endif
 
             unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
                 acc[seq_idx] = mad(qk_val[seq_idx], value_val, acc[seq_idx]);
             }
         }
 
+#if SG_SCALE_FACTOR > 1
+        } // if (sgid >= HEAD_SIZE / SUBGROUP_SIZE)
+#endif
+
+#if SG_SCALE_FACTOR > 1
+        if ((partition_seq_len > (SEQ_LEN_PARTITION_SIZE / SG_SCALE_FACTOR)) || (partition_seq_len % SUBGROUP_SIZE != 0)) {
+            if (sgid >= HEAD_SIZE / SUBGROUP_SIZE) {
+                query_local[head_size_idx] = acc[0];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            if (sgid < HEAD_SIZE / SUBGROUP_SIZE) {
+                acc[0] += query_local[head_size_idx];
+            }
+        }
+#endif
+
         // If the number of partitions is greater than 1, save results to the temporary buffer;
         // otherwise, save results directly to the main output.
+#if SG_SCALE_FACTOR > 1
+        if (sgid < HEAD_SIZE / SUBGROUP_SIZE) {
+#endif
         if (num_of_partitions > 1) {
             const uint seq_idx_end = 1;
             for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) {
@@ -542,6 +753,9 @@ KERNEL(sdpa_opt)(
                 output[output_offset] = acc[seq_idx];
             }
         }
+#if SG_SCALE_FACTOR > 1
+        } // if (sgid < HEAD_SIZE / SUBGROUP_SIZE) {
+#endif
     } // Gemm2 calculation end
 }
 
@@ -675,6 +889,10 @@ KERNEL(sdpa_opt)(
     const __global INPUT4_TYPE* scale,
 #endif
     __global OUTPUT_TYPE* output,
+#if IS_KV_COMPRESSED
+    const __global KEY_COMPRESSION_SCALE_TYPE* key_scale,
+    const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale,
+#endif
 #ifdef BEAM_TABLE_TYPE
     const __global BEAM_TABLE_TYPE* beam_table,
 #endif
@@ -743,12 +961,18 @@ KERNEL(sdpa_opt)(
 #endif
         uint query_local_offset = head_size_idx * TARGET_SEQ_LEN_BLOCK_SIZE;
 
+#if HAS_SCALE_INPUT
+        const OUTPUT_TYPE scale_val = *scale;
+#else
+        const OUTPUT_TYPE scale_val = TO_OUTPUT_TYPE(STATIC_SCALE_VALUE);
+#endif
+
         if (cur_target_seq_len_size != TARGET_SEQ_LEN_BLOCK_SIZE) {
             if (sgid * SUBGROUP_SIZE < HEAD_SIZE) {
                 for (uint seq_idx = 0; seq_idx < cur_target_seq_len_size; seq_idx++) {
                     INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset);
 
-                    slm_query[query_local_offset] = val;
+                    slm_query[query_local_offset] = val * scale_val;
                     query_offset += query_pitch;
                     query_local_offset++;
                 }
@@ -759,7 +983,7 @@ KERNEL(sdpa_opt)(
                     unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) {
                         INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset);
 
-                        slm_query[query_local_offset] = val;
+                        slm_query[query_local_offset] = val * scale_val;
                         query_offset += query_pitch;
                         query_local_offset++;
                     }
@@ -769,7 +993,7 @@ KERNEL(sdpa_opt)(
                     unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) {
                         INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset);
 
-                        slm_query[query_local_offset] = val;
+                        slm_query[query_local_offset] = val * scale_val;
                         query_offset += query_pitch;
                         query_local_offset++;
                     }
@@ -780,7 +1004,7 @@ KERNEL(sdpa_opt)(
                 unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) {
                     INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset);
 
-                    slm_query[query_local_offset] = val;
+                    slm_query[query_local_offset] = val * scale_val;
                     query_offset += query_pitch;
                     query_local_offset++;
                 }
@@ -788,7 +1012,7 @@ KERNEL(sdpa_opt)(
                 unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
                     INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset);
 
-                    slm_query[query_local_offset] = val;
+                    slm_query[query_local_offset] = val * scale_val;
                     query_offset += query_pitch;
                     query_local_offset++;
                 }
@@ -862,10 +1086,28 @@ KERNEL(sdpa_opt)(
                             PA_BUFFERS);
 
             if (seq_len_calc_size >= SUBGROUP_SIZE) {
+#if IS_KV_COMPRESSED
+#ifndef BEAM_TABLE_TYPE
+                const uint b_idx = b0_idx;
+#endif
+#ifdef COMPRESSED_PER_HEAD
+                // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, b1_idx, 0);
+                const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid, 0);
+#else
+                // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, 0, 0);
+                const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, 0, seq_len + sglid, 0);
+#endif
+                KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset];
+#if USE_ASYMMETRIC_QUANTIZATION
+                KEY_COMPRESSION_SCALE_TYPE key_comp_zp = key_scale[key_scale_comp_offset + 1];
+#endif
+                // printf("[0]key_scale_comp_offset=%d, sglid=%d: %f\n", key_scale_comp_offset, sglid, key_comp_scale);
+#endif
+
                 __attribute__((opencl_unroll_hint(1)))
                 for (uint head_idx_index = 0; head_idx_index < HEAD_SIZE; head_idx_index += SUBGROUP_SIZE) {
                     #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset);
-                    #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE)
+                    #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE)
 
                     QUERY_VEC queries_vec;
                     uint query_local_offset = (head_idx_index * TARGET_SEQ_LEN_BLOCK_SIZE) + sglid;
@@ -875,10 +1117,23 @@ KERNEL(sdpa_opt)(
                     }
 
                     unroll_for (uint key_row_idx = 0; key_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; key_row_idx++) {
+#if IS_KV_COMPRESSED
+#ifdef BEAM_TABLE_TYPE
+                        INPUT1_TYPE key_vals_compressed = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index);
+#else
+                        INPUT1_TYPE key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index);
+#endif
+#if USE_ASYMMETRIC_QUANTIZATION
+                        KEY_COMPRESSION_SCALE_TYPE key_vals = (TO_KEY_COMPRESSION_SCALE_TYPE(key_vals_compressed) - sub_group_broadcast(key_comp_zp, key_row_idx)) * sub_group_broadcast(key_comp_scale, key_row_idx);
+#else
+                        KEY_COMPRESSION_SCALE_TYPE key_vals = TO_KEY_COMPRESSION_SCALE_TYPE(key_vals_compressed) * sub_group_broadcast(key_comp_scale, key_row_idx);
+#endif
+#else
 #ifdef BEAM_TABLE_TYPE
                         INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index);
 #else
                         INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index);
+#endif
 #endif
 
                         unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) {
@@ -887,12 +1142,34 @@ KERNEL(sdpa_opt)(
                     }
                 }
             } else if (seq_len_calc_size > 0) {
+#if IS_KV_COMPRESSED
+#ifndef BEAM_TABLE_TYPE
+                const uint b_idx = b0_idx;
+#endif
+#ifdef COMPRESSED_PER_HEAD
+                // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, b1_idx, 0);
+                const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx,b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid, 0);
+#else
+                // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, 0, 0);
+                const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, 0, seq_len + sglid, 0);
+#endif
+                KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset];
+#if USE_ASYMMETRIC_QUANTIZATION
+                KEY_COMPRESSION_SCALE_TYPE key_comp_zp = key_scale[key_scale_comp_offset + 1];
+#endif
+                // printf("[1]key_scale_comp_offset=%d, sglid=%d: %f\n", key_scale_comp_offset, sglid, key_comp_scale);
+#endif
                 __attribute__((opencl_unroll_hint(1)))
                 for (uint head_idx_index = 0; head_idx_index < HEAD_SIZE; head_idx_index += SUBGROUP_SIZE) {
-                    #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset);
-                    #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE)
+                    #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset)
+                    #define QUERY_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE)
+#if IS_KV_COMPRESSED
+                    #define KEY_VEC_TYPE MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE)
+#else
+                    #define KEY_VEC_TYPE MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE)
+#endif
 
-                    QUERY_VEC queries_vec;
+                    QUERY_VEC_TYPE queries_vec;
                     uint query_local_offset = (head_idx_index * TARGET_SEQ_LEN_BLOCK_SIZE) + sglid;
                     unroll_for (uint q_row_idx = 0; q_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; q_row_idx++) {
                         queries_vec[q_row_idx] = slm_query[query_local_offset];
@@ -900,18 +1177,49 @@ KERNEL(sdpa_opt)(
                     }
 
 #ifndef LOAD_KEY_LEFTOVERS_IN_CALC_LOOP
-                    QUERY_VEC key_vec = 0;
+                    KEY_VEC_TYPE key_vec = 0;
                     unroll_for (uint key_row_idx = 0; key_row_idx < seq_len_calc_size; key_row_idx++) {
+    #if IS_KV_COMPRESSED
+    #ifdef BEAM_TABLE_TYPE
+                        key_vec[key_row_idx] = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index));
+    #else
+                        key_vec[key_row_idx] = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index));
+    #endif
+#if USE_ASYMMETRIC_QUANTIZATION
+                        key_vec[key_row_idx] = (key_vec[key_row_idx] - sub_group_broadcast(key_comp_zp, key_row_idx)) * sub_group_broadcast(key_comp_scale, key_row_idx);
+#else
+                        key_vec[key_row_idx] *= sub_group_broadcast(key_comp_scale, key_row_idx);
+#endif
+    #else
     #ifdef BEAM_TABLE_TYPE
                         key_vec[key_row_idx] = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index);
     #else
                         key_vec[key_row_idx] = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index);
+    #endif
     #endif
                     }
 #endif
 
                     unroll_for (uint key_row_idx = 0; key_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; key_row_idx++) {
 #ifdef LOAD_KEY_LEFTOVERS_IN_CALC_LOOP
+    #if IS_KV_COMPRESSED
+    #ifdef BEAM_TABLE_TYPE
+                        KEY_COMPRESSION_SCALE_TYPE key_vals = 0;
+                        if (key_row_idx < seq_len_calc_size)
+                            key_vals = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index));
+    #else
+                        KEY_COMPRESSION_SCALE_TYPE key_vals = 0;
+                        if (key_row_idx < seq_len_calc_size) {
+                            key_vals = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index));
+                            // printf("_%d %d %d. Loads key [%d] = %f\n", get_global_id(0), get_global_id(1), get_global_id(2), key_offset + key_row_idx * key_pitch + head_idx_index, key_vals);
+                        }
+    #endif
+#if USE_ASYMMETRIC_QUANTIZATION
+                        key_vals = (key_vals - sub_group_broadcast(key_comp_zp, key_row_idx)) * sub_group_broadcast(key_comp_scale, key_row_idx);
+#else
+                        key_vals *= sub_group_broadcast(key_comp_scale, key_row_idx);
+#endif
+    #else
     #ifdef BEAM_TABLE_TYPE
                         INPUT1_TYPE key_vals = 0;
                         if (key_row_idx < seq_len_calc_size)
@@ -921,6 +1229,7 @@ KERNEL(sdpa_opt)(
                         if (key_row_idx < seq_len_calc_size)
                             key_vals = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index);
     #endif
+    #endif
 #else
     #define key_vals key_vec[key_row_idx]
 #endif
@@ -933,12 +1242,7 @@ KERNEL(sdpa_opt)(
 
             {
                 unroll_for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) {
-#if HAS_SCALE_INPUT
-                    const OUTPUT_TYPE scale_val = *scale;
-#else
-                    const OUTPUT_TYPE scale_val = TO_OUTPUT_TYPE(STATIC_SCALE_VALUE);
-#endif
-                    qk_acc[i] *= scale_val;
+
 
 #ifdef HAS_ALIBI
                     const int alibi_val = (1 - SOURCE_SEQ_LEN) + seq_len + i;
@@ -947,6 +1251,8 @@ KERNEL(sdpa_opt)(
 
                     qk_acc[i] = INPUT0_MIN_FUNC(INPUT0_MAX_FUNC(qk_acc[i], INPUT0_VAL_MIN), INPUT0_VAL_MAX);
 
+                    // printf("%d %d %d. qk_acc=%v16f\n", get_global_id(0), get_global_id(1), get_global_id(2), qk_acc);
+
                     qk_max = SOFTMAX_ACCUMULATOR_MAX_FUNC(qk_max, TO_SOFTMAX_ACCUMULATOR_TYPE(qk_acc[i]));
                 }
             }
@@ -1006,6 +1312,16 @@ KERNEL(sdpa_opt)(
             barrier(CLK_LOCAL_MEM_FENCE);
         }
 
+        // if (get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0) {
+        //     for (int i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) {
+        //         printf("row %d: ", i);
+        //         for (int j = 0; j < 16; j++) {
+        //             printf("%f ", slm_qk_vals[SEQ_LEN_PARTITION_SIZE * i + j]);
+        //         }
+        //         printf("\n");
+        //     }
+        // }
+
         {
             // QK*V calculation
             MAKE_VECTOR_TYPE(OUTPUT_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) acc_output_res = OUTPUT_VAL_ZERO;
@@ -1050,15 +1366,45 @@ KERNEL(sdpa_opt)(
                         qk_val[seq_idx] = slm_qk_vals[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len + sglid];
                     }
 
+#if IS_KV_COMPRESSED
+#ifndef BEAM_TABLE_TYPE
+                    const uint b_idx = b0_idx;
+#endif
+#ifdef COMPRESSED_PER_HEAD
+                    // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len) + sglid, 0, 0, b1_idx, 0);
+                    const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len) + sglid, 0);
+#else
+                    // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len) + sglid, 0, 0, 0, 0);
+                    const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + (seq_len) + sglid, 0);
+#endif
+                    VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset];
+#if USE_ASYMMETRIC_QUANTIZATION
+                    VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1];
+#endif
+                    // printf("[0]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale);
+#endif
+
                     unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) {
+#define UNCOMPPRESED_VALUE_TYPE half
+#define TO_UNCOMPPRESED_VALUE_TYPE(val) convert_half(val)
 #ifdef BEAM_TABLE_TYPE
-                        INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i));
+                        UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)));
 #else
-                        INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset);
+                        UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, value_offset));
+#endif
+
+#if IS_KV_COMPRESSED
+#if USE_ASYMMETRIC_QUANTIZATION
+                        value_val = (value_val - sub_group_broadcast(value_comp_zp, i)) * sub_group_broadcast(value_comp_scale, i);
+#else
+                        value_val *= sub_group_broadcast(value_comp_scale, i);
+#endif
 #endif
                         unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
                             acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc_output_res[seq_idx]);
                         }
+#undef UNCOMPPRESED_VALUE_TYPE
+#undef TO_UNCOMPPRESED_VALUE_TYPE
 
 #ifndef BEAM_TABLE_TYPE
                         value_offset += value_pitch;
@@ -1093,20 +1439,51 @@ KERNEL(sdpa_opt)(
 #endif
 #endif
 
+#if IS_KV_COMPRESSED
+#ifndef BEAM_TABLE_TYPE
+                    const uint b_idx = b0_idx;
+#endif
+#ifdef COMPRESSED_PER_HEAD
+                    // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, b1_idx, 0);
+                    const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0);
+#else
+                    // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, 0, 0);
+                    const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0);
+#endif
+                    VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset];
+#if USE_ASYMMETRIC_QUANTIZATION
+                    VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1];
+#endif
+                    // printf("[1]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale);
+#endif
+
                     MAKE_VECTOR_TYPE(OUTPUT_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) qk_val;
                     unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
                         qk_val[seq_idx] = slm_qk_vals[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len * SUBGROUP_SIZE + sglid];
                     }
 
                     unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) {
+#define UNCOMPPRESED_VALUE_TYPE half
+#define TO_UNCOMPPRESED_VALUE_TYPE(val) convert_half(val)
 #ifdef BEAM_TABLE_TYPE
-                        INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i));
+                        UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)));
 #else
-                        INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset);
+                        UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, value_offset));
 #endif
+
+#if IS_KV_COMPRESSED
+#if USE_ASYMMETRIC_QUANTIZATION
+                        value_val = (value_val - sub_group_broadcast(value_comp_zp, i)) * sub_group_broadcast(value_comp_scale, i);
+#else
+                        value_val *= sub_group_broadcast(value_comp_scale, i);
+#endif
+#endif
+
                         unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
                             acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc_output_res[seq_idx]);
                         }
+#undef UNCOMPPRESED_VALUE_TYPE
+#undef TO_UNCOMPPRESED_VALUE_TYPE
 
 #ifndef BEAM_TABLE_TYPE
                         value_offset += value_pitch;
@@ -1144,17 +1521,47 @@ KERNEL(sdpa_opt)(
 #endif
 #endif
 
+#if IS_KV_COMPRESSED
+#ifndef BEAM_TABLE_TYPE
+                    const uint b_idx = b0_idx;
+#endif
+#ifdef COMPRESSED_PER_HEAD
+                    // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len_leftovers_start + sglid, 0, 0, b1_idx, 0);
+                    const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len_leftovers_start + sglid, 0);
+#else
+                    // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len_leftovers_start + sglid, 0, 0, 0, 0);
+                    const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + seq_len_leftovers_start + sglid, 0);
+#endif
+                    VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset];
+#if USE_ASYMMETRIC_QUANTIZATION
+                    VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1];
+#endif
+                    // printf("[2]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale);
+#endif
+
                     for (uint seq_len_idx = 0; seq_len_idx < partition_seq_len - seq_len_leftovers_start; seq_len_idx++) {
+#define UNCOMPPRESED_VALUE_TYPE half
+#define TO_UNCOMPPRESED_VALUE_TYPE(val) convert_half(val)
 #ifdef BEAM_TABLE_TYPE
-                        INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, seq_len_idx));
+                        UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, seq_len_idx)));
 #else
-                        INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset);
+                        UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, value_offset));
+#endif
+
+#if IS_KV_COMPRESSED
+#if USE_ASYMMETRIC_QUANTIZATION
+                        value_val = (value_val - sub_group_broadcast(value_comp_zp, seq_len_idx)) * sub_group_broadcast(value_comp_scale, seq_len_idx);
+#else
+                        value_val *= sub_group_broadcast(value_comp_scale, seq_len_idx);
+#endif
 #endif
 
                         for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) {
                             acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], seq_len_idx), value_val, acc_output_res[seq_idx]);
                         }
 
+#undef UNCOMPPRESED_VALUE_TYPE
+#undef TO_UNCOMPPRESED_VALUE_TYPE
 #ifndef BEAM_TABLE_TYPE
                         value_offset += value_pitch;
 #endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl
index 83e3c7c7e9fef1..8efec101151dd2 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl
@@ -111,7 +111,9 @@ inline uint FUNC(get_bt_index_value)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uin
 }
 #endif
 
+
 #define APPLY_SCALE_TO_QUERY 1
+#define HAS_KV_CACHE_ZP_INPUT USE_ASYMMETRIC_QUANTIZATION && !COMBINE_SCALES_AND_ZP
 
 KERNEL(sdpa_ref)(
     OPTIONAL_SHAPE_INFO_ARG
@@ -125,6 +127,14 @@ KERNEL(sdpa_ref)(
     const __global INPUT4_TYPE* scale,
 #endif
     __global OUTPUT_TYPE* output,
+#if USE_ASYMMETRIC_QUANTIZATION
+    const __global KEY_COMPRESSION_SCALE_TYPE* key_scale,
+    const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale,
+#if HAS_KV_CACHE_ZP_INPUT
+    const __global KEY_COMPRESSION_ZP_TYPE* key_zp,
+    const __global VALUE_COMPRESSION_ZP_TYPE* val_zp,
+#endif
+#endif
 #ifdef BEAM_TABLE_TYPE
     const __global BEAM_TABLE_TYPE* beam_table,
 #endif
@@ -162,7 +172,28 @@ KERNEL(sdpa_ref)(
 #else
                 INPUT0_TYPE q_val = query_input[query_offset];
 #endif
+#if IS_KV_COMPRESSED
+                INPUT1_TYPE k_val_comp = key_input[key_offset];
+                half k_val = (half)k_val_comp;
+#ifdef COMPRESSED_PER_HEAD
+                const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1 / BROADCAST_GROUP_SIZE, s, 0);
+#else
+                // const uint key_scale_comp_offset = s;
+                const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, 0, s, 0);
+#endif
+#if USE_ASYMMETRIC_QUANTIZATION
+#if HAS_KV_CACHE_ZP_INPUT
+                k_val = (k_val - key_zp[key_scale_comp_offset]) * key_scale[key_scale_comp_offset];
+#else
+                k_val = (k_val - key_scale[key_scale_comp_offset + 1]) * key_scale[key_scale_comp_offset];
+#endif
+
+#else
+                k_val *= key_scale[key_scale_comp_offset];
+#endif
+#else
                 INPUT1_TYPE k_val = key_input[key_offset];
+#endif
                 acc += q_val * k_val;
             }
 
@@ -236,7 +267,27 @@ KERNEL(sdpa_ref)(
 #endif
         uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1, 0, 0, s, head_size_idx);
 
+#if IS_KV_COMPRESSED
+        INPUT2_TYPE __value = value_input[value_offset];
+        half value = (half)__value;
+    #ifdef COMPRESSED_PER_HEAD
+        const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1 / BROADCAST_GROUP_SIZE, s, 0);
+    #else
+        const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, s, 0);
+    #endif
+#if USE_ASYMMETRIC_QUANTIZATION
+#if HAS_KV_CACHE_ZP_INPUT
+        value = (value - val_zp[value_scale_comp_offset]) * val_scale[value_scale_comp_offset];
+#else
+        value = (value - val_scale[value_scale_comp_offset + 1]) * val_scale[value_scale_comp_offset];
+#endif
+#else
+        value *= val_scale[value_scale_comp_offset];
+#endif
+        acc += tmp_buf[tmp_buf_offset] * value;
+#else
         acc += tmp_buf[tmp_buf_offset] * value_input[value_offset];
+#endif
     }
 
     uint output_offset = OUTPUT_GET_INDEX(b0, b1, target_seq_idx, head_size_idx);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp
new file mode 100644
index 00000000000000..6e422f3174cf73
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp
@@ -0,0 +1,327 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "dynamic_quantize_kernel_opt_generic.h"
+#include "kernel_selector_utils.h"
+#include <string>
+
+
+static constexpr size_t subgroup_size = 16;
+
+namespace kernel_selector {
+static Tensor::NDims get_normalized_dims(const DataTensor& tensor) {
+    auto dims = tensor.GetDims();
+    std::reverse(dims.begin(), dims.end());
+
+    return dims;
+}
+
+static size_t get_elements_number_per_batch(const dynamic_quantize_params& params) {
+    const auto& group_sizes = params.group_sizes;
+    const auto& input_dims = get_normalized_dims(params.inputs[0]);
+
+    auto total_elements_number = 1;
+    for (size_t i = 0; i < group_sizes.size(); i++) {
+        if (group_sizes[i] != UINT64_MAX) {
+            GPU_DEBUG_TRACE_DETAIL << "Multiply " << input_dims[i].v << "\n";
+            total_elements_number *= input_dims[i].v;
+        }
+    }
+
+    return total_elements_number;
+}
+
+static size_t get_elements_number_per_group(const dynamic_quantize_params& params) {
+    const auto& group_sizes = params.group_sizes;
+    const auto& input_dims = get_normalized_dims(params.inputs[0]);
+
+    auto total_elements_number = 1;
+    for (size_t i = 0; i < group_sizes.size(); i++) {
+        if (group_sizes[i] == UINT64_MAX) {
+            GPU_DEBUG_TRACE_DETAIL << "-> Multiply " << input_dims[i].v << "\n";
+            total_elements_number *= input_dims[i].v;
+        } else {
+            GPU_DEBUG_TRACE_DETAIL << "=> Multiply " << group_sizes[i] << "\n";
+            total_elements_number *= group_sizes[i];
+        }
+    }
+
+    return total_elements_number;
+}
+
+static std::string generate_dims_indexes_calculation(std::vector<std::pair<std::string, std::string>> dims,
+                                                     std::pair<std::string, std::string> append_axis_info = {}) {
+    std::reverse(dims.begin(), dims.end()); // reorder dims in order from innermost to outermost dimensions
+
+    auto generate_calc_function = [&](std::string data_type, std::string index_var, size_t dim_idx) {
+        std::string index_calc_str;
+        index_calc_str += "" + data_type + " " + dims[dim_idx].first + " = ";
+        index_calc_str += "((" + index_var + " / ";
+        index_calc_str += "(1";
+        for (size_t i = 0; i < dim_idx; i++) {
+            index_calc_str += " * " + dims[i].second;
+        }
+        index_calc_str += ")) % " + dims[dim_idx].second + ")";
+
+        if (append_axis_info.first == dims[dim_idx].first) {
+            index_calc_str += " + " + append_axis_info.second;
+        }
+
+        index_calc_str += ";";
+        return index_calc_str;
+    };
+
+    std::stringstream indexes_calc_str;
+    for (size_t i = 0; i < dims.size(); i++) {
+        indexes_calc_str << generate_calc_function("uint", "data_idx", i);
+    }
+
+    return indexes_calc_str.str();
+}
+
+// static size_t get_innermost_group_size(const dynamic_quantize_params& params) {
+//     const auto& group_sizes = params.group_sizes;
+//     const auto& input_dims = get_normalized_dims(params.inputs[0]);
+
+//     for (size_t i = group_sizes.size(); i > 0; i--) {
+//         if (group_sizes[i - 1] == UINT64_MAX) {
+//             return input_dims[i - 1].v;
+//         } else if (group_sizes[i - 1] != 1) {
+//             return group_sizes[i - 1];
+//         }
+//     }
+
+//     return 1;
+// }
+
+// static size_t get_match_vector_size(const dynamic_quantize_params& params) {
+//     // const auto input_dt = BytesPerElement(params.inputs[0].GetDType());
+//     auto block_sizes = { 8, 4, 2 };
+
+//     for (auto block_size : block_sizes) {
+//         if (((params.inputs[0].X().v * params.inputs[0].Y().v) / simd) % block_size == 0) {
+//             return block_size;
+//         }
+//     }
+
+//     return 1;
+// }
+
+static size_t get_per_iter_elements_number(const dynamic_quantize_params& params) {
+    const auto maxWorkGroupSize = params.engineInfo.maxWorkGroupSize;
+    const auto total_grouped_elements = get_elements_number_per_group(params);
+
+    if (total_grouped_elements % maxWorkGroupSize == 0)
+        return maxWorkGroupSize;
+
+    if (total_grouped_elements < maxWorkGroupSize)
+        return total_grouped_elements;
+
+    return 0;
+}
+
+ParamsKey DynamicQuantizeKernelOptGeneric::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableDifferentTypes();
+    k.EnableAllInputLayout();
+    k.EnableAllOutputLayout();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnableDynamicShapesSupport();
+    return k;
+}
+
+JitConstants DynamicQuantizeKernelOptGeneric::GetJitConstants(const dynamic_quantize_params& params) const {
+    JitConstants jit = MakeBaseParamsJitConstants(params);
+
+    const std::vector<std::pair<std::string, std::string>> default_dims = {{"b", "INPUT0_BATCH_NUM"},
+                                                                           {"f", "INPUT0_FEATURE_NUM"},
+                                                                           {"y", "INPUT0_SIZE_Y"},
+                                                                           {"x", "INPUT0_SIZE_X"}};
+
+    const auto& group_sizes = params.group_sizes;
+    std::vector<std::pair<std::string, std::string>> batch_dims, grouped_dims;
+    for (size_t i = 0; i < group_sizes.size(); i++) {
+        if (group_sizes[i] == 1)
+            batch_dims.push_back(default_dims[i]);
+        else
+            grouped_dims.push_back(default_dims[i]);
+    }
+
+    const auto& input_dims = get_normalized_dims(params.inputs[0]);
+    const auto total_grouped_elements = get_elements_number_per_group(params);
+    const auto per_iter_elements_number = get_per_iter_elements_number(params);
+    const auto total_subgroups_number = total_grouped_elements / input_dims.back().v;
+
+    // drop the last dimensions, since it will be processed inside kernel
+    grouped_dims.pop_back();
+
+    const bool append_mode = params.append_axis != -1;
+    std::pair<std::string, std::string> append_axis_info = {};
+    if (append_mode) {
+        // TODO: Is not needed???
+        // append_axis_info = { default_dims[params.append_axis].first, "axis_offset" }; // axis and input scalar name
+
+        jit.AddConstant(MakeJitConstant("APPEND_MODE", append_mode));
+        jit.AddConstant(MakeJitConstant("APPEND_AXIS_NAME", default_dims[params.append_axis].first));
+    }
+
+    jit.AddConstant(MakeJitConstant("DECLARE_BATCHED_DIMS_INDEXES(data_idx)", generate_dims_indexes_calculation(batch_dims, append_axis_info)));
+    jit.AddConstant(MakeJitConstant("DECLARE_GROUPED_DIMS_INDEXES(data_idx)", generate_dims_indexes_calculation(grouped_dims)));
+    jit.AddConstant(MakeJitConstant("SUBGROUPS_NUMBER", total_subgroups_number));
+
+    const auto iterations_number = total_grouped_elements / per_iter_elements_number;
+
+    jit.AddConstant(MakeJitConstant("ITERATIONS_NUMBER", iterations_number));
+    jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization));
+    jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp));
+
+    bool rearrange_scales_order = false;
+    const auto& scales_output_order = params.scales_output_order;
+    if (!scales_output_order.empty()) {
+        for (size_t i = 0; i < scales_output_order.size(); i++) {
+            if (i != scales_output_order[i]) {
+                rearrange_scales_order = true;
+                break;
+            }
+        }
+    }
+
+    if (rearrange_scales_order) {
+        const std::array<char, 4> default_dim_order = {'b', 'f', 'y', 'x'};
+
+        std::stringstream ss;
+        for (size_t i = 0; i < scales_output_order.size(); i++) {
+            ss << default_dim_order[scales_output_order[i]];
+
+            if (i + 1 != scales_output_order.size())
+                ss << ", ";
+        }
+
+        jit.AddConstant(MakeJitConstant("SCALES_OUTPUT_ORDER", ss.str()));
+        GPU_DEBUG_TRACE_DETAIL << "SCALES_OUTPUT_ORDER: " << ss.str() << "\n";
+    }
+
+    for (size_t i = 0; i < group_sizes.size(); i++) {
+        jit.AddConstant(MakeJitConstant("GROUP_SIZE_DIM" + std::to_string(i), group_sizes[i]));
+    }
+
+    return jit;
+}
+
+CommonDispatchData DynamicQuantizeKernelOptGeneric::SetDefault(const dynamic_quantize_params& params) const {
+    CommonDispatchData dispatchData;
+
+    const auto& input_dims = get_normalized_dims(params.inputs[0]);
+    const auto total_batched_elements = get_elements_number_per_batch(params);
+    const auto total_grouped_elements = get_elements_number_per_group(params);
+    const auto total_subgroups_number = total_grouped_elements / input_dims.back().v;
+    // const auto per_iter_elements_number = get_per_iter_elements_number(params);
+
+    // TODO: add check that input_dims.back().v / SUBGROUP_SIZE is enough to allocate private array inside kernel
+
+    dispatchData.gws = {subgroup_size, total_subgroups_number, total_batched_elements};
+    dispatchData.lws = {subgroup_size, total_subgroups_number, 1};
+
+    return dispatchData;
+}
+
+void DynamicQuantizeKernelOptGeneric::GetUpdateDispatchDataFunc(KernelData& kd) const {
+    kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) {
+        const auto& prim_params = static_cast<const dynamic_quantize_params&>(params);
+        auto dispatchData = SetDefault(prim_params);
+        OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
+        kd.kernels[0].params.workGroups.global = dispatchData.gws;
+        kd.kernels[0].params.workGroups.local = dispatchData.lws;
+        kd.kernels[0].skip_execution = false;
+
+        if (prim_params.append_axis != -1) {
+            kd.kernels[0].params.scalars.clear();
+
+            ScalarDescriptor axis_offset;
+            axis_offset.t = ScalarDescriptor::Types::UINT32;
+            axis_offset.v.u32 = static_cast<uint32_t>(prim_params.axis_offset);
+            kd.kernels[0].params.scalars.push_back(axis_offset);
+        }
+
+        GPU_DEBUG_TRACE_DETAIL << "Update Dispatch data DynamicQuantizeKernelOptGeneric gws : " << dispatchData.gws[0] << ", "
+                << dispatchData.gws[1] << ", " << dispatchData.gws[2] << std::endl;
+    };
+}
+
+KernelsData DynamicQuantizeKernelOptGeneric::GetKernelsData(const Params& params) const {
+    assert(params.GetType() == KernelType::DYNAMIC_QUANTIZE);
+
+    if (!Validate(params))
+        return {};
+
+    const dynamic_quantize_params& prim_params = static_cast<const dynamic_quantize_params&>(params);
+    auto dispatchData = SetDefault(prim_params);
+
+    KernelData kd = KernelData::Default<dynamic_quantize_params>(params);
+
+    auto cldnn_jit = GetJitConstants(prim_params);
+    auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params);
+    auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
+
+    GetUpdateDispatchDataFunc(kd);
+
+    auto& kernel = kd.kernels[0];
+    FillCLKernelData(kernel,
+                     dispatchData,
+                     params.engineInfo,
+                     kernelName,
+                     jit,
+                     entry_point,
+                     EXE_MODE_DEFAULT,
+                     false,
+                     false,
+                     1,
+                     GetFusedPrimitiveInputsCount(params),
+                     static_cast<int>(prim_params.outputs.size()),
+                     prim_params.is_shape_agnostic);
+
+    if (prim_params.append_axis != -1)
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0});
+
+    return {kd};
+}
+
+KernelsPriority DynamicQuantizeKernelOptGeneric::GetKernelsPriority(const Params& /*params*/) const {
+    return FORCE_PRIORITY_1;
+}
+
+bool DynamicQuantizeKernelOptGeneric::Validate(const Params& params) const {
+    if (!KernelBaseOpenCL::Validate(params))
+        return false;
+
+    const auto& dq_params = static_cast<const dynamic_quantize_params&>(params);
+
+    const auto& group_sizes = dq_params.group_sizes;
+    const auto& input_dims = get_normalized_dims(dq_params.inputs[0]);
+    const size_t non_compressed_dims_number = std::count(group_sizes.begin(), group_sizes.end(), 1);
+
+    if (non_compressed_dims_number == group_sizes.size())
+        return false;
+
+    for (size_t i = 0; i < group_sizes.size(); i++) {
+        if (group_sizes[i] != 1 && input_dims[i].is_dynamic) {
+            return false;
+        }
+    }
+
+    // last dimension should be static, reduced by group_sizes configuration and divisible by 16
+    if (group_sizes.back() == 1 || input_dims.back().is_dynamic || input_dims.back().v % subgroup_size != 0)
+        return false;
+
+    if (dq_params.inputs[0].GetPaddedVal() != 0 || dq_params.outputs[0].GetPaddedVal() != 0)
+        return false;
+
+    return true;
+}
+}  // namespace kernel_selector
+
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.h
new file mode 100644
index 00000000000000..f5148ccc1d9a1f
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.h
@@ -0,0 +1,30 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "kernel_base_opencl.h"
+#include "dynamic_quantize_kernel_ref.h"
+
+namespace kernel_selector {
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// dynamic_quantize_params
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class DynamicQuantizeKernelOptGeneric : public KernelBaseOpenCL {
+public:
+    DynamicQuantizeKernelOptGeneric() : KernelBaseOpenCL("dynamic_quantize_gpu_opt_generic") {}
+    virtual ~DynamicQuantizeKernelOptGeneric() {}
+
+    virtual JitConstants GetJitConstants(const dynamic_quantize_params& params) const;
+    virtual CommonDispatchData SetDefault(const dynamic_quantize_params& params) const;
+    KernelsData GetKernelsData(const Params& params) const override;
+    KernelsPriority GetKernelsPriority(const Params& params) const override;
+    Datatype GetAccumulatorType(const dynamic_quantize_params& params) const;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    bool Validate(const Params&) const override;
+    void GetUpdateDispatchDataFunc(KernelData& kd) const override;
+};
+}  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp
index 3b214848e2f8ad..87a89e7c043b10 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp
@@ -26,6 +26,47 @@ JitConstants DynamicQuantizeKernelRef::GetJitConstants(const dynamic_quantize_pa
 
     jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0]));
 
+    auto print_arr = [&](const std::vector<uint64_t>& vec, size_t max_len, std::string name) {
+        std::stringstream ss;
+        for (size_t i = 0; i < std::min(max_len, vec.size()); i++) {
+            ss << vec[i] << ", ";
+        }
+        GPU_DEBUG_TRACE_DETAIL << "Array " << name << " (len=" << vec.size() << ") content: " << ss.str() << "\n";
+    };
+
+    bool rearrange_scales = false;
+    const auto& scales_output_order = params.scales_output_order;
+    if (!scales_output_order.empty()) {
+        for (size_t i = 0; i < scales_output_order.size(); i++) {
+            if (i != scales_output_order[i]) {
+                rearrange_scales = true;
+                break;
+            }
+        }
+    }
+
+    if (rearrange_scales) {
+        const std::array<char, 4> default_dim_order = {'b', 'f', 'y', 'x'};
+
+        std::stringstream ss;
+        for (size_t i = 0; i < scales_output_order.size(); i++) {
+            ss << default_dim_order[scales_output_order[i]];
+
+            if (i + 1 != scales_output_order.size())
+                ss << ", ";
+        }
+
+        jit.AddConstant(MakeJitConstant("SCALES_OUTPUT_ORDER", ss.str()));
+        GPU_DEBUG_TRACE_DETAIL << "SCALES_OUTPUT_ORDER: " << ss.str() << "\n";
+    }
+
+    print_arr(params.group_sizes, params.group_sizes.size(), "group_sizes");
+
+    const auto& group_sizes = params.group_sizes;
+    for (size_t i = 0; i < group_sizes.size(); i++) {
+        jit.AddConstant(MakeJitConstant("GROUP_SIZE_DIM" + std::to_string(i), group_sizes[i]));
+    }
+
     return jit;
 }
 
@@ -34,7 +75,17 @@ CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_p
     CommonDispatchData dispatchData;
 
     OPENVINO_ASSERT(params.outputs[0].GetLayout() == DataLayout::bfyx, "It supports only 4d tensor");
-    dispatchData.gws = {params.outputs[0].Batch().v * params.outputs[0].Feature().v, 1, 1};
+
+    const auto& group_sizes = params.group_sizes;
+    auto batch_size = group_sizes[0] == 1 ? params.outputs[0].Batch().v : 1;
+    auto feature_size = group_sizes[1] == 1 ? params.outputs[0].Feature().v : 1;
+    auto y_size = group_sizes[2] == 1 ? params.outputs[0].Y().v : 1;
+    auto x_size = group_sizes[3] == 1 ? params.outputs[0].X().v : 1;
+
+    dispatchData.gws = {batch_size * feature_size, y_size, x_size};
+    // GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // per-head compression
+    //     dispatchData.gws[1] = params.outputs[0].Y().v;
+    // }
     dispatchData.lws = {1, 1, 1};
 
     return dispatchData;
@@ -94,6 +145,10 @@ bool DynamicQuantizeKernelRef::Validate(const Params& params) const {
     if (!KernelBaseOpenCL::Validate(params))
         return false;
 
+    const auto& prim_params = static_cast<const dynamic_quantize_params&>(params);
+    if (prim_params.group_sizes.size() != 4)
+        return false;
+
     return true;
 }
 }  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h
index ce52ed9fb19714..bdfd9f61291d21 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h
@@ -12,6 +12,13 @@ namespace kernel_selector {
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 struct dynamic_quantize_params : public base_params {
     dynamic_quantize_params() : base_params(KernelType::DYNAMIC_QUANTIZE) {}
+
+    int64_t append_axis = -1;
+    int64_t axis_offset = -1;
+    std::vector<uint64_t> group_sizes;
+    std::vector<uint64_t> scales_output_order;
+    bool use_asymmetric_quantization = false;
+    bool combine_scales_and_zp = false;
 };
 
 class DynamicQuantizeKernelRef : public KernelBaseOpenCL {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp
index 6ca9fbd2f5bd76..dd5ff4acaf06b1 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp
@@ -5,11 +5,21 @@
 #include "dynamic_quantize_kernel_selector.h"
 #include "dynamic_quantize_kernel_ref.h"
 #include "dynamic_quantize_kernel_opt.h"
+#include "dynamic_quantize_kernel_opt_generic.h"
 
 namespace kernel_selector {
 dynamic_quantize_kernel_selector::dynamic_quantize_kernel_selector() {
     Attach<DynamicQuantizeKernelRef>();
-    Attach<DynamicQuantizeKernelOpt>();
+    int USE_REF_DQ = 0;
+    if (const auto env_var = std::getenv("USE_REF_DQ")) {
+        std::istringstream ss(env_var);
+        ss >> USE_REF_DQ;
+    }
+
+    if (!USE_REF_DQ) {
+        Attach<DynamicQuantizeKernelOptGeneric>();
+    }
+    // Attach<DynamicQuantizeKernelOpt>();
 }
 
 KernelsData dynamic_quantize_kernel_selector::GetBestKernels(const Params& params) const {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp
index 7556debd29df00..1b5deec382e460 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp
@@ -4,6 +4,7 @@
 
 #include "sdpa_kernel_base.h"
 #include "kernel_selector_utils.h"
+#include "intel_gpu/runtime/debug_configuration.hpp"
 
 namespace kernel_selector {
 
@@ -66,6 +67,7 @@ static std::string GetBroadcastInputStr(const size_t input_rank, const int64_t a
 }
 
 JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
     auto jit = MakeBaseParamsJitConstants(params);
 
     if (params.conf.broadcast_axis != -1) {
@@ -73,6 +75,8 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const {
         jit.AddConstant(MakeJitConstant("DO_BROADCAST_KEY_VALUE", GetBroadcastInputStr(params.inputs[0].GetDims().size(),
                                                                                        params.conf.broadcast_axis,
                                                                                        params.conf.group_size)));
+    } else {
+        jit.AddConstant(MakeJitConstant("BROADCAST_GROUP_SIZE", 1));
     }
 
     jit.AddConstant(MakeJitConstant("IS_CAUSAL", params.conf.is_causal));
@@ -81,6 +85,24 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const {
         jit.AddConstant(MakeJitConstant("HAS_SCALE_INPUT", params.inputs.size() > 4));
     }
 
+    jit.AddConstant(MakeJitConstant("IS_KV_COMPRESSED", params.conf.is_kv_compressed));
+    jit.AddConstant(MakeJitConstant("USE_ASYMMETRIC_QUANTIZATION", params.conf.use_asymmetric_quantization));
+    jit.AddConstant(MakeJitConstant("COMBINE_SCALES_AND_ZP", params.conf.combine_scales_and_zp));
+
+    if (params.conf.is_kv_compressed) {
+        jit.AddConstant(MakeJitConstant("KEY_COMPRESSION_SCALE", params.key_cache_comp_scale));
+        jit.AddConstant(MakeJitConstant("VALUE_COMPRESSION_SCALE", params.value_cache_comp_scale));
+
+        if (params.conf.use_asymmetric_quantization && !params.conf.combine_scales_and_zp) {
+            jit.AddConstant(MakeJitConstant("KEY_COMPRESSION_ZP", params.key_cache_comp_zp));
+            jit.AddConstant(MakeJitConstant("VALUE_COMPRESSION_ZP", params.value_cache_comp_zp));
+        }
+    }
+
+    GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // FIXME: it should be placed in params
+        jit.AddConstant(MakeJitConstant("COMPRESSED_PER_HEAD", 1));
+    }
+
     auto is_default_order = [](const std::vector<int64_t>& order) {
         for (size_t i = 0; i < order.size(); i++)
             if (order[i] != static_cast<int64_t>(i))
@@ -140,6 +162,7 @@ bool SDPAKernelBase::Validate(const Params& p) const {
     if (params.outputs[0].Dimentions() != 4)
         return false;
 
+    // FIXME: i8 input is supported only when kv cache is compressed
     return true;
 }
 }  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h
index 6ea8d85527d19d..ded25bc8c8ed9c 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h
@@ -88,6 +88,9 @@ struct sdpa_configuration {
 
     bool is_causal = false;
     bool has_alibi_input = false;
+    bool is_kv_compressed = false;
+    bool use_asymmetric_quantization = false;
+    bool combine_scales_and_zp = false;
 
     // Paged Attention configuration
     bool is_paged_attention = false;
@@ -110,6 +113,10 @@ struct sdpa_params : public base_params {
     int64_t indirect_axis = -1;
 
     DataTensor beam_table;
+    DataTensor key_cache_comp_scale;
+    DataTensor key_cache_comp_zp;
+    DataTensor value_cache_comp_scale;
+    DataTensor value_cache_comp_zp;
 
     sdpa_configuration conf;
 };
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp
index 04ebf2f2165973..d615b97d75e0a5 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp
@@ -342,6 +342,9 @@ bool SDPAKernelMicro::Validate(const Params& p) const {
     if (params.conf.head_size > 256)
         return false;
 
+    if (params.conf.is_kv_compressed)
+        return false;
+
     return true;
 }
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp
index 2f0174d0a45912..d3c1b82002308c 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp
@@ -4,6 +4,7 @@
 
 #include "sdpa_kernel_opt.h"
 #include "kernel_selector_utils.h"
+#include "common_types.h"
 #include <string>
 #include <vector>
 
@@ -26,6 +27,18 @@ static size_t get_sg_number_scale_factor(const sdpa_params& sdpa_params, size_t
         if (sdpa_params.conf.head_size * optimal_scale_factor <= sdpa_params.engineInfo.maxWorkGroupSize) {
             return optimal_scale_factor;
         }
+    } else if (kernel_type == KernelsTypes::SINGLE_TOKEN) {
+        int USE_SCALE_FACTOR = 0;
+        if (const auto env_var = std::getenv("USE_SCALE_FACTOR")) {
+            std::istringstream ss(env_var);
+            ss >> USE_SCALE_FACTOR;
+        }
+        if (USE_SCALE_FACTOR) {
+            const size_t optimal_scale_factor = 2;
+            if (sdpa_params.conf.head_size * optimal_scale_factor <= sdpa_params.engineInfo.maxWorkGroupSize) {
+                return optimal_scale_factor;
+            }
+        }
     }
 
     return 1;
@@ -126,6 +139,7 @@ static std::string GetKernelName(std::string base_name, KernelsTypes type, const
 
 ParamsKey SDPAKernelOpt::GetSupportedKey() const {
     ParamsKey k;
+    k.EnableInputDataType(Datatype::INT8);
     k.EnableInputDataType(Datatype::F16);
     k.EnableInputDataType(Datatype::F32);
     k.EnableInputDataType(Datatype::INT32);
@@ -154,6 +168,9 @@ bool SDPAKernelOpt::Validate(const Params& p) const {
     if (params.conf.head_size < 1 || params.conf.head_size % subgroup_size != 0)
         return false;
 
+    if (params.conf.use_asymmetric_quantization && !params.conf.combine_scales_and_zp)
+        return false;
+
     return true;
 }
 
@@ -231,10 +248,11 @@ CommonDispatchData SDPAKernelOpt::SetDefault(const sdpa_params& params, size_t k
         const size_t target_seq_len_block_size = kernel_idx == 1 ? get_target_seq_len_block_size() : 1;
 
         if (kernel_idx == KernelsTypes::SINGLE_TOKEN) {
+            const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx);
             dispatch_data.gws = { batch_size * heads_num,
                                   CeilDiv(target_seq_len, target_seq_len_block_size),
-                                  head_size * num_of_partitions };
-            dispatch_data.lws = { 1, 1, head_size };
+                                  head_size * num_of_partitions * sg_num_scale };
+            dispatch_data.lws = { 1, 1, head_size * sg_num_scale };
         } else if (kernel_idx == KernelsTypes::MULTI_TOKENS) {
             const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx);
             dispatch_data.gws = { batch_size * heads_num,
@@ -307,8 +325,20 @@ KernelsData SDPAKernelOpt::GetKernelsData(const Params& params) const {
                          static_cast<int>(prim_params.outputs.size()),
                          prim_params.is_shape_agnostic);
 
-        if (prim_params.indirect_axis != -1 && kernel_idx != KernelsTypes::FINALIZATION)
-            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast<uint32_t>(prim_params.inputs.size())});
+        auto beam_table_idx = prim_params.inputs.size();
+        if (prim_params.conf.is_kv_compressed && kernel_idx != KernelsTypes::FINALIZATION) {
+            auto key_cache_compression_scale_idx = static_cast<uint32_t>(prim_params.inputs.size());
+            auto value_cache_compression_scale_idx = static_cast<uint32_t>(prim_params.inputs.size() + 1);
+
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx});
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx});
+
+            beam_table_idx += 2;
+        }
+
+        if (prim_params.indirect_axis != -1 && kernel_idx != KernelsTypes::FINALIZATION) {
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast<uint32_t>(beam_table_idx)});
+        }
 
         kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
         kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
@@ -391,6 +421,6 @@ void SDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const {
 }
 
 KernelsPriority SDPAKernelOpt::GetKernelsPriority(const Params& params) const {
-    return params.engineInfo.supports_immad ?  FORCE_PRIORITY_2 : FORCE_PRIORITY_1;
+    return FORCE_PRIORITY_1;
 }
 }  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp
index 579c4bc06c17e2..d6fcad69b2beb8 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp
@@ -12,10 +12,12 @@ namespace kernel_selector {
 ParamsKey SDPAKernelRef::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::INT8);
     k.EnableInputDataType(Datatype::F32);
     // beam table input
     k.EnableInputDataType(Datatype::INT32);
 
+    // FIXME: support for compressed KV cache
     k.EnableOutputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::F32);
 
@@ -74,8 +76,28 @@ KernelsData SDPAKernelRef::GetKernelsData(const Params& params) const {
                      "", false, false, static_cast<int>(prim_params.inputs.size()),
                      GetFusedPrimitiveInputsCount(params), 1, prim_params.is_shape_agnostic);
 
-    if (prim_params.indirect_axis != -1)
-        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast<uint32_t>(prim_params.inputs.size())});
+    auto beam_table_idx = prim_params.inputs.size();
+    if (prim_params.conf.is_kv_compressed) {
+        auto key_cache_compression_scale_idx = static_cast<uint32_t>(prim_params.inputs.size());
+        auto value_cache_compression_scale_idx = static_cast<uint32_t>(prim_params.inputs.size() + 1);
+
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx});
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx});
+        GPU_DEBUG_TRACE_DETAIL << "sdpa_ref add scales args\n";
+
+        if (prim_params.conf.use_asymmetric_quantization && !prim_params.conf.combine_scales_and_zp) {
+            GPU_DEBUG_TRACE_DETAIL << "sdpa_ref add zero points args\n";
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx + 2});
+            kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx + 2});
+            beam_table_idx += 2;
+        }
+
+        beam_table_idx += 2;
+    }
+
+    if (prim_params.indirect_axis != -1) {
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast<uint32_t>(beam_table_idx)});
+    }
 
     kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp
index e65fd7fd10976b..6d157fe901efcb 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp
@@ -13,11 +13,21 @@
 namespace kernel_selector {
 
 sdpa_kernel_selector::sdpa_kernel_selector() {
-    Attach<SDPAKernelOpt>();
-    Attach<SDPAKernelRef>();
-#ifdef ENABLE_ONEDNN_FOR_GPU
-    Attach<SDPAKernelMicro>();
-#endif
+    int USE_REF_SDPA = 0;
+    if (const auto env_var = std::getenv("USE_REF_SDPA")) {
+        std::istringstream ss(env_var);
+        ss >> USE_REF_SDPA;
+    }
+
+    if (!USE_REF_SDPA) {
+        Attach<SDPAKernelOpt>();
+        Attach<SDPAKernelRef>();
+    #ifdef ENABLE_ONEDNN_FOR_GPU
+        // Attach<SDPAKernelMicro>();
+    #endif
+    } else {
+        Attach<SDPAKernelRef>();
+    }
 }
 
 KernelsData sdpa_kernel_selector::GetBestKernels(const Params& params) const {
diff --git a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp
index 7574b664b6b4b7..7ff2b14f79b569 100644
--- a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp
+++ b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp
@@ -152,5 +152,66 @@ VariableState::Ptr VariableStateIndirectKVCache::get_beam_table_state() const {
     return m_hidden_states[1];
 }
 
+VariableStateIndirectKVCacheCompressed::VariableStateIndirectKVCacheCompressed(
+    const VariableStateInfo& info,
+    std::shared_ptr<RemoteContextImpl> context,
+    std::shared_ptr<cldnn::ShapePredictor> shape_predictor,
+    size_t beam_idx,
+    size_t concat_idx,
+    bool has_zp_state = false)
+    : VariableStateIndirectKVCache(info, context, shape_predictor, beam_idx, concat_idx),
+      m_has_zp_state(has_zp_state) {
+    const auto compression_scale_shape = ov::PartialShape::dynamic(info.m_layout.get_partial_shape().size());
+
+    // TODO: set proper data type
+    cldnn::layout compression_scale_layout(compression_scale_shape,
+                                           ov::element::f16,
+                                           cldnn::format::bfyx);
+    VariableStateInfo compression_scale_state_info(info.m_id + "/comp_scale", compression_scale_layout);
+    m_hidden_states.push_back(std::make_shared<VariableState>(compression_scale_state_info, context, shape_predictor));
+
+    if (has_zp_state) {
+        cldnn::layout compression_zp_layout(compression_scale_shape,
+                                            ov::element::f16,
+                                            cldnn::format::bfyx);
+        VariableStateInfo compression_scale_state_info(info.m_id + "/comp_zp", compression_zp_layout);
+        m_hidden_states.push_back(std::make_shared<VariableState>(compression_scale_state_info, context, shape_predictor));
+    }
+
+    OPENVINO_ASSERT((!m_has_zp_state && m_hidden_states.size() == 3) || (m_has_zp_state && m_hidden_states.size() == 4),
+                    "[GPU] VariableStateIndirectKVCacheCompressed expects 3 or 4 internal states to be initialized, "
+                    "actual number is ", m_hidden_states.size());
+}
+
+VariableState::Ptr VariableStateIndirectKVCacheCompressed::get_compression_scale_state() const {
+    return m_hidden_states[2];
+}
+
+void VariableStateIndirectKVCacheCompressed::set_compression_scale_layout(const cldnn::layout& new_layout) {
+    m_hidden_states[2]->set_layout(new_layout);
+}
+
+VariableState::Ptr VariableStateIndirectKVCacheCompressed::get_compression_zp_state() const {
+    OPENVINO_ASSERT(m_has_zp_state);
+    return m_hidden_states[3];
+}
+
+void VariableStateIndirectKVCacheCompressed::set_compression_zp_layout(const cldnn::layout& new_layout) {
+    OPENVINO_ASSERT(m_has_zp_state);
+    m_hidden_states[3]->set_layout(new_layout);
+}
+
+bool VariableStateIndirectKVCacheCompressed::has_zp_state() const {
+    return m_has_zp_state;
+}
+
+void VariableStateIndirectKVCacheCompressed::set_state(const ov::SoPtr<ov::ITensor>& state) {
+    OPENVINO_THROW("[GPU] set_state API is supported only when KV-cache compression is disabled");
+}
+
+ov::SoPtr<ov::ITensor> VariableStateIndirectKVCacheCompressed::get_state() const {
+    OPENVINO_THROW("[GPU] get_state API is supported only when KV-cache compression is disabled");
+}
+
 }  // namespace intel_gpu
 }  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp
index 0373251e45c051..0fd0f61fafddaa 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp
@@ -3,10 +3,20 @@
 //
 
 #include "ov_ops/dynamic_quantize.hpp"
+#include "intel_gpu/op/dynamic_quantize.hpp"
 #include "intel_gpu/plugin/program_builder.hpp"
 #include "intel_gpu/plugin/common_utils.hpp"
 #include "intel_gpu/primitives/dynamic_quantize.hpp"
 
+
+namespace ov {
+namespace op {
+namespace internal {
+using DynamicQuantizeExtended = ov::intel_gpu::op::DynamicQuantize;
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
+
 namespace ov {
 namespace intel_gpu {
 
@@ -15,19 +25,33 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptr<ov:
     auto inputs = p.GetInputInfo(op);
     std::string primitive_name = layer_type_name_ID(op);
 
+    auto prim = cldnn::dynamic_quantize(primitive_name,
+                                        inputs[0],
+                                        op->get_quantization_config());
+
+    p.add_primitive(*op, prim);
+}
+
+static void CreateDynamicQuantizeExtendedOp(ProgramBuilder& p, const std::shared_ptr<ov::op::internal::DynamicQuantizeExtended>& op) {
+    validate_inputs_count(op, {1});
+    auto inputs = p.GetInputInfo(op);
+    std::string primitive_name = layer_type_name_ID(op);
+
     auto group_sizes = op->get_group_sizes();
-    for (size_t i = 0; i < group_sizes.size() - 1; i++)
-        OPENVINO_ASSERT(group_sizes[i] == 1, "Not supported group size at ", i, ": ", group_sizes[i]);
 
-    OPENVINO_ASSERT(group_sizes.back() == UINT64_MAX, "Not supported group size: ", group_sizes.back());
     auto prim = cldnn::dynamic_quantize(primitive_name,
-                                inputs[0],
-                                op->get_group_sizes().back(),
-                                get_output_data_types(op));
+                                        inputs[0],
+                                        op->get_quantization_config(),
+                                        op->get_combine_scales_and_zp(),
+                                        op->get_scales_zp_output_order());
+
+    prim.num_outputs = op->get_output_size();
+
     p.add_primitive(*op, prim);
 }
 
 REGISTER_FACTORY_IMPL(internal, DynamicQuantize);
+REGISTER_FACTORY_IMPL(internal, DynamicQuantizeExtended);
 
 }  // namespace intel_gpu
 }  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp
index c2ee336e48bf06..bfa4df330c2fc8 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp
@@ -22,19 +22,26 @@ namespace intel_gpu {
 namespace {
 
 void CreateKVCacheOp(ProgramBuilder& p, const std::shared_ptr<ov::op::internal::KVCache>& op) {
-    validate_inputs_count(op, {2, 3});
+    validate_inputs_count(op, {2, 3, 4, 5});
     auto inputs = p.GetInputInfo(op);
     int64_t rank = op->get_input_partial_shape(0).size();
     auto prim = cldnn::kv_cache(layer_type_name_ID(op),
-                                      inputs,
-                                      op->get_variable()->get_info(),
-                                      ov::util::normalize(op->get_concat_axis(), rank),
-                                      ov::util::normalize(op->get_gather_axis(), rank),
-                                      op->get_indirect());
+                                inputs,
+                                op->get_variable()->get_info(),
+                                ov::util::normalize(op->get_concat_axis(), rank),
+                                ov::util::normalize(op->get_gather_axis(), rank),
+                                op->get_indirect());
 
     prim.num_outputs = op->get_output_size();
     prim.output_data_types = get_output_data_types(op);
 
+    if (op->get_kv_compressed()) {
+        prim.compressed = true;
+        prim.combine_scales_and_zp = op->get_combine_scales_and_zp();
+        prim.quantization_config = op->get_quantization_config();
+        prim.scales_zp_output_order = op->get_scales_zp_output_order();
+    }
+
     p.add_primitive(*op, prim);
 }
 
diff --git a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp
index 0ba780183b186d..a2ef2ad50bfcce 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp
@@ -62,11 +62,13 @@ static void CreateSDPAOp(ProgramBuilder& p, const std::shared_ptr<ov::op::intern
 }
 
 static void CreateIndirectSDPAOp(ProgramBuilder& p, const std::shared_ptr<ov::op::internal::IndirectSDPA>& op) {
-    validate_inputs_count(op, {4, 5, 6});
     auto inputs = p.GetInputInfo(op);
     auto layerName = layer_type_name_ID(op);
 
     bool is_causal = op->get_causal();
+    const auto compression_inputs = op->get_compression_inputs_num();
+    validate_inputs_count(op, {4 + compression_inputs, 5 + compression_inputs, 6 + compression_inputs});
+
     int64_t indirect_axis = op->get_indirect_axis();
     auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName,
                                                          inputs,
@@ -75,7 +77,10 @@ static void CreateIndirectSDPAOp(ProgramBuilder& p, const std::shared_ptr<ov::op
                                                          op->get_input0_transpose_order(),
                                                          op->get_input1_transpose_order(),
                                                          op->get_input2_transpose_order(),
-                                                         op->get_output_transpose_order());
+                                                         op->get_output_transpose_order(),
+                                                         op->get_kv_compressed(),
+                                                         op->get_combine_scales_and_zp(),
+                                                         op->get_quantization_config());
 
     p.add_primitive(*op, sdpa_prim);
 }
diff --git a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp
index 9d7d6854009316..05c4ad64946272 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+
 #include "intel_gpu/plugin/program_builder.hpp"
 #include "intel_gpu/plugin/common_utils.hpp"
 #include "openvino/core/type/element_type.hpp"
@@ -9,6 +10,7 @@
 #include "openvino/op/read_value.hpp"
 #include "transformations/rt_info/original_precision_attribute.hpp"
 #include "intel_gpu/op/read_value.hpp"
+#include "intel_gpu/op/read_values.hpp"
 #include "intel_gpu/primitives/assign.hpp"
 #include "intel_gpu/primitives/read_value.hpp"
 
@@ -16,6 +18,7 @@ namespace ov {
 namespace op {
 namespace internal {
 using ReadValue = ov::intel_gpu::op::ReadValue;
+using ReadValues = ov::intel_gpu::op::ReadValues;
 }  // namespace internal
 }  // namespace op
 }  // namespace ov
@@ -39,12 +42,33 @@ void CreateVariableAccessPrimitive(ProgramBuilder &p, const std::shared_ptr<ov::
     const auto prim = T_PRIMITIVE{layer_type_name_ID(op),
                                   inputs,
                                   variable_id,
-                                  variable_layout,
+                                  { variable_layout },
                                   user_specified_type};
 
     p.add_primitive(*op, prim);
 }
 
+// template<typename T_PRIMITIVE>
+// void CreateVariableAccessPrimitive<T_PRIMITIVE, true>(ProgramBuilder &p, const std::shared_ptr<ov::op::Op> &op,
+//                                                       const std::string &variable_id) {
+//     const auto output_pshape = op->get_output_partial_shape(0);
+//     const auto output_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(0));
+//     const auto output_format = cldnn::format::get_default_format(output_pshape.size());
+
+//     const auto variable_layout = cldnn::layout{ output_pshape, output_dtype, output_format };
+
+//     auto inputs = p.GetInputInfo(op);
+//     auto user_specified_type = get_original_precision(op);
+//     const auto prim = T_PRIMITIVE{layer_type_name_ID(op),
+//                                   inputs,
+//                                   variable_id,
+//                                   variable_layout,
+//                                   user_specified_type,
+//                                   true};
+
+//     p.add_primitive(*op, prim);
+// }
+
 void CreateReadValueOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v3::ReadValue>& op) {
     validate_inputs_count(op, {0, 1});
     CreateVariableAccessPrimitive<cldnn::read_value>(p, op, op->get_variable_id());
@@ -70,6 +94,26 @@ void CreateAssignOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v6::Assign>
     CreateVariableAccessPrimitive<cldnn::assign>(p, op, op->get_variable_id());
 }
 
+void CreateReadValuesOp(ProgramBuilder& p, const std::shared_ptr<ov::intel_gpu::op::ReadValues>& op) {
+    std::vector<cldnn::layout> variable_layouts;
+    for (size_t i = 0; i < op->get_output_size(); i++) {
+        const auto output_pshape = op->get_output_partial_shape(i);
+        const auto output_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(i));
+        const auto output_format = cldnn::format::get_default_format(output_pshape.size());
+        variable_layouts.emplace_back(output_pshape, output_dtype, output_format);
+    }
+
+    auto inputs = p.GetInputInfo(op);
+    auto user_specified_type = get_original_precision(op);
+    auto prim = cldnn::read_value{layer_type_name_ID(op),
+                                  inputs,
+                                  op->get_variable_id(),
+                                  variable_layouts,
+                                  user_specified_type};
+
+    p.add_primitive(*op, prim);
+}
+
 } // namespace
 
 REGISTER_FACTORY_IMPL(v3, Assign);
@@ -77,6 +121,7 @@ REGISTER_FACTORY_IMPL(v6, Assign);
 REGISTER_FACTORY_IMPL(v3, ReadValue);
 REGISTER_FACTORY_IMPL(v6, ReadValue);
 REGISTER_FACTORY_IMPL(internal, ReadValue);
+REGISTER_FACTORY_IMPL(internal, ReadValues);
 
 }  // namespace intel_gpu
 }  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
index 346b4471779593..df14d9cfefb7a0 100644
--- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -643,16 +643,24 @@ void SyncInferRequest::allocate_states() {
         bool indirect_kv_cache = false;
         int64_t beam_axis = 0;
         int64_t concat_axis = 0;
+        bool compressed = false;
+        bool has_zp_state = false;
         auto kv_cache_shape = vi.second.m_layout.get_partial_shape();
         for (auto& p : state_prims) {
             if (auto kv_cache_prim = dynamic_cast<const cldnn::kv_cache*>(p)) {
                 indirect_kv_cache = kv_cache_prim->indirect;
                 beam_axis = ov::util::normalize(kv_cache_prim->gather_axis, kv_cache_shape.size());
                 concat_axis = ov::util::normalize(kv_cache_prim->concat_axis, kv_cache_shape.size());
+                compressed = kv_cache_prim->compressed;
+                if (compressed && !kv_cache_prim->combine_scales_and_zp) {
+                    has_zp_state = kv_cache_prim->quantization_config.is_asymmetric_quantization();
+                }
             }
         }
 
-        if (indirect_kv_cache) {
+        if (compressed) {
+            m_variables.emplace(vi.first, std::make_shared<VariableStateIndirectKVCacheCompressed>(vi.second, m_context, m_shape_predictor, beam_axis, concat_axis, has_zp_state));
+        } else if (indirect_kv_cache) {
             m_variables.emplace(vi.first, std::make_shared<VariableStateIndirectKVCache>(vi.second, m_context, m_shape_predictor, beam_axis, concat_axis));
         } else {
             m_variables.emplace(vi.first, std::make_shared<VariableState>(vi.second, m_context, m_shape_predictor));
@@ -719,6 +727,18 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
     auto user_tensor = user_tensor_wrapper.ptr;
     auto element_type = user_tensor->get_element_type();
 
+    // auto print_arr = [&](int64_t* vec, size_t max_len, std::string name) {
+    //     std::stringstream ss;
+    //     for (size_t i = 0; i < max_len; i++) {
+    //         ss << vec[i] << ", ";
+    //     }
+    //     std::cout << "Array " << name << " (len=" << max_len << ") content: " << ss.str() << "\n";
+    // };
+
+    // if (internal_name == "parameter:input_ids") {
+    //     print_arr(user_tensor->data<int64_t>(), user_tensor->get_size(), "parameter:input_ids");
+    // }
+
     auto remote_tensor_impl_ptr = std::dynamic_pointer_cast<RemoteTensorImpl>(user_tensor);
     auto iremote_tensor_ptr = std::dynamic_pointer_cast<IRemoteTensor>(user_tensor);
     auto usm_host_ptr = std::dynamic_pointer_cast<USMHostTensor>(user_tensor);
@@ -737,6 +757,19 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
     auto& engine = m_graph->get_engine();
     auto& stream = network->get_stream();
 
+    // auto print_arr = [&](int64_t* vec, size_t max_len, std::string name) {
+    //     std::stringstream ss;
+    //     for (size_t i = 0; i < max_len; i++) {
+    //         ss << vec[i] << ", ";
+    //     }
+    //     std::cout << "Array " << name << " (len=" << max_len << ") content: " << ss.str() << "\n";
+    // };
+
+
+    // if (internal_name == "parameter:input_ids") {
+    //     print_arr(user_tensor->data<int64_t>(), user_tensor->get_size(), "parameter:input_ids");
+    // }
+
     auto need_lockable_mem = network->does_node_need_lockable_output(internal_name);
 
     OPENVINO_ASSERT(pshape.compatible(ov::PartialShape(user_tensor->get_shape())) || is_batched_input(port),
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp
index eb16213bcb936c..28e0e7b74c71ae 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp
@@ -61,7 +61,14 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size
         auto rank = m_fc->get_input_partial_shape(0).size();
         std::vector<uint64_t> shape_group_size(rank, 1);
         shape_group_size.back() = group_size;
-        auto dyn_quan = std::make_shared<ov::op::internal::DynamicQuantize>(m_data, shape_group_size, element::f16);
+
+        ov::op::internal::QuantizationConfig config;
+        config.quantization_dt = element::i8;
+        config.mode = ov::op::internal::QuantizationConfig::QuantizationMode::Symmetric;
+        config.scale_dt = element::f16;
+        config.group_sizes = shape_group_size;
+
+        auto dyn_quan = std::make_shared<ov::op::internal::DynamicQuantize>(m_data, config);
         auto optional_w_zp = m_fc->get_input_size() > 4 ? m_fc->get_input_node_shared_ptr(4) : std::make_shared<ov::intel_gpu::op::Placeholder>();
 
         auto output_type = m_fc->get_output_type();
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp
new file mode 100644
index 00000000000000..d38e3c5a510c2b
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp
@@ -0,0 +1,349 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "kv_cache_compression.hpp"
+
+#include "intel_gpu/op/kv_cache.hpp"
+#include "intel_gpu/op/indirect_sdpa.hpp"
+#include "intel_gpu/op/read_value.hpp"
+#include "intel_gpu/op/read_values.hpp"
+#include "intel_gpu/op/dynamic_quantize.hpp"
+#include "intel_gpu/plugin/common_utils.hpp"
+
+#include "openvino/core/node_vector.hpp"
+#include "openvino/core/rt_info.hpp"
+#include "openvino/op/concat.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/convert.hpp"
+#include "openvino/op/gather.hpp"
+#include "openvino/op/parameter.hpp"
+#include "openvino/op/sink.hpp"
+#include "openvino/op/transpose.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
+#include "openvino/pass/graph_rewrite.hpp"
+#include "openvino/pass/pattern/op/label.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "openvino/pass/pattern/op/or.hpp"
+#include "openvino/pass/visualize_tree.hpp"
+#include "transformations/utils/utils.hpp"
+
+#include <memory>
+
+namespace ov {
+namespace intel_gpu {
+
+namespace {
+std::vector<ov::op::util::VariableInfo> get_variable_infos(const ov::op::util::VariableInfo& data_variable_info,
+                                                           const ov::op::internal::QuantizationConfig& config,
+                                                           const std::vector<uint64_t>& scales_zp_output_order,
+                                                           const bool combine_scales_and_zp = false) {
+    std::vector<ov::op::util::VariableInfo> infos;
+
+    // add initial data variable info
+    infos.push_back(data_variable_info);
+
+    // infer DQ shapes
+    ov::intel_gpu::op::DynamicQuantize dq;
+    auto dq_shapes =
+        ov::intel_gpu::op::DynamicQuantize::shape_infer(&dq, {data_variable_info.data_shape}, config, scales_zp_output_order, combine_scales_and_zp);
+
+    const auto variable_id = data_variable_info.variable_id;
+    const auto scale_shape = dq_shapes[1];
+    const auto scale_dt = config.scale_dt;
+
+    // add scales variable info
+    infos.push_back(ov::op::util::VariableInfo{scale_shape, scale_dt, variable_id});
+
+    if (config.is_asymmetric_quantization() && !combine_scales_and_zp) {
+        // add zero points variable info
+        const auto zp_dt = config.zp_dt;
+        infos.push_back(ov::op::util::VariableInfo{scale_shape, zp_dt, variable_id});
+    }
+
+    return infos;
+}
+
+std::shared_ptr<ov::intel_gpu::op::ReadValues>
+    update_past_read_value(std::shared_ptr<ov::intel_gpu::op::ReadValue> past_rv_node,
+                           const ov::op::internal::QuantizationConfig& config,
+                           const std::vector<uint64_t>& scales_zp_output_order,
+                           const bool combine_scales_and_zp = false) {
+    auto variable = past_rv_node->get_variable();
+    variable->update_data_type(config.quantization_dt);
+
+    auto variable_infos = get_variable_infos(past_rv_node->get_variable()->get_info(), config, scales_zp_output_order, combine_scales_and_zp);
+    auto new_past_rv_node = std::make_shared<ov::intel_gpu::op::ReadValues>();
+
+    if (past_rv_node->get_input_size() == 0) {
+        new_past_rv_node = std::make_shared<ov::intel_gpu::op::ReadValues>(past_rv_node->get_variable(), variable_infos);
+    } else {
+        auto initializer_dyn_quantization = std::make_shared<ov::intel_gpu::op::DynamicQuantize>(past_rv_node->get_input_node_shared_ptr(0),
+                                                                                                 config,
+                                                                                                 scales_zp_output_order,
+                                                                                                 combine_scales_and_zp);
+        initializer_dyn_quantization->set_friendly_name(past_rv_node->get_input_node_shared_ptr(0)->get_friendly_name() + "_dyn_quan");
+
+        OutputVector initializer_outputs = { initializer_dyn_quantization->output(0), initializer_dyn_quantization->output(1) };
+
+        if (config.is_asymmetric_quantization() && !combine_scales_and_zp)
+            initializer_outputs.push_back( initializer_dyn_quantization->output(2) );
+
+        new_past_rv_node = std::make_shared<ov::intel_gpu::op::ReadValues>(initializer_outputs, past_rv_node->get_variable(), variable_infos);
+    }
+
+    ov::copy_runtime_info(past_rv_node, new_past_rv_node);
+    past_rv_node->output(0).replace(new_past_rv_node->output(0));
+
+    return new_past_rv_node;
+}
+
+std::shared_ptr<ov::intel_gpu::op::KVCache>
+    update_kv_cache(std::shared_ptr<ov::intel_gpu::op::ReadValue> past_rv_node,
+                    std::shared_ptr<ov::intel_gpu::op::KVCache> kv_cache_node,
+                    const ov::op::internal::QuantizationConfig& config,
+                    const std::vector<uint64_t>& scales_zp_output_order,
+                    const bool combine_scales_and_zp = false) {
+    OutputVector kv_cache_inputs = { past_rv_node->output(0),
+                                     kv_cache_node->get_input_node_shared_ptr(1),
+                                     kv_cache_node->get_input_node_shared_ptr(2),
+                                     past_rv_node->output(1) };
+
+    if (config.is_asymmetric_quantization() && !combine_scales_and_zp)
+        kv_cache_inputs.push_back(past_rv_node->output(2));
+
+    auto new_kv_cache = std::make_shared<op::KVCache>(kv_cache_inputs,
+                                                      kv_cache_node->get_variable(),
+                                                      kv_cache_node->get_concat_axis(),
+                                                      kv_cache_node->get_gather_axis(),
+                                                      combine_scales_and_zp,
+                                                      config,
+                                                      scales_zp_output_order);
+
+    new_kv_cache->set_friendly_name(kv_cache_node->get_friendly_name());
+    ov::copy_runtime_info(kv_cache_node, new_kv_cache);
+
+    return new_kv_cache;
+}
+}
+
+class KVCacheCompressionMatcher : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("KVCacheCompressionMatcher", "0");
+    KVCacheCompressionMatcher();
+};
+
+KVCacheCompressionMatcher::KVCacheCompressionMatcher() {
+    using namespace ov::pass::pattern;
+
+    bool first = true;
+
+    int KV_CACHE_COMP = 0;
+    if (const auto env_var = std::getenv("KV_CACHE_COMP")) {
+        std::istringstream ss(env_var);
+        ss >> KV_CACHE_COMP;
+    }
+
+    if (KV_CACHE_COMP == 0) {
+        if (first) {
+            printf("NO_KV_CACHE_COMP\n");
+        }
+        first = false;
+        return;
+    } else {
+        if (first)
+            printf("YES_KV_CACHE_COMP\n");
+
+        first = false;
+    }
+
+    int USE_ZP = 0;
+    if (const auto env_var = std::getenv("USE_ZP")) {
+        std::istringstream ss(env_var);
+        ss >> USE_ZP;
+    }
+
+    std::cout << "Set USE_ZP = " << USE_ZP << "\n";
+
+    auto quantization_mode = ov::op::internal::QuantizationConfig::QuantizationMode::Symmetric;
+    if (USE_ZP)
+        quantization_mode = ov::op::internal::QuantizationConfig::QuantizationMode::Asymmetric;
+
+
+    bool combine_scales_and_zp = quantization_mode == ov::op::internal::QuantizationConfig::QuantizationMode::Asymmetric;
+
+    int ZP_INPUT = 0;
+    if (const auto env_var = std::getenv("ZP_INPUT")) {
+        std::istringstream ss(env_var);
+        ss >> ZP_INPUT;
+    }
+
+    if (ZP_INPUT && combine_scales_and_zp) {
+        std::cout << "Use independent ZP INPUT\n";
+        combine_scales_and_zp = false;
+    }
+
+    auto query = any_input();
+
+    auto key_past = wrap_type<ov::intel_gpu::op::ReadValue>();
+    auto key_new_token = any_input();
+    auto key_beam_idx = any_input();
+    auto key_cache = wrap_type<ov::intel_gpu::op::KVCache>({key_past, key_new_token, key_beam_idx});
+
+    auto value_past = wrap_type<ov::intel_gpu::op::ReadValue>();
+    auto value_new_token = any_input();
+    auto value_beam_idx = any_input();
+    auto value_cache = wrap_type<ov::intel_gpu::op::KVCache>({value_past, value_new_token, value_beam_idx});
+
+    auto input_attn_mask = any_input();
+    auto input_scale = any_input();
+    auto input_beam_table = any_input();
+
+    auto sdpa_without_attn_mask_m = wrap_type<ov::intel_gpu::op::IndirectSDPA>({ query, key_cache, value_cache, input_beam_table });
+    auto sdpa_with_attn_mask_m = wrap_type<ov::intel_gpu::op::IndirectSDPA>({ query, key_cache, value_cache, input_attn_mask, input_beam_table });
+    auto sdpa_with_attn_mask_and_scale_m =
+        wrap_type<ov::intel_gpu::op::IndirectSDPA>({ query, key_cache, value_cache, input_attn_mask, input_scale, input_beam_table });
+
+    auto sdpa = std::make_shared<ov::pass::pattern::op::Or>(OutputVector{sdpa_without_attn_mask_m, sdpa_with_attn_mask_m, sdpa_with_attn_mask_and_scale_m});
+
+    ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
+        if (transformation_callback(m.get_match_root())) {
+            return false;
+        }
+
+        const auto& pattern_map = m.get_pattern_value_map();
+
+        auto query_node = pattern_map.at(query).get_node_shared_ptr();
+
+        auto key_new_token_node = pattern_map.at(key_new_token).get_node_shared_ptr();
+        auto key_cache_node = std::dynamic_pointer_cast<ov::intel_gpu::op::KVCache>(pattern_map.at(key_cache).get_node_shared_ptr());
+        auto value_cache_node = std::dynamic_pointer_cast<ov::intel_gpu::op::KVCache>(pattern_map.at(value_cache).get_node_shared_ptr());
+        auto sdpa_node = std::dynamic_pointer_cast<ov::intel_gpu::op::IndirectSDPA>(m.get_match_root());
+
+        auto key_past_rv_node = std::dynamic_pointer_cast<ov::intel_gpu::op::ReadValue>(pattern_map.at(key_past).get_node_shared_ptr());
+        auto value_past_rv_node = std::dynamic_pointer_cast<ov::intel_gpu::op::ReadValue>(pattern_map.at(value_past).get_node_shared_ptr());
+
+        auto data_rank = key_cache_node->get_input_partial_shape(0).size();
+        auto get_shape_group_sizes = [&](const std::vector<int64_t>& transposed_order) {
+            std::vector<uint64_t> group_sizes(data_rank, 1);
+            std::vector<int64_t> order = transposed_order;
+            if (transposed_order.size() != data_rank) {
+                order.resize(data_rank);
+                std::iota(order.begin(), order.end(), 0);
+            }
+
+            group_sizes[order[data_rank - 1]] = UINT64_MAX;
+            GPU_DEBUG_GET_INSTANCE(debug_config);
+            GPU_DEBUG_IF(debug_config->enable_kv_cache_compression != 1) { // per-token compression
+                group_sizes[order[1]] = UINT64_MAX;
+            }
+
+            return group_sizes;
+        };
+
+        // Reorder scales in static order: [batch, num_heads, seq_len, head_size]
+        auto get_scales_output_order = [&](const std::vector<int64_t>& transposed_order) {
+            std::vector<uint64_t> scales_zp_output_order(data_rank);
+            scales_zp_output_order[0] = transposed_order[0];
+            scales_zp_output_order[1] = transposed_order[1];
+            scales_zp_output_order[2] = transposed_order[2];
+            scales_zp_output_order[3] = transposed_order[3];
+
+            return scales_zp_output_order;
+        };
+
+        auto group_sizes = get_shape_group_sizes(sdpa_node->get_input1_transpose_order());
+        auto scales_zp_output_order = get_scales_output_order(sdpa_node->get_input1_transpose_order());
+
+        ov::op::internal::QuantizationConfig config;
+        config.mode = quantization_mode;
+        config.group_sizes = group_sizes;
+        config.quantization_dt = element::i8;
+        config.scale_dt = query_node->get_output_element_type(0);
+
+        if (config.is_asymmetric_quantization())
+            config.zp_dt = query_node->get_output_element_type(0);
+
+        auto print_arr = [&](const std::vector<uint64_t>& vec, size_t max_len) {
+            std::stringstream ss;
+            for (size_t i = 0; i < std::min(max_len, vec.size()); i++) {
+                ss << vec[i] << ", ";
+            }
+
+            return ss.str();
+        };
+
+        std::cout << "pattern matched! " << sdpa_node->get_friendly_name() << "; "
+                    << "groups: " << print_arr(group_sizes, group_sizes.size()) << "; "
+                    << "scales_order: " << print_arr(scales_zp_output_order, scales_zp_output_order.size()) << std::endl;
+
+        key_past_rv_node = update_past_read_value(key_past_rv_node, config, scales_zp_output_order, combine_scales_and_zp);
+        value_past_rv_node = update_past_read_value(value_past_rv_node, config, scales_zp_output_order, combine_scales_and_zp);
+
+        auto new_key_cache = update_kv_cache(key_past_rv_node, key_cache_node, config, scales_zp_output_order, combine_scales_and_zp);
+        auto new_value_cache = update_kv_cache(value_past_rv_node, value_cache_node, config, scales_zp_output_order, combine_scales_and_zp);
+
+        OutputVector sdpa_inputs;
+        // Add Query, Key, Value, attention_mask, scale inputs
+        for (size_t i = 0; i < sdpa_node->get_input_size() - 1; i++)
+            sdpa_inputs.push_back(sdpa_node->get_input_node_shared_ptr(i));
+
+        // Replace Key and Value inputs with compressed ones
+        sdpa_inputs[1] = new_key_cache->output(0);
+        sdpa_inputs[2] = new_value_cache->output(0);
+
+        // Add Key and Value compression scales
+        sdpa_inputs.push_back(new_key_cache->output(2));
+        sdpa_inputs.push_back(new_value_cache->output(2));
+
+        // Add Key and Value compression zero points
+        if (config.is_asymmetric_quantization() && !combine_scales_and_zp) {
+            sdpa_inputs.push_back(new_key_cache->output(3));
+            sdpa_inputs.push_back(new_value_cache->output(3));
+        }
+
+        auto input0_transpose_order = sdpa_node->get_input0_transpose_order();
+        auto input1_transpose_order = sdpa_node->get_input1_transpose_order();
+        auto input2_transpose_order = sdpa_node->get_input2_transpose_order();
+        auto output_transpose_order = sdpa_node->get_output_transpose_order();
+
+        auto new_sdpa = std::make_shared<op::IndirectSDPA>(sdpa_inputs,
+                                                           new_key_cache->output(1),
+                                                           sdpa_node->get_causal(),
+                                                           sdpa_node->get_indirect_axis(),
+                                                           input0_transpose_order,
+                                                           input1_transpose_order,
+                                                           input2_transpose_order,
+                                                           output_transpose_order,
+                                                           config,
+                                                           combine_scales_and_zp,
+                                                           sdpa_node->get_output_type());
+
+        new_key_cache->set_friendly_name(key_cache_node->get_friendly_name());
+        ov::copy_runtime_info(key_cache_node, new_key_cache);
+
+        new_value_cache->set_friendly_name(value_cache_node->get_friendly_name());
+        ov::copy_runtime_info(value_cache_node, new_value_cache);
+
+        new_sdpa->set_friendly_name(sdpa_node->get_friendly_name());
+        ov::copy_runtime_info(sdpa_node, new_sdpa);
+
+        ov::replace_node(sdpa_node, new_sdpa);
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(sdpa, "KVCacheCompressionMatcher");
+    this->register_matcher(m, callback);
+
+}
+
+bool KVCacheCompression::run_on_model(const std::shared_ptr<ov::Model>& m) {
+    return pass::GraphRewrite::run_on_model(m);
+}
+
+KVCacheCompression::KVCacheCompression() {
+    add_matcher<ov::intel_gpu::KVCacheCompressionMatcher>();
+}
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp
new file mode 100644
index 00000000000000..d6f18299bced76
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp
@@ -0,0 +1,43 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/graph_rewrite.hpp"
+
+namespace ov {
+namespace intel_gpu {
+
+
+/// Add dynamic quantization node and fuse it with KV cache operation
+///
+///                  ┌───────────┐         ┌─────────────┐                               ┌───────────┐            ┌─────────────┐
+///                  │  New Key  │         │  New Value  │                               │  New Key  │            │  New Value  │
+///                  └──────┬────┘         └──────┬──────┘                               └──────┬────┘            └──────┬──────┘
+///                         │                     │                                             │                        │
+///                         │ f16                 │ f16                                         │ f16                    │ f16
+///                         │                     │               ==>                           │                        │
+///  ┌─────────┐   ┌────────┴─────────┐  ┌────────┴───────────┐         ┌─────────┐    ┌────────┴─────────┐     ┌────────┴───────────┐
+///  │  Query  │   │     KV cache     │  │     KV cache       │         │  Query  │    │  KV cache + DQ   │     │   KV cache + DQ    │
+///  |         |   |      (Key)          |      (Value)       |         |         |    |      (Key)       |     |       (Value)      |
+///  └───┬─────┘   └────────┬─────────┘  └────────┬───────────┘         └────┬────┘    └────────┬─────────┘     └────────┬───────────┘
+///      │                  │                     │                          │                  │                        │
+///      │ f16              │ f16                 │ f16                      │ f16      i8:data │ f16:scale      i8:data │ f16:scale
+///      │                  │                     │                          │                  │                        │
+///      │                  │                     │                          │                  │                        │
+///      │             ┌────┴───┐                 │                          │             ┌────┴───┐                    │
+///      └─────────────┤  SDPA  ├─────────────────┘                          └─────────────┤  SDPA  ├────────────────────┘
+///                    └────────┘                                                          └────────┘
+
+class KVCacheCompression : public ov::pass::GraphRewrite {
+public:
+    OPENVINO_RTTI("KVCacheCompression", "0");
+    KVCacheCompression();
+
+    bool run_on_model(const std::shared_ptr<ov::Model>& m) override;
+};
+
+
+}   // namespace intel_gpu
+}   // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp
new file mode 100644
index 00000000000000..2024e13517e6aa
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp
@@ -0,0 +1,86 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "intel_gpu/op/dynamic_quantize.hpp"
+
+#include "openvino/core/partial_shape.hpp"
+#include "openvino/core/validation_util.hpp"
+
+namespace ov {
+namespace intel_gpu {
+namespace op {
+
+DynamicQuantize::DynamicQuantize(const Output<Node>& data,
+                                 const QuantizationConfig& config,
+                                 const std::vector<uint64_t>& scales_zp_output_order,
+                                 const bool combine_scales_and_zp)
+    : ov::op::internal::DynamicQuantize(data, config, combine_scales_and_zp || config.mode == QuantizationConfig::QuantizationMode::Symmetric ? 2 : 3),
+      m_combine_scales_and_zp(combine_scales_and_zp),
+      m_scales_zp_output_order(scales_zp_output_order) {
+    if (m_scales_zp_output_order.empty()) {
+        m_scales_zp_output_order.resize(data.get_partial_shape().size());
+        std::iota(m_scales_zp_output_order.begin(), m_scales_zp_output_order.end(), 0);
+    }
+
+    OPENVINO_ASSERT(data.get_partial_shape().size() == m_scales_zp_output_order.size());
+    validate_and_infer_types();
+}
+
+void DynamicQuantize::validate_and_infer_types() {
+    std::vector<ov::PartialShape> input_shapes = {get_input_partial_shape(0)};
+
+    auto out_shapes = shape_infer(this, input_shapes, m_config, m_scales_zp_output_order, m_combine_scales_and_zp);
+    set_output_type(0, m_config.quantization_dt, out_shapes[0]);
+    set_output_type(1, m_config.scale_dt, out_shapes[1]);
+
+    if (m_config.is_asymmetric_quantization() && !m_combine_scales_and_zp)
+        set_output_type(2, m_config.zp_dt, out_shapes[2]);
+}
+
+std::shared_ptr<Node> DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const {
+    check_new_args_count(this, new_args);
+    return std::make_shared<DynamicQuantize>(new_args.at(0), m_config, m_scales_zp_output_order, m_combine_scales_and_zp);
+}
+
+std::vector<ov::PartialShape> DynamicQuantize::shape_infer(const DynamicQuantize* op,
+                                                           const std::vector<ov::PartialShape>& input_shapes,
+                                                           const QuantizationConfig& config,
+                                                           const std::vector<uint64_t>& scales_zp_output_order,
+                                                           const bool combine_scales_and_zp) {
+    std::vector<ov::PartialShape> out_shapes = ov::op::internal::DynamicQuantize::shape_infer(op, input_shapes, config);
+    const auto is_asymmetric = config.is_asymmetric_quantization();
+    if (is_asymmetric && combine_scales_and_zp) {
+        out_shapes.pop_back(); // drop zero_points shape
+    }
+
+    auto transpose_shape = [](const ov::PartialShape& shape, const std::vector<uint64_t>& scales_zp_output_order) {
+        auto transposed_shape = shape;
+        for (size_t i = 0; i < scales_zp_output_order.size(); i++) {
+            OPENVINO_ASSERT(scales_zp_output_order[i] < transposed_shape.size());
+            transposed_shape[i] = shape[scales_zp_output_order[i]];
+        }
+
+        return transposed_shape;
+    };
+
+    // transpose scales and zero points
+    for (size_t i = 1; i < out_shapes.size(); i++) {
+        out_shapes[i] = transpose_shape(out_shapes[i], scales_zp_output_order);
+    }
+
+    if (is_asymmetric && combine_scales_and_zp) {
+        // TODO: currently scales and zero points are supposed to be combined over the last dimension only
+        const auto combine_axis = out_shapes[1].size() - 1;
+        OPENVINO_ASSERT(config.group_sizes[scales_zp_output_order[combine_axis]] != 1);
+        // OPENVINO_ASSERT(out_shapes[1][combine_axis].is_dynamic() || out_shapes[1][combine_axis] == 1);
+
+        out_shapes[1][combine_axis] *= 2; // (scale, zero_point) pairs
+    }
+
+    return out_shapes;
+}
+
+}  // namespace internal
+}  // namespace op
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp
index 681c88119efd95..a900c99eb6a4af 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp
@@ -25,30 +25,65 @@ IndirectSDPA::IndirectSDPA(const OutputVector& data_inputs,
     validate_and_infer_types();
 }
 
+IndirectSDPA::IndirectSDPA(const OutputVector& data_inputs,
+                           const ov::Output<Node>& beam_table,
+                           const bool is_causal,
+                           const int64_t indirect_axis,
+                           const std::vector<int64_t>& order_q,
+                           const std::vector<int64_t>& order_k,
+                           const std::vector<int64_t>& order_v,
+                           const std::vector<int64_t>& order_out,
+                           const QuantizationConfig& quantization_config,
+                           const bool combine_scales_and_zp,
+                           const ov::element::Type output_type)
+    : ov::intel_gpu::op::SDPA(data_inputs, is_causal, order_q, order_k, order_v, order_out, quantization_config, combine_scales_and_zp, output_type)
+    , m_indirect_axis(indirect_axis) {
+    auto beam_table_idx = data_inputs.size();
+    set_argument(beam_table_idx, beam_table);
+    validate_and_infer_types();
+}
+
 std::shared_ptr<ov::Node> IndirectSDPA::clone_with_new_inputs(const ov::OutputVector& new_args) const {
     check_new_args_count(this, new_args);
 
     // Exclude beam_table input
     OutputVector data_inputs(new_args.begin(), new_args.end() - 1);
 
-    return std::make_shared<IndirectSDPA>(data_inputs,
-                                          new_args.back(),
-                                          m_is_causal,
-                                          m_indirect_axis,
-                                          m_order_q,
-                                          m_order_k,
-                                          m_order_v,
-                                          m_order_out,
-                                          m_output_type);
+    if (m_compressed) {
+        return std::make_shared<IndirectSDPA>(data_inputs,
+                                              new_args.back(),
+                                              m_is_causal,
+                                              m_indirect_axis,
+                                              m_order_q,
+                                              m_order_k,
+                                              m_order_v,
+                                              m_order_out,
+                                              m_output_type);
+    } else {
+        return std::make_shared<IndirectSDPA>(data_inputs,
+                                              new_args.back(),
+                                              m_is_causal,
+                                              m_indirect_axis,
+                                              m_order_q,
+                                              m_order_k,
+                                              m_order_v,
+                                              m_order_out,
+                                              m_quantization_config,
+                                              m_combine_scales_and_zp,
+                                              m_output_type);
+    }
 }
 
 void IndirectSDPA::validate_and_infer_types() {
     const auto input_size = get_input_size();
+
+    const auto compression_inputs = get_compression_inputs_num();
     NODE_VALIDATION_CHECK(this,
-        input_size == 4 || input_size == 5 || input_size == 6,
+        input_size >= 4 + compression_inputs && input_size <= 6 + compression_inputs,
         "Number of inputs is incorrect. Current value is: ",
         input_size,
-        ", expected 4, 5 or 6.");
+        ", expected 4, 5 or 6 data inputs and ", compression_inputs, " KV-cache compression related inputs");
+
 
     std::vector<ov::PartialShape> input_shapes;
     for (size_t i = 0; i < input_size - 1; i++) {
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp
index a598e556a8f05d..89bacbc3ce1430 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp
@@ -24,6 +24,7 @@ KVCache::KVCache(const Output<Node>& past,
     , m_concat_axis(concat_axis)
     , m_gather_axis(gather_axis)
     , m_indirect(true)
+    , m_compressed(false)
     , m_output_type(output_type) {
     m_variable = past_variable;
     if (m_indirect)
@@ -40,28 +41,86 @@ KVCache::KVCache(const Output<Node>& past,
     , m_concat_axis(concat_axis)
     , m_gather_axis(0)
     , m_indirect(false)
+    , m_compressed(false)
     , m_output_type(output_type) {
     m_variable = past_variable;
     validate_and_infer_types();
 }
 
+KVCache::KVCache(const OutputVector& inputs,
+                 const std::shared_ptr<ov::op::util::Variable>& past_variable,
+                 int64_t concat_axis,
+                 int64_t gather_axis,
+                 bool combine_scales_and_zp,
+                 const QuantizationConfig& config,
+                 const std::vector<uint64_t>& scales_zp_output_order,
+                 const ov::element::Type output_type)
+    : Op(inputs)
+    , m_concat_axis(concat_axis)
+    , m_gather_axis(gather_axis)
+    , m_indirect(true)
+    , m_compressed(true)
+    , m_combine_scales_and_zp(combine_scales_and_zp)
+    , m_quantization_config(config)
+    , m_scales_zp_output_order(scales_zp_output_order)
+    , m_output_type(output_type) {
+    OPENVINO_ASSERT(m_quantization_config.quantization_dt == ov::element::i8,
+                    "[GPU] Only I8 data type is currently supported for KV-cache compression");
+
+    m_variable = past_variable;
+    size_t output_size = 3;
+    if (config.is_asymmetric_quantization() && !combine_scales_and_zp)
+        output_size++; // add zp output
+
+    set_output_size(output_size);
+    validate_and_infer_types();
+}
+
 bool KVCache::visit_attributes(ov::AttributeVisitor& visitor) {
     visitor.on_attribute("concat_axis", m_concat_axis);
     visitor.on_attribute("gather_axis", m_gather_axis);
     visitor.on_attribute("indirect", m_indirect);
     visitor.on_attribute("output_type", m_output_type);
+    visitor.on_attribute("compressed", m_compressed);
     return true;
 }
 
 void KVCache::validate_and_infer_types() {
-    auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type;
+    auto output_type = m_output_type;
+    if (m_compressed) {
+        output_type = m_quantization_config.quantization_dt;
+    } else if (m_output_type == ov::element::undefined) {
+        output_type = get_input_element_type(0);
+    }
+
     std::vector<ov::PartialShape> input_shapes = {m_variable->get_info().data_shape, get_input_partial_shape(1)};
-    if (get_output_size() == 2)
+    if (m_indirect) {
         input_shapes.push_back(get_input_partial_shape(2));
-    auto shapes = shape_infer(this, input_shapes);
-    set_output_type(0, output_type, shapes[0]);
+    }
+
+    if (m_compressed) {
+        input_shapes.push_back(get_input_partial_shape(3));
+
+        if (m_quantization_config.is_asymmetric_quantization() && !m_combine_scales_and_zp)
+            input_shapes.push_back(get_input_partial_shape(4));
+    }
+
+    auto shapes = m_compressed ? shape_infer(this, input_shapes, m_quantization_config, m_scales_zp_output_order, m_combine_scales_and_zp)
+                               : shape_infer(this, input_shapes);
+
+    size_t out_ports = 0;
+    set_output_type(out_ports++, output_type, shapes[0]);
+
     if (m_indirect) {
-        set_output_type(1, get_input_element_type(2), shapes[1]);
+        set_output_type(out_ports++, get_input_element_type(2), shapes[1]);
+    }
+
+    if (m_compressed) {
+        set_output_type(out_ports++, m_quantization_config.scale_dt, shapes[2]);
+
+        if (m_quantization_config.is_asymmetric_quantization() && !m_combine_scales_and_zp) {
+            set_output_type(out_ports++, m_quantization_config.scale_dt, shapes[3]);
+        }
     }
 }
 
@@ -74,7 +133,7 @@ std::shared_ptr<Node> KVCache::clone_with_new_inputs(const ov::OutputVector& new
                                          m_concat_axis,
                                          m_output_type);
 
-    } else {
+    } else if (new_args.size() == 3) {
         return std::make_shared<KVCache>(new_args.at(0),
                                          new_args.at(1),
                                          new_args.at(2),
@@ -82,16 +141,55 @@ std::shared_ptr<Node> KVCache::clone_with_new_inputs(const ov::OutputVector& new
                                          m_concat_axis,
                                          m_gather_axis,
                                          m_output_type);
+    } else {
+        return std::make_shared<KVCache>(new_args,
+                                         m_variable,
+                                         m_concat_axis,
+                                         m_gather_axis,
+                                         m_combine_scales_and_zp,
+                                         m_quantization_config,
+                                         m_scales_zp_output_order,
+                                         m_output_type);
+    }
+}
+
+std::vector<ov::PartialShape> shape_infer(const KVCache* op,
+                                          const std::vector<ov::PartialShape>& input_shapes,
+                                          const ov::op::internal::QuantizationConfig& config,
+                                          const std::vector<uint64_t>& scales_zp_output_order,
+                                          bool combine_scales_and_zp) {
+    std::vector<ov::PartialShape> out_shapes = shape_infer(op, input_shapes);
+
+    if (op->get_output_size() >= 3) {
+        ov::intel_gpu::op::DynamicQuantize op;
+        auto quantized_data_shapes = ov::intel_gpu::op::DynamicQuantize::shape_infer(&op, {input_shapes[1]}, config, scales_zp_output_order, combine_scales_and_zp);
+
+        std::cout << "Base shapes: " << out_shapes[0] << " " << out_shapes[1] << "\n";
+        std::cout << "Start shape: " << input_shapes[3] << ". Qunatized shapes: " << quantized_data_shapes[0] << " " << quantized_data_shapes[1]
+                  << " (total quant shapes: " << quantized_data_shapes.size() << ", input: " << input_shapes.size() << ", out_shapes=" << out_shapes.size() << ")\n";
+        const auto scales_concat_axis = 2;
+        ov::PartialShape compression_scale_shape = input_shapes[3];
+        compression_scale_shape[scales_concat_axis] += quantized_data_shapes[1][scales_concat_axis];
+        out_shapes[2] = compression_scale_shape;
+
+        // add zp output
+        if (quantized_data_shapes.size() == 3) {
+            ov::PartialShape compression_zp_shape = input_shapes[4];
+            compression_zp_shape[scales_concat_axis] += quantized_data_shapes[2][scales_concat_axis];
+            out_shapes[3] = compression_zp_shape;
+        }
     }
+
+    return out_shapes;
 }
 
-std::vector<ov::PartialShape> shape_infer(const KVCache* op, std::vector<ov::PartialShape> input_shapes) {
+std::vector<ov::PartialShape> shape_infer(const KVCache* op, const std::vector<ov::PartialShape>& input_shapes) {
     std::vector<ov::PartialShape> out_shapes;
     out_shapes.resize(op->get_output_size());
 
     const auto& gather_axis = op->get_gather_axis();
     const auto& concat_axis = ov::util::normalize(op->get_concat_axis(), input_shapes[0].size());
-    if (op->get_output_size() == 2) {
+    if (op->get_output_size() >= 2) {
         out_shapes[0] = input_shapes[0];
         out_shapes[0][gather_axis] = input_shapes[2][0];
         out_shapes[0][concat_axis] += input_shapes[1][concat_axis];
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp
index 5438a6e2e695b5..f5af8e9d21a6bc 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp
@@ -3,6 +3,7 @@
 //
 
 #include "intel_gpu/op/read_value.hpp"
+#include "intel_gpu/op/read_values.hpp"
 #include "openvino/core/partial_shape.hpp"
 
 namespace ov {
@@ -28,16 +29,14 @@ bool ReadValue::visit_attributes(ov::AttributeVisitor& visitor) {
     return true;
 }
 
-void ReadValue::validate_and_infer_types() {
-    OPENVINO_ASSERT(m_variable, "Variable is not initialized.");
-    const auto& variable_info = m_variable->get_info();
+void ReadValue::validate_and_infer_types(size_t output_idx, const ov::op::util::VariableInfo& variable_info) {
     const auto& variable_type = variable_info.data_type;
     const auto& variable_shape = variable_info.data_shape;
 
     // If no inputs provided, it means this ReadValue doesn't have initial subgraph. This is valid.
-    if (get_input_size() > 0) {
-        const auto& initial_type = get_input_element_type(0);
-        const auto& initial_shape = get_input_partial_shape(0);
+    if (get_input_size() > output_idx) {
+        const auto& initial_type = get_input_element_type(output_idx);
+        const auto& initial_shape = get_input_partial_shape(output_idx);
 
         // Variable's shape/type determine a permissible range of values for shape/type inferred from initial_subgraph.
         // If initial_subgraph is set, then we need to check that shape/type inferred from initial_subgraph
@@ -64,19 +63,25 @@ void ReadValue::validate_and_infer_types() {
         // dynamic rank/type can be derived from the IRs generated via the prev versions of OV,
         // but dynamic rank/type are not supported in plugins,
         // so we are trying to fix them here using the rank/type of ReadValue 1st input, if it exists
-        if (get_input_size() > 0 && variable_info.data_shape.rank().is_dynamic() &&
-            variable_info.data_type.is_dynamic()) {
-            set_output_type(0, initial_type, initial_shape);
+        if (variable_info.data_shape.rank().is_dynamic() && variable_info.data_type.is_dynamic()) {
+            set_output_type(output_idx, initial_type, initial_shape);
             return;
         }
     }
 
-    set_output_type(0, variable_type, variable_shape);
+    set_output_type(output_idx, variable_type, variable_shape);
+}
+
+void ReadValue::validate_and_infer_types() {
+    OPENVINO_ASSERT(m_variable, "Variable is not initialized.");
+    const auto& variable_info = m_variable->get_info();
+
+    validate_and_infer_types(0, variable_info);
 }
 
 std::shared_ptr<Node> ReadValue::clone_with_new_inputs(const ov::OutputVector& new_args) const {
     check_new_args_count(this, new_args);
-        switch (new_args.size()) {
+    switch (new_args.size()) {
     case 0:
         return std::make_shared<ReadValue>(m_variable);
     case 1:
@@ -89,6 +94,62 @@ std::shared_ptr<Node> ReadValue::clone_with_new_inputs(const ov::OutputVector& n
     }
 }
 
+ReadValues::ReadValues(const std::shared_ptr<ov::op::util::Variable>& variable,
+                       const std::vector<ov::op::util::VariableInfo>& internal_states_infos)
+    : ReadValue(variable)
+    , m_internal_states_infos(internal_states_infos) {
+    OPENVINO_ASSERT(!internal_states_infos.empty());
+    set_output_size(internal_states_infos.size());
+    validate_and_infer_types();
+}
+
+ReadValues::ReadValues(const std::vector<Output<Node>>& variable_initializers,
+                       const std::shared_ptr<ov::op::util::Variable>& variable,
+                       const std::vector<ov::op::util::VariableInfo>& internal_states_infos)
+    : ReadValue(variable_initializers, variable)
+    , m_internal_states_infos(internal_states_infos) {
+    OPENVINO_ASSERT(!internal_states_infos.empty());
+    set_output_size(internal_states_infos.size());
+    validate_and_infer_types();
+}
+
+bool ReadValues::visit_attributes(ov::AttributeVisitor& visitor) {
+    visitor.on_attribute("variable_id", m_variable);
+
+    auto variable_info = m_variable->get_info();
+    visitor.on_attribute("variable_type", variable_info.data_type);
+    visitor.on_attribute("variable_shape", variable_info.data_shape);
+    m_variable->update(variable_info);
+    return true;
+}
+
+void ReadValues::validate_and_infer_types() {
+    OPENVINO_ASSERT(m_variable, "Variable is not initialized.");
+
+    for (size_t i = 0; i < get_output_size(); i++) {
+        ReadValue::validate_and_infer_types(i, m_internal_states_infos[i]);
+    }
+}
+
+std::shared_ptr<Node> ReadValues::clone_with_new_inputs(const ov::OutputVector& new_args) const {
+    check_new_args_count(this, new_args);
+
+    OPENVINO_ASSERT(new_args.empty() || new_args.size() == m_internal_states_infos.size(),
+                    "Unable to clone ReadValues op (name=", this->get_friendly_name(), "). ",
+                    "Incorrect number of inputs. Expected: 0 or ", m_internal_states_infos.size() ,". ",
+                    "Actual: ", new_args.size(), ".");
+
+    if (new_args.size() > 0) {
+        return std::make_shared<ReadValues>(new_args, m_variable, m_internal_states_infos);
+    } else {
+        return std::make_shared<ReadValues>(m_variable, m_internal_states_infos);
+    }
+}
+
+std::vector<ov::op::util::VariableInfo> ReadValues::get_all_internal_states_info() const {
+    return m_internal_states_infos;
+}
+
 }  // namespace op
 }  // namespace intel_gpu
 }  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp
index 57d2899e2b2e77..65930d5feb6d0a 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp
@@ -26,7 +26,31 @@ SDPA::SDPA(const OutputVector& inputs,
     , m_order_k(order_k)
     , m_order_v(order_v)
     , m_order_out(order_out)
-    , m_output_type(output_type) {
+    , m_output_type(output_type)
+    , m_compressed(false) {
+    set_arguments(inputs);
+    set_causal(is_causal);
+    validate_and_infer_types();
+}
+
+SDPA::SDPA(const OutputVector& inputs,
+           const bool is_causal,
+           const std::vector<int64_t>& order_q,
+           const std::vector<int64_t>& order_k,
+           const std::vector<int64_t>& order_v,
+           const std::vector<int64_t>& order_out,
+           const QuantizationConfig& quantization_config,
+           const bool combine_scales_and_zp,
+           const ov::element::Type output_type)
+    : m_is_causal(is_causal)
+    , m_order_q(order_q)
+    , m_order_k(order_k)
+    , m_order_v(order_v)
+    , m_order_out(order_out)
+    , m_output_type(output_type)
+    , m_compressed(true)
+    , m_combine_scales_and_zp(combine_scales_and_zp)
+    , m_quantization_config(quantization_config) {
     set_arguments(inputs);
     set_causal(is_causal);
     validate_and_infer_types();
@@ -46,11 +70,13 @@ std::shared_ptr<ov::Node> SDPA::clone_with_new_inputs(const ov::OutputVector& ne
 
 void SDPA::validate_and_infer_types() {
     const auto input_size = get_input_size();
+
+    const auto compression_inputs = get_compression_inputs_num();
     NODE_VALIDATION_CHECK(this,
-        input_size == 3 || input_size == 4 || input_size == 5,
+        input_size >= 3 + compression_inputs && input_size <= 5 + compression_inputs,
         "Number of inputs is incorrect. Current value is: ",
         input_size,
-        ", expected 3, 4 or 5.");
+        ", expected 3, 4 or 5 data inputs and ", compression_inputs, " KV-cache compression related inputs");
 
     std::vector<ov::PartialShape> input_shapes;
     for (size_t i = 0; i < input_size; i++) {
@@ -77,6 +103,18 @@ bool SDPA::visit_attributes(ov::AttributeVisitor &visitor) {
     return true;
 }
 
+size_t SDPA::get_compression_inputs_num() const {
+    size_t compression_inputs = 0;
+    if (m_compressed) {
+        compression_inputs += 2; // 2 * scales
+
+        if (m_quantization_config.is_asymmetric_quantization() && !m_combine_scales_and_zp)
+            compression_inputs += 2; // 2 * zp
+    }
+
+    return compression_inputs;
+}
+
 std::vector<ov::PartialShape> shape_infer(const SDPA* op,
                                           std::vector<ov::PartialShape> input_shapes,
                                           const std::vector<int64_t>& order_q,
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index 40c7ab48c486cb..d095e3ec317d11 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -68,6 +68,7 @@
 #include "plugin/transformations/swiglu_fusion.hpp"
 #include "plugin/transformations/transpose_fusion.hpp"
 #include "plugin/transformations/indirect_kv_cache.hpp"
+#include "plugin/transformations/kv_cache_compression.hpp"
 #include "plugin/transformations/convert_convolution.hpp"
 #include "plugin/transformations/unsqueeze_broadcast_reshape_matmul_fusion.hpp"
 #include "plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp"
@@ -852,6 +853,8 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
 
         manager.register_pass<ov::intel_gpu::SwiGLUFusion>();
         manager.register_pass<ov::intel_gpu::IndirectKVCache>();
+        manager.register_pass<ov::intel_gpu::KVCacheCompression>();
+
         manager.register_pass<ov::intel_gpu::ConvertConvolutionToInternal>();
 
         // This pass should be done after asymmetric quantization matching as it can move zp subtraction upper in the graph
diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
index dcbabff548cc5d..804a3877ffc837 100644
--- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
+++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@@ -198,6 +198,8 @@ static void print_help_messages() {
                                " Currently, other layers except input-layer('parameter' type) are loading binaries for only input."
                                " Different input or output tensors are seperated by ','. Different layers are separated by space. For example, "
                                " \"[input_layer_name1]:[binary_dumped_file1],[binary_dump_file2] [input_layer_name2]:[binary_dump_1],[binary_dump_2]\"");
+    message_list.emplace_back("OV_GPU_EnableKVCacheCompression",
+                              "Enable KV cache compression. 1 is for per-head compression, 2 is for per-token compression");
 
     auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(),
         [](std::pair<std::string, std::string>& a, std::pair<std::string, std::string>& b){
@@ -254,7 +256,8 @@ debug_configuration::debug_configuration()
         , disable_primitive_fusing(0)
         , disable_fake_alignment(0)
         , dynamic_quantize_group_size(0)
-        , disable_horizontal_fc_fusion(0) {
+        , disable_horizontal_fc_fusion(0)
+        , enable_kv_cache_compression(0) {
 #ifdef GPU_DEBUG_CONFIG
     get_gpu_debug_env_var("Help", help);
     get_common_debug_env_var("Verbose", verbose);
@@ -315,6 +318,7 @@ debug_configuration::debug_configuration()
     get_gpu_debug_env_var("LoadDumpRawBinary", load_dump_raw_bin_str);
     std::string dynamic_quantize_layers_without_onednn_str;
     get_gpu_debug_env_var("DynamicQuantizeLayersWithoutOnednn", dynamic_quantize_layers_without_onednn_str);
+    get_gpu_debug_env_var("EnableKVCacheCompression", enable_kv_cache_compression);
 
     if (help > 0) {
         print_help_messages();