diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index 69c148305fb94f..b43ea641365643 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -11,32 +11,60 @@ namespace ov { namespace op { namespace internal { +struct QuantizationConfig { + enum class QuantizationMode { Symmetric, Asymmetric }; + + QuantizationMode mode = QuantizationMode::Symmetric; + element::Type quantization_dt = element::undefined; + element::Type scale_dt = element::undefined; + element::Type zp_dt = element::undefined; + std::vector group_sizes = {}; + + bool operator==(const QuantizationConfig& rhs) const { + return mode == rhs.mode && quantization_dt == rhs.quantization_dt && scale_dt == rhs.scale_dt && + zp_dt == rhs.zp_dt && group_sizes == rhs.group_sizes; + } + + bool is_asymmetric_quantization() const { + return mode == QuantizationMode::Asymmetric; + } +}; + /// \brief Operator performing Dynamic Quantize class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { public: - OPENVINO_OP("DynamicQuantize", "gpu_opset"); - + OPENVINO_OP("DynamicQuantize", "ie_internal_opset"); DynamicQuantize() = default; /// \brief Constructs an DynamicQuantize operation. /// /// \param data Input tensor with data - /// \param group_sizes Group sizes for dynamic quantization - /// \param dt_scale Data type for scale output - DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale); + /// \param config Dynamic quantization configuration + DynamicQuantize(const Output& data, const QuantizationConfig& config); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + const std::vector& get_group_sizes() const { - return m_group_sizes; - }; + return m_config.group_sizes; + } + + QuantizationConfig::QuantizationMode get_quantization_mode() const { + return m_config.mode; + } + + QuantizationConfig get_quantization_config() const { + return m_config; + } + static std::vector shape_infer(const DynamicQuantize* op, const std::vector& input_shapes, - const std::vector& group_sizes); + const QuantizationConfig& config); + +protected: + DynamicQuantize(const Output& data, const QuantizationConfig& config, size_t outputs_number); -private: - std::vector m_group_sizes; - element::Type m_dt_scale; + QuantizationConfig m_config; }; } // namespace internal diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index 74c0498e9a4425..3bf75a0dbb7895 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -13,35 +13,42 @@ namespace ov { namespace op { namespace internal { -DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale) +DynamicQuantize::DynamicQuantize(const Output& data, const QuantizationConfig& config, size_t outputs_number) : Op({data}), - m_group_sizes(std::move(group_sizes)), - m_dt_scale(dt_scale) { - OPENVINO_ASSERT(data.get_partial_shape().rank() == m_group_sizes.size(), + m_config(config) { + OPENVINO_ASSERT(data.get_partial_shape().rank() == m_config.group_sizes.size(), "FC input rank should be same as the rank of group_size ", data.get_tensor_ptr()->get_partial_shape().rank(), " / ", - m_group_sizes.size()); - set_output_size(2); + m_config.group_sizes.size()); + set_output_size(outputs_number); +} + +DynamicQuantize::DynamicQuantize(const Output& data, const QuantizationConfig& config) + : DynamicQuantize(data, config, config.mode == QuantizationConfig::QuantizationMode::Symmetric ? 2 : 3) { validate_and_infer_types(); } void DynamicQuantize::validate_and_infer_types() { std::vector input_shapes = {get_input_partial_shape(0)}; - auto out_shapes = shape_infer(this, input_shapes, m_group_sizes); - set_output_type(0, element::i8, out_shapes[0]); - set_output_type(1, m_dt_scale, out_shapes[1]); + auto out_shapes = shape_infer(this, input_shapes, m_config); + set_output_type(0, m_config.quantization_dt, out_shapes[0]); + set_output_type(1, m_config.scale_dt, out_shapes[1]); + + if (m_config.is_asymmetric_quantization()) + set_output_type(2, m_config.zp_dt, out_shapes[2]); } std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_group_sizes, m_dt_scale); + return std::make_shared(new_args.at(0), m_config); } std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, const std::vector& input_shapes, - const std::vector& group_sizes) { + const QuantizationConfig& config) { + const auto& group_sizes = config.group_sizes; std::vector out_shapes; out_shapes.push_back(input_shapes[0]); @@ -52,7 +59,7 @@ std::vector DynamicQuantize::shape_infer(const DynamicQuantize " / ", group_sizes.size()); for (size_t i = 0; i < scale_shape.size(); i++) { - if (scale_shape[i].is_dynamic()) + if (scale_shape[i].is_dynamic() || scale_shape[i] == 0) continue; if (group_sizes[i] == UINT64_MAX) @@ -63,6 +70,11 @@ std::vector DynamicQuantize::shape_infer(const DynamicQuantize } } out_shapes.push_back(scale_shape); + + // Add zero points shape + if (config.is_asymmetric_quantization()) + out_shapes.push_back(scale_shape); + return out_shapes; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp index 3e8887fbb2f7ee..72623f6d120955 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp @@ -53,7 +53,7 @@ struct kernel_impl_params final { optional_layout weights_zero_points_layout = optional_layout(); optional_layout activations_zero_points_layout = optional_layout(); optional_layout compensation_layout = optional_layout(); - optional_layout state_layout = optional_layout(); + std::vector state_layouts; std::map memory_deps = {}; size_t primary_input_idx = 0; diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp new file mode 100644 index 00000000000000..7b38e313a370f0 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" +#include "ov_ops/dynamic_quantize.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +class DynamicQuantize : public ov::op::internal::DynamicQuantize { +public: + OPENVINO_OP("DynamicQuantize", "gpu_opset"); + + using QuantizationConfig = ov::op::internal::QuantizationConfig; + + DynamicQuantize() = default; + /// \brief Constructs an DynamicQuantize operation. + /// + /// \param data Input tensor with data + /// \param config Dynamic quantization configuration + /// \param scales_zp_output_order Specifies on default order of scales and zero points + /// \param combine_scales_and_zp If true, combines scales and zero points into a single buffer, pairing each scale with its corresponding zero point + DynamicQuantize(const Output& data, + const QuantizationConfig& config, + const std::vector& scales_zp_output_order = {}, + const bool combine_scales_and_zp = false); + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + const std::vector& get_scales_zp_output_order() const { + return m_scales_zp_output_order; + } + + bool get_combine_scales_and_zp() const { + return m_combine_scales_and_zp; + } + + static std::vector shape_infer(const DynamicQuantize* op, + const std::vector& input_shapes, + const QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp = false); + +private: + bool m_combine_scales_and_zp = false; + std::vector m_scales_zp_output_order; +}; + +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp index b4d34a3975af6b..4ce90a685690e5 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp @@ -29,6 +29,18 @@ class IndirectSDPA : public ov::intel_gpu::op::SDPA { const std::vector& order_out, const ov::element::Type output_type = ov::element::undefined); + IndirectSDPA(const OutputVector& data_inputs, + const ov::Output& beam_table, + const bool is_causal, + const int64_t indirect_axis, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const QuantizationConfig& quantization_config, + const bool combine_scales_and_zp, + const ov::element::Type output_type = ov::element::undefined); + bool visit_attributes(ov::AttributeVisitor &visitor) override; void validate_and_infer_types() override; diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp index 402ff6e46c1607..e6dc5cd7c5ae94 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp @@ -7,6 +7,7 @@ #include "openvino/op/op.hpp" #include "openvino/op/util/variable.hpp" #include "openvino/op/util/variable_extension.hpp" +#include "intel_gpu/op/dynamic_quantize.hpp" namespace ov { namespace intel_gpu { @@ -18,20 +19,31 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension { public: OPENVINO_OP("KVCache", "gpu_opset"); + using QuantizationConfig = ov::op::internal::QuantizationConfig; + KVCache() = default; KVCache(const Output& past, const Output& new_token_data, - const Output& beam_idx, const std::shared_ptr& past_values, int64_t concat_axis, - int64_t gather_axis, const ov::element::Type output_type = ov::element::undefined); KVCache(const Output& past, const Output& new_token_data, + const Output& beam_idx, + const std::shared_ptr& past_values, + int64_t concat_axis, + int64_t gather_axis, + const ov::element::Type output_type = ov::element::undefined); + + KVCache(const OutputVector& inputs, const std::shared_ptr& past_values, int64_t concat_axis, + int64_t gather_axis, + bool combine_scales_and_zp, + const QuantizationConfig& config, + const std::vector& scales_zp_output_order, const ov::element::Type output_type = ov::element::undefined); bool visit_attributes(ov::AttributeVisitor& visitor) override; @@ -53,14 +65,31 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension { bool get_indirect() const { return m_indirect; } + bool get_kv_compressed() const { return m_compressed; } + bool get_combine_scales_and_zp() const { return m_combine_scales_and_zp; } + QuantizationConfig get_quantization_config() const { return m_quantization_config; } + std::vector get_scales_zp_output_order() const { return m_scales_zp_output_order; } + private: int64_t m_concat_axis = 0; int64_t m_gather_axis = 0; bool m_indirect = false; + + bool m_compressed = false; + bool m_combine_scales_and_zp = false; + QuantizationConfig m_quantization_config = {}; + std::vector m_scales_zp_output_order = {}; + ov::element::Type m_output_type; }; -std::vector shape_infer(const KVCache* op, std::vector input_shapes); +std::vector shape_infer(const KVCache* op, const std::vector& input_shapes); + +std::vector shape_infer(const KVCache* op, + const std::vector& input_shapes, + const ov::op::internal::QuantizationConfig& config, + const std::vector& scales_output_order = {}, + bool combine_scales_and_zp = false); } // namespace op } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp index a9c47f3fa39fa6..419c18118229ff 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp @@ -26,6 +26,7 @@ class ReadValue : public ov::op::Op, public ov::op::util::VariableExtension { bool visit_attributes(ov::AttributeVisitor& visitor) override; void validate_and_infer_types() override; + void validate_and_infer_types(size_t output_idx, const ov::op::util::VariableInfo& variable_info); std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; @@ -33,6 +34,12 @@ class ReadValue : public ov::op::Op, public ov::op::util::VariableExtension { OPENVINO_ASSERT(m_variable, "Variable is not initialized. Variable_id is unavailable"); return m_variable->get_info().variable_id; } + +protected: + ReadValue(const std::vector>& variable_initializers, const std::shared_ptr& variable) + : Op(variable_initializers) { + m_variable = variable; + } }; } // namespace op diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp new file mode 100644 index 00000000000000..ecbda9364d7b3b --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/op/read_value.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +/// \brief This operation handles the OpenVINO GPU Plugin's custom variable +// representation (which can store multiple states in a single variable) at the graph level. +class ReadValues : public ReadValue { +public: + OPENVINO_OP("ReadValues", "gpu_opset"); + + ReadValues() = default; + + ReadValues(const std::shared_ptr& variable, + const std::vector& internal_states_infos); + + ReadValues(const OutputVector& variable_initializers, + const std::shared_ptr& variable, + const std::vector& internal_states_infos); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::vector get_all_internal_states_info() const; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + +private: + std::vector m_internal_states_infos; +}; + +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp index 9f5d4dad16efd7..741644208cfe76 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp @@ -8,6 +8,7 @@ #include "openvino/core/partial_shape.hpp" #include "openvino/op/op.hpp" #include "openvino/op/scaled_dot_product_attention.hpp" +#include "intel_gpu/op/dynamic_quantize.hpp" namespace ov { namespace intel_gpu { @@ -17,6 +18,8 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { public: OPENVINO_OP("SDPA", "gpu_opset"); + using QuantizationConfig = ov::op::internal::QuantizationConfig; + SDPA() = default; SDPA(const OutputVector& inputs, @@ -27,6 +30,16 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { const std::vector& order_out, const ov::element::Type output_type = ov::element::undefined); + SDPA(const OutputVector& inputs, + const bool is_causal, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const QuantizationConfig& quantization_config, + const bool m_combine_scales_and_zp, + const ov::element::Type output_type = ov::element::undefined); + bool visit_attributes(ov::AttributeVisitor &visitor) override; void validate_and_infer_types() override; @@ -41,6 +54,11 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { std::vector get_output_transpose_order() const { return m_order_out; } ov::element::Type get_output_type() const { return m_output_type; } + bool get_kv_compressed() const { return m_compressed; } + bool get_combine_scales_and_zp() const { return m_combine_scales_and_zp; } + QuantizationConfig get_quantization_config() const { return m_quantization_config; } + size_t get_compression_inputs_num() const; + static std::vector default_order(size_t rank) { std::vector order(rank); std::iota(order.begin(), order.end(), 0); @@ -54,6 +72,10 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { std::vector m_order_v; std::vector m_order_out; ov::element::Type m_output_type; + + bool m_compressed = false; + bool m_combine_scales_and_zp = false; + QuantizationConfig m_quantization_config = {}; }; std::vector shape_infer(const SDPA* op, diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp index 0cad36f62e47b9..26d15d733102ad 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp @@ -46,5 +46,31 @@ class VariableStateIndirectKVCache : public MultiTensorState { size_t m_concat_axis = 0; }; +// This is multi-tensor state for Indirect KV-Cache + Gemm pattern +// Internally it stores KV Cache state + Beam Table state + compression scales state + (optional compression zero points) +class VariableStateIndirectKVCacheCompressed : public VariableStateIndirectKVCache { +public: + VariableStateIndirectKVCacheCompressed(const VariableStateInfo& info, + std::shared_ptr context, + std::shared_ptr shape_predictor, + const std::vector& output_layouts, + size_t beam_idx, + size_t concat_idx, + bool has_zp_state); + using Ptr = std::shared_ptr; + + void set_state(const ov::SoPtr& state) override; + ov::SoPtr get_state() const override; + + VariableState::Ptr get_compression_scale_state() const; + void set_compression_scale_layout(const cldnn::layout& new_layout); + + VariableState::Ptr get_compression_zp_state() const; + void set_compression_zp_layout(const cldnn::layout& new_layout); + bool has_zp_state() const; + +private: + bool m_has_zp_state = false; +}; } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index 1dbd769444b1aa..947a010efe0377 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -284,6 +284,7 @@ REGISTER_FACTORY(internal, RMS); REGISTER_FACTORY(internal, GatherCompressed); REGISTER_FACTORY(internal, KVCache); REGISTER_FACTORY(internal, ReadValue); +REGISTER_FACTORY(internal, ReadValues); REGISTER_FACTORY(internal, Gemm); REGISTER_FACTORY(internal, SwiGLU); REGISTER_FACTORY(internal, IndirectGemm); @@ -293,4 +294,5 @@ REGISTER_FACTORY(internal, SDPA); REGISTER_FACTORY(internal, IndirectSDPA); REGISTER_FACTORY(internal, RoPE); REGISTER_FACTORY(internal, DynamicQuantize); +REGISTER_FACTORY(internal, DynamicQuantizeExtended); REGISTER_FACTORY(internal, PagedAttentionExtension); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index d93e2f86eed144..446206820b33db 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -5,6 +5,8 @@ #pragma once #include "primitive.hpp" +#include "ov_ops/dynamic_quantize.hpp" + namespace cldnn { /// @brief Dynamic Quantize primitive @@ -12,26 +14,40 @@ namespace cldnn { struct dynamic_quantize : public primitive_base { CLDNN_DECLARE_PRIMITIVE(dynamic_quantize); - dynamic_quantize() : primitive_base("", {}), group_size(0) {} + using QuantizationConfig = ov::op::internal::QuantizationConfig; + + dynamic_quantize() : primitive_base("", {}) {} /// @brief Constructs dynamic_quantize primitive /// @param id This primitive id /// @param input Input primitive id - /// @param group_size Quantization group size + /// @param group_sizes Quantization group size /// @param data_type Output data type of quantized /// @param output_size Output data size of the primitive dynamic_quantize(const primitive_id& id, const input_info& input, - const uint64_t group_size, - const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}) - : primitive_base(id, {input}, 2, data_types), - group_size(group_size) {} + const QuantizationConfig& config, + const bool combine_scales_and_zp = false, + const std::vector& scales_zp_output_order = {}) + : primitive_base(id, {input}) + , combine_scales_and_zp(combine_scales_and_zp) + , quantization_config(config) + , scales_zp_output_order(scales_zp_output_order) {} - uint64_t group_size = 0; + bool combine_scales_and_zp = false; + QuantizationConfig quantization_config; + std::vector scales_zp_output_order = {}; size_t hash() const override { size_t seed = primitive::hash(); - seed = hash_combine(seed, group_size); + seed = hash_range(seed, scales_zp_output_order.begin(), scales_zp_output_order.end()); + seed = hash_range(seed, quantization_config.group_sizes.begin(), quantization_config.group_sizes.end()); + seed = hash_combine(seed, quantization_config.mode); + seed = hash_combine(seed, quantization_config.quantization_dt.hash()); + seed = hash_combine(seed, quantization_config.scale_dt.hash()); + seed = hash_combine(seed, quantization_config.zp_dt.hash()); + seed = hash_combine(seed, combine_scales_and_zp); + return seed; } @@ -41,17 +57,33 @@ struct dynamic_quantize : public primitive_base { auto rhs_casted = downcast(rhs); - return group_size == rhs_casted.group_size; + return scales_zp_output_order == rhs_casted.scales_zp_output_order || + combine_scales_and_zp == rhs_casted.combine_scales_and_zp || + quantization_config == rhs_casted.quantization_config; } void save(BinaryOutputBuffer& ob) const override { primitive_base::save(ob); - ob << group_size; + + ob << combine_scales_and_zp; + ob << scales_zp_output_order; + ob << quantization_config.group_sizes; + ob << make_data(&quantization_config.mode, sizeof(quantization_config.mode)); + ob << make_data(&quantization_config.quantization_dt, sizeof(quantization_config.quantization_dt)); + ob << make_data(&quantization_config.scale_dt, sizeof(quantization_config.scale_dt)); + ob << make_data(&quantization_config.zp_dt, sizeof(quantization_config.zp_dt)); } void load(BinaryInputBuffer& ib) override { primitive_base::load(ib); - ib >> group_size; + + ib >> combine_scales_and_zp; + ib >> scales_zp_output_order; + ib >> quantization_config.group_sizes; + ib >> make_data(&quantization_config.mode, sizeof(quantization_config.mode)); + ib >> make_data(&quantization_config.quantization_dt, sizeof(quantization_config.quantization_dt)); + ib >> make_data(&quantization_config.scale_dt, sizeof(quantization_config.scale_dt)); + ib >> make_data(&quantization_config.zp_dt, sizeof(quantization_config.zp_dt)); } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp index f87041dcff66d6..dca8954979d06d 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp @@ -3,10 +3,14 @@ // #pragma once + +#include "primitive.hpp" + #include "openvino/core/partial_shape.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/util/variable.hpp" -#include "primitive.hpp" +#include "ov_ops/dynamic_quantize.hpp" + #include namespace cldnn { @@ -14,6 +18,8 @@ namespace cldnn { struct kv_cache : public primitive_base { CLDNN_DECLARE_PRIMITIVE(kv_cache) + using QuantizationConfig = ov::op::internal::QuantizationConfig; + kv_cache() : primitive_base("", {}) {} kv_cache(const primitive_id& id, @@ -33,11 +39,25 @@ struct kv_cache : public primitive_base { int64_t gather_axis = 0; bool indirect = false; + bool compressed = false; + bool combine_scales_and_zp = false; + QuantizationConfig quantization_config; + std::vector scales_zp_output_order = {}; + size_t hash() const override { size_t seed = primitive::hash(); seed = hash_combine(seed, concat_axis); seed = hash_combine(seed, gather_axis); seed = hash_combine(seed, indirect); + seed = hash_combine(seed, compressed); + seed = hash_combine(seed, combine_scales_and_zp); + seed = hash_range(seed, scales_zp_output_order.begin(), scales_zp_output_order.end()); + seed = hash_range(seed, quantization_config.group_sizes.begin(), quantization_config.group_sizes.end()); + seed = hash_combine(seed, quantization_config.mode); + seed = hash_combine(seed, quantization_config.quantization_dt.hash()); + seed = hash_combine(seed, quantization_config.scale_dt.hash()); + seed = hash_combine(seed, quantization_config.zp_dt.hash()); + return seed; } @@ -50,7 +70,11 @@ struct kv_cache : public primitive_base { return variable_info == rhs_casted.variable_info && concat_axis == rhs_casted.concat_axis && gather_axis == rhs_casted.gather_axis && - indirect == rhs_casted.indirect; + indirect == rhs_casted.indirect && + compressed == rhs_casted.compressed && + scales_zp_output_order == rhs_casted.scales_zp_output_order && + combine_scales_and_zp == rhs_casted.combine_scales_and_zp && + quantization_config == rhs_casted.quantization_config; } void save(BinaryOutputBuffer& ob) const override { @@ -62,6 +86,14 @@ struct kv_cache : public primitive_base { ob << concat_axis; ob << gather_axis; ob << indirect; + ob << compressed; + ob << combine_scales_and_zp; + ob << scales_zp_output_order; + ob << quantization_config.group_sizes; + ob << make_data(&quantization_config.mode, sizeof(quantization_config.mode)); + ob << make_data(&quantization_config.quantization_dt, sizeof(quantization_config.quantization_dt)); + ob << make_data(&quantization_config.scale_dt, sizeof(quantization_config.scale_dt)); + ob << make_data(&quantization_config.zp_dt, sizeof(quantization_config.zp_dt)); } void load(BinaryInputBuffer& ib) override { @@ -76,6 +108,30 @@ struct kv_cache : public primitive_base { ib >> concat_axis; ib >> gather_axis; ib >> indirect; + ib >> compressed; + ib >> combine_scales_and_zp; + ib >> scales_zp_output_order; + ib >> quantization_config.group_sizes; + ib >> make_data(&quantization_config.mode, sizeof(quantization_config.mode)); + ib >> make_data(&quantization_config.quantization_dt, sizeof(quantization_config.quantization_dt)); + ib >> make_data(&quantization_config.scale_dt, sizeof(quantization_config.scale_dt)); + ib >> make_data(&quantization_config.zp_dt, sizeof(quantization_config.zp_dt)); + } + + size_t get_compression_scales_inputs_num() const { + if (compressed) { + return 1; + } else { + return 0; + } + } + + size_t get_compression_zp_inputs_num() const { + if (compressed && quantization_config.is_asymmetric_quantization() && !combine_scales_and_zp) { + return 1; + } else { + return 0; + } } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp index 7d9e919f56cf13..26465692ef6352 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp @@ -22,19 +22,23 @@ struct read_value : public primitive_base { /// @param id This primitive id /// @param inputs Input parameters ids /// @param variable_id Variable id - /// @param output_layout Memory layout + /// @param output_layouts Memory layouts read_value(const primitive_id& id, const std::vector& inputs, const std::string& variable_id, - const layout& output_layout, + const std::vector& output_layouts, const ov::element::Type& user_specified_type = ov::element::undefined) - : primitive_base(id, inputs, 1, {optional_data_type{output_layout.data_type}}), + : primitive_base(id, inputs, output_layouts.size()), variable_id{variable_id}, - output_layout{output_layout}, - user_specified_type(user_specified_type) {} + output_layouts{output_layouts}, + user_specified_type(user_specified_type) { + for (size_t output_idx = 0; output_idx < output_layouts.size(); output_idx++) { + output_data_types[output_idx] = optional_data_type(output_layouts[output_idx].data_type); + } + } std::string variable_id; - layout output_layout; + std::vector output_layouts; ov::element::Type user_specified_type; bool operator==(const primitive& rhs) const override { @@ -51,7 +55,9 @@ struct read_value : public primitive_base { primitive_base::save(ob); ov::element::Type_t data_type = user_specified_type; ob << variable_id; - ob << output_layout; + ob << output_layouts.size(); + for (const auto& layout : output_layouts) + ob << layout; ob << make_data(&data_type, sizeof(ov::element::Type_t)); } @@ -59,7 +65,12 @@ struct read_value : public primitive_base { primitive_base::load(ib); ov::element::Type_t data_type = ov::element::Type_t::undefined; ib >> variable_id; - ib >> output_layout; + size_t output_layouts_size; + ib >> output_layouts_size; + output_layouts.resize(output_layouts_size); + for (size_t i = 0; i < output_layouts_size; i++) { + ib >> output_layouts[i]; + } ib >> make_data(&data_type, sizeof(ov::element::Type_t)); user_specified_type = data_type; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp index d66012bfac8889..16b53d026d1c7d 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp @@ -5,16 +5,20 @@ #pragma once #include "primitive.hpp" +#include "ov_ops/dynamic_quantize.hpp" + namespace cldnn { struct scaled_dot_product_attention : public primitive_base { CLDNN_DECLARE_PRIMITIVE(scaled_dot_product_attention) + using QuantizationConfig = ov::op::internal::QuantizationConfig; + scaled_dot_product_attention() : primitive_base("", {}) {} /// @brief Constructs scaled_dot_product_attention primitive. /// @param id This primitive id. - /// @param inputs Input data primitives id (query, keys, values, [attention_mask], [scale]). + /// @param inputs Input data primitives id (query, keys, values, [attention_mask], [scale], [keys scales], [keys zp], [values scales], [values zp]). /// @param is_causal If true, assumes causal attention masking. In this case attention_mask input is ignored. scaled_dot_product_attention(const primitive_id& id, const std::vector inputs, @@ -23,18 +27,30 @@ struct scaled_dot_product_attention : public primitive_base& input_q_transpose_order = {}, const std::vector& input_k_transpose_order = {}, const std::vector& input_v_transpose_order = {}, - const std::vector& output_transpose_order = {}) + const std::vector& output_transpose_order = {}, + bool is_kv_compressed = false, + bool combine_scales_and_zp = false, + const QuantizationConfig& quantization_config = {}) : primitive_base(id, inputs) , is_causal(is_causal) , indirect_axis(indirect_axis) + , is_kv_compressed(is_kv_compressed) + , combine_scales_and_zp(combine_scales_and_zp) + , quantization_config(quantization_config) , input_q_transpose_order(input_q_transpose_order) , input_k_transpose_order(input_k_transpose_order) , input_v_transpose_order(input_v_transpose_order) , output_transpose_order(output_transpose_order) { auto data_inputs_num = inputs.size(); - if (indirect_axis != -1) + if (indirect_axis != -1) { data_inputs_num--; + } + if (is_kv_compressed) { + data_inputs_num -= 2; // scales + if (quantization_config.is_asymmetric_quantization() && !combine_scales_and_zp) + data_inputs_num -= 2; // zp + } has_attn_mask_input = data_inputs_num > 3; has_scale_input = data_inputs_num > 4; } @@ -44,6 +60,10 @@ struct scaled_dot_product_attention : public primitive_base input_q_transpose_order; std::vector input_k_transpose_order; std::vector input_v_transpose_order; @@ -59,6 +79,13 @@ struct scaled_dot_product_attention : public primitive_base::load(ib); ib >> is_causal; + ib >> is_kv_compressed; ib >> has_attn_mask_input; ib >> has_scale_input; ib >> indirect_axis; @@ -100,6 +138,12 @@ struct scaled_dot_product_attention : public primitive_base> input_k_transpose_order; ib >> input_v_transpose_order; ib >> output_transpose_order; + ib >> combine_scales_and_zp; + ib >> quantization_config.group_sizes; + ib >> make_data(&quantization_config.mode, sizeof(quantization_config.mode)); + ib >> make_data(&quantization_config.quantization_dt, sizeof(quantization_config.quantization_dt)); + ib >> make_data(&quantization_config.scale_dt, sizeof(quantization_config.scale_dt)); + ib >> make_data(&quantization_config.zp_dt, sizeof(quantization_config.zp_dt)); } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index c65aa3e5894cb8..465ed898ecb7ec 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -143,6 +143,7 @@ class debug_configuration { int disable_primitive_fusing; // Disable primitive fusing int disable_fake_alignment; // Disable fake alignment std::vector dynamic_quantize_layers_without_onednn; // Specify Fully-connected layers which enable Dynamic quantization + int use_kv_cache_compression; // Enable KV-cache compression int dynamic_quantize_group_size; // Enable Dynamic quantization for fully connected primitive by specified group size int disable_horizontal_fc_fusion; // Disable fc horizontal fusion std::set dump_iteration; // Dump n-th execution of network. diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index 5c945f4c2d389c..060f3c18afd6ca 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ov_ops/dynamic_quantize.hpp" +#include "intel_gpu/op/dynamic_quantize.hpp" #include "dynamic_quantize_inst.h" #include "primitive_type_base.h" @@ -22,29 +22,40 @@ layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& no } template -std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size) { - ov::op::internal::DynamicQuantize op; +std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, + const dynamic_quantize::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp) { + ov::intel_gpu::op::DynamicQuantize op; auto output_format = act_layout.format; std::vector input_shapes = { act_layout.get(), }; - std::vector shape_group_size(act_layout.get().size(), 1); - shape_group_size.back() = group_size; + auto output_shapes = ov::intel_gpu::op::DynamicQuantize::shape_infer(&op, input_shapes, config, scales_zp_output_order, combine_scales_and_zp); - auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, shape_group_size); + std::vector output_layouts = { layout(output_shapes[0], config.quantization_dt, output_format), + layout(output_shapes[1], config.scale_dt, output_format) }; - return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; + if (config.is_asymmetric_quantization() && !combine_scales_and_zp) { + output_layouts.emplace_back(layout(output_shapes[2], config.zp_dt, output_format)); + } + + return output_layouts; } -template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size); +template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, + const dynamic_quantize::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp); template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); const auto& input_layout = impl_param.get_input_layout(); - return __calc_output_layouts(input_layout, UINT64_MAX /* TODO: handle group_size here */); + + return __calc_output_layouts(input_layout, desc->quantization_config, desc->scales_zp_output_order, desc->combine_scales_and_zp); } template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& node, @@ -56,6 +67,14 @@ std::string dynamic_quantize_inst::to_string(dynamic_quantize_node const& node) std::stringstream primitive_description; + json_composite dynamic_quantize_info; + dynamic_quantize_info.add("combine_scales_and_zp", desc->combine_scales_and_zp); + dynamic_quantize_info.add("scales_zp_output_order", desc->scales_zp_output_order); + dynamic_quantize_info.add("quantization_dt", desc->quantization_config.quantization_dt); + dynamic_quantize_info.add("scale_dt", desc->quantization_config.scale_dt); + dynamic_quantize_info.add("zp_dt", desc->quantization_config.zp_dt); + dynamic_quantize_info.add("is_asymmetric_quantization", desc->quantization_config.is_asymmetric_quantization()); + node_info->add("dynamic_quantize info", dynamic_quantize_info); node_info->dump(primitive_description); return primitive_description.str(); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 7bdbc53ad54d16..e92eefa5b01ec9 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -423,10 +423,11 @@ bool crop_in_place_optimization::can_crop_be_optimized_simple_data_format(const } static bool can_read_value_be_optimize(const read_value_node& node) { - if (node.get_users().size() == 1) + std::unordered_set unique_users(node.get_users().begin(), node.get_users().end()); + if (unique_users.size() == 1) return true; - const auto non_shape_of_users_count = std::count_if(node.get_users().begin(), node.get_users().end(), [](const program_node* user) { + const auto non_shape_of_users_count = std::count_if(unique_users.begin(), unique_users.end(), [](const program_node* user) { return !user->is_type(); }); if (non_shape_of_users_count <= 1) @@ -877,18 +878,39 @@ void prepare_buffer_fusing::run(program& p) { node.set_output_layout(kv_out_layout); node.can_share_buffer(false); - auto update_dep = [&info_dynamic_pad](program_node* dep) { - auto prev_layout = dep->get_output_layout(); + auto update_dep = [](program_node* dep, padding::DynamicDimsMask& info_dynamic_pad, size_t idx) { + auto prev_layout = dep->get_output_layout(true, idx); prev_layout.data_padding._dynamic_dims_mask = info_dynamic_pad; - dep->set_output_layout(prev_layout); + dep->set_output_layout(prev_layout, true, idx); dep->can_share_buffer(false); }; + auto update_scale_zp = [&](size_t kv_cache_output_idx, size_t read_value_output_idx) { + auto scales_out_layout = node.get_output_layout(false, kv_cache_output_idx); + + const size_t scales_zp_concat_axis = 2; + padding::DynamicDimsMask info_dynamic_pad_scales; + info_dynamic_pad_scales[scales_zp_concat_axis] = 1; + scales_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad_scales; + node.set_output_layout(scales_out_layout, true, kv_cache_output_idx); + + update_dep(rv_prim, info_dynamic_pad_scales, read_value_output_idx); + }; + if (rv_prim) { - update_dep(rv_prim); + update_dep(rv_prim, info_dynamic_pad, 0); } if (gather_prim) { - update_dep(gather_prim); + update_dep(gather_prim, info_dynamic_pad, 0); + } + + const auto& desc = node.get_primitive(); + if (desc->compressed) { + update_scale_zp(2, 1); + + if (desc->quantization_config.is_asymmetric_quantization() && !desc->combine_scales_and_zp) { + update_scale_zp(3, 2); + } } } }); @@ -922,7 +944,7 @@ void prepare_buffer_fusing::run(program& p) { // TODO: Allow optimizations for the case above too. Looks like it can be achieved by more careful // topological sort (i.e. if we ensure that all read_value users are completed before assign is run) node.can_be_optimized(can_read_value_be_optimize(node)); - GPU_DEBUG_TRACE_DETAIL << "[prepare_buffer_fusing] : " << node.id() << " can be optimized" << std::endl; + GPU_DEBUG_TRACE_DETAIL << "[prepare_buffer_fusing] : " << node.id() << " can be optimized = " << node.can_be_optimized() << std::endl; }); } } diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp index 5692b6037a09e0..da7e95c4ab74a5 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp @@ -6,6 +6,8 @@ #include "impls/registry/implementation_map.hpp" #include "register.hpp" +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" + namespace cldnn { namespace cpu { @@ -64,7 +66,21 @@ struct read_value_impl : public typed_primitive_impl { } if (!instance.can_be_optimized()) { - return instance.output_memory(0).copy_from(stream, *variable.get_memory(), false); + GPU_DEBUG_TRACE_DETAIL << "Copy variable's memory to new read_value's output buffer\n"; + std::vector res_events; + res_events.push_back(instance.output_memory(0).copy_from(stream, *variable.get_memory(), false)); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + auto scales_state = compressed_cache_variable->get_compression_scale_state(); + res_events.push_back(instance.output_memory(1).copy_from(stream, *scales_state->get_memory(), false)); + + if (compressed_cache_variable->has_zp_state()) { + auto zp_state = compressed_cache_variable->get_compression_zp_state(); + res_events.push_back(instance.output_memory(1).copy_from(stream, *zp_state->get_memory(), false)); + } + } + + return stream.aggregate_events(res_events, res_events.size() > 1); } return instance.get_network().get_stream().create_user_event(true); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp index c3d436eb9c9b8d..d33b4b6dade34c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp @@ -34,7 +34,6 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl { } static kernel_params_t get_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { - /// TODO: handle group_size here auto params = get_default_params(impl_param, is_shape_agnostic); params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1))); @@ -45,6 +44,16 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl { auto& fc_node = user_node->as(); params.fc_ifm_size = fc_node.weights().get_output_layout().feature(); } + + if (impl_param.output_layouts.size() > 2) + params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(2))); + + const auto& desc = impl_param.typed_desc(); + params.group_sizes = desc->quantization_config.group_sizes; + params.scales_output_order = desc->scales_zp_output_order; + params.use_asymmetric_quantization = desc->quantization_config.is_asymmetric_quantization(); + params.combine_scales_and_zp = desc->combine_scales_and_zp; + return params; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp index e4e4adfbb15452..2ce0b8a5e46b2d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp @@ -9,12 +9,17 @@ #include "multi_stage_primitive.hpp" #include "kv_cache_inst.h" +#include "dynamic_quantize_inst.h" #include "concatenation/concatenation_kernel_selector.h" #include "concatenation/concatenation_kernel_base.h" #include "beam_table_update/beam_table_update_kernel_selector.hpp" #include "beam_table_update/beam_table_update_kernel_ref.hpp" +#include "dynamic_quantize/dynamic_quantize_kernel_selector.h" +#include "dynamic_quantize/dynamic_quantize_kernel_kv_cache.h" #include "openvino/core/dimension.hpp" +#include + namespace cldnn { namespace ocl { @@ -57,6 +62,9 @@ struct kv_cache_impl : multi_stage_primitive { using bt_kernel_selector_t = kernel_selector::beam_table_update_kernel_selector; using bt_kernel_params_t = kernel_selector::beam_table_update_params; + using dq_kernel_selector_t = kernel_selector::dynamic_quantize_kernel_selector; + using dq_kernel_params_t = kernel_selector::dynamic_quantize_params; + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::kv_cache_impl) std::unique_ptr clone() const override { @@ -65,6 +73,9 @@ struct kv_cache_impl : multi_stage_primitive { const size_t concat_stage = 0; const size_t beam_table_stage = 1; + const size_t dq_stage = 2; + const size_t scale_concat_stage = 3; + const size_t zp_concat_stage = 4; cldnn::memory::ptr beam_table_prev = nullptr; cldnn::memory::ptr beam_table_new = nullptr; @@ -75,16 +86,30 @@ struct kv_cache_impl : multi_stage_primitive { auto& kernel_selector = kernel_selector_t::Instance(); auto kernel_impl = kernel_selector.GetImplementation(_kernels_data[concat_stage].kernelName); kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[concat_stage]); - if (_kernels_data.size() == 2) { + if (_kernels_data.size() >= 2) { auto& bt_kernel_selector = bt_kernel_selector_t::Instance(); auto bt_kernel_impl = bt_kernel_selector.GetImplementation(_kernels_data[beam_table_stage].kernelName); bt_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[beam_table_stage]); } + + if (_kernels_data.size() >= 3) { + auto& dq_kernel_selector = dq_kernel_selector_t::Instance(); + auto dq_kernel_impl = dq_kernel_selector.GetImplementation(_kernels_data[dq_stage].kernelName); + dq_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[dq_stage]); + } + + if (_kernels_data.size() >= 4) { + auto& scale_zp_concat_kernel_selector = kernel_selector_t::Instance(); + auto scale_zp_concat_kernel_impl = scale_zp_concat_kernel_selector.GetImplementation(_kernels_data[scale_concat_stage].kernelName); + scale_zp_concat_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[scale_concat_stage]); + } } } void set_arguments_impl(kv_cache_inst& instance) override {} kernel_arguments_data get_arguments(const kv_cache_inst& instance, size_t stage) const override { + // input buffers order: [past, new, (beam_table), (past_scale), (past_zp)] + // output buffers order: [current, (beam_table), (current_scale), (current_zp)] kernel_arguments_data args; args.shape_info = instance.shape_info_memory_ptr(); if (stage == concat_stage) { @@ -93,12 +118,27 @@ struct kv_cache_impl : multi_stage_primitive { } else if (stage == beam_table_stage) { args.inputs = { beam_table_prev, instance.input_memory_ptr(2) }; args.outputs = { beam_table_new }; + } else if (stage == dq_stage) { + args.inputs = { instance.input_memory_ptr(1) }; + args.outputs = { instance.output_memory_ptr(0) }; + for (size_t i = 2; i < instance.outputs_memory_count(); i++) { + args.outputs.push_back(instance.output_memory_ptr(i)); + } + } else if (stage == scale_concat_stage) { + args.inputs = { instance.input_memory_ptr(3) }; + args.outputs = { instance.output_memory_ptr(2) }; + } else if (stage == zp_concat_stage) { + args.inputs = { instance.input_memory_ptr(4) }; + args.outputs = { instance.output_memory_ptr(3) }; } - return args; } - void execute_stage(const std::vector& events, kv_cache_inst& instance, std::vector& all_events, size_t stage) { + void execute_stage(const std::vector& events, + kv_cache_inst& instance, + std::vector& all_events, + size_t stage, + size_t arguments_set = SIZE_MAX) { stream& stream = instance.get_network().get_stream(); std::vector tmp_events(events); size_t kernel_offset = 0; @@ -114,7 +154,8 @@ struct kv_cache_impl : multi_stage_primitive { bool needs_completion_event = instance.needs_completion_event(); auto& params = _kernels_data[stage].kernels[kd_idx].params; - auto args = get_arguments(instance, stage); + auto custom_arguments = arguments_set == SIZE_MAX ? stage : arguments_set; + auto args = get_arguments(instance, custom_arguments); args.scalars = ¶ms.scalars; for (const auto& m : instance.get_intermediates_memories()) { @@ -183,18 +224,58 @@ struct kv_cache_impl : multi_stage_primitive { beam_table_state->set(); } + if (desc->compressed) { + // Copy scales to the new buffer if needed + execute_stage(events, instance, res_events, scale_concat_stage, scale_concat_stage); + + if (desc->get_compression_zp_inputs_num() > 0) { + // Copy zero points to the new buffer if needed + execute_stage(events, instance, res_events, scale_concat_stage, zp_concat_stage); + } + + // Perform dynamic quantization of new token data and append result to the KV-cache + auto dq_params = get_dq_update_kernel_params(impl_param, impl_param.is_dynamic()); + (_kernels_data[dq_stage].update_dispatch_data_func)(dq_params, _kernels_data[dq_stage]); + execute_stage(events, instance, res_events, dq_stage); + + auto compressed_cache_variable = dynamic_cast(&variable); + compressed_cache_variable->get_compression_scale_state()->set(); + + if (desc->get_compression_zp_inputs_num() > 0) { + compressed_cache_variable->get_compression_zp_state()->set(); + } + } + variable.set(); if (can_be_optimized) { GPU_DEBUG_TRACE_DETAIL << desc->id << " : Output is same as variable memory! Skip copying " << std::endl; // When primitive is optimized, concat kernel writes directly to variable memory return stream.aggregate_events(res_events, res_events.size() > 1); } else { - // Othwerise, we need to copy result from out buffer to state memory - GPU_DEBUG_TRACE_DETAIL << desc->id << " : Copying output to variable meomry" << std::endl; + // Otherwise, we need to copy result from out buffer to state memory + GPU_DEBUG_TRACE_DETAIL << desc->id << " : Copying output to variable memory" << std::endl; stream.enqueue_barrier(); + + std::vector res_events; auto out = instance.get_network().get_engine().reinterpret_buffer(instance.output_memory(0), variable.get_memory()->get_layout()); - return variable.get_memory()->copy_from(stream, *out, false); + res_events.push_back(variable.get_memory()->copy_from(stream, *out, false)); + + if (desc->compressed) { + auto compressed_cache_variable = dynamic_cast(&variable); + + auto scale_state = compressed_cache_variable->get_compression_scale_state(); + auto out_scale_mem = instance.get_network().get_engine().reinterpret_buffer(instance.output_memory(2), scale_state->get_memory()->get_layout()); + res_events.push_back(scale_state->get_memory()->copy_from(stream, *out_scale_mem, false)); + + if (desc->get_compression_zp_inputs_num() > 0) { + auto zp_state = compressed_cache_variable->get_compression_zp_state(); + auto out_zp_mem = instance.get_network().get_engine().reinterpret_buffer(instance.output_memory(3), zp_state->get_memory()->get_layout()); + res_events.push_back(zp_state->get_memory()->copy_from(stream, *out_zp_mem, false)); + } + } + + return stream.aggregate_events(res_events, res_events.size() > 1); } } @@ -264,10 +345,14 @@ struct kv_cache_impl : multi_stage_primitive { params.is_state_set = is_state_set; params.indirect_axis = indirect_axis; - const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, beam_table_past]] - const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present] + const auto& desc = impl_param.typed_desc(); + const auto compression_inputs = desc->get_compression_scales_inputs_num() + desc->get_compression_zp_inputs_num(); + const auto beam_table_past_idx = 3 + compression_inputs; + + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, [scale_past], [zp_past], beam_table_past]] + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present, compression_scale_present] std::map in_tensor_to_offset_map = { - {0, in_offsets_map.at(3)}, // beam_table_past + {0, in_offsets_map.at(beam_table_past_idx)}, // beam_table_past {1, in_offsets_map.at(2)}, // beam_idx }; std::map out_tensor_to_offset_map = { @@ -279,17 +364,108 @@ struct kv_cache_impl : multi_stage_primitive { return params; } + static dq_kernel_params_t get_dq_update_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + const auto& primitive = impl_param.typed_desc(); + auto params = get_default_params(impl_param, is_shape_agnostic); + + params.append_axis = primitive->concat_axis; + params.group_sizes = primitive->quantization_config.group_sizes; + params.scales_output_order = primitive->scales_zp_output_order; + params.use_asymmetric_quantization = primitive->quantization_config.is_asymmetric_quantization(); + params.combine_scales_and_zp = primitive->combine_scales_and_zp; + + const auto& past_kv_cache_shape = impl_param.input_layouts[0].get_partial_shape(); + params.axis_offset = past_kv_cache_shape[primitive->concat_axis].is_static() ? past_kv_cache_shape[primitive->concat_axis].get_length() : 0; + + auto inputs_count = 1; + auto outputs_count = 2; + params.inputs.resize(inputs_count); + params.outputs.resize(outputs_count); + + auto current_token_layout = impl_param.input_layouts[1]; + auto present_layout = impl_param.output_layouts[0]; + auto present_scales_layout = impl_param.output_layouts[2]; + params.inputs[0] = convert_data_tensor(current_token_layout); + params.outputs[0] = convert_data_tensor(present_layout); + params.outputs[1] = convert_data_tensor(present_scales_layout); + + const bool has_zp_output_buffer = primitive->get_compression_zp_inputs_num() > 0; + if (has_zp_output_buffer) { + auto present_zp_layout = impl_param.output_layouts[3]; + params.outputs.resize(outputs_count + 1); + params.outputs[2] = convert_data_tensor(present_zp_layout); + } + + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; + + std::map in_tensor_to_offset_map = { + {0, in_offsets_map.at(1)}, // kv_new_token + }; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(0)}, // compressed_kv_present + {1, out_offsets_map.at(2)}, // compression_scale_present + }; + + if (has_zp_output_buffer) { + out_tensor_to_offset_map.emplace(2, out_offsets_map.at(3)); // compression_zp_present + } + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); + + return params; + } + + static kernel_params_t get_compression_scale_update_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + auto params = get_default_params(impl_param, is_shape_agnostic); + + const auto concat_axis = 2; + params.axis = convert_axis(concat_axis, impl_param.get_output_layout().get_rank()); + + auto inputs_count = 1; + auto comp_scale_past_layout = impl_param.input_layouts[3]; + auto comp_scale_present_layout = impl_param.output_layouts[2]; + + params.inputs.resize(inputs_count); + params.inputs[0] = convert_data_tensor(comp_scale_past_layout); + params.outputs[0] = convert_data_tensor(comp_scale_present_layout); + + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; + + std::map in_tensor_to_offset_map = { + {0, in_offsets_map.at(3)}, // compression_scale_past + }; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(2)}, // compression_scale_present + }; + + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); + + return params; + } + static std::unique_ptr create(const typed_program_node& arg, const kernel_impl_params& impl_param) { std::vector kernels_data; auto concat_kernel_params = get_concat_kernel_params(impl_param, impl_param.is_dynamic()); auto& concat_kernel_selector = kernel_selector_t::Instance(); kernels_data.push_back(concat_kernel_selector.get_best_kernel(concat_kernel_params)); const bool indirect = impl_param.typed_desc()->indirect; + const bool compressed = impl_param.typed_desc()->compressed; if (indirect) { auto bt_update_kernel_params = get_bt_update_kernel_params(impl_param, false); auto& bt_update_kernel_selector = bt_kernel_selector_t::Instance(); kernels_data.push_back(bt_update_kernel_selector.get_best_kernel(bt_update_kernel_params)); } + + if (compressed) { + auto dq_kernel_params = get_dq_update_kernel_params(impl_param, impl_param.is_dynamic()); + auto& dq_kernel_selector = dq_kernel_selector_t::Instance(); + kernels_data.push_back(dq_kernel_selector.get_best_kernel(dq_kernel_params)); + + auto concat_scale_zp_kernel_params = get_compression_scale_update_kernel_params(impl_param, impl_param.is_dynamic()); + auto& concat_scale_zp_kernel_selector = kernel_selector_t::Instance(); + kernels_data.push_back(concat_scale_zp_kernel_selector.get_best_kernel(concat_scale_zp_kernel_params)); + } return cldnn::make_unique(kernels_data); } @@ -307,13 +483,26 @@ struct kv_cache_impl : multi_stage_primitive { (_kernels_data[concat_stage].update_dispatch_data_func)(params, _kernels_data[concat_stage]); _kernels_data[concat_stage].kernels[0].skip_execution = impl_param._can_be_optimized || impl_param.get_input_layout(0).count() == 0; + + if (impl_param.typed_desc()->compressed) { + // In case of KV-cache with compression enabled, skip second concat's kernel as new token data append will + // be handled by dynamic quantization kernel + // However, allow execution of the first token for the case if KV-cache can't be optimized (if optimization is disabled, or + // variables memory was reallocated and we have to copy past KV-cache to new memory) + _kernels_data[concat_stage].kernels[1].skip_execution = true; + + // Update dynamic quantization parameters + auto comp_scale_kernel_params = get_compression_scale_update_kernel_params(impl_param, impl_param.is_dynamic()); + (_kernels_data[scale_concat_stage].update_dispatch_data_func)(comp_scale_kernel_params, _kernels_data[scale_concat_stage]); + _kernels_data[scale_concat_stage].kernels[0].skip_execution = impl_param._can_be_optimized || impl_param.get_input_layout(3).count() == 0; + } } }; namespace detail { attach_kv_cache_impl::attach_kv_cache_impl() { - auto types = { data_types::f16, data_types::f32 }; + auto types = { data_types::i8, data_types::f16, data_types::f32 }; auto formats = { format::bfyx }; implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index b33871110ec6b4..1516ac66293eac 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -133,6 +133,20 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveis_causal; + if (desc->is_kv_compressed) { + const auto& group_sizes = desc->quantization_config.group_sizes; + const auto non_compressed_dims = std::count(group_sizes.begin(), group_sizes.end(), 1); + + config.per_head_quantization = (group_sizes.size() - non_compressed_dims) == 1; + config.is_kv_compressed = desc->is_kv_compressed; + config.use_asymmetric_quantization = desc->quantization_config.is_asymmetric_quantization(); + config.combine_scales_and_zp = desc->combine_scales_and_zp; + } + return config; } @@ -229,6 +253,15 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveis_kv_compressed) { + data_inputs_num -= 2; // key and value compression scales are handled separately + + has_zp_input_buffers = desc->quantization_config.is_asymmetric_quantization() && !desc->combine_scales_and_zp; + if (has_zp_input_buffers) + data_inputs_num -= 2; // key and value compression zp are handled separately + } + params.inputs.resize(data_inputs_num); for (size_t i = 0; i < data_inputs_num; i++) { params.inputs[i] = convert_data_tensor(impl_param.get_input_layout(i)); @@ -246,15 +279,41 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveindirect_axis; } - params.set_dynamic_shape_offsets(); + if (desc->is_kv_compressed) { + params.key_cache_comp_scale = convert_data_tensor(impl_param.get_input_layout(data_inputs_num)); + params.value_cache_comp_scale = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 1)); - // Need to adjust sdpa kernel offset to consider beam table input - if (has_indirect_inputs(impl_param)) { - auto out_offset = params.outputs[0].get_dynamic_shape_offset(); - if (indirect) - params.beam_table.SetDynamicShapeOffset(out_offset); + if (has_zp_input_buffers) { + params.key_cache_comp_zp = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 2)); + params.value_cache_comp_zp = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 3)); + } + } - params.outputs[0].SetDynamicShapeOffset(out_offset + kernel_selector::DataTensor::max_rank()); + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; + std::map in_tensor_to_offset_map; + for (size_t i = 0; i < data_inputs_num; i++) { + in_tensor_to_offset_map[i] = in_offsets_map.at(i); + } + + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(0)}, + }; + + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); + + if (desc->is_kv_compressed) { + params.key_cache_comp_scale.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num)); + params.value_cache_comp_scale.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 1)); + + if (has_zp_input_buffers) { + params.key_cache_comp_zp.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 2)); + params.value_cache_comp_zp.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 3)); + } + } + + if (indirect && has_indirect_inputs(impl_param)) { + params.beam_table.SetDynamicShapeOffset(get_beam_table_id(desc)); } return params; @@ -300,6 +359,7 @@ attach_scaled_dot_product_attention_impl::attach_scaled_dot_product_attention_im auto types = { data_types::f32, data_types::f16, + data_types::i8, }; auto formats = { diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h index 49dd62c6332549..bb8ebd093fa696 100644 --- a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h @@ -35,7 +35,10 @@ class typed_primitive_inst : public typed_primitive_inst_base< // Internal function to be used from fakealignment template - static std::vector __calc_output_layouts(const layout &act_layout, uint64_t group_size); + static std::vector __calc_output_layouts(const layout &act_layout, + const dynamic_quantize::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp); static std::string to_string(dynamic_quantize_node const& node); typed_primitive_inst(network& network, dynamic_quantize_node const& node); diff --git a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h index f3aa4de5ec34e1..da0a9397433f89 100644 --- a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h @@ -62,6 +62,12 @@ class typed_primitive_inst : public typed_primitive_inst_base= 0 ? sequence_axis : past_layout_rank + sequence_axis; } + static int64_t get_scale_zp_sequence_axis() { + // The order of scales and zero points is fixed, so use constant axis + const auto scale_zp_concat_axis = 2; + return scale_zp_concat_axis; + } + static int64_t get_max_pad(const layout& target_layout, size_t buffer_size, int64_t sequence_axis, std::string target_name = "") { if (buffer_size == 0) return 0; diff --git a/src/plugins/intel_gpu/src/graph/include/program_node.h b/src/plugins/intel_gpu/src/graph/include/program_node.h index ba5363f09f194b..8105a8bc07dec3 100644 --- a/src/plugins/intel_gpu/src/graph/include/program_node.h +++ b/src/plugins/intel_gpu/src/graph/include/program_node.h @@ -237,7 +237,7 @@ struct program_node { } void merge_output_padding(padding const& padd, size_t idx = 0) { - set_output_padding(padding::max(padd, output_layouts[idx].data_padding)); + set_output_padding(padding::max(padd, output_layouts[idx].data_padding), idx); } // only calculated output layout (for external usage), does not modify/use cached output layout nor invalidate users diff --git a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h index 74f9ffff581b87..ea0c8b82bb21fa 100644 --- a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h @@ -33,9 +33,14 @@ class typed_primitive_inst : public typed_primitive_inst_base static std::vector calc_output_layouts(read_value_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); - const auto& default_layout = desc->output_layout; + std::vector output_layouts; - return { impl_param.state_layout.value_or(default_layout) }; + for (size_t i = 0; i < desc->num_outputs; i++) { + const auto& default_layout = desc->output_layouts[i]; + output_layouts.push_back(impl_param.state_layouts.size() > i ? impl_param.state_layouts[i] : default_layout); + } + + return output_layouts; } static layout calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param); diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp index 66a874b9b153ec..1e5c38634eef59 100644 --- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp @@ -36,16 +36,24 @@ std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& /*no std::vector input_shapes = {impl_param.get_input_layout(0).get(), impl_param.get_input_layout(1).get()}; - if (desc->num_outputs > 1) + if (desc->indirect) { input_shapes.push_back(impl_param.get_input_layout(2).get()); + } + + if (desc->compressed) { + input_shapes.push_back(impl_param.get_input_layout(3).get()); - std::vector output_shapes = shape_infer(&op, input_shapes); + if (desc->get_compression_zp_inputs_num() > 0) { + input_shapes.push_back(impl_param.get_input_layout(4).get()); + } + } - static const std::map ports_map = {{0, 0}, {1, 2}}; + std::vector output_shapes = desc->compressed ? shape_infer(&op, input_shapes, desc->quantization_config, desc->scales_zp_output_order, desc->combine_scales_and_zp) + : shape_infer(&op, input_shapes); std::vector out_layouts; for (size_t i = 0; i < desc->num_outputs; i++) { - auto out_type = desc->output_data_types[i].value_or(impl_param.get_input_layout(ports_map.at(i)).data_type); + auto out_type = desc->output_data_types[i].value(); out_layouts.emplace_back(output_shapes[i], out_type, impl_param.get_output_layout(i).format); } @@ -64,6 +72,9 @@ std::string kv_cache_inst::to_string(const kv_cache_node& node) { kv_cache_info.add("concat axis", node.get_primitive()->concat_axis); kv_cache_info.add("gather axis", node.get_primitive()->gather_axis); kv_cache_info.add("indirect", node.get_primitive()->indirect); + kv_cache_info.add("compressed", node.get_primitive()->compressed); + kv_cache_info.add("combine_scales_and_zp", node.get_primitive()->combine_scales_and_zp); + kv_cache_info.add("scales_zp_output_order", node.get_primitive()->scales_zp_output_order); node_info->add("kv_cache info", kv_cache_info); std::stringstream primitive_description; node_info->dump(primitive_description); diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index f90d4e34b08cc2..a8f8ecbe528850 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -40,6 +40,7 @@ #include "graph_optimizer/prepare_buffer_fusing.h" #include "intel_gpu/plugin/common_utils.hpp" +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" #include "intel_gpu/graph/network.hpp" #include "intel_gpu/graph/serialization/set_serializer.hpp" #include "intel_gpu/runtime/engine.hpp" @@ -294,30 +295,54 @@ void primitive_inst::update_shape() { auto prim = get_node().as().get_primitive(); const auto& variable_id = prim->variable_id; auto& variable = get_network().get_variable(variable_id); - // Initial variable shape is taken from variable itself - auto new_layout = variable.get_layout(); - // If variable is not set and we have an initializer - use it's shape as shape of variable - if (!variable.is_set() && _impl_params->input_layouts.size() == 1) { - new_layout = _impl_params->get_input_layout(0); - } + auto update_state_layout = [&](ov::intel_gpu::VariableStateBase& variable, layout new_layout, size_t layout_idx) { + // If variable is not set and we have an initializer - use it's shape as shape of variable + if (!variable.is_set() && _impl_params->input_layouts.size() > layout_idx) { + new_layout = _impl_params->get_input_layout(layout_idx); + } - // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0 - if (new_layout.is_dynamic()) { - auto pshape = new_layout.get_partial_shape(); - for (auto& d : pshape) { - if (d.is_dynamic()) { - d = 0; + // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0 + if (new_layout.is_dynamic()) { + auto pshape = new_layout.get_partial_shape(); + for (auto& d : pshape) { + if (d.is_dynamic()) { + d = 0; + } } + new_layout.set_partial_shape(pshape); } - new_layout.set_partial_shape(pshape); - } - variable.set_layout(new_layout); + variable.set_layout(new_layout); - if (!_impl_params->state_layout.has_value() || _impl_params->state_layout.value() != new_layout) { - _impl_params->state_layout = new_layout; - input_shape_changed = true; + if (_impl_params->state_layouts[layout_idx] != new_layout) { + _impl_params->state_layouts[layout_idx] = new_layout; + GPU_DEBUG_TRACE_DETAIL << "Update " << layout_idx << " layout: " << new_layout.to_short_string() << "\n"; + input_shape_changed = true; + } + }; + + if (_impl_params->state_layouts.empty()) + _impl_params->state_layouts.resize(1); + + // Initial variable shape is taken from variable itself + auto new_layout = variable.get_layout(); + update_state_layout(variable, new_layout, 0); + + if (prim->num_outputs > 1) { + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + _impl_params->state_layouts.resize(compressed_cache_variable->has_zp_state() ? 3 : 2); + + auto scales_state = compressed_cache_variable->get_compression_scale_state(); + auto new_scales_layout = compressed_cache_variable->get_compression_scale_state()->get_layout(); + update_state_layout(*scales_state, new_scales_layout, 1); + + if (compressed_cache_variable->has_zp_state()) { + auto scales_state = compressed_cache_variable->get_compression_zp_state(); + auto new_zp_layout = compressed_cache_variable->get_compression_zp_state()->get_layout(); + update_state_layout(*scales_state, new_zp_layout, 2); + } + } } } @@ -462,6 +487,14 @@ void primitive_inst::update_shape() { auto& variable = get_network().get_variable(desc->variable_id); // Custom output layout update as update_output_layout handles paddings incorrectly for optimized out read_value + kv_cache pattern _impl_params->output_layouts[0] = variable.get_layout(); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + _impl_params->output_layouts[1] = compressed_cache_variable->get_compression_scale_state()->get_layout(); + + if (compressed_cache_variable->has_zp_state()) { + _impl_params->output_layouts[2] = compressed_cache_variable->get_compression_zp_state()->get_layout(); + } + } } if (get_node().is_type()) { @@ -544,6 +577,15 @@ event::ptr primitive_inst::realloc_if_needed() { << ", variable layout " << variable.get_layout().to_short_string() << ")" << std::endl; _outputs[0] = variable.get_memory(); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + _outputs[2] = compressed_cache_variable->get_compression_scale_state()->get_memory(); + + if (compressed_cache_variable->has_zp_state()) { + _outputs[3] = compressed_cache_variable->get_compression_zp_state()->get_memory(); + } + } + // To record shape predictor for (size_t j = 0; j < _impl_params->output_layouts.size(); ++j) sp.predict_preallocation_shape(id(), _impl_params->output_layouts[j], true, j); @@ -563,11 +605,27 @@ event::ptr primitive_inst::realloc_if_needed() { GPU_DEBUG_TRACE_DETAIL << id() << ": Update variable (ptr: " << variable.get_memory()->buffer_ptr() << ", actual_size:" << variable.get_actual_mem_size() << " bytes" << ", variable layout:" << variable.get_layout().to_short_string() << ")" << std::endl; + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + compressed_cache_variable->get_compression_scale_state()->set_layout(_impl_params->output_layouts[1]); + + if (compressed_cache_variable->has_zp_state()) { + compressed_cache_variable->get_compression_zp_state()->set_layout(_impl_params->output_layouts[2]); + } + } } // For nodes that can be optimized, variable memory is used as output memory // so there is no need for output memory reallocation if (can_be_optimized()) { _max_output_layout_count[0] = variable.get_actual_mem_size() / dt_sizes_in_B[0]; + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + const size_t scale_idx = _node->is_type() ? 1 : 2; // kv_cache or read_value + _max_output_layout_count[scale_idx] = compressed_cache_variable->get_compression_scale_state()->get_actual_mem_size() / dt_sizes_in_B[1]; + if (compressed_cache_variable->has_zp_state()) { + _max_output_layout_count[scale_idx + 1] = compressed_cache_variable->get_compression_zp_state()->get_actual_mem_size() / dt_sizes_in_B[2]; + } + } GPU_DEBUG_PROFILED_STAGE_MEMALLOC_INFO("can_be_optimized"); return ev; } @@ -646,7 +704,12 @@ event::ptr primitive_inst::realloc_if_needed() { // dynamic quantization is only applied to activation of FC if (get_node().is_type()) { - auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], 0); + const auto& desc = get_node().as().get_primitive(); + auto dyn_quan_scale_layout = + dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], + desc->quantization_config, + desc->scales_zp_output_order, + desc->combine_scales_and_zp); GPU_DEBUG_TRACE_DETAIL << "update layout of dynamic quantize scale parameter layout " << dyn_quan_scale_layout[1].to_short_string() << std::endl; updated_params.output_layouts[1] = dyn_quan_scale_layout[1]; @@ -671,13 +734,24 @@ event::ptr primitive_inst::realloc_if_needed() { // update layout to ensure that it repsects paddings for correct allocation size if (_node_output_layout.data_padding.is_dynamic()) { - auto current_dims = updated_layouts[0].get_padded_dims(); + auto update_padding = [](layout& orig_layout) { + auto current_dims = orig_layout.get_padded_dims(); + + std::vector current_buf_shape; + current_buf_shape.reserve(current_dims.size()); + std::transform(current_dims.begin(), current_dims.end(), + std::back_inserter(current_buf_shape), [](const tensor::value_type& el) { return static_cast(el); }); + orig_layout = layout(ov::PartialShape(current_buf_shape), orig_layout.data_type, orig_layout.format); + }; - std::vector current_buf_shape; - current_buf_shape.reserve(current_dims.size()); - std::transform(current_dims.begin(), current_dims.end(), - std::back_inserter(current_buf_shape), [](const tensor::value_type& el) { return static_cast(el); }); - updated_layouts[0] = layout(ov::PartialShape(current_buf_shape), updated_layouts[0].data_type, updated_layouts[0].format); + update_padding(updated_layouts[0]); + + // Update scales and zero points buffers paddings, skipping beam_table + if (_node->is_type()) { + for (size_t i = 2; i < updated_layouts.size(); ++i) { + update_padding(updated_layouts[i]); + } + } } int32_t tmp_prealloc_count = get_prealloc_iter_num(); @@ -690,13 +764,14 @@ event::ptr primitive_inst::realloc_if_needed() { for (size_t i = 0; i < updated_layouts.size(); ++i) { bool reclaim = 0; size_t required_buffer_size = 0; - if (_node->is_type() && i == 0) { + if (_node->is_type() && i != 1) { // Relax reclaiming condition for kv cache const auto& desc = _node->as().get_primitive(); auto prealloc_shape = updated_layouts[i].get_shape(); const auto shape_rank = prealloc_shape.size(); - auto seq_axis = - static_cast(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis); + const auto seq_axis = i == 0 ? kv_cache_inst::get_sequence_axis(desc->concat_axis, shape_rank) + : kv_cache_inst::get_scale_zp_sequence_axis(); + prealloc_shape[seq_axis] += tmp_prealloc_count; required_buffer_size = std::accumulate(prealloc_shape.begin(), prealloc_shape.end(), size_t(1), std::multiplies()); } else { @@ -723,11 +798,12 @@ event::ptr primitive_inst::realloc_if_needed() { for (size_t i = 0; i < actual_layouts.size(); ++i) { bool can_reuse_buffer = (_outputs[i] && updated_layouts[i].get_linear_size() <= _max_output_layout_count[i]); std::pair prealloc_info; - if (_node->is_type() && i == 0) { + if (_node->is_type() && i != 1) { const auto& desc = _node->as().get_primitive(); - auto shape_rank = updated_layouts[i].get_shape().size(); - auto seq_axis = - static_cast(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis); + const auto shape_rank = updated_layouts[i].get_shape().size(); + const auto seq_axis = i == 0 ? kv_cache_inst::get_sequence_axis(desc->concat_axis, shape_rank) + : kv_cache_inst::get_scale_zp_sequence_axis(); + prealloc_info = sp.predict_preallocation_shape(id(), updated_layouts[i], false, i, tmp_prealloc_count, seq_axis); } else { prealloc_info = sp.predict_preallocation_shape(id(), updated_layouts[i], can_reuse_buffer, i, tmp_prealloc_count); @@ -743,19 +819,21 @@ event::ptr primitive_inst::realloc_if_needed() { GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer[" << i << "] - " << actual_layouts[i].get_linear_size() << "/" << _max_output_layout_count[i] << std::endl; - if (_node->is_type() && (i == 0)) { + if (_node->is_type() && i != 1) { // kv_cache has already assigned memory. // No need to reinterpret output memory but need to update padding const auto& desc = _node->as().get_primitive(); auto& present_layout = _impl_params->output_layouts[i]; const auto present_layout_rank = present_layout.get_partial_shape().size(); - const auto sequence_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank); + const auto sequence_axis = i == 0 ? kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank) + : kv_cache_inst::get_scale_zp_sequence_axis();; + auto max_pad = kv_cache_inst::get_max_pad(present_layout, _max_output_layout_count[i], sequence_axis, - "present_layout"); + i == 0 ? "present_layout" : "present_scales_layout"); kv_cache_inst::update_pad(present_layout, max_pad, sequence_axis); - GPU_DEBUG_TRACE_DETAIL << _impl_params->output_layouts[i].to_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << i << ". " << _impl_params->output_layouts[i].to_string() << std::endl; set_shape_change(); } else { _outputs[i] = _network.get_engine().reinterpret_buffer(*_outputs[i], actual_layouts[i]); @@ -816,6 +894,22 @@ event::ptr primitive_inst::realloc_if_needed() { sequence_axis, "present_layout"); if (max_pad > 0) { + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + auto present_scales_layout = _impl_params->output_layouts[2]; + const auto sequence_axis = kv_cache_inst::get_scale_zp_sequence_axis();; + + kv_cache_inst::update_pad(present_scales_layout, max_pad, sequence_axis); + if (!axis_is_outer_most) { + _impl_params->output_layouts[2] = present_scales_layout; + } + + compressed_cache_variable->get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout); + if (compressed_cache_variable->has_zp_state()) { + auto present_zp_layout = present_scales_layout; + compressed_cache_variable->get_compression_zp_state()->set_memory(_outputs[3], present_zp_layout); + } + } + kv_cache_inst::update_pad(present_layout, max_pad, sequence_axis); if (!axis_is_outer_most) { GPU_DEBUG_TRACE_DETAIL << id() << ": Update impl with new output padding" << std::endl; @@ -836,12 +930,32 @@ event::ptr primitive_inst::realloc_if_needed() { << "'s layout with allocated kv cache output: " << present_layout.to_short_string() << " (is_set = " << variable.is_set() << ") " << std::endl; variable.set_memory(_outputs[0], present_layout); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + auto present_scales_layout = _impl_params->output_layouts[2]; + + compressed_cache_variable->get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout); + if (compressed_cache_variable->has_zp_state()) { + auto present_zp_layout = present_scales_layout; + compressed_cache_variable->get_compression_zp_state()->set_memory(_outputs[3], present_zp_layout); + } + } } } else { GPU_DEBUG_TRACE_DETAIL << id() << ": Update variable " << variable.get_name() << "'s layout with allocated kv cache output: " << present_layout.to_short_string() << " (is_set = " << variable.is_set() << ") " << std::endl; variable.set_layout(present_layout); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + auto present_scales_layout = _impl_params->output_layouts[2]; + + compressed_cache_variable->get_compression_scale_state()->set_layout(present_scales_layout); + if (compressed_cache_variable->has_zp_state()) { + auto present_zp_layout = present_scales_layout; + compressed_cache_variable->get_compression_zp_state()->set_layout(present_zp_layout); + } + } } } @@ -1017,8 +1131,8 @@ bool primitive_inst::update_impl(bool use_async_compilation) { } void primitive_inst::update_paddings() { - auto reset_pad = [](kernel_impl_params& params, const program_node* node) { - params.output_layouts[0].data_padding = node->get_output_layout(0).data_padding; + auto reset_pad = [](kernel_impl_params& params, const program_node* node, size_t idx = 0) { + params.output_layouts[idx].data_padding = node->get_output_layout(idx).data_padding; }; if (_node->is_type() || _node->is_type()) { auto variable_id = _node->is_type() ? (_node->as().get_primitive()->variable_id) @@ -1030,6 +1144,15 @@ void primitive_inst::update_paddings() { primitive_inst* inst = this; while (inst) { reset_pad(*inst->_impl_params, inst->_node); + if (inst == this) { + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + const size_t scale_idx = _node->is_type() ? 1 : 2; + reset_pad(*inst->_impl_params, inst->_node, scale_idx); + if (compressed_cache_variable->has_zp_state()) { + reset_pad(*inst->_impl_params, inst->_node, scale_idx + 1); + } + } + } auto& users = inst->_node->get_users(); if (users.size() == 1 && users.front()->get_output_layout(0).data_padding.is_dynamic()) { inst = inst->get_user_insts().front(); @@ -1155,11 +1278,40 @@ void primitive_inst::do_runtime_in_place_kv_cache() { GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_layout's pad : " << present_layout.to_string() << std::endl; auto& variable = get_network().get_variable(desc->variable_info.variable_id); variable.set_layout(present_layout); + + if (desc->compressed) { + auto compressed_cache_variable = dynamic_cast(&variable); + auto& present_scales_layout = _impl_params->output_layouts[2]; + const auto sequence_axis = kv_cache_inst::get_scale_zp_sequence_axis(); + kv_cache_inst::update_pad(present_scales_layout, max_pad - new_seq_len, sequence_axis); + GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_scale_layout's pad : " << present_scales_layout.to_string() << std::endl; + + compressed_cache_variable->get_compression_scale_state()->set_layout(present_scales_layout); + if (desc->get_compression_zp_inputs_num() > 0) { + auto& present_zp_layout = _impl_params->output_layouts[3]; + kv_cache_inst::update_pad(present_zp_layout, max_pad - new_seq_len, sequence_axis); + GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_zp_layout's pad : " << present_scales_layout.to_string() << std::endl; + + compressed_cache_variable->get_compression_zp_state()->set_layout(present_scales_layout); + } + } + GPU_DEBUG_TRACE_DETAIL << "[do_runtime_in_place_kv_cache] " << id() << "Updated variable with present_layout" << variable.get_layout().to_string() << " is_set = " << variable.is_set() << std::endl; if (past_layout.data_padding._upper_size[sequence_axis] > 0 && variable.is_set()) { kv_cache_inst::update_pad(past_layout, max_pad, sequence_axis); _impl_params->_can_be_optimized = true; + + if (desc->compressed) { + auto& past_scale_layout = _impl_params->input_layouts[3]; + const auto sequence_axis = kv_cache_inst::get_scale_zp_sequence_axis(); + kv_cache_inst::update_pad(past_scale_layout, max_pad, sequence_axis); + + if (desc->get_compression_zp_inputs_num() > 0) { + auto& past_zp_layout = _impl_params->input_layouts[4]; + kv_cache_inst::update_pad(past_zp_layout, max_pad, sequence_axis); + } + } GPU_DEBUG_TRACE_DETAIL << "[do_runtime_in_place_kv_cache] " << id() << " Updated past layout's pad : " << past_layout.to_string() << std::endl; } } diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index fc9648b90e444c..a9bb7c665f177b 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -90,7 +90,8 @@ void program_node::replace_dependency(size_t idx, std::pair const program_node::get_input_layouts() const { std::vector layouts; for (size_t i = 0; i < dependencies.size(); i++) { - layouts.push_back(get_input_layout(i)); + auto input_layout = get_input_layout(i); + layouts.push_back(input_layout); } return layouts; } diff --git a/src/plugins/intel_gpu/src/graph/read_value.cpp b/src/plugins/intel_gpu/src/graph/read_value.cpp index bf6e730e8a808b..1d6657b9bf8ac4 100644 --- a/src/plugins/intel_gpu/src/graph/read_value.cpp +++ b/src/plugins/intel_gpu/src/graph/read_value.cpp @@ -2,8 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "read_value_inst.h" #include "primitive_type_base.h" + +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" + #include #include @@ -16,7 +19,7 @@ read_value_inst::typed_primitive_inst(network& network, const read_value_node& n } layout read_value_inst::calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param) { - return impl_param.typed_desc()->output_layout; + return impl_param.typed_desc()->output_layouts[0]; } std::string read_value_inst::to_string(const read_value_node& node) { @@ -45,5 +48,25 @@ void read_value_inst::update_output_memory() { GPU_DEBUG_TRACE_DETAIL << " - layout " << variable.get_layout().to_string() << std::endl; GPU_DEBUG_TRACE_DETAIL << " - actual_size " << variable.get_actual_mem_size() << " bytes" << std::endl; set_output_memory(variable.get_memory(), false, 0); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + auto scales_state = compressed_cache_variable->get_compression_scale_state(); + set_output_memory(scales_state->get_memory(), false, 1); + + GPU_DEBUG_TRACE_DETAIL << id() << " Update output memory with variable " << scales_state->get_name() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - ptr : " << scales_state->get_memory()->buffer_ptr() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - layout " << scales_state->get_layout().to_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - actual_size " << scales_state->get_actual_mem_size() << " bytes" << std::endl; + + if (compressed_cache_variable->has_zp_state()) { + auto zp_state = compressed_cache_variable->get_compression_zp_state(); + set_output_memory(zp_state->get_memory(), false, 2); + + GPU_DEBUG_TRACE_DETAIL << id() << " Update output memory with variable " << zp_state->get_name() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - ptr : " << zp_state->get_memory()->buffer_ptr() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - layout " << zp_state->get_layout().to_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - actual_size " << zp_state->get_actual_mem_size() << " bytes" << std::endl; + } + } } } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp index e8e213ad97011a..14aea98822a393 100644 --- a/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp @@ -87,6 +87,14 @@ std::string scaled_dot_product_attention_inst::to_string(scaled_dot_product_atte json_composite scaled_dot_product_attention_info; scaled_dot_product_attention_info.add("input id", input.id()); scaled_dot_product_attention_info.add("is_causal", desc->is_causal); + scaled_dot_product_attention_info.add("is_kv_compressed", desc->is_kv_compressed); + scaled_dot_product_attention_info.add("combine_scales_and_zp", desc->combine_scales_and_zp); + scaled_dot_product_attention_info.add("group_size", desc->quantization_config.group_sizes); + scaled_dot_product_attention_info.add("is_asymmetric_quantization", desc->quantization_config.is_asymmetric_quantization()); + scaled_dot_product_attention_info.add("quantization_dt", desc->quantization_config.quantization_dt); + scaled_dot_product_attention_info.add("scale_dt", desc->quantization_config.scale_dt); + scaled_dot_product_attention_info.add("zp_dt", desc->quantization_config.zp_dt); + scaled_dot_product_attention_info.add("indirect_axis", desc->indirect_axis); scaled_dot_product_attention_info.add("has_attn_mask_input", desc->has_attn_mask_input); scaled_dot_product_attention_info.add("has_scale_input", desc->has_scale_input); scaled_dot_product_attention_info.add("input_q_transpose_order", desc->input_q_transpose_order); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl new file mode 100644 index 00000000000000..b5ecfd6dfc9c14 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl @@ -0,0 +1,129 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" +#include "include/batch_headers/fetch_data.cl" +#include "include/batch_headers/common.cl" +#include "include/batch_headers/sub_group_block_read.cl" +#include "include/batch_headers/sub_group_block_write.cl" +#include "include/batch_headers/sub_group_shuffle.cl" + + +#if OUTPUT_DIMS != 4 +#error "dynamic_quantize_gpu_opt.cl: Unsupported output dimension" +#endif + +#define VLOAD_N CAT(vload, VEC_SIZE) +#define VSTORE_N CAT(vstore, VEC_SIZE) +#define CONVERT_CHAR_N CAT(convert_char, VEC_SIZE) +#define AS_TYPE_N_(type, n, x) as_##type##n(x) +#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) +#define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x) + + +inline uint FUNC(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { + return OUTPUT1_GET_INDEX(b, f, y, x); +} + +inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { +#ifdef SCALES_OUTPUT_ORDER + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR SCALES_OUTPUT_ORDER); +#else + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#endif +} + +#define SUBGROUP_SIZE 16 +#define INNERMOST_DIM_VALUE INPUT0_SIZE_X +#define INPUT_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, 1, ptr, offset) +#define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, 1, ptr, offset, val) + +__attribute__((reqd_work_group_size(SUBGROUP_SIZE, SUBGROUPS_NUMBER, 1))) +REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE) +KERNEL(dynamic_quantize_gpu_kv_cache)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global OUTPUT1_TYPE* output_scale +#if ASYMMETRIC_QUANTIZATION && !GROUP_SCALES_WITH_ZP + , __global OUTPUT2_TYPE* output_zp +#endif +#ifdef APPEND_MODE + , const uint axis_offset +#endif + ) +{ + const uint sglid = get_sub_group_local_id(); + const uint grouped_indexes = get_global_id(1); + const uint batch_indexes = get_global_id(2); + + DECLARE_BATCHED_DIMS_INDEXES(batch_indexes); + DECLARE_GROUPED_DIMS_INDEXES(grouped_indexes); + + // int8 arg = (0, 0, 1, 1, 2, 2, 3, 3); + // int acc = intel_sub_group_i8_i8_matrix_mad_k32(as_int(sglid), arg, as_int(sglid)); + // int intel_sub_group_i8_i8_matrix_mad_k32(int a, int8 b, int acc); + + // the innermost dimension is always handled in the loop inside the kernel + const uint x = 0; + + half max_value = INPUT0_VAL_MIN; + half min_value = INPUT0_VAL_MAX; + + // if (acc == 787) { + // max_value = INPUT0_VAL_MIN + 1; + // } + + half val[INNERMOST_DIM_VALUE / SUBGROUP_SIZE]; + + const uint input_offset = INPUT0_GET_INDEX(b, f, y, x); + unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) { + val[i] = INPUT_BLOCK_READ(input, input_offset + i * SUBGROUP_SIZE); +#if ASYMMETRIC_QUANTIZATION + max_value = fmax(max_value, val[i]); + min_value = fmin(min_value, val[i]); +#else + max_value = fmax(max_value, fabs(val[i])); +#endif + } + +#if ASYMMETRIC_QUANTIZATION + min_value = work_group_reduce_min(min_value); + max_value = work_group_reduce_max(max_value); + OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value)); + OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_value * scale) - CHAR_MAX; +#else + max_value = work_group_reduce_max(max_value); + OUTPUT1_TYPE scale = 127.0h / max_value; +#endif + +#ifdef APPEND_MODE + APPEND_AXIS_NAME += axis_offset; +#endif + + const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x); + unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) { +#if ASYMMETRIC_QUANTIZATION + OUTPUT_TYPE res = convert_char(val[i] * scale + zp); +#else + OUTPUT_TYPE res = convert_char(val[i] * scale); +#endif + OUTPUT_BLOCK_WRITE(output, output_offset + i * SUBGROUP_SIZE, res); + } + + const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); + + if (grouped_indexes == 0 && sglid == 0) { +#if ASYMMETRIC_QUANTIZATION + output_scale[scale_idx] = 1.0h / scale; +#if GROUP_SCALES_WITH_ZP + output_scale[scale_idx + 1] = zp; +#else + output_zp[scale_idx] = zp; +#endif +#else + output_scale[scale_idx] = 1.0h / scale; +#endif + } +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl new file mode 100644 index 00000000000000..9d4965be101d69 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl @@ -0,0 +1,130 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" +#include "include/batch_headers/fetch_data.cl" +#include "include/batch_headers/common.cl" +#include "include/batch_headers/sub_group_block_read.cl" +#include "include/batch_headers/sub_group_block_write.cl" +#include "include/batch_headers/sub_group_shuffle.cl" + + +#if OUTPUT_DIMS != 4 +#error "dynamic_quantize_gpu_opt.cl: Unsupported output dimension" +#endif + +#define VLOAD_N CAT(vload, VEC_SIZE) +#define VSTORE_N CAT(vstore, VEC_SIZE) +#define CONVERT_CHAR_N CAT(convert_char, VEC_SIZE) +#define AS_TYPE_N_(type, n, x) as_##type##n(x) +#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) +#define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x) + + +inline uint FUNC(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { + return OUTPUT1_GET_INDEX(b, f, y, x); +} + +inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { +#ifdef SCALES_OUTPUT_ORDER + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR SCALES_OUTPUT_ORDER); +#else + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#endif +} + +#define SUBGROUP_SIZE 16 +#define INNERMOST_DIM_VALUE INPUT0_SIZE_X +#define INPUT_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, 1, ptr, offset) +#define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, 1, ptr, offset, val) + +__attribute__((reqd_work_group_size(SUBGROUP_SIZE, SUBGROUPS_NUMBER, 1))) +REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE) +KERNEL(dynamic_quantize_gpu_opt_generic)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global OUTPUT1_TYPE* output_scale +#if ASYMMETRIC_QUANTIZATION && !GROUP_SCALES_WITH_ZP + , __global OUTPUT2_TYPE* output_zp +#endif +#ifdef APPEND_MODE + , const uint axis_offset +#endif + ) +{ + const uint sglid = get_sub_group_local_id(); + const uint grouped_indexes = get_global_id(1); + const uint batch_indexes = get_global_id(2); + + DECLARE_BATCHED_DIMS_INDEXES(batch_indexes); + DECLARE_GROUPED_DIMS_INDEXES(grouped_indexes); + + // the innermost dimension is always handled in the loop inside the kernel + const uint x = 0; + + half max_value = INPUT0_VAL_MIN; + half min_value = INPUT0_VAL_MAX; + + half val[INNERMOST_DIM_VALUE / SUBGROUP_SIZE]; + + const uint input_offset = INPUT0_GET_INDEX(b, f, y, x); + unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) { + val[i] = INPUT_BLOCK_READ(input, input_offset + i * SUBGROUP_SIZE); +#if ASYMMETRIC_QUANTIZATION + max_value = fmax(max_value, val[i]); + min_value = fmin(min_value, val[i]); +#else + max_value = fmax(max_value, fabs(val[i])); +#endif + } + +#if ASYMMETRIC_QUANTIZATION + min_value = work_group_reduce_min(min_value); + max_value = work_group_reduce_max(max_value); + OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value)); + OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_value * scale) - CHAR_MAX; +#else + max_value = work_group_reduce_max(max_value); + OUTPUT1_TYPE scale = 127.0h / max_value; +#endif + +#ifdef APPEND_MODE + APPEND_AXIS_NAME += axis_offset; +#endif + + const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x); + unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) { +#if ASYMMETRIC_QUANTIZATION + OUTPUT_TYPE res = convert_char(val[i] * scale + zp); +#else + OUTPUT_TYPE res = convert_char(val[i] * scale); +#endif + OUTPUT_BLOCK_WRITE(output, output_offset + i * SUBGROUP_SIZE, res); + } + + const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); + + if (grouped_indexes == 0 && sglid == 0) { +#ifdef APPEND_MODE +#if GROUP_SCALES_WITH_ZP + // half result0 = (convert_half(convert_char(val[0] * scale + zp)) - zp) * (1.0h / scale); + // half result1 = (convert_half(convert_char(val[1] * scale + zp)) - zp) * (1.0h / scale); + // half result2 = (convert_half(convert_char(val[2] * scale + zp)) - zp) * (1.0h / scale); + // half result3 = (convert_half(convert_char(val[3] * scale + zp)) - zp) * (1.0h / scale); + // printf("Save scale_idx=%d, axis_offset=%d; scale=%f; zp=%f, min=%f, max=%f; orig=(%f %f %f %f), compressed=(%d %d %d %d), decompressed=(%f %f)\n", scale_idx, axis_offset, scale, zp, min_value, max_value, + // val[0], val[1], val[2], val[3], + // convert_char(val[0] * scale + zp), convert_char(val[1] * scale + zp), convert_char(val[2] * scale + zp), convert_char(val[3] * scale + zp), + // result0, + // result1); +#endif +#endif + output_scale[scale_idx] = 1.0h / scale; +#if ASYMMETRIC_QUANTIZATION && GROUP_SCALES_WITH_ZP + output_scale[scale_idx + 1] = zp; +#elif ASYMMETRIC_QUANTIZATION + output_zp[scale_idx] = zp; +#endif + } +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl index 436276a67e48c0..e7e2f803105760 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl @@ -8,6 +8,18 @@ #error "dynamic_quantize_gpu_ref.cl: Unsupported output dimension" #endif +inline uint FUNC(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { + return OUTPUT1_GET_INDEX(b, f, y, x); +} + +inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { +#ifdef SCALES_OUTPUT_ORDER + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR SCALES_OUTPUT_ORDER); +#else + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#endif +} + KERNEL(dynamic_quantize_gpu_ref)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, @@ -15,14 +27,27 @@ KERNEL(dynamic_quantize_gpu_ref)( __global OUTPUT1_TYPE* output_scale) { const uint bf = (uint)get_global_id(0); - const uint b = (uint)get_global_id(0) / INPUT0_FEATURE_NUM; - const uint f = (uint)get_global_id(0) % INPUT0_FEATURE_NUM; + const uint b = bf / INPUT0_FEATURE_NUM; + const uint f = bf % INPUT0_FEATURE_NUM; const uint y = (uint)get_global_id(1); - const uint scale_idx = OUTPUT1_GET_INDEX(b, f, y, 0); + const uint x = (uint)get_global_id(2); +#ifdef SCALES_OUTPUT_ORDER + const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#else + const uint scale_idx = OUTPUT1_GET_INDEX_SAFE(b, f, y, x); +#endif half max_val = 0.0001h; - for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) { - const uint offset = INPUT0_GET_INDEX(b, f, y + y_off, 0); + for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) { + for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) { + for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) { +#if GROUP_SIZE_DIM3 == 1 + const uint offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x); + half val = input[offset]; + half abs_val = fabs(val); + max_val = fmax(max_val, abs_val); +#else + const uint offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); int x; for (x = 0; x < INPUT0_SIZE_X / 8; x++) { half8 val = as_half8(vload8(0, (ushort*)input + offset + x * 8)); @@ -34,12 +59,25 @@ KERNEL(dynamic_quantize_gpu_ref)( x *= 8; for (; x < INPUT0_SIZE_X; x++) max_val = fmax(max_val, fabs(input[offset + x])); +#endif + } + } } half scale = 127.0h / max_val; - for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) { - const uint in_offset = INPUT0_GET_INDEX(b, f, y + y_off, 0); - const uint out_offset = OUTPUT_GET_INDEX(b, f, y + y_off, 0); + for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) { + for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) { + for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) { +#if GROUP_SIZE_DIM3 == 1 + const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x); + const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, x); + + half val = input[in_offset]; + val *= scale; + output[out_offset] = convert_char(val); +#else + const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); + const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); int x; for (x = 0; x < INPUT0_SIZE_X / 8; x++) { half8 val = as_half8(vload8(0, (ushort*)input + in_offset + x * 8)); @@ -49,6 +87,9 @@ KERNEL(dynamic_quantize_gpu_ref)( x *= 8; for (; x < INPUT0_SIZE_X; x++) output[out_offset + x] = convert_char(input[in_offset + x] * scale); +#endif + } + } } output_scale[scale_idx] = 1.0h / scale; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index 948bd3c0f1a305..6f39487e3014be 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -118,12 +118,21 @@ inline uint FUNC(get_bt_index_value)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uin #define VALUE_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT2_TYPE, 1, ptr, offset) #define SUBGROUPS_PER_WG (HEAD_SIZE * SG_SCALE_FACTOR / SUBGROUP_SIZE) +#if IS_KV_COMPRESSED +#if COMPRESSED_PER_HEAD + #define GET_COMPRESSION_INDEX(INPUT, b, f, y, x) GET_DATA_INDEX(INPUT, (b), (f), (y), (0)); +#else + #define GET_COMPRESSION_INDEX(INPUT, b, f, y, x) GET_DATA_INDEX(INPUT, (b), (0), (y), (0)); +#endif +#endif + #ifdef SDPA_STAGE_0 #if TARGET_SEQ_LEN_BLOCK_SIZE == 1 /* This version is used for 2nd token */ REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE) +__attribute__((reqd_work_group_size(1, 1, HEAD_SIZE * SG_SCALE_FACTOR))) KERNEL(sdpa_opt)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* query_input, @@ -136,6 +145,10 @@ KERNEL(sdpa_opt)( const __global INPUT4_TYPE* scale, #endif __global OUTPUT_TYPE* output, +#if IS_KV_COMPRESSED + const __global KEY_COMPRESSION_SCALE_TYPE* key_scale, + const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale, +#endif #ifdef BEAM_TABLE_TYPE const __global BEAM_TABLE_TYPE* beam_table, #endif @@ -149,7 +162,18 @@ KERNEL(sdpa_opt)( const uint b1_idx = batch_idx % NUM_HEADS; /* HEADS_NUM dim */ const uint target_seq_idx = get_global_id(1); const uint lid = get_local_id(2); + +#if SG_SCALE_FACTOR == 2 + const uint head_size_idx = lid % HEAD_SIZE; +#elif SG_SCALE_FACTOR == 1 const uint head_size_idx = lid; +#else + #error "sdpa_opt.cl: Unsupported scale factor" +#endif + +#if SUBGROUPS_PER_WG > SUBGROUP_SIZE + #error "sdpa_opt.cl: Number of subgroups per work group should be less than subgroup_size +#endif const uint sgid = get_sub_group_id(); const uint sglid = get_sub_group_local_id(); @@ -199,13 +223,19 @@ KERNEL(sdpa_opt)( uint query_offset = INPUT0_GET_INDEX(b0_idx, b1_idx, target_seq_idx, (sgid * SUBGROUP_SIZE)); const uint query_pitch = QUERY_STEP_LOCAL; #endif - for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { - #define QUERY_BLOCK_SIZE 1 +#if SG_SCALE_FACTOR == 2 + if (sgid < HEAD_SIZE / SUBGROUP_SIZE) { +#else + { +#endif + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + #define QUERY_BLOCK_SIZE 1 - INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset); - query_local[query_local_offset] = val * scale_val; - query_local_offset += QUERY_STEP_LOCAL; - query_offset += query_pitch; + INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset); + query_local[query_local_offset] = val * scale_val; + query_local_offset += QUERY_STEP_LOCAL; + query_offset += query_pitch; + } } #undef QUERY_BLOCK_SIZE #undef QUERY_STEP @@ -216,28 +246,45 @@ KERNEL(sdpa_opt)( // Main Gemm1 calculation loop // Each SG performs element-wise multiplications of Q[HEAD_SIZE]xK[HEAD_SIZE] values // HEAD_SIZE / SUBGROUPS_PER_WG times in the loop and saves the result to the qk_local SLM buffer - for (uint seq_len = sgid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE / SUBGROUP_SIZE)) { -#ifdef INPUT1_DIMS_ORDER + for (uint seq_len = sgid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE / SUBGROUP_SIZE) * SG_SCALE_FACTOR) { #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_key)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0)]; #else const uint b_idx = b0_idx; #endif - const uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0); + +#ifdef INPUT1_DIMS_ORDER + uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0); #else - const uint key_offset = INPUT1_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, 0); + uint key_offset = INPUT1_GET_INDEX(b_idx, b1_idx, start_partition_idx + seq_len, 0); #endif INPUT0_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {INPUT0_VAL_ZERO}; +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0); + KEY_COMPRESSION_SCALE_TYPE comp_scale = key_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + KEY_COMPRESSION_SCALE_TYPE comp_zp = key_scale[comp_offset + 1]; +#endif +#endif uint head_idx_index = 0; #define KEY_BLOCK_SIZE 8 for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) - KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + KEY_BLOCK key_vec_packed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed) - comp_zp) * comp_scale; +#elif IS_KV_COMPRESSED + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed)) * comp_scale; +#else + KEY_BLOCK key_vals = key_vec_packed; +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -258,9 +305,18 @@ KERNEL(sdpa_opt)( for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) - KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + KEY_BLOCK key_vec_packed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed) - comp_zp) * comp_scale; +#elif IS_KV_COMPRESSED + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed)) * comp_scale; +#else + KEY_BLOCK key_vals = key_vec_packed; +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -281,9 +337,18 @@ KERNEL(sdpa_opt)( for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) - KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + KEY_BLOCK key_vec_packed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed) - comp_zp) * comp_scale; +#elif IS_KV_COMPRESSED + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed)) * comp_scale; +#else + KEY_BLOCK key_vals = key_vec_packed; +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -304,9 +369,18 @@ KERNEL(sdpa_opt)( for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) - KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + KEY_BLOCK key_vec_packed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed) - comp_zp) * comp_scale; +#elif IS_KV_COMPRESSED + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vec_packed)) * comp_scale; +#else + KEY_BLOCK key_vals = key_vec_packed; +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -335,7 +409,7 @@ KERNEL(sdpa_opt)( const uint seq_idx_end = 1; for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { // Iterate over all values QK values in SLM and apply scale and attention mask - for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE)) { + for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE * SG_SCALE_FACTOR)) { // Read value from SLM and apply scale qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len]; @@ -388,7 +462,7 @@ KERNEL(sdpa_opt)( SOFTMAX_ACCUMULATOR_TYPE exp_sum[TARGET_SEQ_LEN_BLOCK_SIZE] = {SOFTMAX_ACCUMULATOR_VAL_ZERO}; const uint qk_num_per_wi = CEIL_DIV(partition_seq_len, SUBGROUPS_PER_WG * SUBGROUP_SIZE); for (uint qk_idx = 0; qk_idx < qk_num_per_wi; qk_idx++) { - const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + head_size_idx; + const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + lid; if (local_data_idx < partition_seq_len) { for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { SOFTMAX_ACCUMULATOR_TYPE qk_new = native_exp(TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx]) - qk_max[seq_idx]); @@ -420,7 +494,7 @@ KERNEL(sdpa_opt)( // const SOFTMAX_ACCUMULATOR_TYPE inv_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ONE / exp_sum[seq_idx]; for (uint qk_idx = 0; qk_idx < qk_num_per_wi; qk_idx++) { - const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + sgid * SUBGROUP_SIZE + sglid; + const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + lid; if (local_data_idx < partition_seq_len) { for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { SOFTMAX_ACCUMULATOR_TYPE qk_new = TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx]) / exp_sum[seq_idx]; @@ -434,7 +508,7 @@ KERNEL(sdpa_opt)( { // If the number of partitions is greater than 1, save exm_sums and max_logits to the temporary buffers // Use single WI in the WG, since all the WIs have the same value - if (num_of_partitions > 1 && head_size_idx == 0) { + if (num_of_partitions > 1 && lid == 0) { for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) + b1_idx * (TARGET_SEQ_LEN * num_of_partitions) + @@ -463,15 +537,32 @@ KERNEL(sdpa_opt)( #endif #endif - for (uint seq_len = 0; seq_len < partition_seq_len / SUBGROUP_SIZE; seq_len++) { +#if SG_SCALE_FACTOR > 1 + const uint seq_len_start = (sgid / (HEAD_SIZE / SUBGROUP_SIZE)) * (SEQ_LEN_PARTITION_SIZE / SG_SCALE_FACTOR / SUBGROUP_SIZE); + const uint seq_len_end = min(seq_len_start + (SEQ_LEN_PARTITION_SIZE / SG_SCALE_FACTOR / SUBGROUP_SIZE), partition_seq_len / SUBGROUP_SIZE); +#else + const uint seq_len_start = 0; + const uint seq_len_end = partition_seq_len / SUBGROUP_SIZE; +#endif + + for (uint seq_len = seq_len_start; seq_len < seq_len_end; seq_len++) { #ifdef BEAM_TABLE_TYPE - uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE)]; + const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE)]; uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE); #else + const uint b_idx = b0_idx; #ifdef INPUT2_DIMS_ORDER - uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); + uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); #else - uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); + uint value_offset = INPUT2_GET_INDEX(b_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); +#endif +#endif + +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); + VALUE_COMPRESSION_SCALE_TYPE comp_scale = val_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_scale[comp_offset + 1]; #endif #endif @@ -482,9 +573,17 @@ KERNEL(sdpa_opt)( unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, value_offset); +#endif + +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed - sub_group_broadcast(comp_zp, i)) * sub_group_broadcast(comp_scale, i); +#elif IS_KV_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed * sub_group_broadcast(comp_scale, i)); +#else + INPUT2_TYPE value_val = value_packed; #endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc[seq_idx]); @@ -496,17 +595,30 @@ KERNEL(sdpa_opt)( } } - const uint seq_len_leftovers_start = (partition_seq_len / SUBGROUP_SIZE) * SUBGROUP_SIZE; - for (uint seq_len = seq_len_leftovers_start; seq_len < partition_seq_len; seq_len++) { -#ifdef INPUT2_DIMS_ORDER + +#if SG_SCALE_FACTOR > 1 + if (sgid >= HEAD_SIZE / SUBGROUP_SIZE) { +#endif + + for (uint seq_len = (partition_seq_len / SUBGROUP_SIZE) * SUBGROUP_SIZE; seq_len < partition_seq_len; seq_len++) { #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, head_size_idx)]; #else const uint b_idx = b0_idx; #endif + +#ifdef INPUT2_DIMS_ORDER const uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, head_size_idx); #else - const uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, head_size_idx); + const uint value_offset = INPUT2_GET_INDEX(b_idx, b1_idx, start_partition_idx + seq_len, head_size_idx); +#endif + +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0); + VALUE_COMPRESSION_SCALE_TYPE comp_scale = val_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_scale[comp_offset + 1]; +#endif #endif OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; @@ -514,15 +626,42 @@ KERNEL(sdpa_opt)( qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len]; } - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, value_offset); +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + const VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed - comp_zp) * comp_scale; +#elif IS_KV_COMPRESSED + const VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed * comp_scale); +#else + const INPUT2_TYPE value_val = value_packed; +#endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc[seq_idx] = mad(qk_val[seq_idx], value_val, acc[seq_idx]); } } +#if SG_SCALE_FACTOR > 1 + } // if (sgid >= HEAD_SIZE / SUBGROUP_SIZE) +#endif + +#if SG_SCALE_FACTOR > 1 + if ((partition_seq_len > (SEQ_LEN_PARTITION_SIZE / SG_SCALE_FACTOR)) || (partition_seq_len % SUBGROUP_SIZE != 0)) { + if (sgid >= HEAD_SIZE / SUBGROUP_SIZE) { + // Reuse query_local SLM to sum-up results between two groups of subgroups + query_local[head_size_idx] = acc[0]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid < HEAD_SIZE / SUBGROUP_SIZE) { + acc[0] += query_local[head_size_idx]; + } + } +#endif + // If the number of partitions is greater than 1, save results to the temporary buffer; // otherwise, save results directly to the main output. +#if SG_SCALE_FACTOR > 1 + if (sgid < HEAD_SIZE / SUBGROUP_SIZE) { +#endif if (num_of_partitions > 1) { const uint seq_idx_end = 1; for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { @@ -542,6 +681,9 @@ KERNEL(sdpa_opt)( output[output_offset] = acc[seq_idx]; } } +#if SG_SCALE_FACTOR > 1 + } // if (sgid < HEAD_SIZE / SUBGROUP_SIZE) { +#endif } // Gemm2 calculation end } @@ -582,6 +724,12 @@ KERNEL(sdpa_opt)( #define ATTN_SCALE_BUFFER_ARG #endif +// Applying scales to query input improves the accuracy, but leads to performance drop for FP16 KV-cache case, +// so use it only for compressed version +#if IS_KV_COMPRESSED +#define APPLY_SCALES_TO_QUERY 1 +#endif + #define MASK_VECTOR_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) inline MASK_VECTOR_TYPE FUNC(load_attn_mask)(OPTIONAL_SHAPE_INFO_ARG @@ -675,6 +823,10 @@ KERNEL(sdpa_opt)( const __global INPUT4_TYPE* scale, #endif __global OUTPUT_TYPE* output, +#if IS_KV_COMPRESSED + const __global KEY_COMPRESSION_SCALE_TYPE* key_scale, + const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale, +#endif #ifdef BEAM_TABLE_TYPE const __global BEAM_TABLE_TYPE* beam_table, #endif @@ -743,12 +895,22 @@ KERNEL(sdpa_opt)( #endif uint query_local_offset = head_size_idx * TARGET_SEQ_LEN_BLOCK_SIZE; +#if APPLY_SCALES_TO_QUERY +#if HAS_SCALE_INPUT + const INPUT0_TYPE scale_val = *scale; +#else + const INPUT0_TYPE scale_val = TO_INPUT0_TYPE(STATIC_SCALE_VALUE); +#endif +#else + const INPUT0_TYPE scale_val = INPUT0_VAL_ONE; +#endif + if (cur_target_seq_len_size != TARGET_SEQ_LEN_BLOCK_SIZE) { if (sgid * SUBGROUP_SIZE < HEAD_SIZE) { for (uint seq_idx = 0; seq_idx < cur_target_seq_len_size; seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -759,7 +921,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -769,7 +931,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -780,7 +942,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -788,7 +950,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -833,6 +995,7 @@ KERNEL(sdpa_opt)( const uint b_idx = beam_table[FUNC_CALL(get_bt_index_key)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, seq_len + sglid, 0)]; const uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, seq_len + sglid, 0); #else + const uint b_idx = b0_idx; #ifdef INPUT1_DIMS_ORDER uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, seq_len, 0); uint key_offset_next_seq = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, seq_len + 1, 0); @@ -862,10 +1025,17 @@ KERNEL(sdpa_opt)( PA_BUFFERS); if (seq_len_calc_size >= SUBGROUP_SIZE) { +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid, 0); + KEY_COMPRESSION_SCALE_TYPE comp_scale = key_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + KEY_COMPRESSION_SCALE_TYPE comp_zp = key_scale[comp_offset + 1]; +#endif +#endif __attribute__((opencl_unroll_hint(1))) for (uint head_idx_index = 0; head_idx_index < HEAD_SIZE; head_idx_index += SUBGROUP_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset); - #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) QUERY_VEC queries_vec; uint query_local_offset = (head_idx_index * TARGET_SEQ_LEN_BLOCK_SIZE) + sglid; @@ -876,9 +1046,17 @@ KERNEL(sdpa_opt)( unroll_for (uint key_row_idx = 0; key_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; key_row_idx++) { #ifdef BEAM_TABLE_TYPE - INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); + const INPUT1_TYPE key_packed = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); #else - INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); + const INPUT1_TYPE key_packed = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); +#endif + +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + KEY_COMPRESSION_SCALE_TYPE key_vals = (TO_KEY_COMPRESSION_SCALE_TYPE(key_packed) - sub_group_broadcast(comp_zp, key_row_idx)) * sub_group_broadcast(comp_scale, key_row_idx); +#elif IS_KV_COMPRESSED + KEY_COMPRESSION_SCALE_TYPE key_vals = (TO_KEY_COMPRESSION_SCALE_TYPE(key_packed) * sub_group_broadcast(comp_scale, key_row_idx)); +#else + INPUT1_TYPE key_vals = key_packed; #endif unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { @@ -887,12 +1065,29 @@ KERNEL(sdpa_opt)( } } } else if (seq_len_calc_size > 0) { +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, seq_len + min(sglid, (uint)seq_len_calc_size - 1), 0); + // const uint comp_offset = GET_COMPRESSION_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid, 0); + KEY_COMPRESSION_SCALE_TYPE comp_scale = key_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + KEY_COMPRESSION_SCALE_TYPE comp_zp = key_scale[comp_offset + 1]; +#endif +#endif __attribute__((opencl_unroll_hint(1))) for (uint head_idx_index = 0; head_idx_index < HEAD_SIZE; head_idx_index += SUBGROUP_SIZE) { - #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset); - #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset) + #define QUERY_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) +#if IS_KV_COMPRESSED + #define KEY_UNPACKED_TYPE KEY_COMPRESSION_SCALE_TYPE + #define KEY_UNPACKED_VEC_TYPE MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + #define TO_KEY_UNPACKED_TYPE(val) TO_KEY_COMPRESSION_SCALE_TYPE(val) +#else + #define KEY_UNPACKED_TYPE INPUT1_TYPE + #define KEY_UNPACKED_VEC_TYPE MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + #define TO_KEY_UNPACKED_TYPE(val) TO_INPUT1_TYPE(val) +#endif - QUERY_VEC queries_vec; + QUERY_VEC_TYPE queries_vec; uint query_local_offset = (head_idx_index * TARGET_SEQ_LEN_BLOCK_SIZE) + sglid; unroll_for (uint q_row_idx = 0; q_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; q_row_idx++) { queries_vec[q_row_idx] = slm_query[query_local_offset]; @@ -900,27 +1095,37 @@ KERNEL(sdpa_opt)( } #ifndef LOAD_KEY_LEFTOVERS_IN_CALC_LOOP - QUERY_VEC key_vec = 0; + KEY_UNPACKED_VEC_TYPE key_vec = 0; unroll_for (uint key_row_idx = 0; key_row_idx < seq_len_calc_size; key_row_idx++) { - #ifdef BEAM_TABLE_TYPE - key_vec[key_row_idx] = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); - #else - key_vec[key_row_idx] = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); - #endif +#ifdef BEAM_TABLE_TYPE + key_vec[key_row_idx] = TO_KEY_UNPACKED_TYPE(KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index)); +#else + key_vec[key_row_idx] = TO_KEY_UNPACKED_TYPE(KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index)); +#endif + +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + key_vec[key_row_idx] = (key_vec[key_row_idx] - sub_group_broadcast(comp_zp, key_row_idx)) * sub_group_broadcast(comp_scale, key_row_idx); +#elif IS_KV_COMPRESSED + key_vec[key_row_idx] *= sub_group_broadcast(comp_scale, key_row_idx); +#endif } #endif unroll_for (uint key_row_idx = 0; key_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; key_row_idx++) { #ifdef LOAD_KEY_LEFTOVERS_IN_CALC_LOOP - #ifdef BEAM_TABLE_TYPE - INPUT1_TYPE key_vals = 0; - if (key_row_idx < seq_len_calc_size) - key_vals = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); - #else - INPUT1_TYPE key_vals = 0; - if (key_row_idx < seq_len_calc_size) - key_vals = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); - #endif + KEY_UNPACKED_TYPE key_vals = 0; + if (key_row_idx < seq_len_calc_size) { +#ifdef BEAM_TABLE_TYPE + key_vals = TO_KEY_UNPACKED_TYPE(KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index)); +#else + key_vals = TO_KEY_UNPACKED_TYPE(KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index)); +#endif + } +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + key_vals = (key_vals - sub_group_broadcast(comp_zp, key_row_idx)) * sub_group_broadcast(comp_scale, key_row_idx); +#elif IS_KV_COMPRESSED + key_vals *= sub_group_broadcast(comp_scale, key_row_idx); +#endif #else #define key_vals key_vec[key_row_idx] #endif @@ -933,12 +1138,14 @@ KERNEL(sdpa_opt)( { unroll_for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) { +#if !APPLY_SCALES_TO_QUERY #if HAS_SCALE_INPUT const OUTPUT_TYPE scale_val = *scale; #else const OUTPUT_TYPE scale_val = TO_OUTPUT_TYPE(STATIC_SCALE_VALUE); #endif qk_acc[i] *= scale_val; +#endif #ifdef HAS_ALIBI const int alibi_val = (1 - SOURCE_SEQ_LEN) + seq_len + i; @@ -1037,6 +1244,7 @@ KERNEL(sdpa_opt)( const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len) + sglid, sgid * SUBGROUP_SIZE)]; const uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + (seq_len) + sglid, sgid * SUBGROUP_SIZE); #else + const uint b_idx = b0_idx; #ifdef INPUT2_DIMS_ORDER uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len), head_size_idx); #else @@ -1050,12 +1258,28 @@ KERNEL(sdpa_opt)( qk_val[seq_idx] = slm_qk_vals[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len + sglid]; } +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len + sglid, 0); + VALUE_COMPRESSION_SCALE_TYPE comp_scale = val_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_scale[comp_offset + 1]; +#endif +#endif unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); +#else + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, value_offset); +#endif + +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed - sub_group_broadcast(comp_zp, i)) * sub_group_broadcast(comp_scale, i); +#elif IS_KV_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed * sub_group_broadcast(comp_scale, i)); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + INPUT2_TYPE value_val = value_packed; #endif + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc_output_res[seq_idx]); } @@ -1085,12 +1309,21 @@ KERNEL(sdpa_opt)( const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE)]; uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE); #else + const uint b_idx = b0_idx; #ifdef INPUT2_DIMS_ORDER uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); #else uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); #endif #endif +#endif + +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); + VALUE_COMPRESSION_SCALE_TYPE comp_scale = val_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_scale[comp_offset + 1]; +#endif #endif MAKE_VECTOR_TYPE(OUTPUT_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) qk_val; @@ -1100,9 +1333,17 @@ KERNEL(sdpa_opt)( unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); +#else + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, value_offset); +#endif + +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed - sub_group_broadcast(comp_zp, i)) * sub_group_broadcast(comp_scale, i); +#elif IS_KV_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed * sub_group_broadcast(comp_scale, i)); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + INPUT2_TYPE value_val = value_packed; #endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc_output_res[seq_idx]); @@ -1136,19 +1377,37 @@ KERNEL(sdpa_opt)( const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid, sgid * SUBGROUP_SIZE)]; const uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid, sgid * SUBGROUP_SIZE); #else + const uint b_idx = b0_idx; #ifdef INPUT2_DIMS_ORDER uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len_leftovers_start, head_size_idx); #else uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len_leftovers_start, head_size_idx); #endif #endif +#endif + +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + min(seq_len_leftovers_start + sglid, seq_len_end - 1), 0); + // const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len_leftovers_start + sglid, 0); + VALUE_COMPRESSION_SCALE_TYPE comp_scale = val_scale[comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_scale[comp_offset + 1]; +#endif #endif for (uint seq_len_idx = 0; seq_len_idx < partition_seq_len - seq_len_leftovers_start; seq_len_idx++) { #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, seq_len_idx)); + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, seq_len_idx)); +#else + const INPUT2_TYPE value_packed = VALUE_BLOCK_READ(value_input, value_offset); +#endif + +#if IS_KV_COMPRESSED && USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed - sub_group_broadcast(comp_zp, seq_len_idx)) * sub_group_broadcast(comp_scale, seq_len_idx); +#elif IS_KV_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_packed * sub_group_broadcast(comp_scale, seq_len_idx)); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + INPUT2_TYPE value_val = value_packed; #endif for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl index 83e3c7c7e9fef1..682af11777012f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl @@ -112,6 +112,15 @@ inline uint FUNC(get_bt_index_value)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uin #endif #define APPLY_SCALE_TO_QUERY 1 +#define HAS_KV_CACHE_ZP_INPUT USE_ASYMMETRIC_QUANTIZATION && !COMBINE_SCALES_AND_ZP + +#if IS_KV_COMPRESSED +#if COMPRESSED_PER_HEAD + #define GET_COMPRESSION_INDEX(INPUT, b, f, y, x) GET_DATA_INDEX(INPUT, (b), (f), (y), (0)); +#else + #define GET_COMPRESSION_INDEX(INPUT, b, f, y, x) GET_DATA_INDEX(INPUT, (b), (0), (y), (0)); +#endif +#endif KERNEL(sdpa_ref)( OPTIONAL_SHAPE_INFO_ARG @@ -125,6 +134,14 @@ KERNEL(sdpa_ref)( const __global INPUT4_TYPE* scale, #endif __global OUTPUT_TYPE* output, +#if IS_KV_COMPRESSED + const __global KEY_COMPRESSION_SCALE_TYPE* key_scale, + const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale, +#if HAS_KV_CACHE_ZP_INPUT + const __global KEY_COMPRESSION_ZP_TYPE* key_zp, + const __global VALUE_COMPRESSION_ZP_TYPE* val_zp, +#endif +#endif #ifdef BEAM_TABLE_TYPE const __global BEAM_TABLE_TYPE* beam_table, #endif @@ -162,7 +179,24 @@ KERNEL(sdpa_ref)( #else INPUT0_TYPE q_val = query_input[query_offset]; #endif - INPUT1_TYPE k_val = key_input[key_offset]; + + INPUT1_TYPE k_val_packed = key_input[key_offset]; +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1 / BROADCAST_GROUP_SIZE, s, 0); + KEY_COMPRESSION_SCALE_TYPE comp_scale = key_scale[comp_offset]; + +#if USE_ASYMMETRIC_QUANTIZATION && HAS_KV_CACHE_ZP_INPUT + KEY_COMPRESSION_SCALE_TYPE comp_zp = key_zp[comp_offset]; +#elif USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = key_scale[comp_offset + 1]; +#else + KEY_COMPRESSION_SCALE_TYPE comp_zp = 0; +#endif + KEY_COMPRESSION_SCALE_TYPE k_val = ((k_val_packed - comp_zp) * comp_scale); + +#else + INPUT1_TYPE k_val = k_val_packed; +#endif acc += q_val * k_val; } @@ -236,7 +270,24 @@ KERNEL(sdpa_ref)( #endif uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1, 0, 0, s, head_size_idx); - acc += tmp_buf[tmp_buf_offset] * value_input[value_offset]; + const INPUT2_TYPE value_packed = value_input[value_offset]; +#if IS_KV_COMPRESSED + const uint comp_offset = GET_COMPRESSION_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1 / BROADCAST_GROUP_SIZE, s, 0); + VALUE_COMPRESSION_SCALE_TYPE comp_scale = val_scale[comp_offset]; + +#if USE_ASYMMETRIC_QUANTIZATION && HAS_KV_CACHE_ZP_INPUT + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_zp[comp_offset]; +#elif USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE comp_zp = val_scale[comp_offset + 1]; +#else + VALUE_COMPRESSION_SCALE_TYPE comp_zp = 0; +#endif + VALUE_COMPRESSION_SCALE_TYPE value = ((value_packed - comp_zp) * comp_scale); +#else + INPUT2_TYPE value = value_packed; +#endif + + acc += tmp_buf[tmp_buf_offset] * value; } uint output_offset = OUTPUT_GET_INDEX(b0, b1, target_seq_idx, head_size_idx); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_kv_cache.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_kv_cache.h new file mode 100644 index 00000000000000..ac6870a37a1728 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_kv_cache.h @@ -0,0 +1,30 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" +#include "dynamic_quantize_kernel_ref.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// dynamic_quantize_params +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class DynamicQuantizeKernelKVCache : public KernelBaseOpenCL { +public: + DynamicQuantizeKernelKVCache() : KernelBaseOpenCL("dynamic_quantize_gpu_kv_cache") {} + virtual ~DynamicQuantizeKernelKVCache() {} + + virtual JitConstants GetJitConstants(const dynamic_quantize_params& params) const; + virtual CommonDispatchData SetDefault(const dynamic_quantize_params& params) const; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + Datatype GetAccumulatorType(const dynamic_quantize_params& params) const; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params&) const override; + void GetUpdateDispatchDataFunc(KernelData& kd) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp index 6a678770e85d72..b610ac2076def4 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp @@ -154,6 +154,15 @@ bool DynamicQuantizeKernelOpt::Validate(const Params& params) const { if (dq_params.inputs[0].GetPaddedVal() != 0 || dq_params.outputs[0].GetPaddedVal() != 0) return false; + if (dq_params.append_axis != -1) + return false; + + if (dq_params.group_sizes.back() != UINT64_MAX) + return false; + + if (!dq_params.scales_output_order.empty()) + return false; + return true; } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_kv_cache.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_kv_cache.cpp new file mode 100644 index 00000000000000..d0c99484e3f52e --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_kv_cache.cpp @@ -0,0 +1,285 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "dynamic_quantize_kernel_kv_cache.h" +#include "kernel_selector_utils.h" +#include + + +static constexpr size_t subgroup_size = 16; + +namespace kernel_selector { +static Tensor::NDims get_normalized_dims(const DataTensor& tensor) { + auto dims = tensor.GetDims(); + std::reverse(dims.begin(), dims.end()); + + return dims; +} + +static size_t get_elements_number_per_batch(const dynamic_quantize_params& params) { + const auto& group_sizes = params.group_sizes; + const auto& input_dims = get_normalized_dims(params.inputs[0]); + + size_t total_elements_number = 1; + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] != UINT64_MAX) { + total_elements_number *= input_dims[i].v; + } + } + + return total_elements_number; +} + +static size_t get_elements_number_per_group(const dynamic_quantize_params& params) { + const auto& group_sizes = params.group_sizes; + const auto& input_dims = get_normalized_dims(params.inputs[0]); + + size_t total_elements_number = 1; + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] == UINT64_MAX) { + total_elements_number *= input_dims[i].v; + } else { + total_elements_number *= group_sizes[i]; + } + } + + return total_elements_number; +} + +static std::string generate_dims_indexes_calculation(std::vector> dims) { + std::reverse(dims.begin(), dims.end()); // reorder dims in order from innermost to outermost dimensions + + auto generate_calc_function = [&](std::string data_type, std::string index_var, size_t dim_idx) { + std::string index_calc_str; + index_calc_str += "" + data_type + " " + dims[dim_idx].first + " = "; + index_calc_str += "(" + index_var + " / "; + index_calc_str += "(1"; + for (size_t i = 0; i < dim_idx; i++) { + index_calc_str += " * " + dims[i].second; + } + index_calc_str += ")) % " + dims[dim_idx].second + ";"; + + return index_calc_str; + }; + + std::stringstream indexes_calc_str; + for (size_t i = 0; i < dims.size(); i++) { + indexes_calc_str << generate_calc_function("uint", "data_idx", i); + } + + return indexes_calc_str.str(); +} + +static size_t get_per_iter_elements_number(const dynamic_quantize_params& params) { + const auto maxWorkGroupSize = params.engineInfo.maxWorkGroupSize; + const auto total_grouped_elements = get_elements_number_per_group(params); + + if (total_grouped_elements % maxWorkGroupSize == 0) + return maxWorkGroupSize; + + if (total_grouped_elements < maxWorkGroupSize) + return total_grouped_elements; + + return 0; +} + +ParamsKey DynamicQuantizeKernelKVCache::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::INT8); + k.EnableDifferentTypes(); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDynamicShapesSupport(); + return k; +} + +JitConstants DynamicQuantizeKernelKVCache::GetJitConstants(const dynamic_quantize_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + + const std::vector> default_dims = {{"b", "INPUT0_BATCH_NUM"}, + {"f", "INPUT0_FEATURE_NUM"}, + {"y", "INPUT0_SIZE_Y"}, + {"x", "INPUT0_SIZE_X"}}; + + const auto& group_sizes = params.group_sizes; + std::vector> batch_dims, grouped_dims; + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] == 1) { + batch_dims.push_back(default_dims[i]); + } else { + grouped_dims.push_back(default_dims[i]); + } + } + + const auto& input_dims = get_normalized_dims(params.inputs[0]); + const auto total_grouped_elements = get_elements_number_per_group(params); + const auto per_iter_elements_number = get_per_iter_elements_number(params); + const auto total_subgroups_number = total_grouped_elements / input_dims.back().v; + + // Drop the last dimensions, since it will be processed in the kernel's loop + grouped_dims.pop_back(); + + const bool append_mode = params.append_axis != -1; + std::pair append_axis_info = {}; + if (append_mode) { + jit.AddConstant(MakeJitConstant("APPEND_MODE", append_mode)); + jit.AddConstant(MakeJitConstant("APPEND_AXIS_NAME", default_dims[params.append_axis].first)); + } + + jit.AddConstant(MakeJitConstant("DECLARE_BATCHED_DIMS_INDEXES(data_idx)", generate_dims_indexes_calculation(batch_dims))); + jit.AddConstant(MakeJitConstant("DECLARE_GROUPED_DIMS_INDEXES(data_idx)", generate_dims_indexes_calculation(grouped_dims))); + jit.AddConstant(MakeJitConstant("SUBGROUPS_NUMBER", total_subgroups_number)); + + const auto iterations_number = total_grouped_elements / per_iter_elements_number; + + jit.AddConstant(MakeJitConstant("ITERATIONS_NUMBER", iterations_number)); + jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization)); + jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp)); + + bool rearrange_scales_order = false; + const auto& scales_output_order = params.scales_output_order; + if (!scales_output_order.empty()) { + for (size_t i = 0; i < scales_output_order.size(); i++) { + if (i != scales_output_order[i]) { + rearrange_scales_order = true; + break; + } + } + } + + if (rearrange_scales_order) { + const std::array default_dim_order = {'b', 'f', 'y', 'x'}; + std::stringstream ss; + for (size_t i = 0; i < scales_output_order.size(); i++) { + ss << default_dim_order[scales_output_order[i]]; + + if (i + 1 != scales_output_order.size()) + ss << ", "; + } + + jit.AddConstant(MakeJitConstant("SCALES_OUTPUT_ORDER", ss.str())); + } + + for (size_t i = 0; i < group_sizes.size(); i++) { + jit.AddConstant(MakeJitConstant("GROUP_SIZE_DIM" + std::to_string(i), group_sizes[i])); + } + + return jit; +} + +CommonDispatchData DynamicQuantizeKernelKVCache::SetDefault(const dynamic_quantize_params& params) const { + CommonDispatchData dispatchData; + + const auto& input_dims = get_normalized_dims(params.inputs[0]); + const auto total_batched_elements = get_elements_number_per_batch(params); + const auto total_grouped_elements = get_elements_number_per_group(params); + const auto total_subgroups_number = total_grouped_elements / input_dims.back().v; + + dispatchData.gws = {subgroup_size, total_subgroups_number, total_batched_elements}; + dispatchData.lws = {subgroup_size, total_subgroups_number, 1}; + + return dispatchData; +} + +void DynamicQuantizeKernelKVCache::GetUpdateDispatchDataFunc(KernelData& kd) const { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + kd.kernels[0].skip_execution = false; + + if (prim_params.append_axis != -1) { + kd.kernels[0].params.scalars.clear(); + + ScalarDescriptor axis_offset; + axis_offset.t = ScalarDescriptor::Types::UINT32; + axis_offset.v.u32 = static_cast(prim_params.axis_offset); + kd.kernels[0].params.scalars.push_back(axis_offset); + } + }; +} + +KernelsData DynamicQuantizeKernelKVCache::GetKernelsData(const Params& params) const { + assert(params.GetType() == KernelType::DYNAMIC_QUANTIZE); + + if (!Validate(params)) + return {}; + + const dynamic_quantize_params& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + + KernelData kd = KernelData::Default(params); + + auto cldnn_jit = GetJitConstants(prim_params); + auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + GetUpdateDispatchDataFunc(kd); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, + dispatchData, + params.engineInfo, + kernelName, + jit, + entry_point, + EXE_MODE_DEFAULT, + false, + false, + 1, + GetFusedPrimitiveInputsCount(params), + static_cast(prim_params.outputs.size()), + prim_params.is_shape_agnostic); + + if (prim_params.append_axis != -1) + kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0}); + + return {kd}; +} + +KernelsPriority DynamicQuantizeKernelKVCache::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_3; +} + +bool DynamicQuantizeKernelKVCache::Validate(const Params& params) const { + if (!KernelBaseOpenCL::Validate(params)) + return false; + + const auto& dq_params = static_cast(params); + + const auto& group_sizes = dq_params.group_sizes; + const auto& input_dims = get_normalized_dims(dq_params.inputs[0]); + const size_t non_compressed_dims_number = std::count(group_sizes.begin(), group_sizes.end(), 1); + + if (non_compressed_dims_number == group_sizes.size()) + return false; + + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] != 1 && input_dims[i].is_dynamic) { + return false; + } + } + + // Last dimension should be static, reduced by group_sizes configuration and divisible by 16 + if (group_sizes.back() == 1 || input_dims.back().is_dynamic || input_dims.back().v % subgroup_size != 0) + return false; + + // Limit the size of the innermost dimension + if (input_dims.back().v > 256) + return false; + + // In case of HEADS_NUM * HEAD_SIZE group size, check that it fits into the supported workgroup size limit + if (get_elements_number_per_group(dq_params) / input_dims.back().v >= params.engineInfo.maxWorkGroupSize / subgroup_size) + return false; + + return true; +} +} // namespace kernel_selector + diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp index 3b214848e2f8ad..04090a58f85e00 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -26,6 +26,36 @@ JitConstants DynamicQuantizeKernelRef::GetJitConstants(const dynamic_quantize_pa jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); + bool rearrange_scales = false; + const auto& scales_output_order = params.scales_output_order; + if (!scales_output_order.empty()) { + for (size_t i = 0; i < scales_output_order.size(); i++) { + if (i != scales_output_order[i]) { + rearrange_scales = true; + break; + } + } + } + + if (rearrange_scales) { + const std::array default_dim_order = {'b', 'f', 'y', 'x'}; + + std::stringstream ss; + for (size_t i = 0; i < scales_output_order.size(); i++) { + ss << default_dim_order[scales_output_order[i]]; + + if (i + 1 != scales_output_order.size()) + ss << ", "; + } + + jit.AddConstant(MakeJitConstant("SCALES_OUTPUT_ORDER", ss.str())); + } + + const auto& group_sizes = params.group_sizes; + for (size_t i = 0; i < group_sizes.size(); i++) { + jit.AddConstant(MakeJitConstant("GROUP_SIZE_DIM" + std::to_string(i), group_sizes[i])); + } + return jit; } @@ -34,7 +64,14 @@ CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_p CommonDispatchData dispatchData; OPENVINO_ASSERT(params.outputs[0].GetLayout() == DataLayout::bfyx, "It supports only 4d tensor"); - dispatchData.gws = {params.outputs[0].Batch().v * params.outputs[0].Feature().v, 1, 1}; + + const auto& group_sizes = params.group_sizes; + auto batch_size = group_sizes[0] == 1 ? params.outputs[0].Batch().v : 1; + auto feature_size = group_sizes[1] == 1 ? params.outputs[0].Feature().v : 1; + auto y_size = group_sizes[2] == 1 ? params.outputs[0].Y().v : 1; + auto x_size = group_sizes[3] == 1 ? params.outputs[0].X().v : 1; + + dispatchData.gws = {batch_size * feature_size, y_size, x_size}; dispatchData.lws = {1, 1, 1}; return dispatchData; @@ -94,6 +131,10 @@ bool DynamicQuantizeKernelRef::Validate(const Params& params) const { if (!KernelBaseOpenCL::Validate(params)) return false; + const auto& prim_params = static_cast(params); + if (prim_params.group_sizes.size() != 4) + return false; + return true; } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h index c46b6b2685a940..d437d6ab6eb1f6 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h @@ -13,6 +13,13 @@ namespace kernel_selector { struct dynamic_quantize_params : public base_params { dynamic_quantize_params() : base_params(KernelType::DYNAMIC_QUANTIZE) {} size_t fc_ifm_size = 0; + + int64_t append_axis = -1; + int64_t axis_offset = -1; + std::vector group_sizes; + std::vector scales_output_order; + bool use_asymmetric_quantization = false; + bool combine_scales_and_zp = false; }; class DynamicQuantizeKernelRef : public KernelBaseOpenCL { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp index 6ca9fbd2f5bd76..d38cf6ad2b4e52 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp @@ -5,11 +5,13 @@ #include "dynamic_quantize_kernel_selector.h" #include "dynamic_quantize_kernel_ref.h" #include "dynamic_quantize_kernel_opt.h" +#include "dynamic_quantize_kernel_kv_cache.h" namespace kernel_selector { dynamic_quantize_kernel_selector::dynamic_quantize_kernel_selector() { Attach(); Attach(); + Attach(); } KernelsData dynamic_quantize_kernel_selector::GetBestKernels(const Params& params) const { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp index 7556debd29df00..e2a538750d1615 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp @@ -4,6 +4,7 @@ #include "sdpa_kernel_base.h" #include "kernel_selector_utils.h" +#include "intel_gpu/runtime/debug_configuration.hpp" namespace kernel_selector { @@ -73,6 +74,8 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { jit.AddConstant(MakeJitConstant("DO_BROADCAST_KEY_VALUE", GetBroadcastInputStr(params.inputs[0].GetDims().size(), params.conf.broadcast_axis, params.conf.group_size))); + } else { + jit.AddConstant(MakeJitConstant("BROADCAST_GROUP_SIZE", 1)); } jit.AddConstant(MakeJitConstant("IS_CAUSAL", params.conf.is_causal)); @@ -81,6 +84,21 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { jit.AddConstant(MakeJitConstant("HAS_SCALE_INPUT", params.inputs.size() > 4)); } + jit.AddConstant(MakeJitConstant("IS_KV_COMPRESSED", params.conf.is_kv_compressed)); + + if (params.conf.is_kv_compressed) { + jit.AddConstant(MakeJitConstant("USE_ASYMMETRIC_QUANTIZATION", params.conf.use_asymmetric_quantization)); + jit.AddConstant(MakeJitConstant("COMBINE_SCALES_AND_ZP", params.conf.combine_scales_and_zp)); + jit.AddConstant(MakeJitConstant("COMPRESSED_PER_HEAD", params.conf.per_head_quantization)); + jit.AddConstant(MakeJitConstant("KEY_COMPRESSION_SCALE", params.key_cache_comp_scale)); + jit.AddConstant(MakeJitConstant("VALUE_COMPRESSION_SCALE", params.value_cache_comp_scale)); + + if (params.conf.use_asymmetric_quantization && !params.conf.combine_scales_and_zp) { + jit.AddConstant(MakeJitConstant("KEY_COMPRESSION_ZP", params.key_cache_comp_zp)); + jit.AddConstant(MakeJitConstant("VALUE_COMPRESSION_ZP", params.value_cache_comp_zp)); + } + } + auto is_default_order = [](const std::vector& order) { for (size_t i = 0; i < order.size(); i++) if (order[i] != static_cast(i)) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h index 6ea8d85527d19d..8770c6b9a83e8b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h @@ -88,6 +88,10 @@ struct sdpa_configuration { bool is_causal = false; bool has_alibi_input = false; + bool is_kv_compressed = false; + bool use_asymmetric_quantization = false; + bool combine_scales_and_zp = false; + bool per_head_quantization = false; // Paged Attention configuration bool is_paged_attention = false; @@ -110,6 +114,10 @@ struct sdpa_params : public base_params { int64_t indirect_axis = -1; DataTensor beam_table; + DataTensor key_cache_comp_scale; + DataTensor key_cache_comp_zp; + DataTensor value_cache_comp_scale; + DataTensor value_cache_comp_zp; sdpa_configuration conf; }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp index 04ebf2f2165973..d615b97d75e0a5 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp @@ -342,6 +342,9 @@ bool SDPAKernelMicro::Validate(const Params& p) const { if (params.conf.head_size > 256) return false; + if (params.conf.is_kv_compressed) + return false; + return true; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp index 2f0174d0a45912..52a23261e46042 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp @@ -4,6 +4,7 @@ #include "sdpa_kernel_opt.h" #include "kernel_selector_utils.h" +#include "common_types.h" #include #include @@ -26,6 +27,19 @@ static size_t get_sg_number_scale_factor(const sdpa_params& sdpa_params, size_t if (sdpa_params.conf.head_size * optimal_scale_factor <= sdpa_params.engineInfo.maxWorkGroupSize) { return optimal_scale_factor; } + } else if (kernel_type == KernelsTypes::SINGLE_TOKEN) { + int USE_SCALE_FACTOR = 1; + if (const auto env_var = std::getenv("USE_SCALE_FACTOR")) { + std::istringstream ss(env_var); + ss >> USE_SCALE_FACTOR; + } + if (USE_SCALE_FACTOR) { + const size_t optimal_scale_factor = 2; + if (sdpa_params.conf.head_size * optimal_scale_factor <= sdpa_params.engineInfo.maxWorkGroupSize && + sdpa_params.conf.head_size * optimal_scale_factor / subgroup_size <= subgroup_size) { + return optimal_scale_factor; + } + } } return 1; @@ -126,6 +140,7 @@ static std::string GetKernelName(std::string base_name, KernelsTypes type, const ParamsKey SDPAKernelOpt::GetSupportedKey() const { ParamsKey k; + k.EnableInputDataType(Datatype::INT8); k.EnableInputDataType(Datatype::F16); k.EnableInputDataType(Datatype::F32); k.EnableInputDataType(Datatype::INT32); @@ -154,6 +169,9 @@ bool SDPAKernelOpt::Validate(const Params& p) const { if (params.conf.head_size < 1 || params.conf.head_size % subgroup_size != 0) return false; + if (params.conf.use_asymmetric_quantization && !params.conf.combine_scales_and_zp) + return false; + return true; } @@ -231,10 +249,11 @@ CommonDispatchData SDPAKernelOpt::SetDefault(const sdpa_params& params, size_t k const size_t target_seq_len_block_size = kernel_idx == 1 ? get_target_seq_len_block_size() : 1; if (kernel_idx == KernelsTypes::SINGLE_TOKEN) { + const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx); dispatch_data.gws = { batch_size * heads_num, CeilDiv(target_seq_len, target_seq_len_block_size), - head_size * num_of_partitions }; - dispatch_data.lws = { 1, 1, head_size }; + head_size * num_of_partitions * sg_num_scale }; + dispatch_data.lws = { 1, 1, head_size * sg_num_scale }; } else if (kernel_idx == KernelsTypes::MULTI_TOKENS) { const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx); dispatch_data.gws = { batch_size * heads_num, @@ -307,8 +326,20 @@ KernelsData SDPAKernelOpt::GetKernelsData(const Params& params) const { static_cast(prim_params.outputs.size()), prim_params.is_shape_agnostic); - if (prim_params.indirect_axis != -1 && kernel_idx != KernelsTypes::FINALIZATION) - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(prim_params.inputs.size())}); + auto beam_table_idx = prim_params.inputs.size(); + if (prim_params.conf.is_kv_compressed && kernel_idx != KernelsTypes::FINALIZATION) { + auto key_cache_compression_scale_idx = static_cast(prim_params.inputs.size()); + auto value_cache_compression_scale_idx = static_cast(prim_params.inputs.size() + 1); + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx}); + + beam_table_idx += 2; + } + + if (prim_params.indirect_axis != -1 && kernel_idx != KernelsTypes::FINALIZATION) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(beam_table_idx)}); + } kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp index 579c4bc06c17e2..0d551883b6c385 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp @@ -12,6 +12,7 @@ namespace kernel_selector { ParamsKey SDPAKernelRef::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::INT8); k.EnableInputDataType(Datatype::F32); // beam table input k.EnableInputDataType(Datatype::INT32); @@ -74,8 +75,26 @@ KernelsData SDPAKernelRef::GetKernelsData(const Params& params) const { "", false, false, static_cast(prim_params.inputs.size()), GetFusedPrimitiveInputsCount(params), 1, prim_params.is_shape_agnostic); - if (prim_params.indirect_axis != -1) - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(prim_params.inputs.size())}); + auto beam_table_idx = prim_params.inputs.size(); + if (prim_params.conf.is_kv_compressed) { + auto key_cache_compression_scale_idx = static_cast(prim_params.inputs.size()); + auto value_cache_compression_scale_idx = static_cast(prim_params.inputs.size() + 1); + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx}); + + if (prim_params.conf.use_asymmetric_quantization && !prim_params.conf.combine_scales_and_zp) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx + 2}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx + 2}); + beam_table_idx += 2; + } + + beam_table_idx += 2; + } + + if (prim_params.indirect_axis != -1) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(beam_table_idx)}); + } kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp index e65fd7fd10976b..01052520a8e91d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp @@ -15,9 +15,18 @@ namespace kernel_selector { sdpa_kernel_selector::sdpa_kernel_selector() { Attach(); Attach(); + int USE_OPT_SDPA = 0; + if (const auto env_var = std::getenv("USE_SDPA_OPT")) { + std::istringstream ss(env_var); + ss >> USE_OPT_SDPA; + } + + if (!USE_OPT_SDPA) { #ifdef ENABLE_ONEDNN_FOR_GPU - Attach(); + Attach(); + std::cout << "micro_sdpa added\n"; #endif + } } KernelsData sdpa_kernel_selector::GetBestKernels(const Params& params) const { diff --git a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp index 7574b664b6b4b7..8173a29c1b35f8 100644 --- a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp +++ b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp @@ -152,5 +152,64 @@ VariableState::Ptr VariableStateIndirectKVCache::get_beam_table_state() const { return m_hidden_states[1]; } +VariableStateIndirectKVCacheCompressed::VariableStateIndirectKVCacheCompressed( + const VariableStateInfo& info, + std::shared_ptr context, + std::shared_ptr shape_predictor, + const std::vector& output_layouts, + size_t beam_idx, + size_t concat_idx, + bool has_zp_state = false) + : VariableStateIndirectKVCache(info, context, shape_predictor, beam_idx, concat_idx), + m_has_zp_state(has_zp_state) { + OPENVINO_ASSERT((has_zp_state && output_layouts.size() == 3) || + (!has_zp_state && output_layouts.size() == 2), + "[GPU] Unexpected number of output layouts for VariableStateIndirectKVCacheCompressed"); + + const auto compression_scale_layout = output_layouts[1]; + VariableStateInfo compression_scale_state_info(info.m_id + "/comp_scale", compression_scale_layout); + m_hidden_states.push_back(std::make_shared(compression_scale_state_info, context, shape_predictor)); + + if (has_zp_state) { + const auto compression_zp_layout = output_layouts[2]; + VariableStateInfo compression_zp_state_info(info.m_id + "/comp_zp", compression_zp_layout); + m_hidden_states.push_back(std::make_shared(compression_zp_state_info, context, shape_predictor)); + } + + OPENVINO_ASSERT((!m_has_zp_state && m_hidden_states.size() == 3) || (m_has_zp_state && m_hidden_states.size() == 4), + "[GPU] VariableStateIndirectKVCacheCompressed expects 3 or 4 internal states to be initialized, " + "actual number is ", m_hidden_states.size()); +} + +VariableState::Ptr VariableStateIndirectKVCacheCompressed::get_compression_scale_state() const { + return m_hidden_states[2]; +} + +void VariableStateIndirectKVCacheCompressed::set_compression_scale_layout(const cldnn::layout& new_layout) { + m_hidden_states[2]->set_layout(new_layout); +} + +VariableState::Ptr VariableStateIndirectKVCacheCompressed::get_compression_zp_state() const { + OPENVINO_ASSERT(m_has_zp_state); + return m_hidden_states[3]; +} + +void VariableStateIndirectKVCacheCompressed::set_compression_zp_layout(const cldnn::layout& new_layout) { + OPENVINO_ASSERT(m_has_zp_state); + m_hidden_states[3]->set_layout(new_layout); +} + +bool VariableStateIndirectKVCacheCompressed::has_zp_state() const { + return m_has_zp_state; +} + +void VariableStateIndirectKVCacheCompressed::set_state(const ov::SoPtr& state) { + OPENVINO_THROW("[GPU] set_state API is supported only when KV-cache compression is disabled"); +} + +ov::SoPtr VariableStateIndirectKVCacheCompressed::get_state() const { + OPENVINO_THROW("[GPU] get_state API is supported only when KV-cache compression is disabled"); +} + } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index 0373251e45c051..04d60c1e430396 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -3,31 +3,53 @@ // #include "ov_ops/dynamic_quantize.hpp" +#include "intel_gpu/op/dynamic_quantize.hpp" #include "intel_gpu/plugin/program_builder.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/primitives/dynamic_quantize.hpp" + +namespace ov { +namespace op { +namespace internal { +using DynamicQuantizeExtended = ov::intel_gpu::op::DynamicQuantize; +} // namespace internal +} // namespace op +} // namespace ov + namespace ov { namespace intel_gpu { -static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptr& op) { +static void CreateDynamicQuantize(ProgramBuilder &p, + const std::shared_ptr &op, + const ov::op::internal::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + bool combine_scales_and_zp) { validate_inputs_count(op, {1}); auto inputs = p.GetInputInfo(op); std::string primitive_name = layer_type_name_ID(op); - auto group_sizes = op->get_group_sizes(); - for (size_t i = 0; i < group_sizes.size() - 1; i++) - OPENVINO_ASSERT(group_sizes[i] == 1, "Not supported group size at ", i, ": ", group_sizes[i]); - - OPENVINO_ASSERT(group_sizes.back() == UINT64_MAX, "Not supported group size: ", group_sizes.back()); auto prim = cldnn::dynamic_quantize(primitive_name, - inputs[0], - op->get_group_sizes().back(), - get_output_data_types(op)); + inputs[0], + config, + combine_scales_and_zp, + scales_zp_output_order); + + prim.num_outputs = op->get_output_size(); + p.add_primitive(*op, prim); } +static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptr& op) { + CreateDynamicQuantize(p, op, op->get_quantization_config(), {}, false); +} + +static void CreateDynamicQuantizeExtendedOp(ProgramBuilder& p, const std::shared_ptr& op) { + CreateDynamicQuantize(p, op, op->get_quantization_config(), op->get_scales_zp_output_order(), op->get_combine_scales_and_zp()); +} + REGISTER_FACTORY_IMPL(internal, DynamicQuantize); +REGISTER_FACTORY_IMPL(internal, DynamicQuantizeExtended); } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp index c2ee336e48bf06..bfa4df330c2fc8 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp @@ -22,19 +22,26 @@ namespace intel_gpu { namespace { void CreateKVCacheOp(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {2, 3}); + validate_inputs_count(op, {2, 3, 4, 5}); auto inputs = p.GetInputInfo(op); int64_t rank = op->get_input_partial_shape(0).size(); auto prim = cldnn::kv_cache(layer_type_name_ID(op), - inputs, - op->get_variable()->get_info(), - ov::util::normalize(op->get_concat_axis(), rank), - ov::util::normalize(op->get_gather_axis(), rank), - op->get_indirect()); + inputs, + op->get_variable()->get_info(), + ov::util::normalize(op->get_concat_axis(), rank), + ov::util::normalize(op->get_gather_axis(), rank), + op->get_indirect()); prim.num_outputs = op->get_output_size(); prim.output_data_types = get_output_data_types(op); + if (op->get_kv_compressed()) { + prim.compressed = true; + prim.combine_scales_and_zp = op->get_combine_scales_and_zp(); + prim.quantization_config = op->get_quantization_config(); + prim.scales_zp_output_order = op->get_scales_zp_output_order(); + } + p.add_primitive(*op, prim); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp index 0ba780183b186d..a2ef2ad50bfcce 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp @@ -62,11 +62,13 @@ static void CreateSDPAOp(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {4, 5, 6}); auto inputs = p.GetInputInfo(op); auto layerName = layer_type_name_ID(op); bool is_causal = op->get_causal(); + const auto compression_inputs = op->get_compression_inputs_num(); + validate_inputs_count(op, {4 + compression_inputs, 5 + compression_inputs, 6 + compression_inputs}); + int64_t indirect_axis = op->get_indirect_axis(); auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName, inputs, @@ -75,7 +77,10 @@ static void CreateIndirectSDPAOp(ProgramBuilder& p, const std::shared_ptrget_input0_transpose_order(), op->get_input1_transpose_order(), op->get_input2_transpose_order(), - op->get_output_transpose_order()); + op->get_output_transpose_order(), + op->get_kv_compressed(), + op->get_combine_scales_and_zp(), + op->get_quantization_config()); p.add_primitive(*op, sdpa_prim); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp index d655e297e4a2c6..a4354c51092ac8 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // + #include "intel_gpu/plugin/program_builder.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "openvino/core/type/element_type.hpp" @@ -9,6 +10,7 @@ #include "openvino/op/read_value.hpp" #include "transformations/rt_info/original_precision_attribute.hpp" #include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/op/read_values.hpp" #include "intel_gpu/primitives/assign.hpp" #include "intel_gpu/primitives/read_value.hpp" @@ -16,6 +18,7 @@ namespace ov { namespace op { namespace internal { using ReadValue = ov::intel_gpu::op::ReadValue; +using ReadValues = ov::intel_gpu::op::ReadValues; } // namespace internal } // namespace op } // namespace ov @@ -39,7 +42,7 @@ void CreateVariableAccessPrimitive(ProgramBuilder &p, const std::shared_ptr CreateVariableAccessPrimitive(p, op, op->get_variable_id()); } +void CreateReadValuesOp(ProgramBuilder& p, const std::shared_ptr& op) { + std::vector variable_layouts; + for (size_t i = 0; i < op->get_output_size(); i++) { + const auto output_pshape = op->get_output_partial_shape(i); + const auto output_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(i)); + const auto output_format = cldnn::format::get_default_format(output_pshape.size()); + variable_layouts.emplace_back(output_pshape, output_dtype, output_format); + } + + auto inputs = p.GetInputInfo(op); + auto user_specified_type = get_original_precision(op); + auto prim = cldnn::read_value{layer_type_name_ID(op), + inputs, + op->get_variable_id(), + variable_layouts, + user_specified_type}; + + p.add_primitive(*op, prim); +} + } // namespace REGISTER_FACTORY_IMPL(v3, Assign); @@ -89,6 +112,7 @@ REGISTER_FACTORY_IMPL(v6, Assign); REGISTER_FACTORY_IMPL(v3, ReadValue); REGISTER_FACTORY_IMPL(v6, ReadValue); REGISTER_FACTORY_IMPL(internal, ReadValue); +REGISTER_FACTORY_IMPL(internal, ReadValues); } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index 58e99e037fb931..26771117e2e786 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -8,6 +8,7 @@ #include "openvino/core/validation_util.hpp" #include "intel_gpu/primitives/kv_cache.hpp" +#include "intel_gpu/primitives/read_value.hpp" #include "intel_gpu/plugin/usm_host_tensor.hpp" #include "intel_gpu/plugin/sync_infer_request.hpp" #include "intel_gpu/plugin/remote_context.hpp" @@ -646,19 +647,40 @@ void SyncInferRequest::allocate_states() { bool indirect_kv_cache = false; int64_t beam_axis = 0; int64_t concat_axis = 0; + bool compressed = false; + bool has_zp_state = false; auto kv_cache_shape = vi.second.m_layout.get_partial_shape(); + std::vector states_layouts; for (auto& p : state_prims) { if (auto kv_cache_prim = dynamic_cast(p)) { indirect_kv_cache = kv_cache_prim->indirect; beam_axis = ov::util::normalize(kv_cache_prim->gather_axis, kv_cache_shape.size()); concat_axis = ov::util::normalize(kv_cache_prim->concat_axis, kv_cache_shape.size()); + compressed = kv_cache_prim->compressed; + has_zp_state = kv_cache_prim->get_compression_zp_inputs_num() > 0; + } else if (auto read_value = dynamic_cast(p)) { + states_layouts = read_value->output_layouts; } } - if (indirect_kv_cache) { - m_variables.emplace(vi.first, std::make_shared(vi.second, m_context, m_shape_predictor, beam_axis, concat_axis)); + if (compressed) { + m_variables.emplace(vi.first, std::make_shared(vi.second, + m_context, + m_shape_predictor, + states_layouts, + beam_axis, + concat_axis, + has_zp_state)); + } else if (indirect_kv_cache) { + m_variables.emplace(vi.first, std::make_shared(vi.second, + m_context, + m_shape_predictor, + beam_axis, + concat_axis)); } else { - m_variables.emplace(vi.first, std::make_shared(vi.second, m_context, m_shape_predictor)); + m_variables.emplace(vi.first, std::make_shared(vi.second, + m_context, + m_shape_predictor)); } } } diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index eb16213bcb936c..28e0e7b74c71ae 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -61,7 +61,14 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size auto rank = m_fc->get_input_partial_shape(0).size(); std::vector shape_group_size(rank, 1); shape_group_size.back() = group_size; - auto dyn_quan = std::make_shared(m_data, shape_group_size, element::f16); + + ov::op::internal::QuantizationConfig config; + config.quantization_dt = element::i8; + config.mode = ov::op::internal::QuantizationConfig::QuantizationMode::Symmetric; + config.scale_dt = element::f16; + config.group_sizes = shape_group_size; + + auto dyn_quan = std::make_shared(m_data, config); auto optional_w_zp = m_fc->get_input_size() > 4 ? m_fc->get_input_node_shared_ptr(4) : std::make_shared(); auto output_type = m_fc->get_output_type(); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp new file mode 100644 index 00000000000000..65e7cae3a73b95 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp @@ -0,0 +1,297 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "kv_cache_compression.hpp" + +#include "intel_gpu/op/kv_cache.hpp" +#include "intel_gpu/op/indirect_sdpa.hpp" +#include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/op/read_values.hpp" +#include "intel_gpu/op/dynamic_quantize.hpp" +#include "intel_gpu/plugin/common_utils.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" + +#include "openvino/core/node_vector.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/sink.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pattern/op/label.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/visualize_tree.hpp" +#include "transformations/utils/utils.hpp" + +#include + +namespace ov { +namespace intel_gpu { + +namespace { +std::vector get_variable_infos(const ov::op::util::VariableInfo& data_variable_info, + const ov::op::internal::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp = false) { + std::vector infos; + + // Add initial data variable info + infos.push_back(data_variable_info); + + // Infer DQ shapes + ov::intel_gpu::op::DynamicQuantize dq; + auto dq_shapes = + ov::intel_gpu::op::DynamicQuantize::shape_infer(&dq, {data_variable_info.data_shape}, config, scales_zp_output_order, combine_scales_and_zp); + + const auto variable_id = data_variable_info.variable_id; + const auto scale_shape = dq_shapes[1]; + const auto scale_dt = config.scale_dt; + + // Add scales variable info + infos.push_back(ov::op::util::VariableInfo{scale_shape, scale_dt, variable_id}); + + if (config.is_asymmetric_quantization() && !combine_scales_and_zp) { + // Add zero points variable info + const auto zp_dt = config.zp_dt; + infos.push_back(ov::op::util::VariableInfo{scale_shape, zp_dt, variable_id}); + } + + return infos; +} + +std::shared_ptr + update_past_read_value(std::shared_ptr past_rv_node, + const ov::op::internal::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp = false) { + auto variable = past_rv_node->get_variable(); + variable->update_data_type(config.quantization_dt); + + auto variable_infos = get_variable_infos(past_rv_node->get_variable()->get_info(), config, scales_zp_output_order, combine_scales_and_zp); + auto new_past_rv_node = std::make_shared(); + + if (past_rv_node->get_input_size() == 0) { + new_past_rv_node = std::make_shared(past_rv_node->get_variable(), variable_infos); + } else { + auto initializer_dyn_quantization = std::make_shared(past_rv_node->get_input_node_shared_ptr(0), + config, + scales_zp_output_order, + combine_scales_and_zp); + initializer_dyn_quantization->set_friendly_name(past_rv_node->get_input_node_shared_ptr(0)->get_friendly_name() + "_dyn_quan"); + + OutputVector initializer_outputs = { initializer_dyn_quantization->output(0), initializer_dyn_quantization->output(1) }; + + if (config.is_asymmetric_quantization() && !combine_scales_and_zp) + initializer_outputs.push_back(initializer_dyn_quantization->output(2)); + + new_past_rv_node = std::make_shared(initializer_outputs, past_rv_node->get_variable(), variable_infos); + } + + ov::copy_runtime_info(past_rv_node, new_past_rv_node); + past_rv_node->output(0).replace(new_past_rv_node->output(0)); + + return new_past_rv_node; +} + +std::shared_ptr + update_kv_cache(std::shared_ptr past_rv_node, + std::shared_ptr kv_cache_node, + const ov::op::internal::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp = false) { + OutputVector kv_cache_inputs = { past_rv_node->output(0), + kv_cache_node->get_input_node_shared_ptr(1), + kv_cache_node->get_input_node_shared_ptr(2), + past_rv_node->output(1) }; + + if (config.is_asymmetric_quantization() && !combine_scales_and_zp) + kv_cache_inputs.push_back(past_rv_node->output(2)); + + auto new_kv_cache = std::make_shared(kv_cache_inputs, + kv_cache_node->get_variable(), + kv_cache_node->get_concat_axis(), + kv_cache_node->get_gather_axis(), + combine_scales_and_zp, + config, + scales_zp_output_order); + + new_kv_cache->set_friendly_name(kv_cache_node->get_friendly_name()); + ov::copy_runtime_info(kv_cache_node, new_kv_cache); + + return new_kv_cache; +} +} // namespace + +class KVCacheCompressionMatcher : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("KVCacheCompressionMatcher", "0"); + KVCacheCompressionMatcher(ov::element::Type compression_dt); +}; + +KVCacheCompressionMatcher::KVCacheCompressionMatcher(ov::element::Type compression_dt) { + using namespace ov::pass::pattern; + + if (compression_dt != element::i8) + return; + + auto quantization_mode = ov::op::internal::QuantizationConfig::QuantizationMode::Asymmetric; + bool combine_scales_and_zp = quantization_mode == ov::op::internal::QuantizationConfig::QuantizationMode::Asymmetric; + + GPU_DEBUG_LOG << "KV-cache compression configuration: " + << "dt=" << compression_dt << ", " + << "asym=" << (quantization_mode == ov::op::internal::QuantizationConfig::QuantizationMode::Asymmetric) << ", " + << "single_buffer_for_scales_and_zp=" << combine_scales_and_zp << "\n"; + + auto query = any_input(); + + auto key_past = wrap_type(); + auto key_new_token = any_input(); + auto key_beam_idx = any_input(); + auto key_cache = wrap_type({key_past, key_new_token, key_beam_idx}); + + auto value_past = wrap_type(); + auto value_new_token = any_input(); + auto value_beam_idx = any_input(); + auto value_cache = wrap_type({value_past, value_new_token, value_beam_idx}); + + auto input_attn_mask = any_input(); + auto input_scale = any_input(); + auto input_beam_table = any_input(); + + auto sdpa_without_attn_mask_m = wrap_type({ query, key_cache, value_cache, input_beam_table }); + auto sdpa_with_attn_mask_m = wrap_type({ query, key_cache, value_cache, input_attn_mask, input_beam_table }); + auto sdpa_with_attn_mask_and_scale_m = + wrap_type({ query, key_cache, value_cache, input_attn_mask, input_scale, input_beam_table }); + + auto sdpa = std::make_shared(OutputVector{sdpa_without_attn_mask_m, sdpa_with_attn_mask_m, sdpa_with_attn_mask_and_scale_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + if (transformation_callback(m.get_match_root())) { + return false; + } + + const auto& pattern_map = m.get_pattern_value_map(); + + auto query_node = pattern_map.at(query).get_node_shared_ptr(); + + auto key_new_token_node = pattern_map.at(key_new_token).get_node_shared_ptr(); + auto key_cache_node = std::dynamic_pointer_cast(pattern_map.at(key_cache).get_node_shared_ptr()); + auto value_cache_node = std::dynamic_pointer_cast(pattern_map.at(value_cache).get_node_shared_ptr()); + auto sdpa_node = std::dynamic_pointer_cast(m.get_match_root()); + + auto key_past_rv_node = std::dynamic_pointer_cast(pattern_map.at(key_past).get_node_shared_ptr()); + auto value_past_rv_node = std::dynamic_pointer_cast(pattern_map.at(value_past).get_node_shared_ptr()); + + auto data_rank = key_cache_node->get_input_partial_shape(0).size(); + auto get_shape_group_sizes = [&](const std::vector& transposed_order) { + std::vector group_sizes(data_rank, 1); + std::vector order = transposed_order; + if (transposed_order.size() != data_rank) { + order.resize(data_rank); + std::iota(order.begin(), order.end(), 0); + } + + group_sizes[order[data_rank - 1]] = UINT64_MAX; + group_sizes[order[1]] = UINT64_MAX; + + return group_sizes; + }; + + // Reorder scales in static order: [batch, num_heads, seq_len, head_size] + auto get_scales_output_order = [&](const std::vector& transposed_order) { + std::vector scales_zp_output_order(data_rank); + scales_zp_output_order[0] = transposed_order[0]; + scales_zp_output_order[1] = transposed_order[1]; + scales_zp_output_order[2] = transposed_order[2]; + scales_zp_output_order[3] = transposed_order[3]; + + return scales_zp_output_order; + }; + + auto group_sizes = get_shape_group_sizes(sdpa_node->get_input1_transpose_order()); + auto scales_zp_output_order = get_scales_output_order(sdpa_node->get_input1_transpose_order()); + + ov::op::internal::QuantizationConfig config; + config.mode = quantization_mode; + config.group_sizes = group_sizes; + config.quantization_dt = element::i8; + config.scale_dt = query_node->get_output_element_type(0); + + if (config.is_asymmetric_quantization()) + config.zp_dt = query_node->get_output_element_type(0); + + key_past_rv_node = update_past_read_value(key_past_rv_node, config, scales_zp_output_order, combine_scales_and_zp); + value_past_rv_node = update_past_read_value(value_past_rv_node, config, scales_zp_output_order, combine_scales_and_zp); + + auto new_key_cache = update_kv_cache(key_past_rv_node, key_cache_node, config, scales_zp_output_order, combine_scales_and_zp); + auto new_value_cache = update_kv_cache(value_past_rv_node, value_cache_node, config, scales_zp_output_order, combine_scales_and_zp); + + OutputVector sdpa_inputs; + // Add Query, Key, Value, attention_mask, scale inputs + for (size_t i = 0; i < sdpa_node->get_input_size() - 1; i++) + sdpa_inputs.push_back(sdpa_node->get_input_node_shared_ptr(i)); + + // Replace Key and Value inputs with compressed ones + sdpa_inputs[1] = new_key_cache->output(0); + sdpa_inputs[2] = new_value_cache->output(0); + + // Add Key and Value compression scales + sdpa_inputs.push_back(new_key_cache->output(2)); + sdpa_inputs.push_back(new_value_cache->output(2)); + + // Add Key and Value compression zero points + if (config.is_asymmetric_quantization() && !combine_scales_and_zp) { + sdpa_inputs.push_back(new_key_cache->output(3)); + sdpa_inputs.push_back(new_value_cache->output(3)); + } + + auto input0_transpose_order = sdpa_node->get_input0_transpose_order(); + auto input1_transpose_order = sdpa_node->get_input1_transpose_order(); + auto input2_transpose_order = sdpa_node->get_input2_transpose_order(); + auto output_transpose_order = sdpa_node->get_output_transpose_order(); + + auto new_sdpa = std::make_shared(sdpa_inputs, + new_key_cache->output(1), + sdpa_node->get_causal(), + sdpa_node->get_indirect_axis(), + input0_transpose_order, + input1_transpose_order, + input2_transpose_order, + output_transpose_order, + config, + combine_scales_and_zp, + sdpa_node->get_output_type()); + + new_key_cache->set_friendly_name(key_cache_node->get_friendly_name()); + ov::copy_runtime_info(key_cache_node, new_key_cache); + + new_value_cache->set_friendly_name(value_cache_node->get_friendly_name()); + ov::copy_runtime_info(value_cache_node, new_value_cache); + + new_sdpa->set_friendly_name(sdpa_node->get_friendly_name()); + ov::copy_runtime_info(sdpa_node, new_sdpa); + + ov::replace_node(sdpa_node, new_sdpa); + return true; + }; + + auto m = std::make_shared(sdpa, "KVCacheCompressionMatcher"); + this->register_matcher(m, callback); +} + +bool KVCacheCompression::run_on_model(const std::shared_ptr& m) { + return pass::GraphRewrite::run_on_model(m); +} + +KVCacheCompression::KVCacheCompression(ov::element::Type compression_dt) { + add_matcher(compression_dt); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp new file mode 100644 index 00000000000000..1587021a03ed36 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + + +/// Add dynamic quantization node and fuse it with KV cache operation +/// +/// ┌───────────┐ ┌─────────────┐ ┌───────────┐ ┌─────────────┐ +/// │ New Key │ │ New Value │ │ New Key │ │ New Value │ +/// └──────┬────┘ └──────┬──────┘ └──────┬────┘ └──────┬──────┘ +/// │ │ │ │ +/// │ f16 │ f16 │ f16 │ f16 +/// │ │ ==> │ │ +/// ┌─────────┐ ┌────────┴─────────┐ ┌────────┴───────────┐ ┌─────────┐ ┌────────┴─────────┐ ┌────────┴───────────┐ +/// │ Query │ │ KV cache │ │ KV cache │ │ Query │ │ KV cache + DQ │ │ KV cache + DQ │ +/// | | | (Key) | (Value) | | | | (Key) | | (Value) | +/// └───┬─────┘ └────────┬─────────┘ └────────┬───────────┘ └────┬────┘ └────────┬─────────┘ └────────┬───────────┘ +/// │ │ │ │ │ │ +/// │ f16 │ f16 │ f16 │ f16 i8:data │ f16:scale i8:data │ f16:scale +/// │ │ │ │ │ │ +/// │ │ │ │ │ │ +/// │ ┌────┴───┐ │ │ ┌────┴───┐ │ +/// └─────────────┤ SDPA ├─────────────────┘ └─────────────┤ SDPA ├────────────────────┘ +/// └────────┘ └────────┘ + +class KVCacheCompression : public ov::pass::GraphRewrite { +public: + OPENVINO_RTTI("KVCacheCompression", "0"); + KVCacheCompression(ov::element::Type compression_dt); + + bool run_on_model(const std::shared_ptr& m) override; +}; + + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp new file mode 100644 index 00000000000000..ebb4163922fca8 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/op/dynamic_quantize.hpp" + +#include "openvino/core/partial_shape.hpp" +#include "openvino/core/validation_util.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +DynamicQuantize::DynamicQuantize(const Output& data, + const QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp) + : ov::op::internal::DynamicQuantize(data, config, combine_scales_and_zp || config.mode == QuantizationConfig::QuantizationMode::Symmetric ? 2 : 3), + m_combine_scales_and_zp(combine_scales_and_zp), + m_scales_zp_output_order(scales_zp_output_order) { + if (m_scales_zp_output_order.empty()) { + m_scales_zp_output_order.resize(data.get_partial_shape().size()); + std::iota(m_scales_zp_output_order.begin(), m_scales_zp_output_order.end(), 0); + } + + OPENVINO_ASSERT(data.get_partial_shape().size() == m_scales_zp_output_order.size()); + validate_and_infer_types(); +} + +void DynamicQuantize::validate_and_infer_types() { + std::vector input_shapes = {get_input_partial_shape(0)}; + + auto out_shapes = shape_infer(this, input_shapes, m_config, m_scales_zp_output_order, m_combine_scales_and_zp); + set_output_type(0, m_config.quantization_dt, out_shapes[0]); + set_output_type(1, m_config.scale_dt, out_shapes[1]); + + if (m_config.is_asymmetric_quantization() && !m_combine_scales_and_zp) + set_output_type(2, m_config.zp_dt, out_shapes[2]); +} + +std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_config, m_scales_zp_output_order, m_combine_scales_and_zp); +} + +std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, + const std::vector& input_shapes, + const QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp) { + std::vector out_shapes = ov::op::internal::DynamicQuantize::shape_infer(op, input_shapes, config); + const auto is_asymmetric = config.is_asymmetric_quantization(); + if (is_asymmetric && combine_scales_and_zp) { + out_shapes.pop_back(); // drop zero_points shape + } + + auto transpose_shape = [](const ov::PartialShape& shape, const std::vector& scales_zp_output_order) { + auto transposed_shape = shape; + for (size_t i = 0; i < scales_zp_output_order.size(); i++) { + OPENVINO_ASSERT(scales_zp_output_order[i] < transposed_shape.size()); + transposed_shape[i] = shape[scales_zp_output_order[i]]; + } + + return transposed_shape; + }; + + // Transpose scales and zero points + for (size_t i = 1; i < out_shapes.size(); i++) { + out_shapes[i] = transpose_shape(out_shapes[i], scales_zp_output_order); + } + + if (is_asymmetric && combine_scales_and_zp) { + // TODO: currently scales and zero points are supposed to be combined over the last dimension only + const auto combine_axis = out_shapes[1].size() - 1; + OPENVINO_ASSERT(config.group_sizes[scales_zp_output_order[combine_axis]] != 1); + + out_shapes[1][combine_axis] *= 2; // (scale, zero_point) pairs + } + + return out_shapes; +} + +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp index 681c88119efd95..a900c99eb6a4af 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp @@ -25,30 +25,65 @@ IndirectSDPA::IndirectSDPA(const OutputVector& data_inputs, validate_and_infer_types(); } +IndirectSDPA::IndirectSDPA(const OutputVector& data_inputs, + const ov::Output& beam_table, + const bool is_causal, + const int64_t indirect_axis, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const QuantizationConfig& quantization_config, + const bool combine_scales_and_zp, + const ov::element::Type output_type) + : ov::intel_gpu::op::SDPA(data_inputs, is_causal, order_q, order_k, order_v, order_out, quantization_config, combine_scales_and_zp, output_type) + , m_indirect_axis(indirect_axis) { + auto beam_table_idx = data_inputs.size(); + set_argument(beam_table_idx, beam_table); + validate_and_infer_types(); +} + std::shared_ptr IndirectSDPA::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); // Exclude beam_table input OutputVector data_inputs(new_args.begin(), new_args.end() - 1); - return std::make_shared(data_inputs, - new_args.back(), - m_is_causal, - m_indirect_axis, - m_order_q, - m_order_k, - m_order_v, - m_order_out, - m_output_type); + if (m_compressed) { + return std::make_shared(data_inputs, + new_args.back(), + m_is_causal, + m_indirect_axis, + m_order_q, + m_order_k, + m_order_v, + m_order_out, + m_output_type); + } else { + return std::make_shared(data_inputs, + new_args.back(), + m_is_causal, + m_indirect_axis, + m_order_q, + m_order_k, + m_order_v, + m_order_out, + m_quantization_config, + m_combine_scales_and_zp, + m_output_type); + } } void IndirectSDPA::validate_and_infer_types() { const auto input_size = get_input_size(); + + const auto compression_inputs = get_compression_inputs_num(); NODE_VALIDATION_CHECK(this, - input_size == 4 || input_size == 5 || input_size == 6, + input_size >= 4 + compression_inputs && input_size <= 6 + compression_inputs, "Number of inputs is incorrect. Current value is: ", input_size, - ", expected 4, 5 or 6."); + ", expected 4, 5 or 6 data inputs and ", compression_inputs, " KV-cache compression related inputs"); + std::vector input_shapes; for (size_t i = 0; i < input_size - 1; i++) { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp index a598e556a8f05d..7759e4ab65d459 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp @@ -24,6 +24,7 @@ KVCache::KVCache(const Output& past, , m_concat_axis(concat_axis) , m_gather_axis(gather_axis) , m_indirect(true) + , m_compressed(false) , m_output_type(output_type) { m_variable = past_variable; if (m_indirect) @@ -40,28 +41,86 @@ KVCache::KVCache(const Output& past, , m_concat_axis(concat_axis) , m_gather_axis(0) , m_indirect(false) + , m_compressed(false) , m_output_type(output_type) { m_variable = past_variable; validate_and_infer_types(); } +KVCache::KVCache(const OutputVector& inputs, + const std::shared_ptr& past_variable, + int64_t concat_axis, + int64_t gather_axis, + bool combine_scales_and_zp, + const QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const ov::element::Type output_type) + : Op(inputs) + , m_concat_axis(concat_axis) + , m_gather_axis(gather_axis) + , m_indirect(true) + , m_compressed(true) + , m_combine_scales_and_zp(combine_scales_and_zp) + , m_quantization_config(config) + , m_scales_zp_output_order(scales_zp_output_order) + , m_output_type(output_type) { + OPENVINO_ASSERT(m_quantization_config.quantization_dt == ov::element::i8, + "[GPU] Only I8 data type is currently supported for KV-cache compression"); + + m_variable = past_variable; + size_t output_size = 3; + if (config.is_asymmetric_quantization() && !combine_scales_and_zp) + output_size++; // add zp output + + set_output_size(output_size); + validate_and_infer_types(); +} + bool KVCache::visit_attributes(ov::AttributeVisitor& visitor) { visitor.on_attribute("concat_axis", m_concat_axis); visitor.on_attribute("gather_axis", m_gather_axis); visitor.on_attribute("indirect", m_indirect); visitor.on_attribute("output_type", m_output_type); + visitor.on_attribute("compressed", m_compressed); return true; } void KVCache::validate_and_infer_types() { - auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; + auto output_type = m_output_type; + if (m_compressed) { + output_type = m_quantization_config.quantization_dt; + } else if (m_output_type == ov::element::undefined) { + output_type = get_input_element_type(0); + } + std::vector input_shapes = {m_variable->get_info().data_shape, get_input_partial_shape(1)}; - if (get_output_size() == 2) + if (m_indirect) { input_shapes.push_back(get_input_partial_shape(2)); - auto shapes = shape_infer(this, input_shapes); - set_output_type(0, output_type, shapes[0]); + } + + if (m_compressed) { + input_shapes.push_back(get_input_partial_shape(3)); + + if (m_quantization_config.is_asymmetric_quantization() && !m_combine_scales_and_zp) + input_shapes.push_back(get_input_partial_shape(4)); + } + + auto shapes = m_compressed ? shape_infer(this, input_shapes, m_quantization_config, m_scales_zp_output_order, m_combine_scales_and_zp) + : shape_infer(this, input_shapes); + + size_t out_ports = 0; + set_output_type(out_ports++, output_type, shapes[0]); + if (m_indirect) { - set_output_type(1, get_input_element_type(2), shapes[1]); + set_output_type(out_ports++, get_input_element_type(2), shapes[1]); + } + + if (m_compressed) { + set_output_type(out_ports++, m_quantization_config.scale_dt, shapes[2]); + + if (m_quantization_config.is_asymmetric_quantization() && !m_combine_scales_and_zp) { + set_output_type(out_ports++, m_quantization_config.scale_dt, shapes[3]); + } } } @@ -74,7 +133,7 @@ std::shared_ptr KVCache::clone_with_new_inputs(const ov::OutputVector& new m_concat_axis, m_output_type); - } else { + } else if (new_args.size() == 3) { return std::make_shared(new_args.at(0), new_args.at(1), new_args.at(2), @@ -82,16 +141,53 @@ std::shared_ptr KVCache::clone_with_new_inputs(const ov::OutputVector& new m_concat_axis, m_gather_axis, m_output_type); + } else { + return std::make_shared(new_args, + m_variable, + m_concat_axis, + m_gather_axis, + m_combine_scales_and_zp, + m_quantization_config, + m_scales_zp_output_order, + m_output_type); + } +} + +std::vector shape_infer(const KVCache* op, + const std::vector& input_shapes, + const ov::op::internal::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + bool combine_scales_and_zp) { + std::vector out_shapes = shape_infer(op, input_shapes); + + if (op->get_output_size() >= 3) { + ov::intel_gpu::op::DynamicQuantize op; + auto quantized_data_shapes = + ov::intel_gpu::op::DynamicQuantize::shape_infer(&op, { input_shapes[1] }, config, scales_zp_output_order, combine_scales_and_zp); + + const auto scales_concat_axis = 2; + ov::PartialShape compression_scale_shape = input_shapes[3]; + compression_scale_shape[scales_concat_axis] += quantized_data_shapes[1][scales_concat_axis]; + out_shapes[2] = compression_scale_shape; + + // add zp output + if (quantized_data_shapes.size() == 3) { + ov::PartialShape compression_zp_shape = input_shapes[4]; + compression_zp_shape[scales_concat_axis] += quantized_data_shapes[2][scales_concat_axis]; + out_shapes[3] = compression_zp_shape; + } } + + return out_shapes; } -std::vector shape_infer(const KVCache* op, std::vector input_shapes) { +std::vector shape_infer(const KVCache* op, const std::vector& input_shapes) { std::vector out_shapes; out_shapes.resize(op->get_output_size()); const auto& gather_axis = op->get_gather_axis(); const auto& concat_axis = ov::util::normalize(op->get_concat_axis(), input_shapes[0].size()); - if (op->get_output_size() == 2) { + if (op->get_output_size() >= 2) { out_shapes[0] = input_shapes[0]; out_shapes[0][gather_axis] = input_shapes[2][0]; out_shapes[0][concat_axis] += input_shapes[1][concat_axis]; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp index 5438a6e2e695b5..6cd7f778c71b3b 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp @@ -3,6 +3,7 @@ // #include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/op/read_values.hpp" #include "openvino/core/partial_shape.hpp" namespace ov { @@ -28,16 +29,14 @@ bool ReadValue::visit_attributes(ov::AttributeVisitor& visitor) { return true; } -void ReadValue::validate_and_infer_types() { - OPENVINO_ASSERT(m_variable, "Variable is not initialized."); - const auto& variable_info = m_variable->get_info(); +void ReadValue::validate_and_infer_types(size_t output_idx, const ov::op::util::VariableInfo& variable_info) { const auto& variable_type = variable_info.data_type; const auto& variable_shape = variable_info.data_shape; // If no inputs provided, it means this ReadValue doesn't have initial subgraph. This is valid. - if (get_input_size() > 0) { - const auto& initial_type = get_input_element_type(0); - const auto& initial_shape = get_input_partial_shape(0); + if (get_input_size() > output_idx) { + const auto& initial_type = get_input_element_type(output_idx); + const auto& initial_shape = get_input_partial_shape(output_idx); // Variable's shape/type determine a permissible range of values for shape/type inferred from initial_subgraph. // If initial_subgraph is set, then we need to check that shape/type inferred from initial_subgraph @@ -64,19 +63,25 @@ void ReadValue::validate_and_infer_types() { // dynamic rank/type can be derived from the IRs generated via the prev versions of OV, // but dynamic rank/type are not supported in plugins, // so we are trying to fix them here using the rank/type of ReadValue 1st input, if it exists - if (get_input_size() > 0 && variable_info.data_shape.rank().is_dynamic() && - variable_info.data_type.is_dynamic()) { - set_output_type(0, initial_type, initial_shape); + if (variable_info.data_shape.rank().is_dynamic() && variable_info.data_type.is_dynamic()) { + set_output_type(output_idx, initial_type, initial_shape); return; } } - set_output_type(0, variable_type, variable_shape); + set_output_type(output_idx, variable_type, variable_shape); +} + +void ReadValue::validate_and_infer_types() { + OPENVINO_ASSERT(m_variable, "Variable is not initialized."); + const auto& variable_info = m_variable->get_info(); + + validate_and_infer_types(0, variable_info); } std::shared_ptr ReadValue::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); - switch (new_args.size()) { + switch (new_args.size()) { case 0: return std::make_shared(m_variable); case 1: @@ -89,6 +94,62 @@ std::shared_ptr ReadValue::clone_with_new_inputs(const ov::OutputVector& n } } +ReadValues::ReadValues(const std::shared_ptr& variable, + const std::vector& internal_states_infos) + : ReadValue(variable) + , m_internal_states_infos(internal_states_infos) { + OPENVINO_ASSERT(!internal_states_infos.empty()); + set_output_size(internal_states_infos.size()); + validate_and_infer_types(); +} + +ReadValues::ReadValues(const std::vector>& variable_initializers, + const std::shared_ptr& variable, + const std::vector& internal_states_infos) + : ReadValue(variable_initializers, variable) + , m_internal_states_infos(internal_states_infos) { + OPENVINO_ASSERT(!internal_states_infos.empty()); + set_output_size(internal_states_infos.size()); + validate_and_infer_types(); +} + +bool ReadValues::visit_attributes(ov::AttributeVisitor& visitor) { + visitor.on_attribute("variable_id", m_variable); + + auto variable_info = m_variable->get_info(); + visitor.on_attribute("variable_type", variable_info.data_type); + visitor.on_attribute("variable_shape", variable_info.data_shape); + m_variable->update(variable_info); + return true; +} + +void ReadValues::validate_and_infer_types() { + OPENVINO_ASSERT(m_variable, "Variable is not initialized."); + + for (size_t i = 0; i < get_output_size(); i++) { + ReadValue::validate_and_infer_types(i, m_internal_states_infos[i]); + } +} + +std::shared_ptr ReadValues::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + OPENVINO_ASSERT(new_args.empty() || new_args.size() == m_internal_states_infos.size(), + "Unable to clone ReadValues op (name=", this->get_friendly_name(), "). ", + "Incorrect number of inputs. Expected: 0 or ", m_internal_states_infos.size(), ". ", + "Actual: ", new_args.size(), "."); + + if (new_args.size() > 0) { + return std::make_shared(new_args, m_variable, m_internal_states_infos); + } else { + return std::make_shared(m_variable, m_internal_states_infos); + } +} + +std::vector ReadValues::get_all_internal_states_info() const { + return m_internal_states_infos; +} + } // namespace op } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp index 57d2899e2b2e77..65930d5feb6d0a 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp @@ -26,7 +26,31 @@ SDPA::SDPA(const OutputVector& inputs, , m_order_k(order_k) , m_order_v(order_v) , m_order_out(order_out) - , m_output_type(output_type) { + , m_output_type(output_type) + , m_compressed(false) { + set_arguments(inputs); + set_causal(is_causal); + validate_and_infer_types(); +} + +SDPA::SDPA(const OutputVector& inputs, + const bool is_causal, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const QuantizationConfig& quantization_config, + const bool combine_scales_and_zp, + const ov::element::Type output_type) + : m_is_causal(is_causal) + , m_order_q(order_q) + , m_order_k(order_k) + , m_order_v(order_v) + , m_order_out(order_out) + , m_output_type(output_type) + , m_compressed(true) + , m_combine_scales_and_zp(combine_scales_and_zp) + , m_quantization_config(quantization_config) { set_arguments(inputs); set_causal(is_causal); validate_and_infer_types(); @@ -46,11 +70,13 @@ std::shared_ptr SDPA::clone_with_new_inputs(const ov::OutputVector& ne void SDPA::validate_and_infer_types() { const auto input_size = get_input_size(); + + const auto compression_inputs = get_compression_inputs_num(); NODE_VALIDATION_CHECK(this, - input_size == 3 || input_size == 4 || input_size == 5, + input_size >= 3 + compression_inputs && input_size <= 5 + compression_inputs, "Number of inputs is incorrect. Current value is: ", input_size, - ", expected 3, 4 or 5."); + ", expected 3, 4 or 5 data inputs and ", compression_inputs, " KV-cache compression related inputs"); std::vector input_shapes; for (size_t i = 0; i < input_size; i++) { @@ -77,6 +103,18 @@ bool SDPA::visit_attributes(ov::AttributeVisitor &visitor) { return true; } +size_t SDPA::get_compression_inputs_num() const { + size_t compression_inputs = 0; + if (m_compressed) { + compression_inputs += 2; // 2 * scales + + if (m_quantization_config.is_asymmetric_quantization() && !m_combine_scales_and_zp) + compression_inputs += 2; // 2 * zp + } + + return compression_inputs; +} + std::vector shape_infer(const SDPA* op, std::vector input_shapes, const std::vector& order_q, diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index f97b7fae126b47..305e21a5000149 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -68,6 +68,7 @@ #include "plugin/transformations/swiglu_fusion.hpp" #include "plugin/transformations/transpose_fusion.hpp" #include "plugin/transformations/indirect_kv_cache.hpp" +#include "plugin/transformations/kv_cache_compression.hpp" #include "plugin/transformations/convert_convolution.hpp" #include "plugin/transformations/unsqueeze_broadcast_reshape_matmul_fusion.hpp" #include "plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp" @@ -853,6 +854,10 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); + + auto kv_cache_compression_dt = config.get_property(ov::hint::kv_cache_precision); + manager.register_pass(kv_cache_compression_dt); + manager.register_pass(); // This pass should be done after asymmetric quantization matching as it can move zp subtraction upper in the graph diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index 5f943564d6f50e..5c3b3ee0c970f9 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -183,6 +183,7 @@ static void print_help_messages() { message_list.emplace_back("OV_GPU_DisableRuntimeSkipReorder", "Disable runtime skip reorder."); message_list.emplace_back("OV_GPU_DisablePrimitiveFusing", "Disable primitive fusing"); message_list.emplace_back("OV_GPU_DisableFakeAlignment", "Disable fake alignment"); + message_list.emplace_back("OV_GPU_KVCacheCompression", "Enable/Disable KV-cache compression"); message_list.emplace_back("OV_GPU_DynamicQuantizeLayersWithoutOnednn", "Enable Dynamic quantization for specified Fully connected layers only, " "separated by space. Support case-insensitive and regular expression. For example .*fully_connected.*"); message_list.emplace_back("OV_GPU_DynamicQuantizeGroupSize", "Specify a group size of dynamic quantization to enable " @@ -253,6 +254,7 @@ debug_configuration::debug_configuration() , disable_runtime_skip_reorder(0) , disable_primitive_fusing(0) , disable_fake_alignment(0) + , use_kv_cache_compression(-1) , dynamic_quantize_group_size(DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET) , disable_horizontal_fc_fusion(0) { #ifdef GPU_DEBUG_CONFIG @@ -305,6 +307,7 @@ debug_configuration::debug_configuration() get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder); get_gpu_debug_env_var("DisablePrimitiveFusing", disable_primitive_fusing); get_gpu_debug_env_var("DisableFakeAlignment", disable_fake_alignment); + get_gpu_debug_env_var("KVCacheCompression", use_kv_cache_compression); get_gpu_debug_env_var("DynamicQuantizeGroupSize", dynamic_quantize_group_size); get_gpu_debug_env_var("DisableHorizontalFCFusion", disable_horizontal_fc_fusion); std::string dump_iteration_str; diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index 7661444cc4fd7b..09a979c495f207 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -58,6 +58,7 @@ void ExecutionConfig::set_default() { std::make_tuple(ov::cache_mode, ov::CacheMode::OPTIMIZE_SPEED), std::make_tuple(ov::cache_encryption_callbacks, EncryptionCallbacks{}), std::make_tuple(ov::hint::dynamic_quantization_group_size, 32), + std::make_tuple(ov::hint::kv_cache_precision, ov::element::undefined), std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false), std::make_tuple(ov::weights_path, ""), @@ -209,6 +210,14 @@ void ExecutionConfig::apply_debug_options(const cldnn::device_info& info) { else set_property(ov::hint::dynamic_quantization_group_size(debug_config->dynamic_quantize_group_size)); } + + GPU_DEBUG_IF(debug_config->use_kv_cache_compression != -1) { + GPU_DEBUG_IF(debug_config->use_kv_cache_compression == 1) { + set_property(ov::hint::kv_cache_precision(ov::element::i8)); + } else { + set_property(ov::hint::kv_cache_precision(ov::element::undefined)); + } + } } void ExecutionConfig::apply_hints(const cldnn::device_info& info) { diff --git a/src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp b/src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp index 4b24fb996b3f3f..105963d1b09d73 100644 --- a/src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp +++ b/src/plugins/intel_gpu/tests/unit/dynamic_execution/stateful_model.cpp @@ -197,7 +197,7 @@ TEST(stateful_model, check_dynamic_pad_for_kv_cache) { auto input_kv_lay = layout{info.data_shape, info.data_type, format::bfyx}; topology topology(input_layout("beam_idx", input_beam_idx_lay), input_layout("present", input_present_lay), - read_value("kv_cache", std::vector{}, info.variable_id, input_kv_lay), + read_value("kv_cache", std::vector{}, info.variable_id, {input_kv_lay}), gather("gather", input_info("kv_cache"), input_info("beam_idx"), @@ -224,7 +224,7 @@ TEST(stateful_model, check_dynamic_pad_for_kv_cache) { auto pad = tensor(0); pad.batch[0] = 1; - + { std::vector dynamic_pad_mask; const auto& dynamic_pad_dims = read_value_inst->get_output_layout(0).data_padding._dynamic_dims_mask; diff --git a/src/plugins/intel_gpu/tests/unit/shape_infer/read_value_si_test.cpp b/src/plugins/intel_gpu/tests/unit/shape_infer/read_value_si_test.cpp index 194bc0244f86f0..2000d826ddfad6 100644 --- a/src/plugins/intel_gpu/tests/unit/shape_infer/read_value_si_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/shape_infer/read_value_si_test.cpp @@ -32,7 +32,7 @@ TEST_P(read_value_test, shape_infer) { auto& engine = get_test_engine(); - const auto variable_layout = p.input_layout; + const std::vector variable_layout = {p.input_layout}; auto input_layout_prim = std::make_shared("input", p.input_layout); auto inputs = std::vector{ input_info("input") }; diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp index c1686e359e91a0..1aaa5df26998fa 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp @@ -38,6 +38,8 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { auto input_ps = is_4d ? ov::PartialShape{ batch_num, 1, 1, ifm_num } : ov::PartialShape{ batch_num, ifm_num}; auto dyn_input_ps = is_4d ? ov::PartialShape{ -1, 1, 1, ifm_num } : ov::PartialShape{ -1, ifm_num}; auto input_mem = engine.allocate_memory({ input_ps, data_types::f32, format::bfyx }); + auto group_sizes = std::vector(input_ps.size(), 1); + group_sizes.back() = 32; auto input_data = rg.generate_random_1d(batch_num * ifm_num, -16.0f, 16.0f); set_values(input_mem, input_data); @@ -48,8 +50,14 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { auto in_layout = is_dynamic ? layout{ dyn_input_ps, data_types::f16, format::bfyx } : layout{ input_ps, data_types::f16, format::bfyx }; + dynamic_quantize::QuantizationConfig dq_config; + dq_config.mode = dynamic_quantize::QuantizationConfig::QuantizationMode::Symmetric; + dq_config.quantization_dt = data_types::i8; + dq_config.scale_dt = data_types::f16; + dq_config.group_sizes = group_sizes; + auto reorder_1 = reorder("reorder_1", input_info("input"), layout{ input_ps, data_types::f16, format::bfyx }); - auto dyn_quan_prim = dynamic_quantize("dyn_quan_prim", input_info("reorder_1"), 32, {data_types::f16, data_types::i8}); + auto dyn_quan_prim = dynamic_quantize("dyn_quan_prim", input_info("reorder_1"), dq_config); auto reorder_2 = reorder("reorder_2", input_info("dyn_quan_prim"), layout{ input_ps, data_types::f16, format::bfyx }); // Implemented dynamic quantize kernel diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/variable.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/variable.cpp index 3bf1a771512ae8..59e31547602252 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/variable.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/variable.cpp @@ -35,7 +35,7 @@ struct variable_test : public ::testing::TestWithParam> { topology topology; topology.add(input_layout("input", input_data->get_layout())); - topology.add(read_value{"read_value", { input_info("input") }, "v0", variable_layout}); + topology.add(read_value{"read_value", { input_info("input") }, "v0", { variable_layout }}); topology.add(eltwise{"sum", { input_info("input"), input_info("read_value") }, eltwise_mode::sum, {}, variable_layout.data_type}); topology.add(assign{"assign", { input_info("sum") }, "v0", variable_layout}); @@ -129,7 +129,7 @@ void test_exception_on_wrong_layout(bool is_caching_test) { topology topology; topology.add(input_layout("input", input_data->get_layout())); - topology.add(read_value{"read_value", { input_info("input") }, "v0", variable_layout}); + topology.add(read_value{"read_value", { input_info("input") }, "v0", { variable_layout }}); topology.add(input_layout("wrong_input", wrong_input_data->get_layout())); topology.add(assign{"assign", { input_info("wrong_input") }, "v0", wrong_layout}); @@ -218,14 +218,14 @@ void test_variables_are_preserved_across_inferences(bool is_caching_test) { topology.add(assign{"assign_2", { input_info("input_2") }, "v2", variable_layout}); topology.add(data("dummy1", dummy1)); - topology.add(read_value{"read_value_1", { input_info("dummy1") }, "v1", variable_layout}); - topology.add(read_value{"read_value_2", { input_info("dummy1") }, "v2", variable_layout}); + topology.add(read_value{"read_value_1", { input_info("dummy1") }, "v1", { variable_layout }}); + topology.add(read_value{"read_value_2", { input_info("dummy1") }, "v2", { variable_layout }}); topology.add(eltwise{"sum", { input_info("read_value_1"), input_info("read_value_2") }, eltwise_mode::sum, {}, variable_layout.data_type}); topology.add(assign{"assign_result", { input_info("sum") }, "v_result", variable_layout}); topology.add(data("dummy2", dummy2)); - topology.add(read_value{"read_result", { input_info("dummy2") }, "v_result", variable_layout}); + topology.add(read_value{"read_result", { input_info("dummy2") }, "v_result", { variable_layout }}); cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test);