From 7a0aa3e6d2529707c43df336f10b5c35631d4b55 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 17 Oct 2024 18:48:59 +0400 Subject: [PATCH] WIP: [GPU] Some fixes --- .../include/ov_ops/dynamic_quantize.hpp | 54 +- .../src/ov_ops/dynamic_quantize.cpp | 36 +- .../intel_gpu/graph/kernel_impl_params.hpp | 2 +- .../include/intel_gpu/op/dynamic_quantize.hpp | 57 ++ .../include/intel_gpu/op/indirect_sdpa.hpp | 12 + .../include/intel_gpu/op/kv_cache.hpp | 35 +- .../include/intel_gpu/op/read_value.hpp | 7 + .../include/intel_gpu/op/read_values.hpp | 41 ++ .../intel_gpu/include/intel_gpu/op/sdpa.hpp | 22 + .../plugin/multi_tensor_variable_state.hpp | 27 +- .../intel_gpu/plugin/primitives_list.hpp | 2 + .../intel_gpu/primitives/dynamic_quantize.hpp | 40 +- .../include/intel_gpu/primitives/kv_cache.hpp | 24 +- .../intel_gpu/primitives/read_value.hpp | 27 +- .../scaled_dot_product_attention.hpp | 31 +- .../intel_gpu/runtime/debug_configuration.hpp | 1 + .../intel_gpu/src/graph/dynamic_quantize.cpp | 37 +- .../graph_optimizer/build_implementations.cpp | 3 + .../graph_optimizer/prepare_buffer_fusing.cpp | 59 ++- .../src/graph/impls/cpu/read_value.cpp | 26 +- .../src/graph/impls/ocl/dynamic_quantize.cpp | 10 + .../impls/ocl/kernel_selector_helper.cpp | 6 +- .../graph/impls/ocl/kernel_selector_helper.h | 2 +- .../src/graph/impls/ocl/kv_cache.cpp | 238 ++++++++- .../ocl/scaled_dot_product_attention.cpp | 82 ++- .../src/graph/include/dynamic_quantize_inst.h | 5 +- .../src/graph/include/kv_cache_inst.h | 3 + .../src/graph/include/program_node.h | 2 +- .../src/graph/include/read_value_inst.h | 11 +- src/plugins/intel_gpu/src/graph/kv_cache.cpp | 44 +- .../intel_gpu/src/graph/primitive_inst.cpp | 249 +++++++-- .../intel_gpu/src/graph/program_node.cpp | 8 +- .../intel_gpu/src/graph/read_value.cpp | 39 +- .../graph/scaled_dot_product_attention.cpp | 8 + .../cl_kernels/dynamic_quantize_gpu_opt.cl | 2 +- .../dynamic_quantize_gpu_opt_generic.cl | 130 +++++ .../cl_kernels/dynamic_quantize_gpu_ref.cl | 101 +++- .../kernel_selector/cl_kernels/sdpa_opt.cl | 485 ++++++++++++++++-- .../kernel_selector/cl_kernels/sdpa_ref.cl | 51 ++ .../dynamic_quantize_kernel_opt_generic.cpp | 327 ++++++++++++ .../dynamic_quantize_kernel_opt_generic.h | 30 ++ .../dynamic_quantize_kernel_ref.cpp | 57 +- .../dynamic_quantize_kernel_ref.h | 7 + .../dynamic_quantize_kernel_selector.cpp | 12 +- .../kernels/sdpa/sdpa_kernel_base.cpp | 23 + .../kernels/sdpa/sdpa_kernel_base.h | 7 + .../kernels/sdpa/sdpa_kernel_micro.cpp | 3 + .../kernels/sdpa/sdpa_kernel_opt.cpp | 40 +- .../kernels/sdpa/sdpa_kernel_ref.cpp | 26 +- .../kernels/sdpa/sdpa_kernel_selector.cpp | 20 +- .../plugin/multi_tensor_variable_state.cpp | 61 +++ .../src/plugin/ops/dynamic_quantize.cpp | 36 +- .../intel_gpu/src/plugin/ops/kv_cache.cpp | 19 +- .../ops/scaled_dot_product_attention.cpp | 9 +- .../intel_gpu/src/plugin/ops/variable.cpp | 47 +- .../src/plugin/sync_infer_request.cpp | 35 +- .../dynamic_quantize_fully_connected.cpp | 9 +- .../transformations/kv_cache_compression.cpp | 349 +++++++++++++ .../transformations/kv_cache_compression.hpp | 43 ++ .../transformations/op/dynamic_quantize.cpp | 86 ++++ .../transformations/op/indirect_sdpa.cpp | 57 +- .../plugin/transformations/op/kv_cache.cpp | 114 +++- .../plugin/transformations/op/read_value.cpp | 83 ++- .../src/plugin/transformations/op/sdpa.cpp | 44 +- .../src/plugin/transformations_pipeline.cpp | 3 + .../src/runtime/debug_configuration.cpp | 6 +- 66 files changed, 3330 insertions(+), 242 deletions(-) create mode 100644 src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp create mode 100644 src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.h create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index 69c148305fb94f..643a726ffbcee6 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -11,32 +11,66 @@ namespace ov { namespace op { namespace internal { +struct QuantizationConfig { + enum class QuantizationMode { + Symmetric, + Asymmetric + }; + + QuantizationMode mode = QuantizationMode::Symmetric; + element::Type quantization_dt = element::undefined; + element::Type scale_dt = element::undefined; + element::Type zp_dt = element::undefined; + std::vector group_sizes = {}; + + bool operator==(const QuantizationConfig& rhs) const { + return mode == rhs.mode && + quantization_dt == rhs.quantization_dt && + scale_dt == rhs.scale_dt && + zp_dt == rhs.zp_dt && + group_sizes == rhs.group_sizes; + } + + bool is_asymmetric_quantization() const { + return mode == QuantizationMode::Asymmetric; + } +}; + /// \brief Operator performing Dynamic Quantize class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { public: - OPENVINO_OP("DynamicQuantize", "gpu_opset"); - + OPENVINO_OP("DynamicQuantize", "ie_internal_opset"); DynamicQuantize() = default; /// \brief Constructs an DynamicQuantize operation. /// /// \param data Input tensor with data - /// \param group_sizes Group sizes for dynamic quantization - /// \param dt_scale Data type for scale output - DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale); + /// \param config Dynamic quantization configuration + DynamicQuantize(const Output& data, const QuantizationConfig& config); void validate_and_infer_types() override; std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + const std::vector& get_group_sizes() const { - return m_group_sizes; + return m_config.group_sizes; + }; + + QuantizationConfig::QuantizationMode get_quantization_mode() const { + return m_config.mode; }; + + QuantizationConfig get_quantization_config() const { + return m_config; + }; + static std::vector shape_infer(const DynamicQuantize* op, const std::vector& input_shapes, - const std::vector& group_sizes); + const QuantizationConfig& config); + +protected: + DynamicQuantize(const Output& data, const QuantizationConfig& config, size_t outputs_number); -private: - std::vector m_group_sizes; - element::Type m_dt_scale; + QuantizationConfig m_config; }; } // namespace internal diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index 74c0498e9a4425..3bf75a0dbb7895 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -13,35 +13,42 @@ namespace ov { namespace op { namespace internal { -DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale) +DynamicQuantize::DynamicQuantize(const Output& data, const QuantizationConfig& config, size_t outputs_number) : Op({data}), - m_group_sizes(std::move(group_sizes)), - m_dt_scale(dt_scale) { - OPENVINO_ASSERT(data.get_partial_shape().rank() == m_group_sizes.size(), + m_config(config) { + OPENVINO_ASSERT(data.get_partial_shape().rank() == m_config.group_sizes.size(), "FC input rank should be same as the rank of group_size ", data.get_tensor_ptr()->get_partial_shape().rank(), " / ", - m_group_sizes.size()); - set_output_size(2); + m_config.group_sizes.size()); + set_output_size(outputs_number); +} + +DynamicQuantize::DynamicQuantize(const Output& data, const QuantizationConfig& config) + : DynamicQuantize(data, config, config.mode == QuantizationConfig::QuantizationMode::Symmetric ? 2 : 3) { validate_and_infer_types(); } void DynamicQuantize::validate_and_infer_types() { std::vector input_shapes = {get_input_partial_shape(0)}; - auto out_shapes = shape_infer(this, input_shapes, m_group_sizes); - set_output_type(0, element::i8, out_shapes[0]); - set_output_type(1, m_dt_scale, out_shapes[1]); + auto out_shapes = shape_infer(this, input_shapes, m_config); + set_output_type(0, m_config.quantization_dt, out_shapes[0]); + set_output_type(1, m_config.scale_dt, out_shapes[1]); + + if (m_config.is_asymmetric_quantization()) + set_output_type(2, m_config.zp_dt, out_shapes[2]); } std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_group_sizes, m_dt_scale); + return std::make_shared(new_args.at(0), m_config); } std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, const std::vector& input_shapes, - const std::vector& group_sizes) { + const QuantizationConfig& config) { + const auto& group_sizes = config.group_sizes; std::vector out_shapes; out_shapes.push_back(input_shapes[0]); @@ -52,7 +59,7 @@ std::vector DynamicQuantize::shape_infer(const DynamicQuantize " / ", group_sizes.size()); for (size_t i = 0; i < scale_shape.size(); i++) { - if (scale_shape[i].is_dynamic()) + if (scale_shape[i].is_dynamic() || scale_shape[i] == 0) continue; if (group_sizes[i] == UINT64_MAX) @@ -63,6 +70,11 @@ std::vector DynamicQuantize::shape_infer(const DynamicQuantize } } out_shapes.push_back(scale_shape); + + // Add zero points shape + if (config.is_asymmetric_quantization()) + out_shapes.push_back(scale_shape); + return out_shapes; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp index 3e8887fbb2f7ee..72623f6d120955 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp @@ -53,7 +53,7 @@ struct kernel_impl_params final { optional_layout weights_zero_points_layout = optional_layout(); optional_layout activations_zero_points_layout = optional_layout(); optional_layout compensation_layout = optional_layout(); - optional_layout state_layout = optional_layout(); + std::vector state_layouts; std::map memory_deps = {}; size_t primary_input_idx = 0; diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp new file mode 100644 index 00000000000000..007231daf4de15 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/op/dynamic_quantize.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/op/op.hpp" +#include "ov_ops/dynamic_quantize.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +class DynamicQuantize : public ov::op::internal::DynamicQuantize { +public: + OPENVINO_OP("DynamicQuantize", "gpu_opset"); + + using QuantizationConfig = ov::op::internal::QuantizationConfig; + + DynamicQuantize() = default; + /// \brief Constructs an DynamicQuantize operation. + /// + /// \param data Input tensor with data + /// \param config Dynamic quantization configuration + /// \param scales_zp_output_order Non default order of scales + /// \param combine_scales_and_zp Save scales and zero points into single buffer by pairs (scale, zp) + DynamicQuantize(const Output& data, + const QuantizationConfig& config, + const std::vector& scales_zp_output_order = {}, + const bool combine_scales_and_zp = false); + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + const std::vector& get_scales_zp_output_order() const { + return m_scales_zp_output_order; + }; + + bool get_combine_scales_and_zp() const { + return m_combine_scales_and_zp; + }; + + static std::vector shape_infer(const DynamicQuantize* op, + const std::vector& input_shapes, + const QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp = false); + +private: + bool m_combine_scales_and_zp = false; + std::vector m_scales_zp_output_order; +}; + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp index b4d34a3975af6b..4ce90a685690e5 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp @@ -29,6 +29,18 @@ class IndirectSDPA : public ov::intel_gpu::op::SDPA { const std::vector& order_out, const ov::element::Type output_type = ov::element::undefined); + IndirectSDPA(const OutputVector& data_inputs, + const ov::Output& beam_table, + const bool is_causal, + const int64_t indirect_axis, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const QuantizationConfig& quantization_config, + const bool combine_scales_and_zp, + const ov::element::Type output_type = ov::element::undefined); + bool visit_attributes(ov::AttributeVisitor &visitor) override; void validate_and_infer_types() override; diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp index 402ff6e46c1607..e6dc5cd7c5ae94 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp @@ -7,6 +7,7 @@ #include "openvino/op/op.hpp" #include "openvino/op/util/variable.hpp" #include "openvino/op/util/variable_extension.hpp" +#include "intel_gpu/op/dynamic_quantize.hpp" namespace ov { namespace intel_gpu { @@ -18,20 +19,31 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension { public: OPENVINO_OP("KVCache", "gpu_opset"); + using QuantizationConfig = ov::op::internal::QuantizationConfig; + KVCache() = default; KVCache(const Output& past, const Output& new_token_data, - const Output& beam_idx, const std::shared_ptr& past_values, int64_t concat_axis, - int64_t gather_axis, const ov::element::Type output_type = ov::element::undefined); KVCache(const Output& past, const Output& new_token_data, + const Output& beam_idx, + const std::shared_ptr& past_values, + int64_t concat_axis, + int64_t gather_axis, + const ov::element::Type output_type = ov::element::undefined); + + KVCache(const OutputVector& inputs, const std::shared_ptr& past_values, int64_t concat_axis, + int64_t gather_axis, + bool combine_scales_and_zp, + const QuantizationConfig& config, + const std::vector& scales_zp_output_order, const ov::element::Type output_type = ov::element::undefined); bool visit_attributes(ov::AttributeVisitor& visitor) override; @@ -53,14 +65,31 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension { bool get_indirect() const { return m_indirect; } + bool get_kv_compressed() const { return m_compressed; } + bool get_combine_scales_and_zp() const { return m_combine_scales_and_zp; } + QuantizationConfig get_quantization_config() const { return m_quantization_config; } + std::vector get_scales_zp_output_order() const { return m_scales_zp_output_order; } + private: int64_t m_concat_axis = 0; int64_t m_gather_axis = 0; bool m_indirect = false; + + bool m_compressed = false; + bool m_combine_scales_and_zp = false; + QuantizationConfig m_quantization_config = {}; + std::vector m_scales_zp_output_order = {}; + ov::element::Type m_output_type; }; -std::vector shape_infer(const KVCache* op, std::vector input_shapes); +std::vector shape_infer(const KVCache* op, const std::vector& input_shapes); + +std::vector shape_infer(const KVCache* op, + const std::vector& input_shapes, + const ov::op::internal::QuantizationConfig& config, + const std::vector& scales_output_order = {}, + bool combine_scales_and_zp = false); } // namespace op } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp index a9c47f3fa39fa6..419c18118229ff 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp @@ -26,6 +26,7 @@ class ReadValue : public ov::op::Op, public ov::op::util::VariableExtension { bool visit_attributes(ov::AttributeVisitor& visitor) override; void validate_and_infer_types() override; + void validate_and_infer_types(size_t output_idx, const ov::op::util::VariableInfo& variable_info); std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; @@ -33,6 +34,12 @@ class ReadValue : public ov::op::Op, public ov::op::util::VariableExtension { OPENVINO_ASSERT(m_variable, "Variable is not initialized. Variable_id is unavailable"); return m_variable->get_info().variable_id; } + +protected: + ReadValue(const std::vector>& variable_initializers, const std::shared_ptr& variable) + : Op(variable_initializers) { + m_variable = variable; + } }; } // namespace op diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp new file mode 100644 index 00000000000000..90e1f885254346 --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/op/read_values.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/op/read_value.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +/// \brief This operation handles the OpenVINO GPU Plugin's custom variable representation (which can store multiple states in a single variable) at the graph level. +class ReadValues : public ReadValue { +public: + OPENVINO_OP("ReadValues", "gpu_opset"); + + ReadValues() = default; + + ReadValues(const std::shared_ptr& variable, + const std::vector& internal_states_infos); + + ReadValues(const OutputVector& variable_initializers, + const std::shared_ptr& variable, + const std::vector& internal_states_infos); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::vector get_all_internal_states_info() const; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + +private: + std::vector m_internal_states_infos; +}; + +} // namespace op +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp index 9f5d4dad16efd7..741644208cfe76 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp @@ -8,6 +8,7 @@ #include "openvino/core/partial_shape.hpp" #include "openvino/op/op.hpp" #include "openvino/op/scaled_dot_product_attention.hpp" +#include "intel_gpu/op/dynamic_quantize.hpp" namespace ov { namespace intel_gpu { @@ -17,6 +18,8 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { public: OPENVINO_OP("SDPA", "gpu_opset"); + using QuantizationConfig = ov::op::internal::QuantizationConfig; + SDPA() = default; SDPA(const OutputVector& inputs, @@ -27,6 +30,16 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { const std::vector& order_out, const ov::element::Type output_type = ov::element::undefined); + SDPA(const OutputVector& inputs, + const bool is_causal, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const QuantizationConfig& quantization_config, + const bool m_combine_scales_and_zp, + const ov::element::Type output_type = ov::element::undefined); + bool visit_attributes(ov::AttributeVisitor &visitor) override; void validate_and_infer_types() override; @@ -41,6 +54,11 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { std::vector get_output_transpose_order() const { return m_order_out; } ov::element::Type get_output_type() const { return m_output_type; } + bool get_kv_compressed() const { return m_compressed; } + bool get_combine_scales_and_zp() const { return m_combine_scales_and_zp; } + QuantizationConfig get_quantization_config() const { return m_quantization_config; } + size_t get_compression_inputs_num() const; + static std::vector default_order(size_t rank) { std::vector order(rank); std::iota(order.begin(), order.end(), 0); @@ -54,6 +72,10 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { std::vector m_order_v; std::vector m_order_out; ov::element::Type m_output_type; + + bool m_compressed = false; + bool m_combine_scales_and_zp = false; + QuantizationConfig m_quantization_config = {}; }; std::vector shape_infer(const SDPA* op, diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp index 0cad36f62e47b9..d1777d6478eef1 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp @@ -41,10 +41,35 @@ class VariableStateIndirectKVCache : public MultiTensorState { VariableState::Ptr get_beam_table_state() const; ov::PartialShape get_beam_table_shape(const ov::PartialShape& kv_cache_shape); -private: +protected: size_t m_beam_axis = 0; size_t m_concat_axis = 0; }; +// This is multi-tensor state for Indirect KV-Cache + Gemm pattern +// Internally it stores KV Cache state + Beam Table state + compression scales state + (optional compression zero points) +class VariableStateIndirectKVCacheCompressed : public VariableStateIndirectKVCache { +public: + VariableStateIndirectKVCacheCompressed(const VariableStateInfo& info, + std::shared_ptr context, + std::shared_ptr shape_predictor, + size_t beam_idx, + size_t concat_idx, + bool has_zp_state); + using Ptr = std::shared_ptr; + + void set_state(const ov::SoPtr& state) override; + ov::SoPtr get_state() const override; + + VariableState::Ptr get_compression_scale_state() const; + void set_compression_scale_layout(const cldnn::layout& new_layout); + + VariableState::Ptr get_compression_zp_state() const; + void set_compression_zp_layout(const cldnn::layout& new_layout); + bool has_zp_state() const; + +private: + bool m_has_zp_state = false; +}; } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index 1dbd769444b1aa..947a010efe0377 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -284,6 +284,7 @@ REGISTER_FACTORY(internal, RMS); REGISTER_FACTORY(internal, GatherCompressed); REGISTER_FACTORY(internal, KVCache); REGISTER_FACTORY(internal, ReadValue); +REGISTER_FACTORY(internal, ReadValues); REGISTER_FACTORY(internal, Gemm); REGISTER_FACTORY(internal, SwiGLU); REGISTER_FACTORY(internal, IndirectGemm); @@ -293,4 +294,5 @@ REGISTER_FACTORY(internal, SDPA); REGISTER_FACTORY(internal, IndirectSDPA); REGISTER_FACTORY(internal, RoPE); REGISTER_FACTORY(internal, DynamicQuantize); +REGISTER_FACTORY(internal, DynamicQuantizeExtended); REGISTER_FACTORY(internal, PagedAttentionExtension); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index d93e2f86eed144..e0a920c5c4cf7d 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -5,6 +5,8 @@ #pragma once #include "primitive.hpp" +#include "ov_ops/dynamic_quantize.hpp" + namespace cldnn { /// @brief Dynamic Quantize primitive @@ -12,26 +14,41 @@ namespace cldnn { struct dynamic_quantize : public primitive_base { CLDNN_DECLARE_PRIMITIVE(dynamic_quantize); - dynamic_quantize() : primitive_base("", {}), group_size(0) {} + using QuantizationConfig = ov::op::internal::QuantizationConfig; + + dynamic_quantize() : primitive_base("", {}) {} /// @brief Constructs dynamic_quantize primitive /// @param id This primitive id /// @param input Input primitive id - /// @param group_size Quantization group size + /// @param group_sizes Quantization group size /// @param data_type Output data type of quantized /// @param output_size Output data size of the primitive dynamic_quantize(const primitive_id& id, const input_info& input, - const uint64_t group_size, - const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}) - : primitive_base(id, {input}, 2, data_types), - group_size(group_size) {} + const QuantizationConfig& config, + const bool combine_scales_and_zp = false, + const std::vector& scales_zp_output_order = {}) + : primitive_base(id, {input}) + , combine_scales_and_zp(combine_scales_and_zp) + , quantization_config(config) + , scales_zp_output_order(scales_zp_output_order) {} - uint64_t group_size = 0; + bool combine_scales_and_zp = false; + QuantizationConfig quantization_config; + std::vector scales_zp_output_order = {}; size_t hash() const override { size_t seed = primitive::hash(); - seed = hash_combine(seed, group_size); + // TODO: add more parameters + seed = hash_range(seed, scales_zp_output_order.begin(), scales_zp_output_order.end()); + seed = hash_range(seed, quantization_config.group_sizes.begin(), quantization_config.group_sizes.end()); + seed = hash_combine(seed, combine_scales_and_zp); + seed = hash_combine(seed, quantization_config.mode); + // seed = hash_combine(seed, quantization_config.quantization_dt); + // seed = hash_combine(seed, quantization_config.scale_dt); + // seed = hash_combine(seed, quantization_config.zp_dt); + return seed; } @@ -40,18 +57,19 @@ struct dynamic_quantize : public primitive_base { return false; auto rhs_casted = downcast(rhs); + // TODO: add more parameters - return group_size == rhs_casted.group_size; + return scales_zp_output_order == rhs_casted.scales_zp_output_order || + quantization_config == rhs_casted.quantization_config; } void save(BinaryOutputBuffer& ob) const override { primitive_base::save(ob); - ob << group_size; + // TODO: add more parameters } void load(BinaryInputBuffer& ib) override { primitive_base::load(ib); - ib >> group_size; } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp index f87041dcff66d6..2086c21033f400 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp @@ -3,10 +3,14 @@ // #pragma once + +#include "primitive.hpp" + #include "openvino/core/partial_shape.hpp" #include "openvino/core/type/element_type.hpp" #include "openvino/op/util/variable.hpp" -#include "primitive.hpp" +#include "ov_ops/dynamic_quantize.hpp" + #include namespace cldnn { @@ -14,6 +18,8 @@ namespace cldnn { struct kv_cache : public primitive_base { CLDNN_DECLARE_PRIMITIVE(kv_cache) + using QuantizationConfig = ov::op::internal::QuantizationConfig; + kv_cache() : primitive_base("", {}) {} kv_cache(const primitive_id& id, @@ -33,11 +39,18 @@ struct kv_cache : public primitive_base { int64_t gather_axis = 0; bool indirect = false; + bool compressed = false; + bool combine_scales_and_zp = false; + QuantizationConfig quantization_config; + std::vector scales_zp_output_order = {}; + size_t hash() const override { size_t seed = primitive::hash(); seed = hash_combine(seed, concat_axis); seed = hash_combine(seed, gather_axis); seed = hash_combine(seed, indirect); + seed = hash_combine(seed, compressed); + // TODO: add here return seed; } @@ -50,7 +63,10 @@ struct kv_cache : public primitive_base { return variable_info == rhs_casted.variable_info && concat_axis == rhs_casted.concat_axis && gather_axis == rhs_casted.gather_axis && - indirect == rhs_casted.indirect; + indirect == rhs_casted.indirect && + compressed == rhs_casted.compressed && + quantization_config == rhs_casted.quantization_config; + // TODO: add here } void save(BinaryOutputBuffer& ob) const override { @@ -62,6 +78,8 @@ struct kv_cache : public primitive_base { ob << concat_axis; ob << gather_axis; ob << indirect; + ob << compressed; + // TODO: add here } void load(BinaryInputBuffer& ib) override { @@ -76,6 +94,8 @@ struct kv_cache : public primitive_base { ib >> concat_axis; ib >> gather_axis; ib >> indirect; + ib >> compressed; + // TODO: add here } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp index 7d9e919f56cf13..26465692ef6352 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp @@ -22,19 +22,23 @@ struct read_value : public primitive_base { /// @param id This primitive id /// @param inputs Input parameters ids /// @param variable_id Variable id - /// @param output_layout Memory layout + /// @param output_layouts Memory layouts read_value(const primitive_id& id, const std::vector& inputs, const std::string& variable_id, - const layout& output_layout, + const std::vector& output_layouts, const ov::element::Type& user_specified_type = ov::element::undefined) - : primitive_base(id, inputs, 1, {optional_data_type{output_layout.data_type}}), + : primitive_base(id, inputs, output_layouts.size()), variable_id{variable_id}, - output_layout{output_layout}, - user_specified_type(user_specified_type) {} + output_layouts{output_layouts}, + user_specified_type(user_specified_type) { + for (size_t output_idx = 0; output_idx < output_layouts.size(); output_idx++) { + output_data_types[output_idx] = optional_data_type(output_layouts[output_idx].data_type); + } + } std::string variable_id; - layout output_layout; + std::vector output_layouts; ov::element::Type user_specified_type; bool operator==(const primitive& rhs) const override { @@ -51,7 +55,9 @@ struct read_value : public primitive_base { primitive_base::save(ob); ov::element::Type_t data_type = user_specified_type; ob << variable_id; - ob << output_layout; + ob << output_layouts.size(); + for (const auto& layout : output_layouts) + ob << layout; ob << make_data(&data_type, sizeof(ov::element::Type_t)); } @@ -59,7 +65,12 @@ struct read_value : public primitive_base { primitive_base::load(ib); ov::element::Type_t data_type = ov::element::Type_t::undefined; ib >> variable_id; - ib >> output_layout; + size_t output_layouts_size; + ib >> output_layouts_size; + output_layouts.resize(output_layouts_size); + for (size_t i = 0; i < output_layouts_size; i++) { + ib >> output_layouts[i]; + } ib >> make_data(&data_type, sizeof(ov::element::Type_t)); user_specified_type = data_type; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp index d66012bfac8889..5b77c7e6c8551d 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp @@ -5,16 +5,20 @@ #pragma once #include "primitive.hpp" +#include "ov_ops/dynamic_quantize.hpp" + namespace cldnn { struct scaled_dot_product_attention : public primitive_base { CLDNN_DECLARE_PRIMITIVE(scaled_dot_product_attention) + using QuantizationConfig = ov::op::internal::QuantizationConfig; + scaled_dot_product_attention() : primitive_base("", {}) {} /// @brief Constructs scaled_dot_product_attention primitive. /// @param id This primitive id. - /// @param inputs Input data primitives id (query, keys, values, [attention_mask], [scale]). + /// @param inputs Input data primitives id (query, keys, values, [attention_mask], [scale], [keys scales], [keys zp], [values scales], [values zp]). /// @param is_causal If true, assumes causal attention masking. In this case attention_mask input is ignored. scaled_dot_product_attention(const primitive_id& id, const std::vector inputs, @@ -23,18 +27,30 @@ struct scaled_dot_product_attention : public primitive_base& input_q_transpose_order = {}, const std::vector& input_k_transpose_order = {}, const std::vector& input_v_transpose_order = {}, - const std::vector& output_transpose_order = {}) + const std::vector& output_transpose_order = {}, + bool is_kv_compressed = false, + bool combine_scales_and_zp = false, + const QuantizationConfig& quantization_config = {}) : primitive_base(id, inputs) , is_causal(is_causal) , indirect_axis(indirect_axis) + , is_kv_compressed(is_kv_compressed) + , combine_scales_and_zp(combine_scales_and_zp) + , quantization_config(quantization_config) , input_q_transpose_order(input_q_transpose_order) , input_k_transpose_order(input_k_transpose_order) , input_v_transpose_order(input_v_transpose_order) , output_transpose_order(output_transpose_order) { auto data_inputs_num = inputs.size(); - if (indirect_axis != -1) + if (indirect_axis != -1) { data_inputs_num--; + } + if (is_kv_compressed) { + data_inputs_num -= 2; // scales + if (quantization_config.is_asymmetric_quantization() && !combine_scales_and_zp) + data_inputs_num -= 2; // zp + } has_attn_mask_input = data_inputs_num > 3; has_scale_input = data_inputs_num > 4; } @@ -44,6 +60,10 @@ struct scaled_dot_product_attention : public primitive_base input_q_transpose_order; std::vector input_k_transpose_order; std::vector input_v_transpose_order; @@ -52,6 +72,7 @@ struct scaled_dot_product_attention : public primitive_base(rhs); return is_causal == rhs_casted.is_causal && + is_kv_compressed == rhs_casted.is_kv_compressed && has_attn_mask_input == rhs_casted.has_attn_mask_input && has_scale_input == rhs_casted.has_scale_input && indirect_axis == rhs_casted.indirect_axis && @@ -81,6 +103,7 @@ struct scaled_dot_product_attention : public primitive_base::save(ob); ob << is_causal; + ob << is_kv_compressed; ob << has_attn_mask_input; ob << has_scale_input; ob << indirect_axis; @@ -88,11 +111,13 @@ struct scaled_dot_product_attention : public primitive_base::load(ib); ib >> is_causal; + ib >> is_kv_compressed; ib >> has_attn_mask_input; ib >> has_scale_input; ib >> indirect_axis; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index fbc8ae84c36a29..7ac2cbbbef543b 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -145,6 +145,7 @@ class debug_configuration { std::vector dynamic_quantize_layers_without_onednn; // Specify Fully-connected layers which enable Dynamic quantization int dynamic_quantize_group_size; // Enable Dynamic quantization for fully connected primitive by specified group size int disable_horizontal_fc_fusion; // Disable fc horizontal fusion + int enable_kv_cache_compression; // Enable KV cache compression std::set dump_iteration; // Dump n-th execution of network. std::vector load_layers_raw_dump; // List of layers to load dumped raw binary and filenames static const debug_configuration *get_instance(); diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index 5c945f4c2d389c..0f0ffbac9cea4e 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "ov_ops/dynamic_quantize.hpp" +#include "intel_gpu/op/dynamic_quantize.hpp" #include "dynamic_quantize_inst.h" #include "primitive_type_base.h" @@ -22,29 +22,42 @@ layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& no } template -std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size) { - ov::op::internal::DynamicQuantize op; +std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, + const dynamic_quantize::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp) { + ov::intel_gpu::op::DynamicQuantize op; auto output_format = act_layout.format; std::vector input_shapes = { act_layout.get(), }; - std::vector shape_group_size(act_layout.get().size(), 1); - shape_group_size.back() = group_size; + auto output_shapes = ov::intel_gpu::op::DynamicQuantize::shape_infer(&op, input_shapes, config, scales_zp_output_order, combine_scales_and_zp); + GPU_DEBUG_TRACE_DETAIL << "shape infer dynamic" << output_shapes[0] << " " << output_shapes[1] << "\n"; - auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, shape_group_size); + std::vector output_layouts = { layout(output_shapes[0], config.quantization_dt, output_format), + layout(output_shapes[1], config.scale_dt, output_format) }; - return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; + const bool is_asymmetric = config.is_asymmetric_quantization(); + if (is_asymmetric && !combine_scales_and_zp) { + output_layouts.emplace_back(layout(output_shapes[2], config.zp_dt, output_format)); + } + + return output_layouts; } -template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size); +template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, + const dynamic_quantize::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp); template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); const auto& input_layout = impl_param.get_input_layout(); - return __calc_output_layouts(input_layout, UINT64_MAX /* TODO: handle group_size here */); + + return __calc_output_layouts(input_layout, desc->quantization_config, desc->scales_zp_output_order, desc->combine_scales_and_zp); } template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& node, @@ -56,6 +69,12 @@ std::string dynamic_quantize_inst::to_string(dynamic_quantize_node const& node) std::stringstream primitive_description; + json_composite dynamic_quantize_info; + // TOOD: Update info + // dynamic_quantize_info.add("activation dt", desc->get_output_data_type(0).value_or(data_types::undefined)); + // dynamic_quantize_info.add("scale dt", desc->get_output_data_type(1).value_or(data_types::undefined)); + + node_info->add("dynamic_quantize info", dynamic_quantize_info); node_info->dump(primitive_description); return primitive_description.str(); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp index 4c1b1008434144..84f1b26507f19e 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp @@ -6,6 +6,7 @@ #include "program_helpers.h" #include "intel_gpu/runtime/itt.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" using namespace cldnn; @@ -19,6 +20,7 @@ void build_implementations::run(program& p) { for (auto& n : p.get_processing_order()) { if (auto impl = n->get_selected_impl()) { auto params = n->get_kernel_impl_params(); + GPU_DEBUG_TRACE << "add_kernels_source: " << params->desc->id << std::endl; cache.add_kernels_source(*params, impl->get_kernels_source()); } } @@ -26,6 +28,7 @@ void build_implementations::run(program& p) { for (auto& n : p.get_processing_order()) { if (auto impl = n->get_selected_impl()) { auto params = n->get_kernel_impl_params(); + GPU_DEBUG_TRACE << "init_kernels: " << params->desc->id << std::endl; impl->init_kernels(cache, *params); impl->reset_kernels_source(); } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 9f2895a008cfa8..64a3512afe546b 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -420,10 +420,13 @@ bool crop_in_place_optimization::can_crop_be_optimized_simple_data_format(const } static bool can_read_value_be_optimize(const read_value_node& node) { - if (node.get_users().size() == 1) + GPU_DEBUG_TRACE_DETAIL << "Check " << node.id() << " users:" << node.get_users().size() << "\n"; + std::unordered_set unique_users(node.get_users().begin(), node.get_users().end()); + + if (unique_users.size() == 1) return true; - const auto non_shape_of_users_count = std::count_if(node.get_users().begin(), node.get_users().end(), [](const program_node* user) { + const auto non_shape_of_users_count = std::count_if(unique_users.begin(), unique_users.end(), [](const program_node* user) { return !user->is_type(); }); if (non_shape_of_users_count <= 1) @@ -847,8 +850,24 @@ void prepare_buffer_fusing::run(program& p) { if (!rv_prim) return; - if (kv_out_layout.data_type != rv_prim->get_output_layout().data_type) + int DISABLE_KV_OPT = 0; + static bool warned = false; + if (const auto env_var = std::getenv("DISABLE_KV_OPT")) { + std::istringstream ss(env_var); + ss >> DISABLE_KV_OPT; + + if (!warned) { + std::cout << "Opt " << DISABLE_KV_OPT << " for kvcache\n"; + warned = true; + } + } + + if (kv_out_layout.data_type != rv_prim->get_output_layout().data_type || DISABLE_KV_OPT) { + GPU_DEBUG_TRACE_DETAIL << node.id() << " can't optimize because of different formats: " << kv_out_layout.to_short_string() << " vs " << rv_prim->get_output_layout().to_short_string() << "\n"; return; + } else { + GPU_DEBUG_TRACE_DETAIL << node.id() << " can optimize because of different formats: " << kv_out_layout.to_short_string() << " vs " << rv_prim->get_output_layout().to_short_string() << "\n"; + } auto concat_axis = node.get_primitive()->concat_axis; @@ -857,21 +876,43 @@ void prepare_buffer_fusing::run(program& p) { padding::DynamicDimsMask info_dynamic_pad; info_dynamic_pad[concat_axis] = 1; kv_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad; + GPU_DEBUG_TRACE_DETAIL << node.id() << " 0th output layout before before " << node.get_output_layout(false, 0) << "\n"; node.set_output_layout(kv_out_layout); node.can_share_buffer(false); + GPU_DEBUG_TRACE_DETAIL << node.id() << " 0th output layout after " << node.get_output_layout(false, 0) << "\n"; - auto update_dep = [&info_dynamic_pad](program_node* dep) { - auto prev_layout = dep->get_output_layout(); + auto update_dep = [](program_node* dep, padding::DynamicDimsMask& info_dynamic_pad, size_t idx) { + auto prev_layout = dep->get_output_layout(true, idx); prev_layout.data_padding._dynamic_dims_mask = info_dynamic_pad; - dep->set_output_layout(prev_layout); + dep->set_output_layout(prev_layout, true, idx); dep->can_share_buffer(false); }; if (rv_prim) { - update_dep(rv_prim); + update_dep(rv_prim, info_dynamic_pad, 0); } if (gather_prim) { - update_dep(gather_prim); + update_dep(gather_prim, info_dynamic_pad, 0); + } + + const auto& desc = node.get_primitive(); + if (desc->compressed) { + auto update_scale_zp = [&](size_t kv_cache_output_idx, size_t read_value_output_idx) { + auto scales_out_layout = node.get_output_layout(false, kv_cache_output_idx); + + const size_t scales_zp_concat_axis = 2; + padding::DynamicDimsMask info_dynamic_pad_scales; + info_dynamic_pad_scales[scales_zp_concat_axis] = 1; + scales_out_layout.data_padding._dynamic_dims_mask = info_dynamic_pad_scales; + node.set_output_layout(scales_out_layout, true, kv_cache_output_idx); + + update_dep(rv_prim, info_dynamic_pad_scales, read_value_output_idx); + }; + + update_scale_zp(2, 1); + if (desc->quantization_config.is_asymmetric_quantization() && !desc->combine_scales_and_zp) { + update_scale_zp(3, 2); + } } } }); @@ -905,7 +946,7 @@ void prepare_buffer_fusing::run(program& p) { // TODO: Allow optimizations for the case above too. Looks like it can be achieved by more careful // topological sort (i.e. if we ensure that all read_value users are completed before assign is run) node.can_be_optimized(can_read_value_be_optimize(node)); - GPU_DEBUG_TRACE_DETAIL << "[prepare_buffer_fusing] : " << node.id() << " can be optimized" << std::endl; + GPU_DEBUG_TRACE_DETAIL << "[prepare_buffer_fusing] : " << node.id() << " can be optimized = " << node.can_be_optimized() << std::endl; }); } } diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp index 6c16618ac816d0..6d70be9c046cf8 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/read_value.cpp @@ -6,6 +6,8 @@ #include "impls/registry/implementation_map.hpp" #include "register.hpp" +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" + namespace cldnn { namespace cpu { @@ -64,7 +66,29 @@ struct read_value_impl : public typed_primitive_impl { } if (!instance.can_be_optimized()) { - return instance.output_memory(0).copy_from(stream, *variable.get_memory(), false); + GPU_DEBUG_TRACE_DETAIL << "Copying variable's memory to new read_value's buffer output\n"; + std::vector res_events; + res_events.push_back(instance.output_memory(0).copy_from(stream, *variable.get_memory(), false)); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + auto scales_state = compressed_cache_variable->get_compression_scale_state(); + res_events.push_back(instance.output_memory(1).copy_from(stream, *scales_state->get_memory(), false)); + + if (compressed_cache_variable->has_zp_state()) { + auto zp_state = compressed_cache_variable->get_compression_zp_state(); + res_events.push_back(instance.output_memory(1).copy_from(stream, *zp_state->get_memory(), false)); + } + } + + // auto desc = instance.get_impl_params()->typed_desc(); + // if (desc->compressed) { + // auto multi_tensor_variable = downcast(variable); + // auto scales_variable = multi_tensor_variable.get_compression_scale_state(); + + // res_events.push_back(instance.output_memory(1).copy_from(stream, *scales_variable->get_memory(), false)); + // } + + return stream.aggregate_events(res_events, res_events.size() > 1); } return instance.get_network().get_stream().create_user_event(true); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp index 91f141ae062723..3af726f8d43823 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp @@ -37,6 +37,16 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl { auto params = get_default_params(impl_param, is_shape_agnostic); params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1))); + if (impl_param.output_layouts.size() > 2) { + params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(2))); + } + + const auto& desc = impl_param.typed_desc(); + params.group_sizes = desc->quantization_config.group_sizes; + params.scales_output_order = desc->scales_zp_output_order; + params.use_asymmetric_quantization = desc->quantization_config.is_asymmetric_quantization(); + params.combine_scales_and_zp = desc->combine_scales_and_zp; + return params; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp index 762118ec5b1af2..19fff28a0e611e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp @@ -828,7 +828,7 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) { } } -kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset) { +kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset, const data_types dt) { const auto& pad = l.data_padding; const auto& vals_original = l.get_partial_shape(); @@ -868,7 +868,9 @@ kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor v pitch *= (reserved_in_mem_count + lp + up); } - return kernel_selector::data_tensor(vec, to_data_type(l.data_type), ks_layout); + data_types new_dt = (dt == data_types::undefined) ? l.data_type : dt; + + return kernel_selector::data_tensor(vec, to_data_type(new_dt), ks_layout); } kernel_selector::weights_tensor convert_weights_tensor(const layout& l, bool is_grouped) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h index 3ddb5bf8793c29..6c36e9002640c3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h @@ -101,7 +101,7 @@ kernel_selector::data_layout to_data_layout(format f); cldnn::format from_data_layout(kernel_selector::data_layout l); kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped); cldnn::format::type from_weights_layout(kernel_selector::weights_layout l); -kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset = tensor {}); +kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset = tensor {}, const data_types dt = data_types::undefined); kernel_selector::weights_tensor convert_weights_tensor(const layout& l, bool is_grouped = false); layout from_weights_tensor(const kernel_selector::weights_tensor& t); kernel_selector::activation_function get_kernel_selector_activation_param(activation_func activation_func); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp index e4e4adfbb15452..7498215b1682ba 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp @@ -9,10 +9,13 @@ #include "multi_stage_primitive.hpp" #include "kv_cache_inst.h" +#include "dynamic_quantize_inst.h" #include "concatenation/concatenation_kernel_selector.h" #include "concatenation/concatenation_kernel_base.h" #include "beam_table_update/beam_table_update_kernel_selector.hpp" #include "beam_table_update/beam_table_update_kernel_ref.hpp" +#include "dynamic_quantize/dynamic_quantize_kernel_selector.h" +#include "dynamic_quantize/dynamic_quantize_kernel_opt_generic.h" #include "openvino/core/dimension.hpp" namespace cldnn { @@ -57,6 +60,9 @@ struct kv_cache_impl : multi_stage_primitive { using bt_kernel_selector_t = kernel_selector::beam_table_update_kernel_selector; using bt_kernel_params_t = kernel_selector::beam_table_update_params; + using dq_kernel_selector_t = kernel_selector::dynamic_quantize_kernel_selector; + using dq_kernel_params_t = kernel_selector::dynamic_quantize_params; + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::ocl::kv_cache_impl) std::unique_ptr clone() const override { @@ -65,9 +71,12 @@ struct kv_cache_impl : multi_stage_primitive { const size_t concat_stage = 0; const size_t beam_table_stage = 1; + const size_t scale_concat_stage = 2; + const size_t dq_concat_stage = 3; cldnn::memory::ptr beam_table_prev = nullptr; cldnn::memory::ptr beam_table_new = nullptr; + // cldnn::memory::ptr compression_scale = nullptr; void load(BinaryInputBuffer& ib) override { parent::load(ib); @@ -75,11 +84,17 @@ struct kv_cache_impl : multi_stage_primitive { auto& kernel_selector = kernel_selector_t::Instance(); auto kernel_impl = kernel_selector.GetImplementation(_kernels_data[concat_stage].kernelName); kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[concat_stage]); - if (_kernels_data.size() == 2) { + if (_kernels_data.size() >= 2) { auto& bt_kernel_selector = bt_kernel_selector_t::Instance(); auto bt_kernel_impl = bt_kernel_selector.GetImplementation(_kernels_data[beam_table_stage].kernelName); bt_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[beam_table_stage]); } + // FIXME: indirectness and compression are orthogonal feature. + if (_kernels_data.size() == 3) { + auto& scale_kernel_selector = kernel_selector_t::Instance(); + auto scale_kernel_impl = scale_kernel_selector.GetImplementation(_kernels_data[scale_concat_stage].kernelName); + scale_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[scale_concat_stage]); + } } } void set_arguments_impl(kv_cache_inst& instance) override {} @@ -93,6 +108,16 @@ struct kv_cache_impl : multi_stage_primitive { } else if (stage == beam_table_stage) { args.inputs = { beam_table_prev, instance.input_memory_ptr(2) }; args.outputs = { beam_table_new }; + } else if (stage == scale_concat_stage) { + // FIXME: indirectness and compression are orthogonal feature. + args.inputs = { instance.input_memory_ptr(3) }; // [past, new, beam_table, past_scale, new_scale] + args.outputs = { instance.output_memory_ptr(2) }; + } else if (stage == dq_concat_stage) { + args.inputs = { instance.input_memory_ptr(1) }; // [past, new, beam_table, past_scale, new_scale] + args.outputs = { instance.output_memory_ptr(0) }; + for (size_t i = 2; i < instance.outputs_memory_count(); i++) { + args.outputs.push_back(instance.output_memory_ptr(i)); + } } return args; @@ -139,6 +164,7 @@ struct kv_cache_impl : multi_stage_primitive { } event::ptr execute_impl(const std::vector& events, kv_cache_inst& instance) override { + GPU_DEBUG_TRACE_DETAIL << "Execute kv-cache: " << instance.get_impl_params()->_can_be_optimized << " " << instance.get_impl_params()->get_input_layout(3).to_short_string() << "\n"; const bool can_be_optimized = instance.get_impl_params()->_can_be_optimized; auto& stream = instance.get_network().get_stream(); auto& engine = instance.get_network().get_engine(); @@ -146,6 +172,17 @@ struct kv_cache_impl : multi_stage_primitive { auto& variable = instance.get_network().get_variable(desc->variable_info.variable_id); std::vector res_events; + if (desc->compressed) { + // In case of KV-cache with compression enabled, skip second concat's kernel as new token data append will + // be handled by dynamic quantization kernel + // However, allow execution of the first token for the case if KV-cache can't be optimized (if optimization is disabled, or + // variables memory was reallocated and we have to copy past KV-cache to new memory) + _kernels_data[concat_stage].kernels[1].skip_execution = true; + if (!_kernels_data[concat_stage].kernels[0].skip_execution) { + GPU_DEBUG_TRACE_DETAIL << "Run copy of data!\n"; + } + } + execute_stage(events, instance, res_events, concat_stage); const auto& impl_param = *instance.get_impl_params(); @@ -183,14 +220,30 @@ struct kv_cache_impl : multi_stage_primitive { beam_table_state->set(); } + if (desc->compressed) { + + execute_stage(events, instance, res_events, scale_concat_stage); + + auto dq_params = get_dq_update_kernel_params(impl_param, impl_param.is_dynamic()); + (_kernels_data[dq_concat_stage].update_dispatch_data_func)(dq_params, _kernels_data[dq_concat_stage]); + execute_stage(events, instance, res_events, dq_concat_stage); + + auto compressed_cache_variable = dynamic_cast(&variable); + compressed_cache_variable->get_compression_scale_state()->set(); + + if (desc->quantization_config.is_asymmetric_quantization() && !desc->combine_scales_and_zp) { + compressed_cache_variable->get_compression_zp_state()->set(); + } + } + variable.set(); if (can_be_optimized) { GPU_DEBUG_TRACE_DETAIL << desc->id << " : Output is same as variable memory! Skip copying " << std::endl; // When primitive is optimized, concat kernel writes directly to variable memory return stream.aggregate_events(res_events, res_events.size() > 1); } else { - // Othwerise, we need to copy result from out buffer to state memory - GPU_DEBUG_TRACE_DETAIL << desc->id << " : Copying output to variable meomry" << std::endl; + // Otherwise, we need to copy result from out buffer to state memory + GPU_DEBUG_TRACE_DETAIL << desc->id << " : Copying output to variable memory" << std::endl; stream.enqueue_barrier(); auto out = instance.get_network().get_engine().reinterpret_buffer(instance.output_memory(0), variable.get_memory()->get_layout()); @@ -213,6 +266,21 @@ struct kv_cache_impl : multi_stage_primitive { return layout{beam_table_shape, impl_param.output_layouts[1].data_type, format::get_default_format(beam_table_shape.size())}; } + static layout get_compression_scale_layout(const kernel_impl_params& impl_param) { + // FIXME: it is implemented in multiple places + GPU_DEBUG_GET_INSTANCE(debug_config); + const auto& primitive = impl_param.typed_desc(); + auto kv_layout = impl_param.get_input_layout(0); + auto kv_shape = kv_layout.get_partial_shape(); + auto comp_scale_shape = ov::PartialShape(std::vector(kv_shape.size(), 1)); + comp_scale_shape[0] = kv_shape[0]; + comp_scale_shape[1] = kv_shape[1]; + GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // per-head compression + comp_scale_shape[2] = kv_shape[2]; + } + return layout{comp_scale_shape, impl_param.output_layouts[2].data_type, format::get_default_format(comp_scale_shape.size())}; + } + static kernel_params_t get_concat_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { const auto& primitive = impl_param.typed_desc(); auto params = get_default_params(impl_param, is_shape_agnostic); @@ -221,7 +289,9 @@ struct kv_cache_impl : multi_stage_primitive { const auto inputs_count = 2; params.inputs.resize(inputs_count); for (size_t i = 0; i < inputs_count; ++i) { - params.inputs[i] = convert_data_tensor(impl_param.input_layouts[i]); + auto tmp = impl_param.input_layouts[i]; + tmp.data_type = data_types::i8; + params.inputs[i] = convert_data_tensor(tmp); } params.axis = convert_axis(axis, impl_param.get_output_layout().get_rank()); @@ -229,6 +299,21 @@ struct kv_cache_impl : multi_stage_primitive { const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, beam_table_past] const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present] + + GPU_DEBUG_TRACE_DETAIL << "Concat output start offset: " << in_offsets_map.size() << " " << out_offsets_map.size() << "\n"; + + + // for (const auto& in_offset : in_offsets_map) { + // if (impl_param.input_layouts.size() > in_offset.first) + // std::cout << in_offset.first << ". " << impl_param.input_layouts[in_offset.first].to_short_string() << ", input, offset=" << in_offset.second << "\n"; + // else + // std::cout << in_offset.first << ". NOPE " << ", input, offset=" << in_offset.second << "\n"; + // } + + // for (const auto& out_offset : out_offsets_map) { + // std::cout << out_offset.first << ". " << impl_param.output_layouts[out_offset.first].to_short_string() << ", output, offset=" << out_offset.second << "\n"; + // } + std::map in_tensor_to_offset_map = { {0, in_offsets_map.at(0)}, {1, in_offsets_map.at(1)}, @@ -237,6 +322,8 @@ struct kv_cache_impl : multi_stage_primitive { {0, out_offsets_map.at(0)}, }; + GPU_DEBUG_TRACE_DETAIL << "Concat output start offset: " << primitive->id << " " << out_offsets_map.at(0) << " layout: " << impl_param.output_layouts[0].to_string() << "\n"; + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); return params; @@ -264,10 +351,12 @@ struct kv_cache_impl : multi_stage_primitive { params.is_state_set = is_state_set; params.indirect_axis = indirect_axis; - const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, beam_table_past]] - const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present] + const bool compressed = impl_param.typed_desc()->compressed; + const auto beam_table_past_idx = compressed ? 4 : 3; + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, compression_scale_past, beam_table_past]] + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present, compression_scale_present] std::map in_tensor_to_offset_map = { - {0, in_offsets_map.at(3)}, // beam_table_past + {0, in_offsets_map.at(beam_table_past_idx)}, // beam_table_past {1, in_offsets_map.at(2)}, // beam_idx }; std::map out_tensor_to_offset_map = { @@ -279,21 +368,148 @@ struct kv_cache_impl : multi_stage_primitive { return params; } + static dq_kernel_params_t get_dq_update_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + const auto& primitive = impl_param.typed_desc(); + auto params = get_default_params(impl_param, is_shape_agnostic); + + params.append_axis = primitive->concat_axis; + params.group_sizes = primitive->quantization_config.group_sizes; + params.scales_output_order = primitive->scales_zp_output_order; + params.use_asymmetric_quantization = primitive->quantization_config.is_asymmetric_quantization(); + params.combine_scales_and_zp = primitive->combine_scales_and_zp; + + if (!is_shape_agnostic) { + const auto& past_kv_cache_shape = impl_param.input_layouts[0].get_partial_shape(); + params.axis_offset = past_kv_cache_shape[primitive->concat_axis].get_length(); + } else { + params.axis_offset = 0; + } + + auto inputs_count = 1; + auto outputs_count = 2; + params.inputs.resize(inputs_count); + params.outputs.resize(outputs_count); + + auto current_token_layout = impl_param.input_layouts[1]; + auto present_layout = impl_param.output_layouts[0]; + auto present_scales_layout = impl_param.output_layouts[2]; + params.inputs[0] = convert_data_tensor(current_token_layout); + params.outputs[0] = convert_data_tensor(present_layout); + params.outputs[1] = convert_data_tensor(present_scales_layout); + + const bool has_zp_output_buffer = primitive->quantization_config.is_asymmetric_quantization() && !primitive->combine_scales_and_zp; + if (has_zp_output_buffer) { + auto present_zp_layout = impl_param.output_layouts[3]; + params.outputs.resize(outputs_count + 1); + params.outputs[2] = convert_data_tensor(present_zp_layout); + } + + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; + + // FIXME: need to handle the index properly when indirect is off + std::map in_tensor_to_offset_map = { + {0, in_offsets_map.at(1)}, // compression_scale_past + }; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(0)}, // compression_scale_present + {1, out_offsets_map.at(2)}, // compression_scale_present + }; + + GPU_DEBUG_TRACE_DETAIL << "DQ shapes: " << current_token_layout.to_short_string() << " " << present_layout.to_short_string() << " " << present_scales_layout.to_short_string() << "\n"; + GPU_DEBUG_TRACE_DETAIL << "DQ: Dynamic shape in0 " << in_offsets_map.at(1) << "\n"; + GPU_DEBUG_TRACE_DETAIL << "DQ: Dynamic shape out " << out_offsets_map.at(0) << "\n"; + GPU_DEBUG_TRACE_DETAIL << "DQ: Dynamic shape out " << out_offsets_map.at(2) << "\n"; + + if (has_zp_output_buffer) { + out_tensor_to_offset_map.emplace(2, out_offsets_map.at(3)); + GPU_DEBUG_TRACE_DETAIL << "DQ: Dynamic shape out " << out_offsets_map.at(3) << "\n"; + } + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); + + return params; + } + + + static kernel_params_t get_compression_scale_update_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + const auto& primitive = impl_param.typed_desc(); + auto params = get_default_params(impl_param, is_shape_agnostic); + + const auto concat_axis = 2; + params.axis = convert_axis(concat_axis, impl_param.get_output_layout().get_rank()); + + auto inputs_count = 1; + auto comp_scale_past_layout = impl_param.input_layouts[3]; + // auto comp_scale_new_layout = impl_param.input_layouts[4]; // <-- this should be replaced with inner layout + + auto comp_scale_present_layout = impl_param.output_layouts[2]; + + GPU_DEBUG_TRACE_DETAIL << "Past scale: " << comp_scale_past_layout.to_short_string() << "\n"; + // GPU_DEBUG_TRACE_DETAIL << "New scale: " << comp_scale_new_layout.to_short_string() << "\n"; + GPU_DEBUG_TRACE_DETAIL << "Present scale: " << comp_scale_present_layout.to_short_string() << "\n"; + + params.inputs.resize(inputs_count); + params.inputs[0] = convert_data_tensor(comp_scale_past_layout); + // params.inputs[1] = convert_data_tensor(comp_scale_new_layout); + params.outputs[0] = convert_data_tensor(comp_scale_present_layout); + + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; + + // FIXME: need to handle the index properly when indirect is off + std::map in_tensor_to_offset_map = { + {0, in_offsets_map.at(3)}, // compression_scale_past + // {1, in_offsets_map.at(4)}, // compression_scale_new + }; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(2)}, // compression_scale_present + }; + + GPU_DEBUG_TRACE_DETAIL << "Dynamic shape in0 " << in_offsets_map.at(3) << "\n"; + // GPU_DEBUG_TRACE_DETAIL << "Dynamic shape in1 " << in_offsets_map.at(4) << "\n"; + GPU_DEBUG_TRACE_DETAIL << "Dynamic shape offset " << out_offsets_map.at(2) << "\n"; + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); + + return params; + } + + static std::unique_ptr create(const typed_program_node& arg, const kernel_impl_params& impl_param) { std::vector kernels_data; + // if (arg.id().find("kvcache:__module.model.transformer.h.0.attn/aten::cat/Concat_4") != std::string::npos) + // std::cout << "mingyuki: create " << arg.id() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "KVCACHE Select concat\n"; + GPU_DEBUG_TRACE_DETAIL << "KVCACHE Select concat\n"; auto concat_kernel_params = get_concat_kernel_params(impl_param, impl_param.is_dynamic()); auto& concat_kernel_selector = kernel_selector_t::Instance(); kernels_data.push_back(concat_kernel_selector.get_best_kernel(concat_kernel_params)); const bool indirect = impl_param.typed_desc()->indirect; + const bool compressed = impl_param.typed_desc()->compressed; + GPU_DEBUG_TRACE_DETAIL << "KVCACHE Select beam table\n"; + GPU_DEBUG_TRACE_DETAIL << "KVCACHE Select beam table\n"; if (indirect) { auto bt_update_kernel_params = get_bt_update_kernel_params(impl_param, false); auto& bt_update_kernel_selector = bt_kernel_selector_t::Instance(); kernels_data.push_back(bt_update_kernel_selector.get_best_kernel(bt_update_kernel_params)); } + GPU_DEBUG_TRACE_DETAIL << "KVCACHE Select DQ\n"; + GPU_DEBUG_TRACE_DETAIL << "KVCACHE Select DQ\n"; + if (compressed) { + auto comp_scale_update_kernel_params = get_compression_scale_update_kernel_params(impl_param, impl_param.is_dynamic()); + auto& comp_scale_update_kernel_selector = kernel_selector_t::Instance(); + kernels_data.push_back(comp_scale_update_kernel_selector.get_best_kernel(comp_scale_update_kernel_params)); + + // kernels_data.push_back(kernel_selector::kernel_data()); + + auto dq_kernel_params = get_dq_update_kernel_params(impl_param, impl_param.is_dynamic()); + auto& dq_kernel_selector = dq_kernel_selector_t::Instance(); + kernels_data.push_back(dq_kernel_selector.get_best_kernel(dq_kernel_params)); + } return cldnn::make_unique(kernels_data); } void update_dispatch_data(const kernel_impl_params& impl_param) override { + GPU_DEBUG_TRACE_DETAIL << "update_dispatch_data kv-cache: " << impl_param._can_be_optimized << " " << impl_param.get_input_layout(3).to_short_string() << "\n"; // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future if (_kernels_data[concat_stage].params == nullptr) { _kernels_data[concat_stage].params = std::make_shared(get_concat_kernel_params(impl_param, true)); @@ -307,13 +523,19 @@ struct kv_cache_impl : multi_stage_primitive { (_kernels_data[concat_stage].update_dispatch_data_func)(params, _kernels_data[concat_stage]); _kernels_data[concat_stage].kernels[0].skip_execution = impl_param._can_be_optimized || impl_param.get_input_layout(0).count() == 0; + + if (impl_param.typed_desc()->compressed) { + auto comp_scale_kernel_params = get_compression_scale_update_kernel_params(impl_param, impl_param.is_dynamic()); + (_kernels_data[scale_concat_stage].update_dispatch_data_func)(comp_scale_kernel_params, _kernels_data[scale_concat_stage]); + _kernels_data[scale_concat_stage].kernels[0].skip_execution = impl_param._can_be_optimized || impl_param.get_input_layout(3).count() == 0; + } } }; namespace detail { attach_kv_cache_impl::attach_kv_cache_impl() { - auto types = { data_types::f16, data_types::f32 }; + auto types = { data_types::i8, data_types::f16, data_types::f32 }; auto formats = { format::bfyx }; implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index b33871110ec6b4..998ca892dc4558 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -83,14 +83,19 @@ struct scaled_dot_product_attention_impl : multi_stage_primitivebuffer_ptr() << "\n"; args.inputs.push_back(instance.input_memory_ptr(i)); } + GPU_DEBUG_TRACE_DETAIL << "instance.has_fused_primitives(): " << instance.has_fused_primitives() << "\n"; if (instance.has_fused_primitives()) { size_t count = instance.get_fused_mem_count(); for (size_t i = 0; i < count; i++) { @@ -98,7 +103,9 @@ struct scaled_dot_product_attention_impl : multi_stage_primitivebuffer_ptr() << "\n"; args.outputs.push_back(instance.output_memory_ptr(i)); } @@ -133,6 +140,20 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive& events, scaled_dot_product_attention_inst& instance) override { + GPU_DEBUG_TRACE_DETAIL << "SDPA inputs \n"; + for (size_t i = 0; i < instance.inputs_memory_count(); i++) + GPU_DEBUG_TRACE_DETAIL << "SDPA input: " << instance.input_memory_ptr(i)->buffer_ptr() << "\n"; if (need_indirect_load(instance)) return execute_stage(events, instance, indirect_sdpa); else @@ -216,6 +240,12 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveis_causal; + config.is_kv_compressed = desc->is_kv_compressed; + config.use_asymmetric_quantization = desc->quantization_config.is_asymmetric_quantization(); + config.combine_scales_and_zp = desc->combine_scales_and_zp; + + GPU_DEBUG_TRACE << "Set is_kv_compressed to " << config.is_kv_compressed << "\n"; + GPU_DEBUG_TRACE << "Set is_asym_compressed to " << config.use_asymmetric_quantization << "\n"; return config; } @@ -229,6 +259,15 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveis_kv_compressed) { + data_inputs_num -= 2; // key and value compression scales are handled separately + + has_zp_input_buffers = desc->quantization_config.is_asymmetric_quantization() && !desc->combine_scales_and_zp; + if (has_zp_input_buffers) + data_inputs_num -= 2; // key and value compression zp are handled separately + } + params.inputs.resize(data_inputs_num); for (size_t i = 0; i < data_inputs_num; i++) { params.inputs[i] = convert_data_tensor(impl_param.get_input_layout(i)); @@ -246,15 +285,43 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveindirect_axis; } - params.set_dynamic_shape_offsets(); + if (desc->is_kv_compressed) { + params.key_cache_comp_scale = convert_data_tensor(impl_param.get_input_layout(data_inputs_num)); + params.value_cache_comp_scale = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 1)); - // Need to adjust sdpa kernel offset to consider beam table input - if (has_indirect_inputs(impl_param)) { - auto out_offset = params.outputs[0].get_dynamic_shape_offset(); - if (indirect) - params.beam_table.SetDynamicShapeOffset(out_offset); + GPU_DEBUG_TRACE_DETAIL << "Add Scales params\n"; + if (has_zp_input_buffers) { + GPU_DEBUG_TRACE_DETAIL << "Add zp params\n"; + params.key_cache_comp_zp = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 2)); + params.value_cache_comp_zp = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 3)); + } + } - params.outputs[0].SetDynamicShapeOffset(out_offset + kernel_selector::DataTensor::max_rank()); + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; + std::map in_tensor_to_offset_map; + for (size_t i = 0; i < data_inputs_num; i++) { + in_tensor_to_offset_map[i] = in_offsets_map.at(i); + } + + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(0)}, + }; + + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); + + if (desc->is_kv_compressed) { + params.key_cache_comp_scale.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num)); + params.value_cache_comp_scale.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 1)); + + if (has_zp_input_buffers) { + params.key_cache_comp_zp.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 2)); + params.value_cache_comp_zp.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 3)); + } + } + + if (indirect && has_indirect_inputs(impl_param)) { + params.beam_table.SetDynamicShapeOffset(get_beam_table_id(desc)); } return params; @@ -300,6 +367,7 @@ attach_scaled_dot_product_attention_impl::attach_scaled_dot_product_attention_im auto types = { data_types::f32, data_types::f16, + data_types::i8, }; auto formats = { diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h index 49dd62c6332549..bb8ebd093fa696 100644 --- a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h @@ -35,7 +35,10 @@ class typed_primitive_inst : public typed_primitive_inst_base< // Internal function to be used from fakealignment template - static std::vector __calc_output_layouts(const layout &act_layout, uint64_t group_size); + static std::vector __calc_output_layouts(const layout &act_layout, + const dynamic_quantize::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp); static std::string to_string(dynamic_quantize_node const& node); typed_primitive_inst(network& network, dynamic_quantize_node const& node); diff --git a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h index f3aa4de5ec34e1..b8f05ac3272353 100644 --- a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h @@ -26,6 +26,7 @@ struct typed_program_node : public typed_program_node_base { std::vector get_shape_info_input_layouts() const override { std::vector res; + GPU_DEBUG_TRACE_DETAIL << "get_shape_info_input_layouts, get_dependencies.size()=" << get_dependencies().size() << "\n"; for (size_t i = 0; i < get_dependencies().size(); i++) { const auto& d = get_dependency_with_port(i); res.push_back(d.first->get_output_layout(false, d.second)); @@ -35,6 +36,8 @@ struct typed_program_node : public typed_program_node_base { res.push_back(layout(ov::PartialShape::dynamic(4), data_types::i32, format::bfyx)); } + GPU_DEBUG_TRACE_DETAIL << "Total shape info input layouts: " << res.size() << "\n"; + return res; } }; diff --git a/src/plugins/intel_gpu/src/graph/include/program_node.h b/src/plugins/intel_gpu/src/graph/include/program_node.h index 029755c4733fe4..323d630732b5c4 100644 --- a/src/plugins/intel_gpu/src/graph/include/program_node.h +++ b/src/plugins/intel_gpu/src/graph/include/program_node.h @@ -232,7 +232,7 @@ struct program_node { } void merge_output_padding(padding const& padd, size_t idx = 0) { - set_output_padding(padding::max(padd, output_layouts[idx].data_padding)); + set_output_padding(padding::max(padd, output_layouts[idx].data_padding), idx); } // only calculated output layout (for external usage), does not modify/use cached output layout nor invalidate users diff --git a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h index 74f9ffff581b87..0a33b41e940565 100644 --- a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h @@ -33,9 +33,16 @@ class typed_primitive_inst : public typed_primitive_inst_base static std::vector calc_output_layouts(read_value_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); - const auto& default_layout = desc->output_layout; + std::vector output_layouts; - return { impl_param.state_layout.value_or(default_layout) }; + for (size_t i = 0; i < desc->num_outputs; i++) { + if (impl_param.state_layouts.size() > i) + GPU_DEBUG_TRACE_DETAIL << "Use state " << impl_param.state_layouts[i].to_short_string() << "\n"; + const auto& default_layout = desc->output_layouts[i]; + output_layouts.push_back(impl_param.state_layouts.size() > i ? impl_param.state_layouts[i] : default_layout); + } + + return output_layouts; } static layout calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param); diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp index 95cdd587cdf175..1733743af996ce 100644 --- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp @@ -10,6 +10,7 @@ #include "primitive_type_base.h" #include #include +#include "to_string_utils.h" namespace cldnn { GPU_DEFINE_PRIMITIVE_TYPE_ID(kv_cache) @@ -25,7 +26,7 @@ layout kv_cache_inst::calc_output_layout(const kv_cache_node& node, kernel_impl_ } template -std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& /*node*/, kernel_impl_params const& impl_param) { +std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& node, kernel_impl_params const& impl_param) { auto desc = impl_param.typed_desc(); ov::intel_gpu::op::KVCache op; @@ -35,16 +36,30 @@ std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& /*no std::vector input_shapes = {impl_param.get_input_layout(0).get(), impl_param.get_input_layout(1).get()}; - if (desc->num_outputs > 1) + if (desc->indirect) { input_shapes.push_back(impl_param.get_input_layout(2).get()); + } + + if (desc->compressed) { + input_shapes.push_back(impl_param.get_input_layout(3).get()); + + if (desc->quantization_config.is_asymmetric_quantization() && !desc->combine_scales_and_zp) { + input_shapes.push_back(impl_param.get_input_layout(4).get()); + } + } + + GPU_DEBUG_TRACE_DETAIL << "kv_cache combine_scales_and_zp=" << desc->combine_scales_and_zp << "\n"; + std::vector output_shapes = desc->compressed ? shape_infer(&op, input_shapes, desc->quantization_config, desc->scales_zp_output_order, desc->combine_scales_and_zp) + : shape_infer(&op, input_shapes); - std::vector output_shapes = shape_infer(&op, input_shapes); + if (desc->num_outputs == 3) + GPU_DEBUG_TRACE_DETAIL << desc->id << " scales output calculated shape: " << output_shapes[2] << "\n"; static const std::map ports_map = {{0, 0}, {1, 2}}; std::vector out_layouts; for (size_t i = 0; i < desc->num_outputs; i++) { - auto out_type = desc->output_data_types[i].value_or(impl_param.get_input_layout(ports_map.at(i)).data_type); + auto out_type = desc->output_data_types[i].value(); out_layouts.emplace_back(output_shapes[i], out_type, impl_param.get_output_layout(i).format); } @@ -63,6 +78,7 @@ std::string kv_cache_inst::to_string(const kv_cache_node& node) { kv_cache_info.add("concat axis", node.get_primitive()->concat_axis); kv_cache_info.add("gather axis", node.get_primitive()->gather_axis); kv_cache_info.add("indirect", node.get_primitive()->indirect); + kv_cache_info.add("compressed", node.get_primitive()->compressed); node_info->add("kv_cache info", kv_cache_info); std::stringstream primitive_description; node_info->dump(primitive_description); @@ -92,6 +108,7 @@ void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) { size_t i = 0; // [kv_state, kv_new_token, [beam_idx, bt_past]] + // FIXME: do we need to handle compression scale value? for (i = 0; i < _node->get_dependencies().size(); i++) { const auto& node_in_lay = _node->get_input_layout(i); const auto& runtime_in_lay = params.input_layouts[i]; @@ -113,10 +130,27 @@ void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) { bt_layout.set_partial_shape(bt_shape); } - GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for input[" << i << "]" << std::endl; + GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for input[" << i++ << "]" << std::endl; fill_shape_info_data(bt_layout, bt_state->get_initial_layout(), shape_info_ptr, offset); } + // if (params.typed_desc()->compressed) { + // auto& var = dynamic_cast(get_network().get_variable(variable_id())); + // const auto& scale_state = var.get_compression_scale_state(); + // auto scale_layout = scale_state->get_layout(); + // if (scale_layout.is_dynamic()) { + // auto bt_shape = scale_layout.get_partial_shape(); + // for (auto& d : bt_shape) { + // if (d.is_dynamic()) + // d = 0; + // } + // scale_layout.set_partial_shape(bt_shape); + // } + + // GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for input[" << i++ << "]" << std::endl; + // fill_shape_info_data(scale_layout, scale_state->get_initial_layout(), shape_info_ptr, offset); + // } + for (size_t i = 0; i < _node->get_output_layouts().size(); i++) { GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for output[" << i << "]" << std::endl; const auto& node_out_lay = _node->get_output_layout(i); diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index d69a2767c018f6..272172ff6c7659 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -37,6 +37,7 @@ #include "graph_optimizer/prepare_buffer_fusing.h" #include "intel_gpu/plugin/common_utils.hpp" +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" #include "intel_gpu/graph/network.hpp" #include "intel_gpu/graph/serialization/set_serializer.hpp" #include "intel_gpu/runtime/engine.hpp" @@ -292,30 +293,91 @@ void primitive_inst::update_shape() { auto prim = get_node().as().get_primitive(); const auto& variable_id = prim->variable_id; auto& variable = get_network().get_variable(variable_id); - // Initial variable shape is taken from variable itself - auto new_layout = variable.get_layout(); - // If variable is not set and we have an initializer - use it's shape as shape of variable - if (!variable.is_set() && _impl_params->input_layouts.size() == 1) { - new_layout = _impl_params->get_input_layout(0); - } + auto update_state_layout = [&](ov::intel_gpu::VariableStateBase& variable, layout new_layout, size_t layout_idx) { + // If variable is not set and we have an initializer - use it's shape as shape of variable + if (!variable.is_set() && _impl_params->input_layouts.size() >= layout_idx) { + new_layout = _impl_params->get_input_layout(layout_idx); + } - // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0 - if (new_layout.is_dynamic()) { - auto pshape = new_layout.get_partial_shape(); - for (auto& d : pshape) { - if (d.is_dynamic()) { - d = 0; + // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0 + if (new_layout.is_dynamic()) { + auto pshape = new_layout.get_partial_shape(); + for (auto& d : pshape) { + if (d.is_dynamic()) { + d = 0; + } } + new_layout.set_partial_shape(pshape); } - new_layout.set_partial_shape(pshape); - } - variable.set_layout(new_layout); + variable.set_layout(new_layout); - if (!_impl_params->state_layout.has_value() || _impl_params->state_layout.value() != new_layout) { - _impl_params->state_layout = new_layout; - input_shape_changed = true; + if (_impl_params->state_layouts[layout_idx] != new_layout) { + _impl_params->state_layouts[layout_idx] = new_layout; + GPU_DEBUG_TRACE_DETAIL << "Update " << layout_idx << " layout: " << new_layout.to_short_string() << "\n"; + input_shape_changed = true; + } + }; + + if (_impl_params->state_layouts.empty()) + _impl_params->state_layouts.resize(1); + + // Initial variable shape is taken from variable itself + auto new_layout = variable.get_layout(); + update_state_layout(variable, new_layout, 0); + + // // If variable is not set and we have an initializer - use it's shape as shape of variable + // if (!variable.is_set() && _impl_params->input_layouts.size() >= 1) { + // new_layout = _impl_params->get_input_layout(0); + // } + + // // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0 + // auto replace_dynamic_dims = [](layout& layout) { + // if (layout.is_dynamic()) { + // auto pshape = layout.get_partial_shape(); + // for (auto& d : pshape) { + // if (d.is_dynamic()) { + // d = 0; + // } + // } + // layout.set_partial_shape(pshape); + // } + // }; + // replace_dynamic_dims(new_layout); + + // GPU_DEBUG_TRACE_DETAIL << id() << " set new layout " << new_layout.to_short_string() << "\n"; + // variable.set_layout(new_layout); + + // if (_impl_params->state_layouts.empty()) { + // _impl_params->state_layouts.resize(1); + // } + + // if (_impl_params->state_layouts[0] != new_layout) { + // _impl_params->state_layouts[0] = new_layout; + // input_shape_changed = true; + // } + + if (prim->num_outputs > 1) { + // _impl_params->state_layouts.resize(2); + GPU_DEBUG_TRACE_DETAIL << "This readvalue check\n"; + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + _impl_params->state_layouts.resize(compressed_cache_variable->has_zp_state() ? 3 : 2); + GPU_DEBUG_TRACE_DETAIL << "Has multi tensor variable" << _impl_params->state_layouts.size() << "!\n"; + + auto scales_state = compressed_cache_variable->get_compression_scale_state(); + auto new_scales_layout = compressed_cache_variable->get_compression_scale_state()->get_layout(); + update_state_layout(*scales_state, new_scales_layout, 1); + + if (compressed_cache_variable->has_zp_state()) { + auto zp_state = compressed_cache_variable->get_compression_zp_state(); + auto new_zp_layout = compressed_cache_variable->get_compression_zp_state()->get_layout(); + update_state_layout(*zp_state, new_zp_layout, 2); + + GPU_DEBUG_TRACE_DETAIL << "Set new zp layout " << new_zp_layout.to_short_string() << "\n"; + GPU_DEBUG_TRACE_DETAIL << "states[2] " << _impl_params->state_layouts[2].to_short_string() << "\n"; + } + } } } @@ -460,6 +522,17 @@ void primitive_inst::update_shape() { auto& variable = get_network().get_variable(desc->variable_id); // Custom output layout update as update_output_layout handles paddings incorrectly for optimized out read_value + kv_cache pattern _impl_params->output_layouts[0] = variable.get_layout(); + GPU_DEBUG_TRACE_DETAIL << "Update ReadValue output0 " << _impl_params->output_layouts[0].to_short_string() << "\n"; + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + _impl_params->output_layouts[1] = compressed_cache_variable->get_compression_scale_state()->get_layout(); + GPU_DEBUG_TRACE_DETAIL << "Update ReadValue output1 " << _impl_params->output_layouts[1].to_short_string() << "\n"; + + if (compressed_cache_variable->has_zp_state()) { + GPU_DEBUG_TRACE_DETAIL << "Update ReadValue output2 " << _impl_params->output_layouts[1].to_short_string() << "\n"; + _impl_params->output_layouts[2] = compressed_cache_variable->get_compression_zp_state()->get_layout(); + } + } } if (get_node().is_type()) { @@ -542,6 +615,15 @@ event::ptr primitive_inst::realloc_if_needed() { << ", variable layout " << variable.get_layout().to_short_string() << ")" << std::endl; _outputs[0] = variable.get_memory(); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + _outputs[2] = compressed_cache_variable->get_compression_scale_state()->get_memory(); + + if (compressed_cache_variable->has_zp_state()) { + _outputs[3] = compressed_cache_variable->get_compression_zp_state()->get_memory(); + } + } + // To record shape predictor for (size_t j = 0; j < _impl_params->output_layouts.size(); ++j) sp.predict_preallocation_shape(id(), _impl_params->output_layouts[j], true, j); @@ -644,7 +726,12 @@ event::ptr primitive_inst::realloc_if_needed() { // dynamic quantization is only applied to activation of FC if (get_node().is_type()) { - auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], 0); + const auto& desc = get_node().as().get_primitive(); + auto dyn_quan_scale_layout = + dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], + desc->quantization_config, + desc->scales_zp_output_order, + desc->combine_scales_and_zp); GPU_DEBUG_TRACE_DETAIL << "update layout of dynamic quantize scale parameter layout " << dyn_quan_scale_layout[1].to_short_string() << std::endl; updated_params.output_layouts[1] = dyn_quan_scale_layout[1]; @@ -688,13 +775,19 @@ event::ptr primitive_inst::realloc_if_needed() { for (size_t i = 0; i < updated_layouts.size(); ++i) { bool reclaim = 0; size_t required_buffer_size = 0; - if (_node->is_type() && i == 0) { + if (_node->is_type() && (i == 0 || i == 2 || i == 3)) { // Relax reclaiming condition for kv cache const auto& desc = _node->as().get_primitive(); auto prealloc_shape = updated_layouts[i].get_shape(); const auto shape_rank = prealloc_shape.size(); - auto seq_axis = - static_cast(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis); + auto seq_axis = 0; + if (i == 0) { + // seq_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, shape_rank); + seq_axis = static_cast(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis); + } else if (i == 2 || i == 3) { + seq_axis = 2; + } + prealloc_shape[seq_axis] += tmp_prealloc_count; required_buffer_size = std::accumulate(prealloc_shape.begin(), prealloc_shape.end(), size_t(1), std::multiplies()); } else { @@ -720,12 +813,19 @@ event::ptr primitive_inst::realloc_if_needed() { for (size_t i = 0; i < actual_layouts.size(); ++i) { bool can_reuse_buffer = (_outputs[i] && updated_layouts[i].get_linear_size() <= _max_output_layout_count[i]); + std::pair prealloc_info; - if (_node->is_type() && i == 0) { + if (_node->is_type() && (i == 0 || i == 2 || i == 3)) { const auto& desc = _node->as().get_primitive(); auto shape_rank = updated_layouts[i].get_shape().size(); - auto seq_axis = - static_cast(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis); + auto seq_axis = 0; + if (i == 0) { + // seq_axis = static_cast(desc->concat_axis >= 0 ? desc->concat_axis : shape_rank + desc->concat_axis); + seq_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, shape_rank); + } else if (i == 2 || i == 3) { + seq_axis = 2; + } + prealloc_info = sp.predict_preallocation_shape(id(), updated_layouts[i], false, i, tmp_prealloc_count, seq_axis); } else { prealloc_info = sp.predict_preallocation_shape(id(), updated_layouts[i], can_reuse_buffer, i, tmp_prealloc_count); @@ -741,19 +841,20 @@ event::ptr primitive_inst::realloc_if_needed() { GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer[" << i << "] - " << actual_layouts[i].get_linear_size() << "/" << _max_output_layout_count[i] << std::endl; - if (_node->is_type() && (i == 0)) { + if (_node->is_type() && (i == 0 || i == 2 || i == 3)) { // kv_cache has already assigned memory. // No need to reinterpret output memory but need to update padding const auto& desc = _node->as().get_primitive(); auto& present_layout = _impl_params->output_layouts[i]; const auto present_layout_rank = present_layout.get_partial_shape().size(); - const auto sequence_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank); + const auto sequence_axis = i == 0 ? kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank) : 2; + GPU_DEBUG_TRACE_DETAIL << "get_max_pad: " << present_layout.to_short_string() << " " << _max_output_layout_count[0] << " " << sequence_axis << "\n"; auto max_pad = kv_cache_inst::get_max_pad(present_layout, _max_output_layout_count[i], sequence_axis, - "present_layout"); + i == 0 ? "present_layout" : "present_scales_layout"); kv_cache_inst::update_pad(present_layout, max_pad, sequence_axis); - GPU_DEBUG_TRACE_DETAIL << _impl_params->output_layouts[i].to_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << i << ". " << _impl_params->output_layouts[i].to_string() << std::endl; set_shape_change(); } else { _outputs[i] = _network.get_engine().reinterpret_buffer(*_outputs[i], actual_layouts[i]); @@ -809,12 +910,35 @@ event::ptr primitive_inst::realloc_if_needed() { } if (present_layout.data_padding._dynamic_dims_mask[sequence_axis] == 1) { // Apply padding of variable to make it be optimized in the next iteration + GPU_DEBUG_TRACE_DETAIL << "get_max_pad: " << present_layout.to_short_string() << " " << _max_output_layout_count[0] << " " << sequence_axis << "\n"; auto max_pad = kv_cache_inst::get_max_pad(present_layout, _max_output_layout_count[0], sequence_axis, "present_layout"); if (max_pad > 0) { + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + GPU_DEBUG_TRACE_DETAIL << "Compressed case!\n"; + auto present_scales_layout = _impl_params->output_layouts[2]; + + const auto sequence_axis = 2; + GPU_DEBUG_TRACE_DETAIL << id() << " is kv_cache => set the variable with newly allocated output memory" + << std::endl; + + kv_cache_inst::update_pad(present_scales_layout, max_pad, sequence_axis); + GPU_DEBUG_TRACE_DETAIL << "Updated scales pad (" << max_pad << " " << sequence_axis << "): " << present_scales_layout.to_string() << "\n"; + if (!axis_is_outer_most) { + _impl_params->output_layouts[2] = present_scales_layout; + } + + compressed_cache_variable->get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout); + if (compressed_cache_variable->has_zp_state()) { + auto present_zp_layout = present_scales_layout; + compressed_cache_variable->get_compression_zp_state()->set_memory(_outputs[3], present_zp_layout); + } + } + kv_cache_inst::update_pad(present_layout, max_pad, sequence_axis); + GPU_DEBUG_TRACE_DETAIL << "Updated data pad (" << max_pad << " " << sequence_axis << "): " << present_layout.to_string() << "\n"; if (!axis_is_outer_most) { GPU_DEBUG_TRACE_DETAIL << id() << ": Update impl with new output padding" << std::endl; set_shape_change(); @@ -834,12 +958,34 @@ event::ptr primitive_inst::realloc_if_needed() { << "'s layout with allocated kv cache output: " << present_layout.to_short_string() << " (is_set = " << variable.is_set() << ") " << std::endl; variable.set_memory(_outputs[0], present_layout); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + GPU_DEBUG_TRACE_DETAIL << "Compressed case[2]!\n"; + auto present_scales_layout = _impl_params->output_layouts[2]; + + compressed_cache_variable->get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout); + if (compressed_cache_variable->has_zp_state()) { + auto present_zp_layout = present_scales_layout; + compressed_cache_variable->get_compression_zp_state()->set_memory(_outputs[3], present_zp_layout); + } + } } } else { GPU_DEBUG_TRACE_DETAIL << id() << ": Update variable " << variable.get_name() << "'s layout with allocated kv cache output: " << present_layout.to_short_string() << " (is_set = " << variable.is_set() << ") " << std::endl; variable.set_layout(present_layout); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + GPU_DEBUG_TRACE_DETAIL << "Compressed case[2]!\n"; + auto present_scales_layout = _impl_params->output_layouts[2]; + + compressed_cache_variable->get_compression_scale_state()->set_layout(present_scales_layout); + if (compressed_cache_variable->has_zp_state()) { + auto present_zp_layout = present_scales_layout; + compressed_cache_variable->get_compression_zp_state()->set_layout(present_zp_layout); + } + } } } @@ -1202,6 +1348,13 @@ void primitive_inst::do_runtime_in_place_kv_cache() { return; } const auto& desc = _node->as().get_primitive(); + + if (desc->compressed) { + GPU_DEBUG_TRACE_DETAIL << "Original layouts\n"; + GPU_DEBUG_TRACE_DETAIL << _impl_params->input_layouts[0] << "\n"; + GPU_DEBUG_TRACE_DETAIL << _impl_params->input_layouts[3] << "\n"; + } + auto& past_layout = _impl_params->input_layouts[0]; auto& new_layout = _impl_params->input_layouts[1]; auto& present_layout = _impl_params->output_layouts[0]; @@ -1220,6 +1373,7 @@ void primitive_inst::do_runtime_in_place_kv_cache() { GPU_DEBUG_TRACE_DETAIL << "[do runtime kv_cache opt] " << id() << " initial present_layout : " << present_layout.to_string() << std::endl; GPU_DEBUG_TRACE_DETAIL << "[do runtime kv_cache opt] " << id() << " initial past_layout : " << past_layout.to_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "get_max_pad: " << past_layout.to_short_string() << " " << _deps[0].first->_max_output_layout_count[0] << " " << sequence_axis << "\n"; auto max_pad = kv_cache_inst::get_max_pad(past_layout, _deps[0].first->_max_output_layout_count[0], sequence_axis, "past_layout"); const auto new_seq_len = static_cast(new_layout.get_shape()[sequence_axis]); // In chatbot scenario, when chat history must be stored in kvcache, new_seq_len may not be 1 even if max_pad is greater than 0 @@ -1228,11 +1382,45 @@ void primitive_inst::do_runtime_in_place_kv_cache() { GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_layout's pad : " << present_layout.to_string() << std::endl; auto& variable = get_network().get_variable(desc->variable_info.variable_id); variable.set_layout(present_layout); + + if (desc->compressed) { + GPU_DEBUG_TRACE_DETAIL << "Compressed case[1]!\n"; + auto compressed_cache_variable = dynamic_cast(&variable); + auto& present_scales_layout = _impl_params->output_layouts[2]; + const auto sequence_axis = 2; + kv_cache_inst::update_pad(present_scales_layout, max_pad - new_seq_len, sequence_axis); + GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_scale_layout's pad : " << present_scales_layout.to_string() << std::endl; + + compressed_cache_variable->get_compression_scale_state()->set_layout(present_scales_layout); + if (_impl_params->input_layouts.size() > 4) { + auto& present_zp_layout = _impl_params->output_layouts[3]; + kv_cache_inst::update_pad(present_zp_layout, max_pad - new_seq_len, sequence_axis); + + compressed_cache_variable->get_compression_zp_state()->set_layout(present_scales_layout); + } + } + GPU_DEBUG_TRACE_DETAIL << "[do_runtime_in_place_kv_cache] " << id() << "Updated variable with present_layout" << variable.get_layout().to_string() << " is_set = " << variable.is_set() << std::endl; if (past_layout.data_padding._upper_size[sequence_axis] > 0 && variable.is_set()) { kv_cache_inst::update_pad(past_layout, max_pad, sequence_axis); _impl_params->_can_be_optimized = true; + + GPU_DEBUG_TRACE_DETAIL << "Updated data layout (" << max_pad << " " << sequence_axis << "): " << _impl_params->input_layouts[0] << "\n"; + + if (desc->compressed) { + GPU_DEBUG_TRACE_DETAIL << "Compressed case[2]!\n"; + auto& past_scale_layout = _impl_params->input_layouts[3]; + const auto sequence_axis = 2; + kv_cache_inst::update_pad(past_scale_layout, max_pad, sequence_axis); + + if (_impl_params->input_layouts.size() >= 4) { + auto& past_zp_layout = _impl_params->input_layouts[4]; + const auto sequence_axis = 2; + kv_cache_inst::update_pad(past_zp_layout, max_pad, sequence_axis); + } + GPU_DEBUG_TRACE_DETAIL << "Updated scales layout (" << max_pad << " " << sequence_axis << "): " << _impl_params->input_layouts[3] << "\n"; + } GPU_DEBUG_TRACE_DETAIL << "[do_runtime_in_place_kv_cache] " << id() << " Updated past layout's pad : " << past_layout.to_string() << std::endl; } } @@ -1883,6 +2071,9 @@ primitive_inst::primitive_inst(network & network, program_node const& node, bool _dynamic_impl = _impl->clone(); } } + if (_node) { + GPU_DEBUG_TRACE_DETAIL << _node->type()->to_string(*_node) << "\n"; + } _impl_params->strm = _network.get_stream_ptr(); for (size_t i = 0; i < get_node().get_output_layouts().size(); ++i) { if (_outputs.size() > i) { diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index 3c21800c66d938..047e04f88d7e8a 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -90,7 +90,8 @@ void program_node::replace_dependency(size_t idx, std::pair const program_node::get_input_layouts() const { std::vector layouts; for (size_t i = 0; i < dependencies.size(); i++) { - layouts.push_back(get_input_layout(i)); + auto input_layout = get_input_layout(i); + layouts.push_back(input_layout); } return layouts; } @@ -434,10 +435,15 @@ layout program_node::get_non_padded_output_layout(bool invalidate_users_if_chang } bool program_node::set_output_layout(layout& new_layout, bool invalidate_users_if_changed, size_t idx) { + // GPU_DEBUG_TRACE_DETAIL << "TEST: " << padding::max(new_layout.data_padding, output_layouts[idx].data_padding)._dynamic_dims_mask << "\n"; + merge_output_padding(new_layout.data_padding, idx); + // GPU_DEBUG_TRACE_DETAIL << "Merged padding[1] " << new_layout.to_string() << "\n"; + // GPU_DEBUG_TRACE_DETAIL << "Merged padding[2] " << output_layouts[idx].data_padding._dynamic_dims_mask << "\n"; OPENVINO_ASSERT(idx < output_layouts.size(), id(), " has invalid index : index is ", std::to_string(idx), " but output_layouts length is ", std::to_string(output_layouts.size())); new_layout.data_padding = output_layouts[idx].data_padding; + // GPU_DEBUG_TRACE_DETAIL << "Merged padding[3] " << new_layout.to_string() << "\n"; bool changed = (new_layout != output_layouts[idx]); if (changed && invalidate_users_if_changed) // output_layout has changed! invalidate users invalidate_users(); diff --git a/src/plugins/intel_gpu/src/graph/read_value.cpp b/src/plugins/intel_gpu/src/graph/read_value.cpp index bf6e730e8a808b..21518d60604b65 100644 --- a/src/plugins/intel_gpu/src/graph/read_value.cpp +++ b/src/plugins/intel_gpu/src/graph/read_value.cpp @@ -2,8 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 // -#include +#include "read_value_inst.h" #include "primitive_type_base.h" + +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" + #include #include @@ -16,7 +19,7 @@ read_value_inst::typed_primitive_inst(network& network, const read_value_node& n } layout read_value_inst::calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param) { - return impl_param.typed_desc()->output_layout; + return impl_param.typed_desc()->output_layouts[0]; } std::string read_value_inst::to_string(const read_value_node& node) { @@ -45,5 +48,37 @@ void read_value_inst::update_output_memory() { GPU_DEBUG_TRACE_DETAIL << " - layout " << variable.get_layout().to_string() << std::endl; GPU_DEBUG_TRACE_DETAIL << " - actual_size " << variable.get_actual_mem_size() << " bytes" << std::endl; set_output_memory(variable.get_memory(), false, 0); + + auto desc = _impl_params->typed_desc(); + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + auto scales_state = compressed_cache_variable->get_compression_scale_state(); + set_output_memory(scales_state->get_memory(), false, 1); + + GPU_DEBUG_TRACE_DETAIL << id() << " Update output memory with variable " << scales_state->get_name() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - ptr : " << scales_state->get_memory()->buffer_ptr() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - layout " << scales_state->get_layout().to_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - actual_size " << scales_state->get_actual_mem_size() << " bytes" << std::endl; + + if (compressed_cache_variable->has_zp_state()) { + auto zp_state = compressed_cache_variable->get_compression_zp_state(); + set_output_memory(zp_state->get_memory(), false, 2); + + GPU_DEBUG_TRACE_DETAIL << id() << " Update output memory with variable " << zp_state->get_name() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - ptr : " << zp_state->get_memory()->buffer_ptr() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - layout " << zp_state->get_layout().to_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << " - actual_size " << zp_state->get_actual_mem_size() << " bytes" << std::endl; + } + } + // if (desc->compressed) { + // auto multi_tensor_variable = downcast(variable); + // auto scales_variable = multi_tensor_variable.get_compression_scale_state(); + + // GPU_DEBUG_TRACE_DETAIL << id() << " Update output memory with variable " << scales_variable->get_name() << std::endl; + // GPU_DEBUG_TRACE_DETAIL << " - ptr : " << scales_variable->get_memory()->buffer_ptr() << std::endl; + // GPU_DEBUG_TRACE_DETAIL << " - layout " << scales_variable->get_layout().to_string() << std::endl; + // GPU_DEBUG_TRACE_DETAIL << " - actual_size " << scales_variable->get_actual_mem_size() << " bytes" << std::endl; + // set_output_memory(scales_variable->get_memory(), false, 1); + // } + GPU_DEBUG_TRACE_DETAIL << id() << variable_id() << std::endl; } } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp index e8e213ad97011a..14aea98822a393 100644 --- a/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp @@ -87,6 +87,14 @@ std::string scaled_dot_product_attention_inst::to_string(scaled_dot_product_atte json_composite scaled_dot_product_attention_info; scaled_dot_product_attention_info.add("input id", input.id()); scaled_dot_product_attention_info.add("is_causal", desc->is_causal); + scaled_dot_product_attention_info.add("is_kv_compressed", desc->is_kv_compressed); + scaled_dot_product_attention_info.add("combine_scales_and_zp", desc->combine_scales_and_zp); + scaled_dot_product_attention_info.add("group_size", desc->quantization_config.group_sizes); + scaled_dot_product_attention_info.add("is_asymmetric_quantization", desc->quantization_config.is_asymmetric_quantization()); + scaled_dot_product_attention_info.add("quantization_dt", desc->quantization_config.quantization_dt); + scaled_dot_product_attention_info.add("scale_dt", desc->quantization_config.scale_dt); + scaled_dot_product_attention_info.add("zp_dt", desc->quantization_config.zp_dt); + scaled_dot_product_attention_info.add("indirect_axis", desc->indirect_axis); scaled_dot_product_attention_info.add("has_attn_mask_input", desc->has_attn_mask_input); scaled_dot_product_attention_info.add("has_scale_input", desc->has_scale_input); scaled_dot_product_attention_info.add("input_q_transpose_order", desc->input_q_transpose_order); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl index 628bc69f3886df..073061789c597e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt.cl @@ -16,7 +16,7 @@ #define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x) REQD_SUB_GROUP_SIZE(SIMD) -KERNEL(dynamic_quantize_gpu_opt)( +KERNEL(dynamic_quantize_gpu_opt_generic)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl new file mode 100644 index 00000000000000..9d4965be101d69 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl @@ -0,0 +1,130 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" +#include "include/batch_headers/fetch_data.cl" +#include "include/batch_headers/common.cl" +#include "include/batch_headers/sub_group_block_read.cl" +#include "include/batch_headers/sub_group_block_write.cl" +#include "include/batch_headers/sub_group_shuffle.cl" + + +#if OUTPUT_DIMS != 4 +#error "dynamic_quantize_gpu_opt.cl: Unsupported output dimension" +#endif + +#define VLOAD_N CAT(vload, VEC_SIZE) +#define VSTORE_N CAT(vstore, VEC_SIZE) +#define CONVERT_CHAR_N CAT(convert_char, VEC_SIZE) +#define AS_TYPE_N_(type, n, x) as_##type##n(x) +#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x) +#define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x) + + +inline uint FUNC(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { + return OUTPUT1_GET_INDEX(b, f, y, x); +} + +inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { +#ifdef SCALES_OUTPUT_ORDER + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR SCALES_OUTPUT_ORDER); +#else + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#endif +} + +#define SUBGROUP_SIZE 16 +#define INNERMOST_DIM_VALUE INPUT0_SIZE_X +#define INPUT_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT0_TYPE, 1, ptr, offset) +#define OUTPUT_BLOCK_WRITE(ptr, offset, val) BLOCK_WRITEN(OUTPUT_TYPE, 1, ptr, offset, val) + +__attribute__((reqd_work_group_size(SUBGROUP_SIZE, SUBGROUPS_NUMBER, 1))) +REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE) +KERNEL(dynamic_quantize_gpu_opt_generic)( + OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global OUTPUT1_TYPE* output_scale +#if ASYMMETRIC_QUANTIZATION && !GROUP_SCALES_WITH_ZP + , __global OUTPUT2_TYPE* output_zp +#endif +#ifdef APPEND_MODE + , const uint axis_offset +#endif + ) +{ + const uint sglid = get_sub_group_local_id(); + const uint grouped_indexes = get_global_id(1); + const uint batch_indexes = get_global_id(2); + + DECLARE_BATCHED_DIMS_INDEXES(batch_indexes); + DECLARE_GROUPED_DIMS_INDEXES(grouped_indexes); + + // the innermost dimension is always handled in the loop inside the kernel + const uint x = 0; + + half max_value = INPUT0_VAL_MIN; + half min_value = INPUT0_VAL_MAX; + + half val[INNERMOST_DIM_VALUE / SUBGROUP_SIZE]; + + const uint input_offset = INPUT0_GET_INDEX(b, f, y, x); + unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) { + val[i] = INPUT_BLOCK_READ(input, input_offset + i * SUBGROUP_SIZE); +#if ASYMMETRIC_QUANTIZATION + max_value = fmax(max_value, val[i]); + min_value = fmin(min_value, val[i]); +#else + max_value = fmax(max_value, fabs(val[i])); +#endif + } + +#if ASYMMETRIC_QUANTIZATION + min_value = work_group_reduce_min(min_value); + max_value = work_group_reduce_max(max_value); + OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value)); + OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_value * scale) - CHAR_MAX; +#else + max_value = work_group_reduce_max(max_value); + OUTPUT1_TYPE scale = 127.0h / max_value; +#endif + +#ifdef APPEND_MODE + APPEND_AXIS_NAME += axis_offset; +#endif + + const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x); + unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) { +#if ASYMMETRIC_QUANTIZATION + OUTPUT_TYPE res = convert_char(val[i] * scale + zp); +#else + OUTPUT_TYPE res = convert_char(val[i] * scale); +#endif + OUTPUT_BLOCK_WRITE(output, output_offset + i * SUBGROUP_SIZE, res); + } + + const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); + + if (grouped_indexes == 0 && sglid == 0) { +#ifdef APPEND_MODE +#if GROUP_SCALES_WITH_ZP + // half result0 = (convert_half(convert_char(val[0] * scale + zp)) - zp) * (1.0h / scale); + // half result1 = (convert_half(convert_char(val[1] * scale + zp)) - zp) * (1.0h / scale); + // half result2 = (convert_half(convert_char(val[2] * scale + zp)) - zp) * (1.0h / scale); + // half result3 = (convert_half(convert_char(val[3] * scale + zp)) - zp) * (1.0h / scale); + // printf("Save scale_idx=%d, axis_offset=%d; scale=%f; zp=%f, min=%f, max=%f; orig=(%f %f %f %f), compressed=(%d %d %d %d), decompressed=(%f %f)\n", scale_idx, axis_offset, scale, zp, min_value, max_value, + // val[0], val[1], val[2], val[3], + // convert_char(val[0] * scale + zp), convert_char(val[1] * scale + zp), convert_char(val[2] * scale + zp), convert_char(val[3] * scale + zp), + // result0, + // result1); +#endif +#endif + output_scale[scale_idx] = 1.0h / scale; +#if ASYMMETRIC_QUANTIZATION && GROUP_SCALES_WITH_ZP + output_scale[scale_idx + 1] = zp; +#elif ASYMMETRIC_QUANTIZATION + output_zp[scale_idx] = zp; +#endif + } +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl index 436276a67e48c0..d771cb7110be8f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl @@ -8,6 +8,59 @@ #error "dynamic_quantize_gpu_ref.cl: Unsupported output dimension" #endif +/* +TODO: check this coniguration: +GPU_Debug: primitive_inst.cpp:1921:primitive_inst: +{ + dynamic_quantize info : + { + scale dt : f16, + activation dt : i8, + group size : 1,18446744073709551615,1,18446744073709551615, + } + implementation : dynamic_quantize_gpu_ref, + cl dump_ info : + { + kernel_entry : dynamic_quantize_gpu_ref_17256100832148678061_0_0__sa, + batch_hash : 8176231137263359740, + } + ptr : node_213002368, + id : dynamicquantize:__module.model.layers.9.self_attn/aten::cat/Concat_3_init_dyn_quan, + optimized : false, + type : dynamic_quantize, + valid output layout : true, + output layouts : + { + 1 : f16:bfyx:?x1x0x1:nopad, + 0 : i8:bfyx:?x32x0x128:nopad, + } + dependant_shape_of_nodes_ids : , + fused primitives : + { + } + constant : false, + in_shape_of_subgraph : 0, + in data flow : true, + output : false, + preferred impl : any, + dependencies : 219236192(0), + users : 207254784,207254784, +} + + */ + +inline uint FUNC(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { + return OUTPUT1_GET_INDEX(b, f, y, x); +} + +inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { +#ifdef SCALES_OUTPUT_ORDER + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR SCALES_OUTPUT_ORDER); +#else + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#endif +} + KERNEL(dynamic_quantize_gpu_ref)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, @@ -15,14 +68,27 @@ KERNEL(dynamic_quantize_gpu_ref)( __global OUTPUT1_TYPE* output_scale) { const uint bf = (uint)get_global_id(0); - const uint b = (uint)get_global_id(0) / INPUT0_FEATURE_NUM; - const uint f = (uint)get_global_id(0) % INPUT0_FEATURE_NUM; + const uint b = bf / INPUT0_FEATURE_NUM; + const uint f = bf % INPUT0_FEATURE_NUM; const uint y = (uint)get_global_id(1); - const uint scale_idx = OUTPUT1_GET_INDEX(b, f, y, 0); + const uint x = (uint)get_global_id(2); +#ifdef SCALES_OUTPUT_ORDER + const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#else + const uint scale_idx = OUTPUT1_GET_INDEX_SAFE(b, f, y, x); +#endif half max_val = 0.0001h; - for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) { - const uint offset = INPUT0_GET_INDEX(b, f, y + y_off, 0); + for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) { + for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) { + for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) { +#if GROUP_SIZE_DIM3 == 1 + const uint offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x); + half val = input[offset]; + half abs_val = fabs(val); + max_val = fmax(max_val, abs_val); +#else + const uint offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); int x; for (x = 0; x < INPUT0_SIZE_X / 8; x++) { half8 val = as_half8(vload8(0, (ushort*)input + offset + x * 8)); @@ -34,21 +100,40 @@ KERNEL(dynamic_quantize_gpu_ref)( x *= 8; for (; x < INPUT0_SIZE_X; x++) max_val = fmax(max_val, fabs(input[offset + x])); +#endif + } + } } half scale = 127.0h / max_val; - for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) { - const uint in_offset = INPUT0_GET_INDEX(b, f, y + y_off, 0); - const uint out_offset = OUTPUT_GET_INDEX(b, f, y + y_off, 0); + for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) { + for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) { + for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) { +#if GROUP_SIZE_DIM3 == 1 + const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x); + const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, x); + + half val = input[in_offset]; + val *= scale; + output[out_offset] = convert_char(val); +#else + const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); + const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); int x; for (x = 0; x < INPUT0_SIZE_X / 8; x++) { half8 val = as_half8(vload8(0, (ushort*)input + in_offset + x * 8)); val *= scale; + // TODO: why it's _rtz instead of _rte? vstore8(convert_char8(val), 0, output + out_offset + x * 8); + // vstore8(convert_char8_rte(val), 0, output + out_offset + x * 8); } x *= 8; for (; x < INPUT0_SIZE_X; x++) output[out_offset + x] = convert_char(input[in_offset + x] * scale); + // output[out_offset + x] = convert_char_rte(input[in_offset + x] * scale); +#endif + } + } } output_scale[scale_idx] = 1.0h / scale; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index 948bd3c0f1a305..f9aa12aa257eb0 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -17,6 +17,8 @@ // max_logits [batch, heads_num, q_len, partition_idx] // tmp_out [batch, heads_num, q_len, partition_idx, head_size] +ulong __attribute__((overloadable)) intel_get_cycle_counter( void ); + inline uint FUNC(get_input0_index_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint w, uint z, uint y, uint x) { #if INPUT0_SIMPLE return GET_DATA_INDEX_6D(INPUT0, b, f, w, z, y, x); @@ -124,6 +126,7 @@ inline uint FUNC(get_bt_index_value)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uin /* This version is used for 2nd token */ REQD_SUB_GROUP_SIZE(SUBGROUP_SIZE) +__attribute__((reqd_work_group_size(1, 1, HEAD_SIZE * SG_SCALE_FACTOR))) KERNEL(sdpa_opt)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* query_input, @@ -136,6 +139,10 @@ KERNEL(sdpa_opt)( const __global INPUT4_TYPE* scale, #endif __global OUTPUT_TYPE* output, +#if IS_KV_COMPRESSED + const __global KEY_COMPRESSION_SCALE_TYPE* key_scale, + const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale, +#endif #ifdef BEAM_TABLE_TYPE const __global BEAM_TABLE_TYPE* beam_table, #endif @@ -149,7 +156,14 @@ KERNEL(sdpa_opt)( const uint b1_idx = batch_idx % NUM_HEADS; /* HEADS_NUM dim */ const uint target_seq_idx = get_global_id(1); const uint lid = get_local_id(2); + + #if SG_SCALE_FACTOR == 2 + const uint head_size_idx = lid % HEAD_SIZE; + #elif SG_SCALE_FACTOR == 1 const uint head_size_idx = lid; + #else + #error "Unsupported scale factor" + #endif const uint sgid = get_sub_group_id(); const uint sglid = get_sub_group_local_id(); @@ -199,13 +213,19 @@ KERNEL(sdpa_opt)( uint query_offset = INPUT0_GET_INDEX(b0_idx, b1_idx, target_seq_idx, (sgid * SUBGROUP_SIZE)); const uint query_pitch = QUERY_STEP_LOCAL; #endif - for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { - #define QUERY_BLOCK_SIZE 1 + #if SG_SCALE_FACTOR == 2 + if (sgid < HEAD_SIZE / SUBGROUP_SIZE) { + #else + { + #endif + for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { + #define QUERY_BLOCK_SIZE 1 - INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset); - query_local[query_local_offset] = val * scale_val; - query_local_offset += QUERY_STEP_LOCAL; - query_offset += query_pitch; + INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, QUERY_BLOCK_SIZE, query_input, query_offset); + query_local[query_local_offset] = val * scale_val; + query_local_offset += QUERY_STEP_LOCAL; + query_offset += query_pitch; + } } #undef QUERY_BLOCK_SIZE #undef QUERY_STEP @@ -216,28 +236,69 @@ KERNEL(sdpa_opt)( // Main Gemm1 calculation loop // Each SG performs element-wise multiplications of Q[HEAD_SIZE]xK[HEAD_SIZE] values // HEAD_SIZE / SUBGROUPS_PER_WG times in the loop and saves the result to the qk_local SLM buffer - for (uint seq_len = sgid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE / SUBGROUP_SIZE)) { + for (uint seq_len = sgid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE / SUBGROUP_SIZE) * SG_SCALE_FACTOR) { #ifdef INPUT1_DIMS_ORDER #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_key)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0)]; #else const uint b_idx = b0_idx; #endif - const uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0); + uint key_offset = FUNC_CALL(get_input1_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, 0); #else - const uint key_offset = INPUT1_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, 0); + uint key_offset = INPUT1_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, 0); #endif + // if (start_partition_idx < 8096) + // key_offset = 0; INPUT0_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {INPUT0_VAL_ZERO}; +#if IS_KV_COMPRESSED +#ifndef INPUT1_DIMS_ORDER + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, b1_idx, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0); + +#else + // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, 0, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + seq_len, 0); +#endif + KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + KEY_COMPRESSION_SCALE_TYPE key_comp_zp = key_scale[key_scale_comp_offset + 1]; +#endif +#endif + + // ulong timer_start = intel_get_cycle_counter(); uint head_idx_index = 0; #define KEY_BLOCK_SIZE 8 for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) + +#if IS_KV_COMPRESSED + #define KEY_BLOCK_READ_TEST(ptr, offset) BLOCK_READN(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE, ptr, offset); + #define KEY_BLOCK_NEW MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + + + #define KEY_BLOCK_READ_TEST1(ptr, offset) KEY_BLOCK_READ(ptr, offset) + #define KEY_BLOCK_NEW1 KEY_BLOCK + // KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + // KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; + KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if USE_ASYMMETRIC_QUANTIZATION + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) - key_comp_zp) * key_comp_scale; +#else + KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; +#endif +#else KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -258,9 +319,20 @@ KERNEL(sdpa_opt)( for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) +#if IS_KV_COMPRESSED + KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if USE_ASYMMETRIC_QUANTIZATION + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) - key_comp_zp) * key_comp_scale; +#else + KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; +#endif +#else KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -281,9 +353,20 @@ KERNEL(sdpa_opt)( for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) +#if IS_KV_COMPRESSED + KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if USE_ASYMMETRIC_QUANTIZATION + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) - key_comp_zp) * key_comp_scale; +#else + KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; +#endif +#else KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -304,9 +387,20 @@ KERNEL(sdpa_opt)( for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) +#if IS_KV_COMPRESSED + KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if USE_ASYMMETRIC_QUANTIZATION + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) - key_comp_zp) * key_comp_scale; +#else + KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; +#endif +#else KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -320,6 +414,12 @@ KERNEL(sdpa_opt)( } } + // ulong timer_end = intel_get_cycle_counter(); + // ulong time = timer_end - timer_start; + // if (batch_idx == 0 && sglid == 0) { + // printf("part=%d, sgid=%d, seq_len=%d: %d\n", partition_idx, sgid, seq_len, time); + // } + // Sum up all accumulators accross single SG and save result to SLM unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc[seq_idx] = sub_group_reduce_add(acc[seq_idx]); @@ -335,7 +435,7 @@ KERNEL(sdpa_opt)( const uint seq_idx_end = 1; for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { // Iterate over all values QK values in SLM and apply scale and attention mask - for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE)) { + for (uint seq_len = sgid * SUBGROUP_SIZE + sglid; seq_len < partition_seq_len; seq_len += (HEAD_SIZE * SG_SCALE_FACTOR)) { // Read value from SLM and apply scale qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len]; @@ -378,6 +478,10 @@ KERNEL(sdpa_opt)( for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { qk_max[seq_idx] = SOFTMAX_ACCUMULATOR_VAL_MIN; + #if SUBGROUPS_PER_WG > SUBGROUP_SIZE + #error "Number of subgroups per work group should be less than subgroup_size + #endif + if (sglid < SUBGROUPS_PER_WG) qk_max[seq_idx] = qk_max_vals[seq_idx * SUBGROUPS_PER_WG + sglid]; @@ -388,7 +492,7 @@ KERNEL(sdpa_opt)( SOFTMAX_ACCUMULATOR_TYPE exp_sum[TARGET_SEQ_LEN_BLOCK_SIZE] = {SOFTMAX_ACCUMULATOR_VAL_ZERO}; const uint qk_num_per_wi = CEIL_DIV(partition_seq_len, SUBGROUPS_PER_WG * SUBGROUP_SIZE); for (uint qk_idx = 0; qk_idx < qk_num_per_wi; qk_idx++) { - const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + head_size_idx; + const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + lid; if (local_data_idx < partition_seq_len) { for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { SOFTMAX_ACCUMULATOR_TYPE qk_new = native_exp(TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx]) - qk_max[seq_idx]); @@ -411,6 +515,10 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { exp_sum[seq_idx] = SOFTMAX_ACCUMULATOR_VAL_ZERO; + #if SUBGROUPS_PER_WG > SUBGROUP_SIZE + #error "Number of subgroups per work group should be less than subgroup_size + #endif + if (sglid < SUBGROUPS_PER_WG) exp_sum[seq_idx] = qk_sum_vals[seq_idx * SUBGROUPS_PER_WG + sglid]; @@ -420,7 +528,7 @@ KERNEL(sdpa_opt)( // const SOFTMAX_ACCUMULATOR_TYPE inv_exp_sum = SOFTMAX_ACCUMULATOR_VAL_ONE / exp_sum[seq_idx]; for (uint qk_idx = 0; qk_idx < qk_num_per_wi; qk_idx++) { - const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + sgid * SUBGROUP_SIZE + sglid; + const uint local_data_idx = qk_idx * (SUBGROUPS_PER_WG * SUBGROUP_SIZE) + lid; if (local_data_idx < partition_seq_len) { for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { SOFTMAX_ACCUMULATOR_TYPE qk_new = TO_SOFTMAX_ACCUMULATOR_TYPE(qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + local_data_idx]) / exp_sum[seq_idx]; @@ -434,7 +542,7 @@ KERNEL(sdpa_opt)( { // If the number of partitions is greater than 1, save exm_sums and max_logits to the temporary buffers // Use single WI in the WG, since all the WIs have the same value - if (num_of_partitions > 1 && head_size_idx == 0) { + if (num_of_partitions > 1 && lid == 0) { for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { const uint exp_sums_offset = b0_idx * (NUM_HEADS * TARGET_SEQ_LEN * num_of_partitions) + b1_idx * (TARGET_SEQ_LEN * num_of_partitions) + @@ -463,8 +571,18 @@ KERNEL(sdpa_opt)( #endif #endif - for (uint seq_len = 0; seq_len < partition_seq_len / SUBGROUP_SIZE; seq_len++) { +#if SG_SCALE_FACTOR > 1 + // SUBGROUPS_PER_WG (HEAD_SIZE * SG_SCALE_FACTOR / SUBGROUP_SIZE) + const uint seq_len_start = (sgid / (HEAD_SIZE / SUBGROUP_SIZE)) * (SEQ_LEN_PARTITION_SIZE / SG_SCALE_FACTOR / SUBGROUP_SIZE); + const uint seq_len_end = min(seq_len_start + (SEQ_LEN_PARTITION_SIZE / SG_SCALE_FACTOR / SUBGROUP_SIZE), partition_seq_len / SUBGROUP_SIZE); +#else + const uint seq_len_start = 0; + const uint seq_len_end = partition_seq_len / SUBGROUP_SIZE; +#endif + + for (uint seq_len = seq_len_start; seq_len < seq_len_end; seq_len++) { #ifdef BEAM_TABLE_TYPE + // TODO: Handle beam search uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE)]; uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, sgid * SUBGROUP_SIZE); #else @@ -473,6 +591,27 @@ KERNEL(sdpa_opt)( #else uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE), head_size_idx); #endif +#endif + + // if (start_partition_idx < 8096) + // value_offset = 0; + +#if IS_KV_COMPRESSED +#ifndef BEAM_TABLE_TYPE + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + // TODO: consider to change scales layout from [batch, seq_len, num_heads, 1] to [batch, num_heads, seq_len, 1] + // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, b1_idx, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); +#else + // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, 0, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); +#endif + VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1]; +#endif #endif OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; @@ -481,10 +620,29 @@ KERNEL(sdpa_opt)( } unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { +#if IS_KV_COMPRESSED + + #define VALUE_BLOCK_READ_TEST(ptr, offset) BLOCK_READN(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE, ptr, offset); + #define VALUE_BLOCK_NEW MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + + #define VALUE_BLOCK_READ_TEST1(ptr, offset) VALUE_BLOCK_READ(ptr, offset) + #define VALUE_BLOCK_NEW1 INPUT2_TYPE +#ifdef BEAM_TABLE_TYPE + INPUT2_TYPE value_val_compressed = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); +#else + INPUT2_TYPE value_val_compressed = VALUE_BLOCK_READ(value_input, value_offset); +#endif +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_val_compressed - sub_group_broadcast(value_comp_zp, i)) * sub_group_broadcast(value_comp_scale, i); +#else + VALUE_COMPRESSION_SCALE_TYPE value_val = value_val_compressed * sub_group_broadcast(value_comp_scale, i); +#endif +#else #ifdef BEAM_TABLE_TYPE INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); #else INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); +#endif #endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc[seq_idx]); @@ -496,33 +654,86 @@ KERNEL(sdpa_opt)( } } - const uint seq_len_leftovers_start = (partition_seq_len / SUBGROUP_SIZE) * SUBGROUP_SIZE; - for (uint seq_len = seq_len_leftovers_start; seq_len < partition_seq_len; seq_len++) { + +#if SG_SCALE_FACTOR > 1 + if (sgid >= HEAD_SIZE / SUBGROUP_SIZE) { +#endif + + for (uint seq_len = (partition_seq_len / SUBGROUP_SIZE) * SUBGROUP_SIZE; seq_len < partition_seq_len; seq_len++) { #ifdef INPUT2_DIMS_ORDER #ifdef BEAM_TABLE_TYPE const uint b_idx = beam_table[FUNC_CALL(get_bt_index_value)(OPTIONAL_SHAPE_INFO_TENSOR b0_idx, b1_idx, 0, 0, start_partition_idx + seq_len, head_size_idx)]; #else const uint b_idx = b0_idx; #endif - const uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, head_size_idx); + uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1_idx, 0, 0, start_partition_idx + seq_len, head_size_idx); #else const uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, head_size_idx); #endif + // if (start_partition_idx < 8096) + // value_offset = 0; + +#if IS_KV_COMPRESSED +#ifndef INPUT2_DIMS_ORDER + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + // TODO: consider to change scales layout from [batch, seq_len, num_heads, 1] to [batch, num_heads, seq_len, 1] + // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len, 0, 0, b1_idx, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0); +#else + // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len, 0, 0, 0, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + seq_len, 0); +#endif + VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1]; +#endif +#endif + OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len]; } +#if IS_KV_COMPRESSED + INPUT2_TYPE value_val_compressed = VALUE_BLOCK_READ(value_input, value_offset); +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_val_compressed - value_comp_zp) * value_comp_scale; +#else + VALUE_COMPRESSION_SCALE_TYPE value_val = value_val_compressed * value_comp_scale; +#endif +#else INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); +#endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc[seq_idx] = mad(qk_val[seq_idx], value_val, acc[seq_idx]); } } +#if SG_SCALE_FACTOR > 1 + } // if (sgid >= HEAD_SIZE / SUBGROUP_SIZE) +#endif + +#if SG_SCALE_FACTOR > 1 + if ((partition_seq_len > (SEQ_LEN_PARTITION_SIZE / SG_SCALE_FACTOR)) || (partition_seq_len % SUBGROUP_SIZE != 0)) { + if (sgid >= HEAD_SIZE / SUBGROUP_SIZE) { + query_local[head_size_idx] = acc[0]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if (sgid < HEAD_SIZE / SUBGROUP_SIZE) { + acc[0] += query_local[head_size_idx]; + } + } +#endif + // If the number of partitions is greater than 1, save results to the temporary buffer; // otherwise, save results directly to the main output. +#if SG_SCALE_FACTOR > 1 + if (sgid < HEAD_SIZE / SUBGROUP_SIZE) { +#endif if (num_of_partitions > 1) { const uint seq_idx_end = 1; for (uint seq_idx = 0; seq_idx < seq_idx_end; seq_idx++) { @@ -542,6 +753,9 @@ KERNEL(sdpa_opt)( output[output_offset] = acc[seq_idx]; } } +#if SG_SCALE_FACTOR > 1 + } // if (sgid < HEAD_SIZE / SUBGROUP_SIZE) { +#endif } // Gemm2 calculation end } @@ -675,6 +889,10 @@ KERNEL(sdpa_opt)( const __global INPUT4_TYPE* scale, #endif __global OUTPUT_TYPE* output, +#if IS_KV_COMPRESSED + const __global KEY_COMPRESSION_SCALE_TYPE* key_scale, + const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale, +#endif #ifdef BEAM_TABLE_TYPE const __global BEAM_TABLE_TYPE* beam_table, #endif @@ -743,12 +961,18 @@ KERNEL(sdpa_opt)( #endif uint query_local_offset = head_size_idx * TARGET_SEQ_LEN_BLOCK_SIZE; +#if HAS_SCALE_INPUT + const OUTPUT_TYPE scale_val = *scale; +#else + const OUTPUT_TYPE scale_val = TO_OUTPUT_TYPE(STATIC_SCALE_VALUE); +#endif + if (cur_target_seq_len_size != TARGET_SEQ_LEN_BLOCK_SIZE) { if (sgid * SUBGROUP_SIZE < HEAD_SIZE) { for (uint seq_idx = 0; seq_idx < cur_target_seq_len_size; seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -759,7 +983,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -769,7 +993,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -780,7 +1004,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < (TARGET_SEQ_LEN_BLOCK_SIZE / SG_SCALE_FACTOR); seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -788,7 +1012,7 @@ KERNEL(sdpa_opt)( unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { INPUT0_TYPE val = BLOCK_READN(INPUT0_TYPE, 1, query_input, query_offset); - slm_query[query_local_offset] = val; + slm_query[query_local_offset] = val * scale_val; query_offset += query_pitch; query_local_offset++; } @@ -862,10 +1086,28 @@ KERNEL(sdpa_opt)( PA_BUFFERS); if (seq_len_calc_size >= SUBGROUP_SIZE) { +#if IS_KV_COMPRESSED +#ifndef BEAM_TABLE_TYPE + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, b1_idx, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid, 0); +#else + // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, 0, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, 0, seq_len + sglid, 0); +#endif + KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + KEY_COMPRESSION_SCALE_TYPE key_comp_zp = key_scale[key_scale_comp_offset + 1]; +#endif + // printf("[0]key_scale_comp_offset=%d, sglid=%d: %f\n", key_scale_comp_offset, sglid, key_comp_scale); +#endif + __attribute__((opencl_unroll_hint(1))) for (uint head_idx_index = 0; head_idx_index < HEAD_SIZE; head_idx_index += SUBGROUP_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset); - #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) QUERY_VEC queries_vec; uint query_local_offset = (head_idx_index * TARGET_SEQ_LEN_BLOCK_SIZE) + sglid; @@ -875,10 +1117,23 @@ KERNEL(sdpa_opt)( } unroll_for (uint key_row_idx = 0; key_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; key_row_idx++) { +#if IS_KV_COMPRESSED +#ifdef BEAM_TABLE_TYPE + INPUT1_TYPE key_vals_compressed = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); +#else + INPUT1_TYPE key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); +#endif +#if USE_ASYMMETRIC_QUANTIZATION + KEY_COMPRESSION_SCALE_TYPE key_vals = (TO_KEY_COMPRESSION_SCALE_TYPE(key_vals_compressed) - sub_group_broadcast(key_comp_zp, key_row_idx)) * sub_group_broadcast(key_comp_scale, key_row_idx); +#else + KEY_COMPRESSION_SCALE_TYPE key_vals = TO_KEY_COMPRESSION_SCALE_TYPE(key_vals_compressed) * sub_group_broadcast(key_comp_scale, key_row_idx); +#endif +#else #ifdef BEAM_TABLE_TYPE INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); #else INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); +#endif #endif unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { @@ -887,12 +1142,34 @@ KERNEL(sdpa_opt)( } } } else if (seq_len_calc_size > 0) { +#if IS_KV_COMPRESSED +#ifndef BEAM_TABLE_TYPE + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, b1_idx, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx,b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid, 0); +#else + // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, 0, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, 0, seq_len + sglid, 0); +#endif + KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + KEY_COMPRESSION_SCALE_TYPE key_comp_zp = key_scale[key_scale_comp_offset + 1]; +#endif + // printf("[1]key_scale_comp_offset=%d, sglid=%d: %f\n", key_scale_comp_offset, sglid, key_comp_scale); +#endif __attribute__((opencl_unroll_hint(1))) for (uint head_idx_index = 0; head_idx_index < HEAD_SIZE; head_idx_index += SUBGROUP_SIZE) { - #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset); - #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset) + #define QUERY_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) +#if IS_KV_COMPRESSED + #define KEY_VEC_TYPE MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) +#else + #define KEY_VEC_TYPE MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) +#endif - QUERY_VEC queries_vec; + QUERY_VEC_TYPE queries_vec; uint query_local_offset = (head_idx_index * TARGET_SEQ_LEN_BLOCK_SIZE) + sglid; unroll_for (uint q_row_idx = 0; q_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; q_row_idx++) { queries_vec[q_row_idx] = slm_query[query_local_offset]; @@ -900,18 +1177,49 @@ KERNEL(sdpa_opt)( } #ifndef LOAD_KEY_LEFTOVERS_IN_CALC_LOOP - QUERY_VEC key_vec = 0; + KEY_VEC_TYPE key_vec = 0; unroll_for (uint key_row_idx = 0; key_row_idx < seq_len_calc_size; key_row_idx++) { + #if IS_KV_COMPRESSED + #ifdef BEAM_TABLE_TYPE + key_vec[key_row_idx] = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index)); + #else + key_vec[key_row_idx] = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index)); + #endif +#if USE_ASYMMETRIC_QUANTIZATION + key_vec[key_row_idx] = (key_vec[key_row_idx] - sub_group_broadcast(key_comp_zp, key_row_idx)) * sub_group_broadcast(key_comp_scale, key_row_idx); +#else + key_vec[key_row_idx] *= sub_group_broadcast(key_comp_scale, key_row_idx); +#endif + #else #ifdef BEAM_TABLE_TYPE key_vec[key_row_idx] = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); #else key_vec[key_row_idx] = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); + #endif #endif } #endif unroll_for (uint key_row_idx = 0; key_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; key_row_idx++) { #ifdef LOAD_KEY_LEFTOVERS_IN_CALC_LOOP + #if IS_KV_COMPRESSED + #ifdef BEAM_TABLE_TYPE + KEY_COMPRESSION_SCALE_TYPE key_vals = 0; + if (key_row_idx < seq_len_calc_size) + key_vals = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index)); + #else + KEY_COMPRESSION_SCALE_TYPE key_vals = 0; + if (key_row_idx < seq_len_calc_size) { + key_vals = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index)); + // printf("_%d %d %d. Loads key [%d] = %f\n", get_global_id(0), get_global_id(1), get_global_id(2), key_offset + key_row_idx * key_pitch + head_idx_index, key_vals); + } + #endif +#if USE_ASYMMETRIC_QUANTIZATION + key_vals = (key_vals - sub_group_broadcast(key_comp_zp, key_row_idx)) * sub_group_broadcast(key_comp_scale, key_row_idx); +#else + key_vals *= sub_group_broadcast(key_comp_scale, key_row_idx); +#endif + #else #ifdef BEAM_TABLE_TYPE INPUT1_TYPE key_vals = 0; if (key_row_idx < seq_len_calc_size) @@ -921,6 +1229,7 @@ KERNEL(sdpa_opt)( if (key_row_idx < seq_len_calc_size) key_vals = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); #endif + #endif #else #define key_vals key_vec[key_row_idx] #endif @@ -933,12 +1242,7 @@ KERNEL(sdpa_opt)( { unroll_for (uint i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) { -#if HAS_SCALE_INPUT - const OUTPUT_TYPE scale_val = *scale; -#else - const OUTPUT_TYPE scale_val = TO_OUTPUT_TYPE(STATIC_SCALE_VALUE); -#endif - qk_acc[i] *= scale_val; + #ifdef HAS_ALIBI const int alibi_val = (1 - SOURCE_SEQ_LEN) + seq_len + i; @@ -947,6 +1251,8 @@ KERNEL(sdpa_opt)( qk_acc[i] = INPUT0_MIN_FUNC(INPUT0_MAX_FUNC(qk_acc[i], INPUT0_VAL_MIN), INPUT0_VAL_MAX); + // printf("%d %d %d. qk_acc=%v16f\n", get_global_id(0), get_global_id(1), get_global_id(2), qk_acc); + qk_max = SOFTMAX_ACCUMULATOR_MAX_FUNC(qk_max, TO_SOFTMAX_ACCUMULATOR_TYPE(qk_acc[i])); } } @@ -1006,6 +1312,16 @@ KERNEL(sdpa_opt)( barrier(CLK_LOCAL_MEM_FENCE); } + // if (get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0) { + // for (int i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) { + // printf("row %d: ", i); + // for (int j = 0; j < 16; j++) { + // printf("%f ", slm_qk_vals[SEQ_LEN_PARTITION_SIZE * i + j]); + // } + // printf("\n"); + // } + // } + { // QK*V calculation MAKE_VECTOR_TYPE(OUTPUT_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) acc_output_res = OUTPUT_VAL_ZERO; @@ -1050,15 +1366,45 @@ KERNEL(sdpa_opt)( qk_val[seq_idx] = slm_qk_vals[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len + sglid]; } +#if IS_KV_COMPRESSED +#ifndef BEAM_TABLE_TYPE + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len) + sglid, 0, 0, b1_idx, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len) + sglid, 0); +#else + // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len) + sglid, 0, 0, 0, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + (seq_len) + sglid, 0); +#endif + VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1]; +#endif + // printf("[0]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale); +#endif + unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { +#define UNCOMPPRESED_VALUE_TYPE half +#define TO_UNCOMPPRESED_VALUE_TYPE(val) convert_half(val) #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); + UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i))); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, value_offset)); +#endif + +#if IS_KV_COMPRESSED +#if USE_ASYMMETRIC_QUANTIZATION + value_val = (value_val - sub_group_broadcast(value_comp_zp, i)) * sub_group_broadcast(value_comp_scale, i); +#else + value_val *= sub_group_broadcast(value_comp_scale, i); +#endif #endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc_output_res[seq_idx]); } +#undef UNCOMPPRESED_VALUE_TYPE +#undef TO_UNCOMPPRESED_VALUE_TYPE #ifndef BEAM_TABLE_TYPE value_offset += value_pitch; @@ -1093,20 +1439,51 @@ KERNEL(sdpa_opt)( #endif #endif +#if IS_KV_COMPRESSED +#ifndef BEAM_TABLE_TYPE + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, b1_idx, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); +#else + // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, 0, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); +#endif + VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1]; +#endif + // printf("[1]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale); +#endif + MAKE_VECTOR_TYPE(OUTPUT_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) qk_val; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { qk_val[seq_idx] = slm_qk_vals[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len * SUBGROUP_SIZE + sglid]; } unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { +#define UNCOMPPRESED_VALUE_TYPE half +#define TO_UNCOMPPRESED_VALUE_TYPE(val) convert_half(val) #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); + UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i))); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, value_offset)); #endif + +#if IS_KV_COMPRESSED +#if USE_ASYMMETRIC_QUANTIZATION + value_val = (value_val - sub_group_broadcast(value_comp_zp, i)) * sub_group_broadcast(value_comp_scale, i); +#else + value_val *= sub_group_broadcast(value_comp_scale, i); +#endif +#endif + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc_output_res[seq_idx]); } +#undef UNCOMPPRESED_VALUE_TYPE +#undef TO_UNCOMPPRESED_VALUE_TYPE #ifndef BEAM_TABLE_TYPE value_offset += value_pitch; @@ -1144,17 +1521,47 @@ KERNEL(sdpa_opt)( #endif #endif +#if IS_KV_COMPRESSED +#ifndef BEAM_TABLE_TYPE + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len_leftovers_start + sglid, 0, 0, b1_idx, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len_leftovers_start + sglid, 0); +#else + // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len_leftovers_start + sglid, 0, 0, 0, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + seq_len_leftovers_start + sglid, 0); +#endif + VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; +#if USE_ASYMMETRIC_QUANTIZATION + VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1]; +#endif + // printf("[2]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale); +#endif + for (uint seq_len_idx = 0; seq_len_idx < partition_seq_len - seq_len_leftovers_start; seq_len_idx++) { +#define UNCOMPPRESED_VALUE_TYPE half +#define TO_UNCOMPPRESED_VALUE_TYPE(val) convert_half(val) #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, seq_len_idx)); + UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, seq_len_idx))); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, value_offset)); +#endif + +#if IS_KV_COMPRESSED +#if USE_ASYMMETRIC_QUANTIZATION + value_val = (value_val - sub_group_broadcast(value_comp_zp, seq_len_idx)) * sub_group_broadcast(value_comp_scale, seq_len_idx); +#else + value_val *= sub_group_broadcast(value_comp_scale, seq_len_idx); +#endif #endif for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], seq_len_idx), value_val, acc_output_res[seq_idx]); } +#undef UNCOMPPRESED_VALUE_TYPE +#undef TO_UNCOMPPRESED_VALUE_TYPE #ifndef BEAM_TABLE_TYPE value_offset += value_pitch; #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl index 83e3c7c7e9fef1..8efec101151dd2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl @@ -111,7 +111,9 @@ inline uint FUNC(get_bt_index_value)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uin } #endif + #define APPLY_SCALE_TO_QUERY 1 +#define HAS_KV_CACHE_ZP_INPUT USE_ASYMMETRIC_QUANTIZATION && !COMBINE_SCALES_AND_ZP KERNEL(sdpa_ref)( OPTIONAL_SHAPE_INFO_ARG @@ -125,6 +127,14 @@ KERNEL(sdpa_ref)( const __global INPUT4_TYPE* scale, #endif __global OUTPUT_TYPE* output, +#if USE_ASYMMETRIC_QUANTIZATION + const __global KEY_COMPRESSION_SCALE_TYPE* key_scale, + const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale, +#if HAS_KV_CACHE_ZP_INPUT + const __global KEY_COMPRESSION_ZP_TYPE* key_zp, + const __global VALUE_COMPRESSION_ZP_TYPE* val_zp, +#endif +#endif #ifdef BEAM_TABLE_TYPE const __global BEAM_TABLE_TYPE* beam_table, #endif @@ -162,7 +172,28 @@ KERNEL(sdpa_ref)( #else INPUT0_TYPE q_val = query_input[query_offset]; #endif +#if IS_KV_COMPRESSED + INPUT1_TYPE k_val_comp = key_input[key_offset]; + half k_val = (half)k_val_comp; +#ifdef COMPRESSED_PER_HEAD + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1 / BROADCAST_GROUP_SIZE, s, 0); +#else + // const uint key_scale_comp_offset = s; + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, 0, s, 0); +#endif +#if USE_ASYMMETRIC_QUANTIZATION +#if HAS_KV_CACHE_ZP_INPUT + k_val = (k_val - key_zp[key_scale_comp_offset]) * key_scale[key_scale_comp_offset]; +#else + k_val = (k_val - key_scale[key_scale_comp_offset + 1]) * key_scale[key_scale_comp_offset]; +#endif + +#else + k_val *= key_scale[key_scale_comp_offset]; +#endif +#else INPUT1_TYPE k_val = key_input[key_offset]; +#endif acc += q_val * k_val; } @@ -236,7 +267,27 @@ KERNEL(sdpa_ref)( #endif uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1, 0, 0, s, head_size_idx); +#if IS_KV_COMPRESSED + INPUT2_TYPE __value = value_input[value_offset]; + half value = (half)__value; + #ifdef COMPRESSED_PER_HEAD + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1 / BROADCAST_GROUP_SIZE, s, 0); + #else + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, s, 0); + #endif +#if USE_ASYMMETRIC_QUANTIZATION +#if HAS_KV_CACHE_ZP_INPUT + value = (value - val_zp[value_scale_comp_offset]) * val_scale[value_scale_comp_offset]; +#else + value = (value - val_scale[value_scale_comp_offset + 1]) * val_scale[value_scale_comp_offset]; +#endif +#else + value *= val_scale[value_scale_comp_offset]; +#endif + acc += tmp_buf[tmp_buf_offset] * value; +#else acc += tmp_buf[tmp_buf_offset] * value_input[value_offset]; +#endif } uint output_offset = OUTPUT_GET_INDEX(b0, b1, target_seq_idx, head_size_idx); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp new file mode 100644 index 00000000000000..6e422f3174cf73 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp @@ -0,0 +1,327 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "dynamic_quantize_kernel_opt_generic.h" +#include "kernel_selector_utils.h" +#include + + +static constexpr size_t subgroup_size = 16; + +namespace kernel_selector { +static Tensor::NDims get_normalized_dims(const DataTensor& tensor) { + auto dims = tensor.GetDims(); + std::reverse(dims.begin(), dims.end()); + + return dims; +} + +static size_t get_elements_number_per_batch(const dynamic_quantize_params& params) { + const auto& group_sizes = params.group_sizes; + const auto& input_dims = get_normalized_dims(params.inputs[0]); + + auto total_elements_number = 1; + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] != UINT64_MAX) { + GPU_DEBUG_TRACE_DETAIL << "Multiply " << input_dims[i].v << "\n"; + total_elements_number *= input_dims[i].v; + } + } + + return total_elements_number; +} + +static size_t get_elements_number_per_group(const dynamic_quantize_params& params) { + const auto& group_sizes = params.group_sizes; + const auto& input_dims = get_normalized_dims(params.inputs[0]); + + auto total_elements_number = 1; + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] == UINT64_MAX) { + GPU_DEBUG_TRACE_DETAIL << "-> Multiply " << input_dims[i].v << "\n"; + total_elements_number *= input_dims[i].v; + } else { + GPU_DEBUG_TRACE_DETAIL << "=> Multiply " << group_sizes[i] << "\n"; + total_elements_number *= group_sizes[i]; + } + } + + return total_elements_number; +} + +static std::string generate_dims_indexes_calculation(std::vector> dims, + std::pair append_axis_info = {}) { + std::reverse(dims.begin(), dims.end()); // reorder dims in order from innermost to outermost dimensions + + auto generate_calc_function = [&](std::string data_type, std::string index_var, size_t dim_idx) { + std::string index_calc_str; + index_calc_str += "" + data_type + " " + dims[dim_idx].first + " = "; + index_calc_str += "((" + index_var + " / "; + index_calc_str += "(1"; + for (size_t i = 0; i < dim_idx; i++) { + index_calc_str += " * " + dims[i].second; + } + index_calc_str += ")) % " + dims[dim_idx].second + ")"; + + if (append_axis_info.first == dims[dim_idx].first) { + index_calc_str += " + " + append_axis_info.second; + } + + index_calc_str += ";"; + return index_calc_str; + }; + + std::stringstream indexes_calc_str; + for (size_t i = 0; i < dims.size(); i++) { + indexes_calc_str << generate_calc_function("uint", "data_idx", i); + } + + return indexes_calc_str.str(); +} + +// static size_t get_innermost_group_size(const dynamic_quantize_params& params) { +// const auto& group_sizes = params.group_sizes; +// const auto& input_dims = get_normalized_dims(params.inputs[0]); + +// for (size_t i = group_sizes.size(); i > 0; i--) { +// if (group_sizes[i - 1] == UINT64_MAX) { +// return input_dims[i - 1].v; +// } else if (group_sizes[i - 1] != 1) { +// return group_sizes[i - 1]; +// } +// } + +// return 1; +// } + +// static size_t get_match_vector_size(const dynamic_quantize_params& params) { +// // const auto input_dt = BytesPerElement(params.inputs[0].GetDType()); +// auto block_sizes = { 8, 4, 2 }; + +// for (auto block_size : block_sizes) { +// if (((params.inputs[0].X().v * params.inputs[0].Y().v) / simd) % block_size == 0) { +// return block_size; +// } +// } + +// return 1; +// } + +static size_t get_per_iter_elements_number(const dynamic_quantize_params& params) { + const auto maxWorkGroupSize = params.engineInfo.maxWorkGroupSize; + const auto total_grouped_elements = get_elements_number_per_group(params); + + if (total_grouped_elements % maxWorkGroupSize == 0) + return maxWorkGroupSize; + + if (total_grouped_elements < maxWorkGroupSize) + return total_grouped_elements; + + return 0; +} + +ParamsKey DynamicQuantizeKernelOptGeneric::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::INT8); + k.EnableDifferentTypes(); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDynamicShapesSupport(); + return k; +} + +JitConstants DynamicQuantizeKernelOptGeneric::GetJitConstants(const dynamic_quantize_params& params) const { + JitConstants jit = MakeBaseParamsJitConstants(params); + + const std::vector> default_dims = {{"b", "INPUT0_BATCH_NUM"}, + {"f", "INPUT0_FEATURE_NUM"}, + {"y", "INPUT0_SIZE_Y"}, + {"x", "INPUT0_SIZE_X"}}; + + const auto& group_sizes = params.group_sizes; + std::vector> batch_dims, grouped_dims; + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] == 1) + batch_dims.push_back(default_dims[i]); + else + grouped_dims.push_back(default_dims[i]); + } + + const auto& input_dims = get_normalized_dims(params.inputs[0]); + const auto total_grouped_elements = get_elements_number_per_group(params); + const auto per_iter_elements_number = get_per_iter_elements_number(params); + const auto total_subgroups_number = total_grouped_elements / input_dims.back().v; + + // drop the last dimensions, since it will be processed inside kernel + grouped_dims.pop_back(); + + const bool append_mode = params.append_axis != -1; + std::pair append_axis_info = {}; + if (append_mode) { + // TODO: Is not needed??? + // append_axis_info = { default_dims[params.append_axis].first, "axis_offset" }; // axis and input scalar name + + jit.AddConstant(MakeJitConstant("APPEND_MODE", append_mode)); + jit.AddConstant(MakeJitConstant("APPEND_AXIS_NAME", default_dims[params.append_axis].first)); + } + + jit.AddConstant(MakeJitConstant("DECLARE_BATCHED_DIMS_INDEXES(data_idx)", generate_dims_indexes_calculation(batch_dims, append_axis_info))); + jit.AddConstant(MakeJitConstant("DECLARE_GROUPED_DIMS_INDEXES(data_idx)", generate_dims_indexes_calculation(grouped_dims))); + jit.AddConstant(MakeJitConstant("SUBGROUPS_NUMBER", total_subgroups_number)); + + const auto iterations_number = total_grouped_elements / per_iter_elements_number; + + jit.AddConstant(MakeJitConstant("ITERATIONS_NUMBER", iterations_number)); + jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization)); + jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp)); + + bool rearrange_scales_order = false; + const auto& scales_output_order = params.scales_output_order; + if (!scales_output_order.empty()) { + for (size_t i = 0; i < scales_output_order.size(); i++) { + if (i != scales_output_order[i]) { + rearrange_scales_order = true; + break; + } + } + } + + if (rearrange_scales_order) { + const std::array default_dim_order = {'b', 'f', 'y', 'x'}; + + std::stringstream ss; + for (size_t i = 0; i < scales_output_order.size(); i++) { + ss << default_dim_order[scales_output_order[i]]; + + if (i + 1 != scales_output_order.size()) + ss << ", "; + } + + jit.AddConstant(MakeJitConstant("SCALES_OUTPUT_ORDER", ss.str())); + GPU_DEBUG_TRACE_DETAIL << "SCALES_OUTPUT_ORDER: " << ss.str() << "\n"; + } + + for (size_t i = 0; i < group_sizes.size(); i++) { + jit.AddConstant(MakeJitConstant("GROUP_SIZE_DIM" + std::to_string(i), group_sizes[i])); + } + + return jit; +} + +CommonDispatchData DynamicQuantizeKernelOptGeneric::SetDefault(const dynamic_quantize_params& params) const { + CommonDispatchData dispatchData; + + const auto& input_dims = get_normalized_dims(params.inputs[0]); + const auto total_batched_elements = get_elements_number_per_batch(params); + const auto total_grouped_elements = get_elements_number_per_group(params); + const auto total_subgroups_number = total_grouped_elements / input_dims.back().v; + // const auto per_iter_elements_number = get_per_iter_elements_number(params); + + // TODO: add check that input_dims.back().v / SUBGROUP_SIZE is enough to allocate private array inside kernel + + dispatchData.gws = {subgroup_size, total_subgroups_number, total_batched_elements}; + dispatchData.lws = {subgroup_size, total_subgroups_number, 1}; + + return dispatchData; +} + +void DynamicQuantizeKernelOptGeneric::GetUpdateDispatchDataFunc(KernelData& kd) const { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); + kd.kernels[0].params.workGroups.global = dispatchData.gws; + kd.kernels[0].params.workGroups.local = dispatchData.lws; + kd.kernels[0].skip_execution = false; + + if (prim_params.append_axis != -1) { + kd.kernels[0].params.scalars.clear(); + + ScalarDescriptor axis_offset; + axis_offset.t = ScalarDescriptor::Types::UINT32; + axis_offset.v.u32 = static_cast(prim_params.axis_offset); + kd.kernels[0].params.scalars.push_back(axis_offset); + } + + GPU_DEBUG_TRACE_DETAIL << "Update Dispatch data DynamicQuantizeKernelOptGeneric gws : " << dispatchData.gws[0] << ", " + << dispatchData.gws[1] << ", " << dispatchData.gws[2] << std::endl; + }; +} + +KernelsData DynamicQuantizeKernelOptGeneric::GetKernelsData(const Params& params) const { + assert(params.GetType() == KernelType::DYNAMIC_QUANTIZE); + + if (!Validate(params)) + return {}; + + const dynamic_quantize_params& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + + KernelData kd = KernelData::Default(params); + + auto cldnn_jit = GetJitConstants(prim_params); + auto entry_point = GetEntryPoint(kernelName, prim_params.layerID, params); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + GetUpdateDispatchDataFunc(kd); + + auto& kernel = kd.kernels[0]; + FillCLKernelData(kernel, + dispatchData, + params.engineInfo, + kernelName, + jit, + entry_point, + EXE_MODE_DEFAULT, + false, + false, + 1, + GetFusedPrimitiveInputsCount(params), + static_cast(prim_params.outputs.size()), + prim_params.is_shape_agnostic); + + if (prim_params.append_axis != -1) + kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0}); + + return {kd}; +} + +KernelsPriority DynamicQuantizeKernelOptGeneric::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_1; +} + +bool DynamicQuantizeKernelOptGeneric::Validate(const Params& params) const { + if (!KernelBaseOpenCL::Validate(params)) + return false; + + const auto& dq_params = static_cast(params); + + const auto& group_sizes = dq_params.group_sizes; + const auto& input_dims = get_normalized_dims(dq_params.inputs[0]); + const size_t non_compressed_dims_number = std::count(group_sizes.begin(), group_sizes.end(), 1); + + if (non_compressed_dims_number == group_sizes.size()) + return false; + + for (size_t i = 0; i < group_sizes.size(); i++) { + if (group_sizes[i] != 1 && input_dims[i].is_dynamic) { + return false; + } + } + + // last dimension should be static, reduced by group_sizes configuration and divisible by 16 + if (group_sizes.back() == 1 || input_dims.back().is_dynamic || input_dims.back().v % subgroup_size != 0) + return false; + + if (dq_params.inputs[0].GetPaddedVal() != 0 || dq_params.outputs[0].GetPaddedVal() != 0) + return false; + + return true; +} +} // namespace kernel_selector + diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.h new file mode 100644 index 00000000000000..f5148ccc1d9a1f --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.h @@ -0,0 +1,30 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "kernel_base_opencl.h" +#include "dynamic_quantize_kernel_ref.h" + +namespace kernel_selector { +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// dynamic_quantize_params +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +class DynamicQuantizeKernelOptGeneric : public KernelBaseOpenCL { +public: + DynamicQuantizeKernelOptGeneric() : KernelBaseOpenCL("dynamic_quantize_gpu_opt_generic") {} + virtual ~DynamicQuantizeKernelOptGeneric() {} + + virtual JitConstants GetJitConstants(const dynamic_quantize_params& params) const; + virtual CommonDispatchData SetDefault(const dynamic_quantize_params& params) const; + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + Datatype GetAccumulatorType(const dynamic_quantize_params& params) const; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params&) const override; + void GetUpdateDispatchDataFunc(KernelData& kd) const override; +}; +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp index 3b214848e2f8ad..87a89e7c043b10 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -26,6 +26,47 @@ JitConstants DynamicQuantizeKernelRef::GetJitConstants(const dynamic_quantize_pa jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); + auto print_arr = [&](const std::vector& vec, size_t max_len, std::string name) { + std::stringstream ss; + for (size_t i = 0; i < std::min(max_len, vec.size()); i++) { + ss << vec[i] << ", "; + } + GPU_DEBUG_TRACE_DETAIL << "Array " << name << " (len=" << vec.size() << ") content: " << ss.str() << "\n"; + }; + + bool rearrange_scales = false; + const auto& scales_output_order = params.scales_output_order; + if (!scales_output_order.empty()) { + for (size_t i = 0; i < scales_output_order.size(); i++) { + if (i != scales_output_order[i]) { + rearrange_scales = true; + break; + } + } + } + + if (rearrange_scales) { + const std::array default_dim_order = {'b', 'f', 'y', 'x'}; + + std::stringstream ss; + for (size_t i = 0; i < scales_output_order.size(); i++) { + ss << default_dim_order[scales_output_order[i]]; + + if (i + 1 != scales_output_order.size()) + ss << ", "; + } + + jit.AddConstant(MakeJitConstant("SCALES_OUTPUT_ORDER", ss.str())); + GPU_DEBUG_TRACE_DETAIL << "SCALES_OUTPUT_ORDER: " << ss.str() << "\n"; + } + + print_arr(params.group_sizes, params.group_sizes.size(), "group_sizes"); + + const auto& group_sizes = params.group_sizes; + for (size_t i = 0; i < group_sizes.size(); i++) { + jit.AddConstant(MakeJitConstant("GROUP_SIZE_DIM" + std::to_string(i), group_sizes[i])); + } + return jit; } @@ -34,7 +75,17 @@ CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_p CommonDispatchData dispatchData; OPENVINO_ASSERT(params.outputs[0].GetLayout() == DataLayout::bfyx, "It supports only 4d tensor"); - dispatchData.gws = {params.outputs[0].Batch().v * params.outputs[0].Feature().v, 1, 1}; + + const auto& group_sizes = params.group_sizes; + auto batch_size = group_sizes[0] == 1 ? params.outputs[0].Batch().v : 1; + auto feature_size = group_sizes[1] == 1 ? params.outputs[0].Feature().v : 1; + auto y_size = group_sizes[2] == 1 ? params.outputs[0].Y().v : 1; + auto x_size = group_sizes[3] == 1 ? params.outputs[0].X().v : 1; + + dispatchData.gws = {batch_size * feature_size, y_size, x_size}; + // GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // per-head compression + // dispatchData.gws[1] = params.outputs[0].Y().v; + // } dispatchData.lws = {1, 1, 1}; return dispatchData; @@ -94,6 +145,10 @@ bool DynamicQuantizeKernelRef::Validate(const Params& params) const { if (!KernelBaseOpenCL::Validate(params)) return false; + const auto& prim_params = static_cast(params); + if (prim_params.group_sizes.size() != 4) + return false; + return true; } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h index ce52ed9fb19714..bdfd9f61291d21 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h @@ -12,6 +12,13 @@ namespace kernel_selector { /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// struct dynamic_quantize_params : public base_params { dynamic_quantize_params() : base_params(KernelType::DYNAMIC_QUANTIZE) {} + + int64_t append_axis = -1; + int64_t axis_offset = -1; + std::vector group_sizes; + std::vector scales_output_order; + bool use_asymmetric_quantization = false; + bool combine_scales_and_zp = false; }; class DynamicQuantizeKernelRef : public KernelBaseOpenCL { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp index 6ca9fbd2f5bd76..dd5ff4acaf06b1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp @@ -5,11 +5,21 @@ #include "dynamic_quantize_kernel_selector.h" #include "dynamic_quantize_kernel_ref.h" #include "dynamic_quantize_kernel_opt.h" +#include "dynamic_quantize_kernel_opt_generic.h" namespace kernel_selector { dynamic_quantize_kernel_selector::dynamic_quantize_kernel_selector() { Attach(); - Attach(); + int USE_REF_DQ = 0; + if (const auto env_var = std::getenv("USE_REF_DQ")) { + std::istringstream ss(env_var); + ss >> USE_REF_DQ; + } + + if (!USE_REF_DQ) { + Attach(); + } + // Attach(); } KernelsData dynamic_quantize_kernel_selector::GetBestKernels(const Params& params) const { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp index 7556debd29df00..1b5deec382e460 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp @@ -4,6 +4,7 @@ #include "sdpa_kernel_base.h" #include "kernel_selector_utils.h" +#include "intel_gpu/runtime/debug_configuration.hpp" namespace kernel_selector { @@ -66,6 +67,7 @@ static std::string GetBroadcastInputStr(const size_t input_rank, const int64_t a } JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { + GPU_DEBUG_GET_INSTANCE(debug_config); auto jit = MakeBaseParamsJitConstants(params); if (params.conf.broadcast_axis != -1) { @@ -73,6 +75,8 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { jit.AddConstant(MakeJitConstant("DO_BROADCAST_KEY_VALUE", GetBroadcastInputStr(params.inputs[0].GetDims().size(), params.conf.broadcast_axis, params.conf.group_size))); + } else { + jit.AddConstant(MakeJitConstant("BROADCAST_GROUP_SIZE", 1)); } jit.AddConstant(MakeJitConstant("IS_CAUSAL", params.conf.is_causal)); @@ -81,6 +85,24 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { jit.AddConstant(MakeJitConstant("HAS_SCALE_INPUT", params.inputs.size() > 4)); } + jit.AddConstant(MakeJitConstant("IS_KV_COMPRESSED", params.conf.is_kv_compressed)); + jit.AddConstant(MakeJitConstant("USE_ASYMMETRIC_QUANTIZATION", params.conf.use_asymmetric_quantization)); + jit.AddConstant(MakeJitConstant("COMBINE_SCALES_AND_ZP", params.conf.combine_scales_and_zp)); + + if (params.conf.is_kv_compressed) { + jit.AddConstant(MakeJitConstant("KEY_COMPRESSION_SCALE", params.key_cache_comp_scale)); + jit.AddConstant(MakeJitConstant("VALUE_COMPRESSION_SCALE", params.value_cache_comp_scale)); + + if (params.conf.use_asymmetric_quantization && !params.conf.combine_scales_and_zp) { + jit.AddConstant(MakeJitConstant("KEY_COMPRESSION_ZP", params.key_cache_comp_zp)); + jit.AddConstant(MakeJitConstant("VALUE_COMPRESSION_ZP", params.value_cache_comp_zp)); + } + } + + GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // FIXME: it should be placed in params + jit.AddConstant(MakeJitConstant("COMPRESSED_PER_HEAD", 1)); + } + auto is_default_order = [](const std::vector& order) { for (size_t i = 0; i < order.size(); i++) if (order[i] != static_cast(i)) @@ -140,6 +162,7 @@ bool SDPAKernelBase::Validate(const Params& p) const { if (params.outputs[0].Dimentions() != 4) return false; + // FIXME: i8 input is supported only when kv cache is compressed return true; } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h index 6ea8d85527d19d..ded25bc8c8ed9c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h @@ -88,6 +88,9 @@ struct sdpa_configuration { bool is_causal = false; bool has_alibi_input = false; + bool is_kv_compressed = false; + bool use_asymmetric_quantization = false; + bool combine_scales_and_zp = false; // Paged Attention configuration bool is_paged_attention = false; @@ -110,6 +113,10 @@ struct sdpa_params : public base_params { int64_t indirect_axis = -1; DataTensor beam_table; + DataTensor key_cache_comp_scale; + DataTensor key_cache_comp_zp; + DataTensor value_cache_comp_scale; + DataTensor value_cache_comp_zp; sdpa_configuration conf; }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp index 04ebf2f2165973..d615b97d75e0a5 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp @@ -342,6 +342,9 @@ bool SDPAKernelMicro::Validate(const Params& p) const { if (params.conf.head_size > 256) return false; + if (params.conf.is_kv_compressed) + return false; + return true; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp index 2f0174d0a45912..d3c1b82002308c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp @@ -4,6 +4,7 @@ #include "sdpa_kernel_opt.h" #include "kernel_selector_utils.h" +#include "common_types.h" #include #include @@ -26,6 +27,18 @@ static size_t get_sg_number_scale_factor(const sdpa_params& sdpa_params, size_t if (sdpa_params.conf.head_size * optimal_scale_factor <= sdpa_params.engineInfo.maxWorkGroupSize) { return optimal_scale_factor; } + } else if (kernel_type == KernelsTypes::SINGLE_TOKEN) { + int USE_SCALE_FACTOR = 0; + if (const auto env_var = std::getenv("USE_SCALE_FACTOR")) { + std::istringstream ss(env_var); + ss >> USE_SCALE_FACTOR; + } + if (USE_SCALE_FACTOR) { + const size_t optimal_scale_factor = 2; + if (sdpa_params.conf.head_size * optimal_scale_factor <= sdpa_params.engineInfo.maxWorkGroupSize) { + return optimal_scale_factor; + } + } } return 1; @@ -126,6 +139,7 @@ static std::string GetKernelName(std::string base_name, KernelsTypes type, const ParamsKey SDPAKernelOpt::GetSupportedKey() const { ParamsKey k; + k.EnableInputDataType(Datatype::INT8); k.EnableInputDataType(Datatype::F16); k.EnableInputDataType(Datatype::F32); k.EnableInputDataType(Datatype::INT32); @@ -154,6 +168,9 @@ bool SDPAKernelOpt::Validate(const Params& p) const { if (params.conf.head_size < 1 || params.conf.head_size % subgroup_size != 0) return false; + if (params.conf.use_asymmetric_quantization && !params.conf.combine_scales_and_zp) + return false; + return true; } @@ -231,10 +248,11 @@ CommonDispatchData SDPAKernelOpt::SetDefault(const sdpa_params& params, size_t k const size_t target_seq_len_block_size = kernel_idx == 1 ? get_target_seq_len_block_size() : 1; if (kernel_idx == KernelsTypes::SINGLE_TOKEN) { + const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx); dispatch_data.gws = { batch_size * heads_num, CeilDiv(target_seq_len, target_seq_len_block_size), - head_size * num_of_partitions }; - dispatch_data.lws = { 1, 1, head_size }; + head_size * num_of_partitions * sg_num_scale }; + dispatch_data.lws = { 1, 1, head_size * sg_num_scale }; } else if (kernel_idx == KernelsTypes::MULTI_TOKENS) { const size_t sg_num_scale = get_sg_number_scale_factor(params, kernel_idx); dispatch_data.gws = { batch_size * heads_num, @@ -307,8 +325,20 @@ KernelsData SDPAKernelOpt::GetKernelsData(const Params& params) const { static_cast(prim_params.outputs.size()), prim_params.is_shape_agnostic); - if (prim_params.indirect_axis != -1 && kernel_idx != KernelsTypes::FINALIZATION) - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(prim_params.inputs.size())}); + auto beam_table_idx = prim_params.inputs.size(); + if (prim_params.conf.is_kv_compressed && kernel_idx != KernelsTypes::FINALIZATION) { + auto key_cache_compression_scale_idx = static_cast(prim_params.inputs.size()); + auto value_cache_compression_scale_idx = static_cast(prim_params.inputs.size() + 1); + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx}); + + beam_table_idx += 2; + } + + if (prim_params.indirect_axis != -1 && kernel_idx != KernelsTypes::FINALIZATION) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(beam_table_idx)}); + } kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); @@ -391,6 +421,6 @@ void SDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const { } KernelsPriority SDPAKernelOpt::GetKernelsPriority(const Params& params) const { - return params.engineInfo.supports_immad ? FORCE_PRIORITY_2 : FORCE_PRIORITY_1; + return FORCE_PRIORITY_1; } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp index 579c4bc06c17e2..d6fcad69b2beb8 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp @@ -12,10 +12,12 @@ namespace kernel_selector { ParamsKey SDPAKernelRef::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::INT8); k.EnableInputDataType(Datatype::F32); // beam table input k.EnableInputDataType(Datatype::INT32); + // FIXME: support for compressed KV cache k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); @@ -74,8 +76,28 @@ KernelsData SDPAKernelRef::GetKernelsData(const Params& params) const { "", false, false, static_cast(prim_params.inputs.size()), GetFusedPrimitiveInputsCount(params), 1, prim_params.is_shape_agnostic); - if (prim_params.indirect_axis != -1) - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(prim_params.inputs.size())}); + auto beam_table_idx = prim_params.inputs.size(); + if (prim_params.conf.is_kv_compressed) { + auto key_cache_compression_scale_idx = static_cast(prim_params.inputs.size()); + auto value_cache_compression_scale_idx = static_cast(prim_params.inputs.size() + 1); + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx}); + GPU_DEBUG_TRACE_DETAIL << "sdpa_ref add scales args\n"; + + if (prim_params.conf.use_asymmetric_quantization && !prim_params.conf.combine_scales_and_zp) { + GPU_DEBUG_TRACE_DETAIL << "sdpa_ref add zero points args\n"; + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx + 2}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx + 2}); + beam_table_idx += 2; + } + + beam_table_idx += 2; + } + + if (prim_params.indirect_axis != -1) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(beam_table_idx)}); + } kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp index e65fd7fd10976b..6d157fe901efcb 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp @@ -13,11 +13,21 @@ namespace kernel_selector { sdpa_kernel_selector::sdpa_kernel_selector() { - Attach(); - Attach(); -#ifdef ENABLE_ONEDNN_FOR_GPU - Attach(); -#endif + int USE_REF_SDPA = 0; + if (const auto env_var = std::getenv("USE_REF_SDPA")) { + std::istringstream ss(env_var); + ss >> USE_REF_SDPA; + } + + if (!USE_REF_SDPA) { + Attach(); + Attach(); + #ifdef ENABLE_ONEDNN_FOR_GPU + // Attach(); + #endif + } else { + Attach(); + } } KernelsData sdpa_kernel_selector::GetBestKernels(const Params& params) const { diff --git a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp index 7574b664b6b4b7..7ff2b14f79b569 100644 --- a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp +++ b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp @@ -152,5 +152,66 @@ VariableState::Ptr VariableStateIndirectKVCache::get_beam_table_state() const { return m_hidden_states[1]; } +VariableStateIndirectKVCacheCompressed::VariableStateIndirectKVCacheCompressed( + const VariableStateInfo& info, + std::shared_ptr context, + std::shared_ptr shape_predictor, + size_t beam_idx, + size_t concat_idx, + bool has_zp_state = false) + : VariableStateIndirectKVCache(info, context, shape_predictor, beam_idx, concat_idx), + m_has_zp_state(has_zp_state) { + const auto compression_scale_shape = ov::PartialShape::dynamic(info.m_layout.get_partial_shape().size()); + + // TODO: set proper data type + cldnn::layout compression_scale_layout(compression_scale_shape, + ov::element::f16, + cldnn::format::bfyx); + VariableStateInfo compression_scale_state_info(info.m_id + "/comp_scale", compression_scale_layout); + m_hidden_states.push_back(std::make_shared(compression_scale_state_info, context, shape_predictor)); + + if (has_zp_state) { + cldnn::layout compression_zp_layout(compression_scale_shape, + ov::element::f16, + cldnn::format::bfyx); + VariableStateInfo compression_scale_state_info(info.m_id + "/comp_zp", compression_zp_layout); + m_hidden_states.push_back(std::make_shared(compression_scale_state_info, context, shape_predictor)); + } + + OPENVINO_ASSERT((!m_has_zp_state && m_hidden_states.size() == 3) || (m_has_zp_state && m_hidden_states.size() == 4), + "[GPU] VariableStateIndirectKVCacheCompressed expects 3 or 4 internal states to be initialized, " + "actual number is ", m_hidden_states.size()); +} + +VariableState::Ptr VariableStateIndirectKVCacheCompressed::get_compression_scale_state() const { + return m_hidden_states[2]; +} + +void VariableStateIndirectKVCacheCompressed::set_compression_scale_layout(const cldnn::layout& new_layout) { + m_hidden_states[2]->set_layout(new_layout); +} + +VariableState::Ptr VariableStateIndirectKVCacheCompressed::get_compression_zp_state() const { + OPENVINO_ASSERT(m_has_zp_state); + return m_hidden_states[3]; +} + +void VariableStateIndirectKVCacheCompressed::set_compression_zp_layout(const cldnn::layout& new_layout) { + OPENVINO_ASSERT(m_has_zp_state); + m_hidden_states[3]->set_layout(new_layout); +} + +bool VariableStateIndirectKVCacheCompressed::has_zp_state() const { + return m_has_zp_state; +} + +void VariableStateIndirectKVCacheCompressed::set_state(const ov::SoPtr& state) { + OPENVINO_THROW("[GPU] set_state API is supported only when KV-cache compression is disabled"); +} + +ov::SoPtr VariableStateIndirectKVCacheCompressed::get_state() const { + OPENVINO_THROW("[GPU] get_state API is supported only when KV-cache compression is disabled"); +} + } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index 0373251e45c051..0fd0f61fafddaa 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -3,10 +3,20 @@ // #include "ov_ops/dynamic_quantize.hpp" +#include "intel_gpu/op/dynamic_quantize.hpp" #include "intel_gpu/plugin/program_builder.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/primitives/dynamic_quantize.hpp" + +namespace ov { +namespace op { +namespace internal { +using DynamicQuantizeExtended = ov::intel_gpu::op::DynamicQuantize; +} // namespace internal +} // namespace op +} // namespace ov + namespace ov { namespace intel_gpu { @@ -15,19 +25,33 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptrget_quantization_config()); + + p.add_primitive(*op, prim); +} + +static void CreateDynamicQuantizeExtendedOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {1}); + auto inputs = p.GetInputInfo(op); + std::string primitive_name = layer_type_name_ID(op); + auto group_sizes = op->get_group_sizes(); - for (size_t i = 0; i < group_sizes.size() - 1; i++) - OPENVINO_ASSERT(group_sizes[i] == 1, "Not supported group size at ", i, ": ", group_sizes[i]); - OPENVINO_ASSERT(group_sizes.back() == UINT64_MAX, "Not supported group size: ", group_sizes.back()); auto prim = cldnn::dynamic_quantize(primitive_name, - inputs[0], - op->get_group_sizes().back(), - get_output_data_types(op)); + inputs[0], + op->get_quantization_config(), + op->get_combine_scales_and_zp(), + op->get_scales_zp_output_order()); + + prim.num_outputs = op->get_output_size(); + p.add_primitive(*op, prim); } REGISTER_FACTORY_IMPL(internal, DynamicQuantize); +REGISTER_FACTORY_IMPL(internal, DynamicQuantizeExtended); } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp index c2ee336e48bf06..bfa4df330c2fc8 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp @@ -22,19 +22,26 @@ namespace intel_gpu { namespace { void CreateKVCacheOp(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {2, 3}); + validate_inputs_count(op, {2, 3, 4, 5}); auto inputs = p.GetInputInfo(op); int64_t rank = op->get_input_partial_shape(0).size(); auto prim = cldnn::kv_cache(layer_type_name_ID(op), - inputs, - op->get_variable()->get_info(), - ov::util::normalize(op->get_concat_axis(), rank), - ov::util::normalize(op->get_gather_axis(), rank), - op->get_indirect()); + inputs, + op->get_variable()->get_info(), + ov::util::normalize(op->get_concat_axis(), rank), + ov::util::normalize(op->get_gather_axis(), rank), + op->get_indirect()); prim.num_outputs = op->get_output_size(); prim.output_data_types = get_output_data_types(op); + if (op->get_kv_compressed()) { + prim.compressed = true; + prim.combine_scales_and_zp = op->get_combine_scales_and_zp(); + prim.quantization_config = op->get_quantization_config(); + prim.scales_zp_output_order = op->get_scales_zp_output_order(); + } + p.add_primitive(*op, prim); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp index 0ba780183b186d..a2ef2ad50bfcce 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp @@ -62,11 +62,13 @@ static void CreateSDPAOp(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {4, 5, 6}); auto inputs = p.GetInputInfo(op); auto layerName = layer_type_name_ID(op); bool is_causal = op->get_causal(); + const auto compression_inputs = op->get_compression_inputs_num(); + validate_inputs_count(op, {4 + compression_inputs, 5 + compression_inputs, 6 + compression_inputs}); + int64_t indirect_axis = op->get_indirect_axis(); auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName, inputs, @@ -75,7 +77,10 @@ static void CreateIndirectSDPAOp(ProgramBuilder& p, const std::shared_ptrget_input0_transpose_order(), op->get_input1_transpose_order(), op->get_input2_transpose_order(), - op->get_output_transpose_order()); + op->get_output_transpose_order(), + op->get_kv_compressed(), + op->get_combine_scales_and_zp(), + op->get_quantization_config()); p.add_primitive(*op, sdpa_prim); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp index 9d7d6854009316..05c4ad64946272 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // + #include "intel_gpu/plugin/program_builder.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "openvino/core/type/element_type.hpp" @@ -9,6 +10,7 @@ #include "openvino/op/read_value.hpp" #include "transformations/rt_info/original_precision_attribute.hpp" #include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/op/read_values.hpp" #include "intel_gpu/primitives/assign.hpp" #include "intel_gpu/primitives/read_value.hpp" @@ -16,6 +18,7 @@ namespace ov { namespace op { namespace internal { using ReadValue = ov::intel_gpu::op::ReadValue; +using ReadValues = ov::intel_gpu::op::ReadValues; } // namespace internal } // namespace op } // namespace ov @@ -39,12 +42,33 @@ void CreateVariableAccessPrimitive(ProgramBuilder &p, const std::shared_ptr +// void CreateVariableAccessPrimitive(ProgramBuilder &p, const std::shared_ptr &op, +// const std::string &variable_id) { +// const auto output_pshape = op->get_output_partial_shape(0); +// const auto output_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(0)); +// const auto output_format = cldnn::format::get_default_format(output_pshape.size()); + +// const auto variable_layout = cldnn::layout{ output_pshape, output_dtype, output_format }; + +// auto inputs = p.GetInputInfo(op); +// auto user_specified_type = get_original_precision(op); +// const auto prim = T_PRIMITIVE{layer_type_name_ID(op), +// inputs, +// variable_id, +// variable_layout, +// user_specified_type, +// true}; + +// p.add_primitive(*op, prim); +// } + void CreateReadValueOp(ProgramBuilder& p, const std::shared_ptr& op) { validate_inputs_count(op, {0, 1}); CreateVariableAccessPrimitive(p, op, op->get_variable_id()); @@ -70,6 +94,26 @@ void CreateAssignOp(ProgramBuilder& p, const std::shared_ptr CreateVariableAccessPrimitive(p, op, op->get_variable_id()); } +void CreateReadValuesOp(ProgramBuilder& p, const std::shared_ptr& op) { + std::vector variable_layouts; + for (size_t i = 0; i < op->get_output_size(); i++) { + const auto output_pshape = op->get_output_partial_shape(i); + const auto output_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(i)); + const auto output_format = cldnn::format::get_default_format(output_pshape.size()); + variable_layouts.emplace_back(output_pshape, output_dtype, output_format); + } + + auto inputs = p.GetInputInfo(op); + auto user_specified_type = get_original_precision(op); + auto prim = cldnn::read_value{layer_type_name_ID(op), + inputs, + op->get_variable_id(), + variable_layouts, + user_specified_type}; + + p.add_primitive(*op, prim); +} + } // namespace REGISTER_FACTORY_IMPL(v3, Assign); @@ -77,6 +121,7 @@ REGISTER_FACTORY_IMPL(v6, Assign); REGISTER_FACTORY_IMPL(v3, ReadValue); REGISTER_FACTORY_IMPL(v6, ReadValue); REGISTER_FACTORY_IMPL(internal, ReadValue); +REGISTER_FACTORY_IMPL(internal, ReadValues); } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index 346b4471779593..df14d9cfefb7a0 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -643,16 +643,24 @@ void SyncInferRequest::allocate_states() { bool indirect_kv_cache = false; int64_t beam_axis = 0; int64_t concat_axis = 0; + bool compressed = false; + bool has_zp_state = false; auto kv_cache_shape = vi.second.m_layout.get_partial_shape(); for (auto& p : state_prims) { if (auto kv_cache_prim = dynamic_cast(p)) { indirect_kv_cache = kv_cache_prim->indirect; beam_axis = ov::util::normalize(kv_cache_prim->gather_axis, kv_cache_shape.size()); concat_axis = ov::util::normalize(kv_cache_prim->concat_axis, kv_cache_shape.size()); + compressed = kv_cache_prim->compressed; + if (compressed && !kv_cache_prim->combine_scales_and_zp) { + has_zp_state = kv_cache_prim->quantization_config.is_asymmetric_quantization(); + } } } - if (indirect_kv_cache) { + if (compressed) { + m_variables.emplace(vi.first, std::make_shared(vi.second, m_context, m_shape_predictor, beam_axis, concat_axis, has_zp_state)); + } else if (indirect_kv_cache) { m_variables.emplace(vi.first, std::make_shared(vi.second, m_context, m_shape_predictor, beam_axis, concat_axis)); } else { m_variables.emplace(vi.first, std::make_shared(vi.second, m_context, m_shape_predictor)); @@ -719,6 +727,18 @@ std::vector SyncInferRequest::prepare_input(const std::string auto user_tensor = user_tensor_wrapper.ptr; auto element_type = user_tensor->get_element_type(); + // auto print_arr = [&](int64_t* vec, size_t max_len, std::string name) { + // std::stringstream ss; + // for (size_t i = 0; i < max_len; i++) { + // ss << vec[i] << ", "; + // } + // std::cout << "Array " << name << " (len=" << max_len << ") content: " << ss.str() << "\n"; + // }; + + // if (internal_name == "parameter:input_ids") { + // print_arr(user_tensor->data(), user_tensor->get_size(), "parameter:input_ids"); + // } + auto remote_tensor_impl_ptr = std::dynamic_pointer_cast(user_tensor); auto iremote_tensor_ptr = std::dynamic_pointer_cast(user_tensor); auto usm_host_ptr = std::dynamic_pointer_cast(user_tensor); @@ -737,6 +757,19 @@ std::vector SyncInferRequest::prepare_input(const std::string auto& engine = m_graph->get_engine(); auto& stream = network->get_stream(); + // auto print_arr = [&](int64_t* vec, size_t max_len, std::string name) { + // std::stringstream ss; + // for (size_t i = 0; i < max_len; i++) { + // ss << vec[i] << ", "; + // } + // std::cout << "Array " << name << " (len=" << max_len << ") content: " << ss.str() << "\n"; + // }; + + + // if (internal_name == "parameter:input_ids") { + // print_arr(user_tensor->data(), user_tensor->get_size(), "parameter:input_ids"); + // } + auto need_lockable_mem = network->does_node_need_lockable_output(internal_name); OPENVINO_ASSERT(pshape.compatible(ov::PartialShape(user_tensor->get_shape())) || is_batched_input(port), diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index eb16213bcb936c..28e0e7b74c71ae 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -61,7 +61,14 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size auto rank = m_fc->get_input_partial_shape(0).size(); std::vector shape_group_size(rank, 1); shape_group_size.back() = group_size; - auto dyn_quan = std::make_shared(m_data, shape_group_size, element::f16); + + ov::op::internal::QuantizationConfig config; + config.quantization_dt = element::i8; + config.mode = ov::op::internal::QuantizationConfig::QuantizationMode::Symmetric; + config.scale_dt = element::f16; + config.group_sizes = shape_group_size; + + auto dyn_quan = std::make_shared(m_data, config); auto optional_w_zp = m_fc->get_input_size() > 4 ? m_fc->get_input_node_shared_ptr(4) : std::make_shared(); auto output_type = m_fc->get_output_type(); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp new file mode 100644 index 00000000000000..d38e3c5a510c2b --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp @@ -0,0 +1,349 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "kv_cache_compression.hpp" + +#include "intel_gpu/op/kv_cache.hpp" +#include "intel_gpu/op/indirect_sdpa.hpp" +#include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/op/read_values.hpp" +#include "intel_gpu/op/dynamic_quantize.hpp" +#include "intel_gpu/plugin/common_utils.hpp" + +#include "openvino/core/node_vector.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/sink.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pattern/op/label.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/visualize_tree.hpp" +#include "transformations/utils/utils.hpp" + +#include + +namespace ov { +namespace intel_gpu { + +namespace { +std::vector get_variable_infos(const ov::op::util::VariableInfo& data_variable_info, + const ov::op::internal::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp = false) { + std::vector infos; + + // add initial data variable info + infos.push_back(data_variable_info); + + // infer DQ shapes + ov::intel_gpu::op::DynamicQuantize dq; + auto dq_shapes = + ov::intel_gpu::op::DynamicQuantize::shape_infer(&dq, {data_variable_info.data_shape}, config, scales_zp_output_order, combine_scales_and_zp); + + const auto variable_id = data_variable_info.variable_id; + const auto scale_shape = dq_shapes[1]; + const auto scale_dt = config.scale_dt; + + // add scales variable info + infos.push_back(ov::op::util::VariableInfo{scale_shape, scale_dt, variable_id}); + + if (config.is_asymmetric_quantization() && !combine_scales_and_zp) { + // add zero points variable info + const auto zp_dt = config.zp_dt; + infos.push_back(ov::op::util::VariableInfo{scale_shape, zp_dt, variable_id}); + } + + return infos; +} + +std::shared_ptr + update_past_read_value(std::shared_ptr past_rv_node, + const ov::op::internal::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp = false) { + auto variable = past_rv_node->get_variable(); + variable->update_data_type(config.quantization_dt); + + auto variable_infos = get_variable_infos(past_rv_node->get_variable()->get_info(), config, scales_zp_output_order, combine_scales_and_zp); + auto new_past_rv_node = std::make_shared(); + + if (past_rv_node->get_input_size() == 0) { + new_past_rv_node = std::make_shared(past_rv_node->get_variable(), variable_infos); + } else { + auto initializer_dyn_quantization = std::make_shared(past_rv_node->get_input_node_shared_ptr(0), + config, + scales_zp_output_order, + combine_scales_and_zp); + initializer_dyn_quantization->set_friendly_name(past_rv_node->get_input_node_shared_ptr(0)->get_friendly_name() + "_dyn_quan"); + + OutputVector initializer_outputs = { initializer_dyn_quantization->output(0), initializer_dyn_quantization->output(1) }; + + if (config.is_asymmetric_quantization() && !combine_scales_and_zp) + initializer_outputs.push_back( initializer_dyn_quantization->output(2) ); + + new_past_rv_node = std::make_shared(initializer_outputs, past_rv_node->get_variable(), variable_infos); + } + + ov::copy_runtime_info(past_rv_node, new_past_rv_node); + past_rv_node->output(0).replace(new_past_rv_node->output(0)); + + return new_past_rv_node; +} + +std::shared_ptr + update_kv_cache(std::shared_ptr past_rv_node, + std::shared_ptr kv_cache_node, + const ov::op::internal::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp = false) { + OutputVector kv_cache_inputs = { past_rv_node->output(0), + kv_cache_node->get_input_node_shared_ptr(1), + kv_cache_node->get_input_node_shared_ptr(2), + past_rv_node->output(1) }; + + if (config.is_asymmetric_quantization() && !combine_scales_and_zp) + kv_cache_inputs.push_back(past_rv_node->output(2)); + + auto new_kv_cache = std::make_shared(kv_cache_inputs, + kv_cache_node->get_variable(), + kv_cache_node->get_concat_axis(), + kv_cache_node->get_gather_axis(), + combine_scales_and_zp, + config, + scales_zp_output_order); + + new_kv_cache->set_friendly_name(kv_cache_node->get_friendly_name()); + ov::copy_runtime_info(kv_cache_node, new_kv_cache); + + return new_kv_cache; +} +} + +class KVCacheCompressionMatcher : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("KVCacheCompressionMatcher", "0"); + KVCacheCompressionMatcher(); +}; + +KVCacheCompressionMatcher::KVCacheCompressionMatcher() { + using namespace ov::pass::pattern; + + bool first = true; + + int KV_CACHE_COMP = 0; + if (const auto env_var = std::getenv("KV_CACHE_COMP")) { + std::istringstream ss(env_var); + ss >> KV_CACHE_COMP; + } + + if (KV_CACHE_COMP == 0) { + if (first) { + printf("NO_KV_CACHE_COMP\n"); + } + first = false; + return; + } else { + if (first) + printf("YES_KV_CACHE_COMP\n"); + + first = false; + } + + int USE_ZP = 0; + if (const auto env_var = std::getenv("USE_ZP")) { + std::istringstream ss(env_var); + ss >> USE_ZP; + } + + std::cout << "Set USE_ZP = " << USE_ZP << "\n"; + + auto quantization_mode = ov::op::internal::QuantizationConfig::QuantizationMode::Symmetric; + if (USE_ZP) + quantization_mode = ov::op::internal::QuantizationConfig::QuantizationMode::Asymmetric; + + + bool combine_scales_and_zp = quantization_mode == ov::op::internal::QuantizationConfig::QuantizationMode::Asymmetric; + + int ZP_INPUT = 0; + if (const auto env_var = std::getenv("ZP_INPUT")) { + std::istringstream ss(env_var); + ss >> ZP_INPUT; + } + + if (ZP_INPUT && combine_scales_and_zp) { + std::cout << "Use independent ZP INPUT\n"; + combine_scales_and_zp = false; + } + + auto query = any_input(); + + auto key_past = wrap_type(); + auto key_new_token = any_input(); + auto key_beam_idx = any_input(); + auto key_cache = wrap_type({key_past, key_new_token, key_beam_idx}); + + auto value_past = wrap_type(); + auto value_new_token = any_input(); + auto value_beam_idx = any_input(); + auto value_cache = wrap_type({value_past, value_new_token, value_beam_idx}); + + auto input_attn_mask = any_input(); + auto input_scale = any_input(); + auto input_beam_table = any_input(); + + auto sdpa_without_attn_mask_m = wrap_type({ query, key_cache, value_cache, input_beam_table }); + auto sdpa_with_attn_mask_m = wrap_type({ query, key_cache, value_cache, input_attn_mask, input_beam_table }); + auto sdpa_with_attn_mask_and_scale_m = + wrap_type({ query, key_cache, value_cache, input_attn_mask, input_scale, input_beam_table }); + + auto sdpa = std::make_shared(OutputVector{sdpa_without_attn_mask_m, sdpa_with_attn_mask_m, sdpa_with_attn_mask_and_scale_m}); + + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + if (transformation_callback(m.get_match_root())) { + return false; + } + + const auto& pattern_map = m.get_pattern_value_map(); + + auto query_node = pattern_map.at(query).get_node_shared_ptr(); + + auto key_new_token_node = pattern_map.at(key_new_token).get_node_shared_ptr(); + auto key_cache_node = std::dynamic_pointer_cast(pattern_map.at(key_cache).get_node_shared_ptr()); + auto value_cache_node = std::dynamic_pointer_cast(pattern_map.at(value_cache).get_node_shared_ptr()); + auto sdpa_node = std::dynamic_pointer_cast(m.get_match_root()); + + auto key_past_rv_node = std::dynamic_pointer_cast(pattern_map.at(key_past).get_node_shared_ptr()); + auto value_past_rv_node = std::dynamic_pointer_cast(pattern_map.at(value_past).get_node_shared_ptr()); + + auto data_rank = key_cache_node->get_input_partial_shape(0).size(); + auto get_shape_group_sizes = [&](const std::vector& transposed_order) { + std::vector group_sizes(data_rank, 1); + std::vector order = transposed_order; + if (transposed_order.size() != data_rank) { + order.resize(data_rank); + std::iota(order.begin(), order.end(), 0); + } + + group_sizes[order[data_rank - 1]] = UINT64_MAX; + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(debug_config->enable_kv_cache_compression != 1) { // per-token compression + group_sizes[order[1]] = UINT64_MAX; + } + + return group_sizes; + }; + + // Reorder scales in static order: [batch, num_heads, seq_len, head_size] + auto get_scales_output_order = [&](const std::vector& transposed_order) { + std::vector scales_zp_output_order(data_rank); + scales_zp_output_order[0] = transposed_order[0]; + scales_zp_output_order[1] = transposed_order[1]; + scales_zp_output_order[2] = transposed_order[2]; + scales_zp_output_order[3] = transposed_order[3]; + + return scales_zp_output_order; + }; + + auto group_sizes = get_shape_group_sizes(sdpa_node->get_input1_transpose_order()); + auto scales_zp_output_order = get_scales_output_order(sdpa_node->get_input1_transpose_order()); + + ov::op::internal::QuantizationConfig config; + config.mode = quantization_mode; + config.group_sizes = group_sizes; + config.quantization_dt = element::i8; + config.scale_dt = query_node->get_output_element_type(0); + + if (config.is_asymmetric_quantization()) + config.zp_dt = query_node->get_output_element_type(0); + + auto print_arr = [&](const std::vector& vec, size_t max_len) { + std::stringstream ss; + for (size_t i = 0; i < std::min(max_len, vec.size()); i++) { + ss << vec[i] << ", "; + } + + return ss.str(); + }; + + std::cout << "pattern matched! " << sdpa_node->get_friendly_name() << "; " + << "groups: " << print_arr(group_sizes, group_sizes.size()) << "; " + << "scales_order: " << print_arr(scales_zp_output_order, scales_zp_output_order.size()) << std::endl; + + key_past_rv_node = update_past_read_value(key_past_rv_node, config, scales_zp_output_order, combine_scales_and_zp); + value_past_rv_node = update_past_read_value(value_past_rv_node, config, scales_zp_output_order, combine_scales_and_zp); + + auto new_key_cache = update_kv_cache(key_past_rv_node, key_cache_node, config, scales_zp_output_order, combine_scales_and_zp); + auto new_value_cache = update_kv_cache(value_past_rv_node, value_cache_node, config, scales_zp_output_order, combine_scales_and_zp); + + OutputVector sdpa_inputs; + // Add Query, Key, Value, attention_mask, scale inputs + for (size_t i = 0; i < sdpa_node->get_input_size() - 1; i++) + sdpa_inputs.push_back(sdpa_node->get_input_node_shared_ptr(i)); + + // Replace Key and Value inputs with compressed ones + sdpa_inputs[1] = new_key_cache->output(0); + sdpa_inputs[2] = new_value_cache->output(0); + + // Add Key and Value compression scales + sdpa_inputs.push_back(new_key_cache->output(2)); + sdpa_inputs.push_back(new_value_cache->output(2)); + + // Add Key and Value compression zero points + if (config.is_asymmetric_quantization() && !combine_scales_and_zp) { + sdpa_inputs.push_back(new_key_cache->output(3)); + sdpa_inputs.push_back(new_value_cache->output(3)); + } + + auto input0_transpose_order = sdpa_node->get_input0_transpose_order(); + auto input1_transpose_order = sdpa_node->get_input1_transpose_order(); + auto input2_transpose_order = sdpa_node->get_input2_transpose_order(); + auto output_transpose_order = sdpa_node->get_output_transpose_order(); + + auto new_sdpa = std::make_shared(sdpa_inputs, + new_key_cache->output(1), + sdpa_node->get_causal(), + sdpa_node->get_indirect_axis(), + input0_transpose_order, + input1_transpose_order, + input2_transpose_order, + output_transpose_order, + config, + combine_scales_and_zp, + sdpa_node->get_output_type()); + + new_key_cache->set_friendly_name(key_cache_node->get_friendly_name()); + ov::copy_runtime_info(key_cache_node, new_key_cache); + + new_value_cache->set_friendly_name(value_cache_node->get_friendly_name()); + ov::copy_runtime_info(value_cache_node, new_value_cache); + + new_sdpa->set_friendly_name(sdpa_node->get_friendly_name()); + ov::copy_runtime_info(sdpa_node, new_sdpa); + + ov::replace_node(sdpa_node, new_sdpa); + return true; + }; + + auto m = std::make_shared(sdpa, "KVCacheCompressionMatcher"); + this->register_matcher(m, callback); + +} + +bool KVCacheCompression::run_on_model(const std::shared_ptr& m) { + return pass::GraphRewrite::run_on_model(m); +} + +KVCacheCompression::KVCacheCompression() { + add_matcher(); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp new file mode 100644 index 00000000000000..d6f18299bced76 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp @@ -0,0 +1,43 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + + +/// Add dynamic quantization node and fuse it with KV cache operation +/// +/// ┌───────────┐ ┌─────────────┐ ┌───────────┐ ┌─────────────┐ +/// │ New Key │ │ New Value │ │ New Key │ │ New Value │ +/// └──────┬────┘ └──────┬──────┘ └──────┬────┘ └──────┬──────┘ +/// │ │ │ │ +/// │ f16 │ f16 │ f16 │ f16 +/// │ │ ==> │ │ +/// ┌─────────┐ ┌────────┴─────────┐ ┌────────┴───────────┐ ┌─────────┐ ┌────────┴─────────┐ ┌────────┴───────────┐ +/// │ Query │ │ KV cache │ │ KV cache │ │ Query │ │ KV cache + DQ │ │ KV cache + DQ │ +/// | | | (Key) | (Value) | | | | (Key) | | (Value) | +/// └───┬─────┘ └────────┬─────────┘ └────────┬───────────┘ └────┬────┘ └────────┬─────────┘ └────────┬───────────┘ +/// │ │ │ │ │ │ +/// │ f16 │ f16 │ f16 │ f16 i8:data │ f16:scale i8:data │ f16:scale +/// │ │ │ │ │ │ +/// │ │ │ │ │ │ +/// │ ┌────┴───┐ │ │ ┌────┴───┐ │ +/// └─────────────┤ SDPA ├─────────────────┘ └─────────────┤ SDPA ├────────────────────┘ +/// └────────┘ └────────┘ + +class KVCacheCompression : public ov::pass::GraphRewrite { +public: + OPENVINO_RTTI("KVCacheCompression", "0"); + KVCacheCompression(); + + bool run_on_model(const std::shared_ptr& m) override; +}; + + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp new file mode 100644 index 00000000000000..2024e13517e6aa --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/dynamic_quantize.cpp @@ -0,0 +1,86 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "intel_gpu/op/dynamic_quantize.hpp" + +#include "openvino/core/partial_shape.hpp" +#include "openvino/core/validation_util.hpp" + +namespace ov { +namespace intel_gpu { +namespace op { + +DynamicQuantize::DynamicQuantize(const Output& data, + const QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp) + : ov::op::internal::DynamicQuantize(data, config, combine_scales_and_zp || config.mode == QuantizationConfig::QuantizationMode::Symmetric ? 2 : 3), + m_combine_scales_and_zp(combine_scales_and_zp), + m_scales_zp_output_order(scales_zp_output_order) { + if (m_scales_zp_output_order.empty()) { + m_scales_zp_output_order.resize(data.get_partial_shape().size()); + std::iota(m_scales_zp_output_order.begin(), m_scales_zp_output_order.end(), 0); + } + + OPENVINO_ASSERT(data.get_partial_shape().size() == m_scales_zp_output_order.size()); + validate_and_infer_types(); +} + +void DynamicQuantize::validate_and_infer_types() { + std::vector input_shapes = {get_input_partial_shape(0)}; + + auto out_shapes = shape_infer(this, input_shapes, m_config, m_scales_zp_output_order, m_combine_scales_and_zp); + set_output_type(0, m_config.quantization_dt, out_shapes[0]); + set_output_type(1, m_config.scale_dt, out_shapes[1]); + + if (m_config.is_asymmetric_quantization() && !m_combine_scales_and_zp) + set_output_type(2, m_config.zp_dt, out_shapes[2]); +} + +std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + return std::make_shared(new_args.at(0), m_config, m_scales_zp_output_order, m_combine_scales_and_zp); +} + +std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, + const std::vector& input_shapes, + const QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const bool combine_scales_and_zp) { + std::vector out_shapes = ov::op::internal::DynamicQuantize::shape_infer(op, input_shapes, config); + const auto is_asymmetric = config.is_asymmetric_quantization(); + if (is_asymmetric && combine_scales_and_zp) { + out_shapes.pop_back(); // drop zero_points shape + } + + auto transpose_shape = [](const ov::PartialShape& shape, const std::vector& scales_zp_output_order) { + auto transposed_shape = shape; + for (size_t i = 0; i < scales_zp_output_order.size(); i++) { + OPENVINO_ASSERT(scales_zp_output_order[i] < transposed_shape.size()); + transposed_shape[i] = shape[scales_zp_output_order[i]]; + } + + return transposed_shape; + }; + + // transpose scales and zero points + for (size_t i = 1; i < out_shapes.size(); i++) { + out_shapes[i] = transpose_shape(out_shapes[i], scales_zp_output_order); + } + + if (is_asymmetric && combine_scales_and_zp) { + // TODO: currently scales and zero points are supposed to be combined over the last dimension only + const auto combine_axis = out_shapes[1].size() - 1; + OPENVINO_ASSERT(config.group_sizes[scales_zp_output_order[combine_axis]] != 1); + // OPENVINO_ASSERT(out_shapes[1][combine_axis].is_dynamic() || out_shapes[1][combine_axis] == 1); + + out_shapes[1][combine_axis] *= 2; // (scale, zero_point) pairs + } + + return out_shapes; +} + +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp index 681c88119efd95..a900c99eb6a4af 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp @@ -25,30 +25,65 @@ IndirectSDPA::IndirectSDPA(const OutputVector& data_inputs, validate_and_infer_types(); } +IndirectSDPA::IndirectSDPA(const OutputVector& data_inputs, + const ov::Output& beam_table, + const bool is_causal, + const int64_t indirect_axis, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const QuantizationConfig& quantization_config, + const bool combine_scales_and_zp, + const ov::element::Type output_type) + : ov::intel_gpu::op::SDPA(data_inputs, is_causal, order_q, order_k, order_v, order_out, quantization_config, combine_scales_and_zp, output_type) + , m_indirect_axis(indirect_axis) { + auto beam_table_idx = data_inputs.size(); + set_argument(beam_table_idx, beam_table); + validate_and_infer_types(); +} + std::shared_ptr IndirectSDPA::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); // Exclude beam_table input OutputVector data_inputs(new_args.begin(), new_args.end() - 1); - return std::make_shared(data_inputs, - new_args.back(), - m_is_causal, - m_indirect_axis, - m_order_q, - m_order_k, - m_order_v, - m_order_out, - m_output_type); + if (m_compressed) { + return std::make_shared(data_inputs, + new_args.back(), + m_is_causal, + m_indirect_axis, + m_order_q, + m_order_k, + m_order_v, + m_order_out, + m_output_type); + } else { + return std::make_shared(data_inputs, + new_args.back(), + m_is_causal, + m_indirect_axis, + m_order_q, + m_order_k, + m_order_v, + m_order_out, + m_quantization_config, + m_combine_scales_and_zp, + m_output_type); + } } void IndirectSDPA::validate_and_infer_types() { const auto input_size = get_input_size(); + + const auto compression_inputs = get_compression_inputs_num(); NODE_VALIDATION_CHECK(this, - input_size == 4 || input_size == 5 || input_size == 6, + input_size >= 4 + compression_inputs && input_size <= 6 + compression_inputs, "Number of inputs is incorrect. Current value is: ", input_size, - ", expected 4, 5 or 6."); + ", expected 4, 5 or 6 data inputs and ", compression_inputs, " KV-cache compression related inputs"); + std::vector input_shapes; for (size_t i = 0; i < input_size - 1; i++) { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp index a598e556a8f05d..89bacbc3ce1430 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp @@ -24,6 +24,7 @@ KVCache::KVCache(const Output& past, , m_concat_axis(concat_axis) , m_gather_axis(gather_axis) , m_indirect(true) + , m_compressed(false) , m_output_type(output_type) { m_variable = past_variable; if (m_indirect) @@ -40,28 +41,86 @@ KVCache::KVCache(const Output& past, , m_concat_axis(concat_axis) , m_gather_axis(0) , m_indirect(false) + , m_compressed(false) , m_output_type(output_type) { m_variable = past_variable; validate_and_infer_types(); } +KVCache::KVCache(const OutputVector& inputs, + const std::shared_ptr& past_variable, + int64_t concat_axis, + int64_t gather_axis, + bool combine_scales_and_zp, + const QuantizationConfig& config, + const std::vector& scales_zp_output_order, + const ov::element::Type output_type) + : Op(inputs) + , m_concat_axis(concat_axis) + , m_gather_axis(gather_axis) + , m_indirect(true) + , m_compressed(true) + , m_combine_scales_and_zp(combine_scales_and_zp) + , m_quantization_config(config) + , m_scales_zp_output_order(scales_zp_output_order) + , m_output_type(output_type) { + OPENVINO_ASSERT(m_quantization_config.quantization_dt == ov::element::i8, + "[GPU] Only I8 data type is currently supported for KV-cache compression"); + + m_variable = past_variable; + size_t output_size = 3; + if (config.is_asymmetric_quantization() && !combine_scales_and_zp) + output_size++; // add zp output + + set_output_size(output_size); + validate_and_infer_types(); +} + bool KVCache::visit_attributes(ov::AttributeVisitor& visitor) { visitor.on_attribute("concat_axis", m_concat_axis); visitor.on_attribute("gather_axis", m_gather_axis); visitor.on_attribute("indirect", m_indirect); visitor.on_attribute("output_type", m_output_type); + visitor.on_attribute("compressed", m_compressed); return true; } void KVCache::validate_and_infer_types() { - auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; + auto output_type = m_output_type; + if (m_compressed) { + output_type = m_quantization_config.quantization_dt; + } else if (m_output_type == ov::element::undefined) { + output_type = get_input_element_type(0); + } + std::vector input_shapes = {m_variable->get_info().data_shape, get_input_partial_shape(1)}; - if (get_output_size() == 2) + if (m_indirect) { input_shapes.push_back(get_input_partial_shape(2)); - auto shapes = shape_infer(this, input_shapes); - set_output_type(0, output_type, shapes[0]); + } + + if (m_compressed) { + input_shapes.push_back(get_input_partial_shape(3)); + + if (m_quantization_config.is_asymmetric_quantization() && !m_combine_scales_and_zp) + input_shapes.push_back(get_input_partial_shape(4)); + } + + auto shapes = m_compressed ? shape_infer(this, input_shapes, m_quantization_config, m_scales_zp_output_order, m_combine_scales_and_zp) + : shape_infer(this, input_shapes); + + size_t out_ports = 0; + set_output_type(out_ports++, output_type, shapes[0]); + if (m_indirect) { - set_output_type(1, get_input_element_type(2), shapes[1]); + set_output_type(out_ports++, get_input_element_type(2), shapes[1]); + } + + if (m_compressed) { + set_output_type(out_ports++, m_quantization_config.scale_dt, shapes[2]); + + if (m_quantization_config.is_asymmetric_quantization() && !m_combine_scales_and_zp) { + set_output_type(out_ports++, m_quantization_config.scale_dt, shapes[3]); + } } } @@ -74,7 +133,7 @@ std::shared_ptr KVCache::clone_with_new_inputs(const ov::OutputVector& new m_concat_axis, m_output_type); - } else { + } else if (new_args.size() == 3) { return std::make_shared(new_args.at(0), new_args.at(1), new_args.at(2), @@ -82,16 +141,55 @@ std::shared_ptr KVCache::clone_with_new_inputs(const ov::OutputVector& new m_concat_axis, m_gather_axis, m_output_type); + } else { + return std::make_shared(new_args, + m_variable, + m_concat_axis, + m_gather_axis, + m_combine_scales_and_zp, + m_quantization_config, + m_scales_zp_output_order, + m_output_type); + } +} + +std::vector shape_infer(const KVCache* op, + const std::vector& input_shapes, + const ov::op::internal::QuantizationConfig& config, + const std::vector& scales_zp_output_order, + bool combine_scales_and_zp) { + std::vector out_shapes = shape_infer(op, input_shapes); + + if (op->get_output_size() >= 3) { + ov::intel_gpu::op::DynamicQuantize op; + auto quantized_data_shapes = ov::intel_gpu::op::DynamicQuantize::shape_infer(&op, {input_shapes[1]}, config, scales_zp_output_order, combine_scales_and_zp); + + std::cout << "Base shapes: " << out_shapes[0] << " " << out_shapes[1] << "\n"; + std::cout << "Start shape: " << input_shapes[3] << ". Qunatized shapes: " << quantized_data_shapes[0] << " " << quantized_data_shapes[1] + << " (total quant shapes: " << quantized_data_shapes.size() << ", input: " << input_shapes.size() << ", out_shapes=" << out_shapes.size() << ")\n"; + const auto scales_concat_axis = 2; + ov::PartialShape compression_scale_shape = input_shapes[3]; + compression_scale_shape[scales_concat_axis] += quantized_data_shapes[1][scales_concat_axis]; + out_shapes[2] = compression_scale_shape; + + // add zp output + if (quantized_data_shapes.size() == 3) { + ov::PartialShape compression_zp_shape = input_shapes[4]; + compression_zp_shape[scales_concat_axis] += quantized_data_shapes[2][scales_concat_axis]; + out_shapes[3] = compression_zp_shape; + } } + + return out_shapes; } -std::vector shape_infer(const KVCache* op, std::vector input_shapes) { +std::vector shape_infer(const KVCache* op, const std::vector& input_shapes) { std::vector out_shapes; out_shapes.resize(op->get_output_size()); const auto& gather_axis = op->get_gather_axis(); const auto& concat_axis = ov::util::normalize(op->get_concat_axis(), input_shapes[0].size()); - if (op->get_output_size() == 2) { + if (op->get_output_size() >= 2) { out_shapes[0] = input_shapes[0]; out_shapes[0][gather_axis] = input_shapes[2][0]; out_shapes[0][concat_axis] += input_shapes[1][concat_axis]; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp index 5438a6e2e695b5..f5af8e9d21a6bc 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp @@ -3,6 +3,7 @@ // #include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/op/read_values.hpp" #include "openvino/core/partial_shape.hpp" namespace ov { @@ -28,16 +29,14 @@ bool ReadValue::visit_attributes(ov::AttributeVisitor& visitor) { return true; } -void ReadValue::validate_and_infer_types() { - OPENVINO_ASSERT(m_variable, "Variable is not initialized."); - const auto& variable_info = m_variable->get_info(); +void ReadValue::validate_and_infer_types(size_t output_idx, const ov::op::util::VariableInfo& variable_info) { const auto& variable_type = variable_info.data_type; const auto& variable_shape = variable_info.data_shape; // If no inputs provided, it means this ReadValue doesn't have initial subgraph. This is valid. - if (get_input_size() > 0) { - const auto& initial_type = get_input_element_type(0); - const auto& initial_shape = get_input_partial_shape(0); + if (get_input_size() > output_idx) { + const auto& initial_type = get_input_element_type(output_idx); + const auto& initial_shape = get_input_partial_shape(output_idx); // Variable's shape/type determine a permissible range of values for shape/type inferred from initial_subgraph. // If initial_subgraph is set, then we need to check that shape/type inferred from initial_subgraph @@ -64,19 +63,25 @@ void ReadValue::validate_and_infer_types() { // dynamic rank/type can be derived from the IRs generated via the prev versions of OV, // but dynamic rank/type are not supported in plugins, // so we are trying to fix them here using the rank/type of ReadValue 1st input, if it exists - if (get_input_size() > 0 && variable_info.data_shape.rank().is_dynamic() && - variable_info.data_type.is_dynamic()) { - set_output_type(0, initial_type, initial_shape); + if (variable_info.data_shape.rank().is_dynamic() && variable_info.data_type.is_dynamic()) { + set_output_type(output_idx, initial_type, initial_shape); return; } } - set_output_type(0, variable_type, variable_shape); + set_output_type(output_idx, variable_type, variable_shape); +} + +void ReadValue::validate_and_infer_types() { + OPENVINO_ASSERT(m_variable, "Variable is not initialized."); + const auto& variable_info = m_variable->get_info(); + + validate_and_infer_types(0, variable_info); } std::shared_ptr ReadValue::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); - switch (new_args.size()) { + switch (new_args.size()) { case 0: return std::make_shared(m_variable); case 1: @@ -89,6 +94,62 @@ std::shared_ptr ReadValue::clone_with_new_inputs(const ov::OutputVector& n } } +ReadValues::ReadValues(const std::shared_ptr& variable, + const std::vector& internal_states_infos) + : ReadValue(variable) + , m_internal_states_infos(internal_states_infos) { + OPENVINO_ASSERT(!internal_states_infos.empty()); + set_output_size(internal_states_infos.size()); + validate_and_infer_types(); +} + +ReadValues::ReadValues(const std::vector>& variable_initializers, + const std::shared_ptr& variable, + const std::vector& internal_states_infos) + : ReadValue(variable_initializers, variable) + , m_internal_states_infos(internal_states_infos) { + OPENVINO_ASSERT(!internal_states_infos.empty()); + set_output_size(internal_states_infos.size()); + validate_and_infer_types(); +} + +bool ReadValues::visit_attributes(ov::AttributeVisitor& visitor) { + visitor.on_attribute("variable_id", m_variable); + + auto variable_info = m_variable->get_info(); + visitor.on_attribute("variable_type", variable_info.data_type); + visitor.on_attribute("variable_shape", variable_info.data_shape); + m_variable->update(variable_info); + return true; +} + +void ReadValues::validate_and_infer_types() { + OPENVINO_ASSERT(m_variable, "Variable is not initialized."); + + for (size_t i = 0; i < get_output_size(); i++) { + ReadValue::validate_and_infer_types(i, m_internal_states_infos[i]); + } +} + +std::shared_ptr ReadValues::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + OPENVINO_ASSERT(new_args.empty() || new_args.size() == m_internal_states_infos.size(), + "Unable to clone ReadValues op (name=", this->get_friendly_name(), "). ", + "Incorrect number of inputs. Expected: 0 or ", m_internal_states_infos.size() ,". ", + "Actual: ", new_args.size(), "."); + + if (new_args.size() > 0) { + return std::make_shared(new_args, m_variable, m_internal_states_infos); + } else { + return std::make_shared(m_variable, m_internal_states_infos); + } +} + +std::vector ReadValues::get_all_internal_states_info() const { + return m_internal_states_infos; +} + } // namespace op } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp index 57d2899e2b2e77..65930d5feb6d0a 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp @@ -26,7 +26,31 @@ SDPA::SDPA(const OutputVector& inputs, , m_order_k(order_k) , m_order_v(order_v) , m_order_out(order_out) - , m_output_type(output_type) { + , m_output_type(output_type) + , m_compressed(false) { + set_arguments(inputs); + set_causal(is_causal); + validate_and_infer_types(); +} + +SDPA::SDPA(const OutputVector& inputs, + const bool is_causal, + const std::vector& order_q, + const std::vector& order_k, + const std::vector& order_v, + const std::vector& order_out, + const QuantizationConfig& quantization_config, + const bool combine_scales_and_zp, + const ov::element::Type output_type) + : m_is_causal(is_causal) + , m_order_q(order_q) + , m_order_k(order_k) + , m_order_v(order_v) + , m_order_out(order_out) + , m_output_type(output_type) + , m_compressed(true) + , m_combine_scales_and_zp(combine_scales_and_zp) + , m_quantization_config(quantization_config) { set_arguments(inputs); set_causal(is_causal); validate_and_infer_types(); @@ -46,11 +70,13 @@ std::shared_ptr SDPA::clone_with_new_inputs(const ov::OutputVector& ne void SDPA::validate_and_infer_types() { const auto input_size = get_input_size(); + + const auto compression_inputs = get_compression_inputs_num(); NODE_VALIDATION_CHECK(this, - input_size == 3 || input_size == 4 || input_size == 5, + input_size >= 3 + compression_inputs && input_size <= 5 + compression_inputs, "Number of inputs is incorrect. Current value is: ", input_size, - ", expected 3, 4 or 5."); + ", expected 3, 4 or 5 data inputs and ", compression_inputs, " KV-cache compression related inputs"); std::vector input_shapes; for (size_t i = 0; i < input_size; i++) { @@ -77,6 +103,18 @@ bool SDPA::visit_attributes(ov::AttributeVisitor &visitor) { return true; } +size_t SDPA::get_compression_inputs_num() const { + size_t compression_inputs = 0; + if (m_compressed) { + compression_inputs += 2; // 2 * scales + + if (m_quantization_config.is_asymmetric_quantization() && !m_combine_scales_and_zp) + compression_inputs += 2; // 2 * zp + } + + return compression_inputs; +} + std::vector shape_infer(const SDPA* op, std::vector input_shapes, const std::vector& order_q, diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 40c7ab48c486cb..d095e3ec317d11 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -68,6 +68,7 @@ #include "plugin/transformations/swiglu_fusion.hpp" #include "plugin/transformations/transpose_fusion.hpp" #include "plugin/transformations/indirect_kv_cache.hpp" +#include "plugin/transformations/kv_cache_compression.hpp" #include "plugin/transformations/convert_convolution.hpp" #include "plugin/transformations/unsqueeze_broadcast_reshape_matmul_fusion.hpp" #include "plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp" @@ -852,6 +853,8 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); + manager.register_pass(); + manager.register_pass(); // This pass should be done after asymmetric quantization matching as it can move zp subtraction upper in the graph diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index dcbabff548cc5d..804a3877ffc837 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -198,6 +198,8 @@ static void print_help_messages() { " Currently, other layers except input-layer('parameter' type) are loading binaries for only input." " Different input or output tensors are seperated by ','. Different layers are separated by space. For example, " " \"[input_layer_name1]:[binary_dumped_file1],[binary_dump_file2] [input_layer_name2]:[binary_dump_1],[binary_dump_2]\""); + message_list.emplace_back("OV_GPU_EnableKVCacheCompression", + "Enable KV cache compression. 1 is for per-head compression, 2 is for per-token compression"); auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(), [](std::pair& a, std::pair& b){ @@ -254,7 +256,8 @@ debug_configuration::debug_configuration() , disable_primitive_fusing(0) , disable_fake_alignment(0) , dynamic_quantize_group_size(0) - , disable_horizontal_fc_fusion(0) { + , disable_horizontal_fc_fusion(0) + , enable_kv_cache_compression(0) { #ifdef GPU_DEBUG_CONFIG get_gpu_debug_env_var("Help", help); get_common_debug_env_var("Verbose", verbose); @@ -315,6 +318,7 @@ debug_configuration::debug_configuration() get_gpu_debug_env_var("LoadDumpRawBinary", load_dump_raw_bin_str); std::string dynamic_quantize_layers_without_onednn_str; get_gpu_debug_env_var("DynamicQuantizeLayersWithoutOnednn", dynamic_quantize_layers_without_onednn_str); + get_gpu_debug_env_var("EnableKVCacheCompression", enable_kv_cache_compression); if (help > 0) { print_help_messages();