From fd905b34962d92198e05ba80fe039500e9c57d74 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Fri, 27 Sep 2024 12:51:31 +0400 Subject: [PATCH] WIP: KV-cache initial version --- .../include/intel_gpu/op/indirect_sdpa.hpp | 1 + .../include/intel_gpu/op/kv_cache.hpp | 11 + .../include/intel_gpu/op/read_value.hpp | 24 +- .../intel_gpu/include/intel_gpu/op/sdpa.hpp | 3 + .../plugin/multi_tensor_variable_state.hpp | 9 +- .../intel_gpu/plugin/primitives_list.hpp | 1 + .../intel_gpu/primitives/dynamic_quantize.hpp | 18 +- .../include/intel_gpu/primitives/kv_cache.hpp | 13 +- .../scaled_dot_product_attention.hpp | 14 +- .../intel_gpu/runtime/debug_configuration.hpp | 1 + .../intel_gpu/src/graph/dynamic_quantize.cpp | 18 +- .../graph_optimizer/build_implementations.cpp | 3 + .../graph_optimizer/prepare_buffer_fusing.cpp | 18 +- .../impls/ocl/kernel_selector_helper.cpp | 6 +- .../graph/impls/ocl/kernel_selector_helper.h | 2 +- .../src/graph/impls/ocl/kv_cache.cpp | 133 ++++++++- .../ocl/scaled_dot_product_attention.cpp | 51 +++- .../src/graph/include/dynamic_quantize_inst.h | 2 +- .../src/graph/include/kv_cache_inst.h | 5 + src/plugins/intel_gpu/src/graph/kv_cache.cpp | 33 ++- .../intel_gpu/src/graph/primitive_inst.cpp | 12 +- .../graph/scaled_dot_product_attention.cpp | 1 + .../kernel_selector/cl_kernels/sdpa_opt.cl | 250 ++++++++++++++++- .../kernel_selector/cl_kernels/sdpa_ref.cl | 27 ++ .../dynamic_quantize_kernel_opt.cpp | 2 +- .../dynamic_quantize_kernel_ref.cpp | 14 + .../dynamic_quantize_kernel_selector.cpp | 2 +- .../kernels/sdpa/sdpa_kernel_base.cpp | 15 ++ .../kernels/sdpa/sdpa_kernel_base.h | 3 + .../kernels/sdpa/sdpa_kernel_micro.cpp | 3 + .../kernels/sdpa/sdpa_kernel_opt.cpp | 21 +- .../kernels/sdpa/sdpa_kernel_ref.cpp | 18 +- .../kernels/sdpa/sdpa_kernel_selector.cpp | 20 +- .../plugin/multi_tensor_variable_state.cpp | 40 ++- .../src/plugin/ops/dynamic_quantize.cpp | 20 +- .../intel_gpu/src/plugin/ops/kv_cache.cpp | 5 +- .../ops/scaled_dot_product_attention.cpp | 8 +- .../intel_gpu/src/plugin/ops/variable.cpp | 16 ++ .../src/plugin/sync_infer_request.cpp | 18 +- .../transformations/indirect_kv_cache.cpp | 1 + .../transformations/kv_cache_compression.cpp | 255 ++++++++++++++++++ .../transformations/kv_cache_compression.hpp | 47 ++++ .../transformations/op/indirect_sdpa.cpp | 12 +- .../plugin/transformations/op/kv_cache.cpp | 63 ++++- .../plugin/transformations/op/read_value.cpp | 41 +++ .../src/plugin/transformations/op/sdpa.cpp | 10 +- .../transformations/transpose_fusion.cpp | 2 +- ...nsqueeze_broadcast_reshape_sdpa_fusion.cpp | 2 +- .../src/plugin/transformations_pipeline.cpp | 3 + .../src/runtime/debug_configuration.cpp | 6 +- 50 files changed, 1206 insertions(+), 97 deletions(-) create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp index b4d34a3975af6b..3b38159bd8a873 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp @@ -22,6 +22,7 @@ class IndirectSDPA : public ov::intel_gpu::op::SDPA { IndirectSDPA(const OutputVector& data_inputs, const ov::Output& beam_table, const bool is_causal, + const bool is_kv_compressed, const int64_t indirect_axis, const std::vector& order_q, const std::vector& order_k, diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp index 402ff6e46c1607..8ed8f525161d5a 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp @@ -34,6 +34,15 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension { int64_t concat_axis, const ov::element::Type output_type = ov::element::undefined); + KVCache(const Output& past, + const Output& new_token_data, + const Output& new_token_scale, + const Output& beam_idx, + const std::shared_ptr& past_values, + int64_t concat_axis, + int64_t gather_axis, + const ov::element::Type output_type = ov::element::undefined); + bool visit_attributes(ov::AttributeVisitor& visitor) override; void validate_and_infer_types() override; @@ -52,11 +61,13 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension { void set_gather_axis(int64_t axis) { m_gather_axis = axis; } bool get_indirect() const { return m_indirect; } + bool get_compressed() const { return m_compressed; } private: int64_t m_concat_axis = 0; int64_t m_gather_axis = 0; bool m_indirect = false; + bool m_compressed = false; ov::element::Type m_output_type; }; diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp index a9c47f3fa39fa6..eceaab28dd6548 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/read_value.hpp @@ -12,7 +12,7 @@ namespace ov { namespace intel_gpu { namespace op { -/// \brief Similar to common v6::ReadValue, but it's not derived from ReadValueBase class to avoid ReadValue-Assign pairing check +/// \brief Similar to common v6::CompressedReadValue, but it's not derived from ReadValueBase class to avoid ReadValue-Assign pairing check /// This is needed to have ReadValue-KVCache pair instead of ReadValue-Assign class ReadValue : public ov::op::Op, public ov::op::util::VariableExtension { public: @@ -35,6 +35,28 @@ class ReadValue : public ov::op::Op, public ov::op::util::VariableExtension { } }; +/// \brief Similar to common v6::ReadValue, but it's not derived from ReadValueBase class to avoid ReadValue-Assign pairing check +/// This is needed to have ReadValue-KVCache pair instead of ReadValue-Assign +class CompressedReadValue : public ReadValue { +public: + OPENVINO_OP("CompressedReadValue", "gpu_opset"); + + CompressedReadValue() = default; + + CompressedReadValue(const Output& compressed_variable_initializer, const Output& compressed_variable_initializer_scale, const std::shared_ptr& variable); + + bool visit_attributes(ov::AttributeVisitor& visitor) override; + + void validate_and_infer_types() override; + + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + + std::string get_variable_id() const override { + OPENVINO_ASSERT(m_variable, "Variable is not initialized. Variable_id is unavailable"); + return m_variable->get_info().variable_id; + } +}; + } // namespace op } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp index 9f5d4dad16efd7..69de61105cba06 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp @@ -21,6 +21,7 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { SDPA(const OutputVector& inputs, const bool is_causal, + const bool is_kv_compressed, const std::vector& order_q, const std::vector& order_k, const std::vector& order_v, @@ -34,6 +35,7 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; bool get_causal() const { return m_is_causal; } + bool get_kv_compressed() const { return m_is_kv_compressed; } std::vector get_input0_transpose_order() const { return m_order_q; } std::vector get_input1_transpose_order() const { return m_order_k; } @@ -49,6 +51,7 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { protected: bool m_is_causal; + bool m_is_kv_compressed; std::vector m_order_q; std::vector m_order_k; std::vector m_order_v; diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp index 0cad36f62e47b9..c716198ad04ac0 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp @@ -18,14 +18,15 @@ class MultiTensorState : public VariableStateBase { }; // This is multi-tensor state for Indirect KV-Cache + Gemm pattern -// Internally it stores KV Cache state + Beam Table state +// Internally it stores KV Cache state + Beam Table state (+ scale state for kv cache compression) class VariableStateIndirectKVCache : public MultiTensorState { public: VariableStateIndirectKVCache(const VariableStateInfo& info, std::shared_ptr context, std::shared_ptr shape_predictor, size_t beam_idx, - size_t concat_idx); + size_t concat_idx, + bool has_compression_scale = false); using Ptr = std::shared_ptr; void reset() override; @@ -41,9 +42,13 @@ class VariableStateIndirectKVCache : public MultiTensorState { VariableState::Ptr get_beam_table_state() const; ov::PartialShape get_beam_table_shape(const ov::PartialShape& kv_cache_shape); + VariableState::Ptr get_compression_scale_state() const; + ov::PartialShape get_compression_scale_shape(const ov::PartialShape& kv_cache_shape); + private: size_t m_beam_axis = 0; size_t m_concat_axis = 0; + bool m_has_compression_scale = false; }; } // namespace intel_gpu diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp index 1dbd769444b1aa..1da4bc36712d71 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/primitives_list.hpp @@ -284,6 +284,7 @@ REGISTER_FACTORY(internal, RMS); REGISTER_FACTORY(internal, GatherCompressed); REGISTER_FACTORY(internal, KVCache); REGISTER_FACTORY(internal, ReadValue); +REGISTER_FACTORY(internal, CompressedReadValue); REGISTER_FACTORY(internal, Gemm); REGISTER_FACTORY(internal, SwiGLU); REGISTER_FACTORY(internal, IndirectGemm); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index d93e2f86eed144..dfacfca5cfbd36 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -12,26 +12,26 @@ namespace cldnn { struct dynamic_quantize : public primitive_base { CLDNN_DECLARE_PRIMITIVE(dynamic_quantize); - dynamic_quantize() : primitive_base("", {}), group_size(0) {} + dynamic_quantize() : primitive_base("", {}), group_sizes{} {} /// @brief Constructs dynamic_quantize primitive /// @param id This primitive id /// @param input Input primitive id - /// @param group_size Quantization group size + /// @param group_sizes Quantization group size /// @param data_type Output data type of quantized /// @param output_size Output data size of the primitive dynamic_quantize(const primitive_id& id, const input_info& input, - const uint64_t group_size, + const std::vector& group_sizes, const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}) : primitive_base(id, {input}, 2, data_types), - group_size(group_size) {} + group_sizes(group_sizes) {} - uint64_t group_size = 0; + std::vector group_sizes; size_t hash() const override { size_t seed = primitive::hash(); - seed = hash_combine(seed, group_size); + seed = hash_range(seed, group_sizes.begin(), group_sizes.end()); return seed; } @@ -41,17 +41,17 @@ struct dynamic_quantize : public primitive_base { auto rhs_casted = downcast(rhs); - return group_size == rhs_casted.group_size; + return group_sizes == rhs_casted.group_sizes; } void save(BinaryOutputBuffer& ob) const override { primitive_base::save(ob); - ob << group_size; + ob << group_sizes; } void load(BinaryInputBuffer& ib) override { primitive_base::load(ib); - ib >> group_size; + ib >> group_sizes; } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp index f87041dcff66d6..f551f2737676e1 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp @@ -21,23 +21,27 @@ struct kv_cache : public primitive_base { const ov::op::util::VariableInfo& variable_info, const int64_t concat_axis, const int64_t gather_axis, - const bool indirect) + const bool indirect, + const bool compressed) : primitive_base(id, inputs) , variable_info(variable_info) , concat_axis(concat_axis) , gather_axis(gather_axis) - , indirect(indirect) {} + , indirect(indirect) + , compressed(compressed) {} ov::op::util::VariableInfo variable_info; int64_t concat_axis = 0; int64_t gather_axis = 0; bool indirect = false; + bool compressed = false; size_t hash() const override { size_t seed = primitive::hash(); seed = hash_combine(seed, concat_axis); seed = hash_combine(seed, gather_axis); seed = hash_combine(seed, indirect); + seed = hash_combine(seed, compressed); return seed; } @@ -50,7 +54,8 @@ struct kv_cache : public primitive_base { return variable_info == rhs_casted.variable_info && concat_axis == rhs_casted.concat_axis && gather_axis == rhs_casted.gather_axis && - indirect == rhs_casted.indirect; + indirect == rhs_casted.indirect && + compressed == rhs_casted.compressed; } void save(BinaryOutputBuffer& ob) const override { @@ -62,6 +67,7 @@ struct kv_cache : public primitive_base { ob << concat_axis; ob << gather_axis; ob << indirect; + ob << compressed; } void load(BinaryInputBuffer& ib) override { @@ -76,6 +82,7 @@ struct kv_cache : public primitive_base { ib >> concat_axis; ib >> gather_axis; ib >> indirect; + ib >> compressed; } }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp index d66012bfac8889..28eaa625832f94 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp @@ -16,9 +16,11 @@ struct scaled_dot_product_attention : public primitive_base inputs, bool is_causal, + bool is_kv_compressed = false, int64_t indirect_axis = -1, const std::vector& input_q_transpose_order = {}, const std::vector& input_k_transpose_order = {}, @@ -26,6 +28,7 @@ struct scaled_dot_product_attention : public primitive_base& output_transpose_order = {}) : primitive_base(id, inputs) , is_causal(is_causal) + , is_kv_compressed(is_kv_compressed) , indirect_axis(indirect_axis) , input_q_transpose_order(input_q_transpose_order) , input_k_transpose_order(input_k_transpose_order) @@ -34,12 +37,13 @@ struct scaled_dot_product_attention : public primitive_base 3; - has_scale_input = data_inputs_num > 4; + size_t scale_value_cnt = is_kv_compressed ? 2 : 0; + has_attn_mask_input = data_inputs_num > 3 + scale_value_cnt; + has_scale_input = data_inputs_num > 4 + scale_value_cnt; } bool is_causal = false; + bool is_kv_compressed = false; bool has_attn_mask_input = false; bool has_scale_input = false; int64_t indirect_axis = -1; @@ -52,6 +56,7 @@ struct scaled_dot_product_attention : public primitive_base(rhs); return is_causal == rhs_casted.is_causal && + is_kv_compressed == rhs_casted.is_kv_compressed && has_attn_mask_input == rhs_casted.has_attn_mask_input && has_scale_input == rhs_casted.has_scale_input && indirect_axis == rhs_casted.indirect_axis && @@ -81,6 +87,7 @@ struct scaled_dot_product_attention : public primitive_base::save(ob); ob << is_causal; + ob << is_kv_compressed; ob << has_attn_mask_input; ob << has_scale_input; ob << indirect_axis; @@ -93,6 +100,7 @@ struct scaled_dot_product_attention : public primitive_base::load(ib); ib >> is_causal; + ib >> is_kv_compressed; ib >> has_attn_mask_input; ib >> has_scale_input; ib >> indirect_axis; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index fbc8ae84c36a29..7ac2cbbbef543b 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -145,6 +145,7 @@ class debug_configuration { std::vector dynamic_quantize_layers_without_onednn; // Specify Fully-connected layers which enable Dynamic quantization int dynamic_quantize_group_size; // Enable Dynamic quantization for fully connected primitive by specified group size int disable_horizontal_fc_fusion; // Disable fc horizontal fusion + int enable_kv_cache_compression; // Enable KV cache compression std::set dump_iteration; // Dump n-th execution of network. std::vector load_layers_raw_dump; // List of layers to load dumped raw binary and filenames static const debug_configuration *get_instance(); diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index 5c945f4c2d389c..fec3c842a1d595 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -22,7 +22,7 @@ layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& no } template -std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size) { +std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, const std::vector& group_sizes) { ov::op::internal::DynamicQuantize op; auto output_format = act_layout.format; @@ -30,21 +30,19 @@ std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &a act_layout.get(), }; - std::vector shape_group_size(act_layout.get().size(), 1); - shape_group_size.back() = group_size; - - auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, shape_group_size); + auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, group_sizes); + GPU_DEBUG_TRACE_DETAIL << "shape infer dynamic" << output_shapes[0] << " " << output_shapes[1] << "\n"; return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; } -template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, uint64_t group_size); +template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, const std::vector& group_sizes); template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); const auto& input_layout = impl_param.get_input_layout(); - return __calc_output_layouts(input_layout, UINT64_MAX /* TODO: handle group_size here */); + return __calc_output_layouts(input_layout, desc->group_sizes); } template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& node, @@ -56,6 +54,12 @@ std::string dynamic_quantize_inst::to_string(dynamic_quantize_node const& node) std::stringstream primitive_description; + json_composite dynamic_quantize_info; + dynamic_quantize_info.add("group size", desc->group_sizes); + dynamic_quantize_info.add("activation dt", desc->get_output_data_type(0).value_or(data_types::undefined)); + dynamic_quantize_info.add("scale dt", desc->get_output_data_type(1).value_or(data_types::undefined)); + + node_info->add("dynamic_quantize info", dynamic_quantize_info); node_info->dump(primitive_description); return primitive_description.str(); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp index 4c1b1008434144..84f1b26507f19e 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/build_implementations.cpp @@ -6,6 +6,7 @@ #include "program_helpers.h" #include "intel_gpu/runtime/itt.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" using namespace cldnn; @@ -19,6 +20,7 @@ void build_implementations::run(program& p) { for (auto& n : p.get_processing_order()) { if (auto impl = n->get_selected_impl()) { auto params = n->get_kernel_impl_params(); + GPU_DEBUG_TRACE << "add_kernels_source: " << params->desc->id << std::endl; cache.add_kernels_source(*params, impl->get_kernels_source()); } } @@ -26,6 +28,7 @@ void build_implementations::run(program& p) { for (auto& n : p.get_processing_order()) { if (auto impl = n->get_selected_impl()) { auto params = n->get_kernel_impl_params(); + GPU_DEBUG_TRACE << "init_kernels: " << params->desc->id << std::endl; impl->init_kernels(cache, *params); impl->reset_kernels_source(); } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 9f2895a008cfa8..ad8ffcdaadf0a6 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -847,8 +847,24 @@ void prepare_buffer_fusing::run(program& p) { if (!rv_prim) return; - if (kv_out_layout.data_type != rv_prim->get_output_layout().data_type) + int DISABLE_KV_OPT = 0; + static bool warned = false; + if (const auto env_var = std::getenv("DISABLE_KV_OPT")) { + std::istringstream ss(env_var); + ss >> DISABLE_KV_OPT; + + if (!warned) { + std::cout << "Opt " << DISABLE_KV_OPT << " for kvcache\n"; + warned = true; + } + } + + if (kv_out_layout.data_type != rv_prim->get_output_layout().data_type || DISABLE_KV_OPT) { + GPU_DEBUG_TRACE_DETAIL << node.id() << " can't optimize because of different formats: " << kv_out_layout.to_short_string() << " vs " << rv_prim->get_output_layout().to_short_string() << "\n"; return; + } else { + GPU_DEBUG_TRACE_DETAIL << node.id() << " can optimize because of different formats: " << kv_out_layout.to_short_string() << " vs " << rv_prim->get_output_layout().to_short_string() << "\n"; + } auto concat_axis = node.get_primitive()->concat_axis; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp index 762118ec5b1af2..19fff28a0e611e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp @@ -828,7 +828,7 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) { } } -kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset) { +kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset, const data_types dt) { const auto& pad = l.data_padding; const auto& vals_original = l.get_partial_shape(); @@ -868,7 +868,9 @@ kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor v pitch *= (reserved_in_mem_count + lp + up); } - return kernel_selector::data_tensor(vec, to_data_type(l.data_type), ks_layout); + data_types new_dt = (dt == data_types::undefined) ? l.data_type : dt; + + return kernel_selector::data_tensor(vec, to_data_type(new_dt), ks_layout); } kernel_selector::weights_tensor convert_weights_tensor(const layout& l, bool is_grouped) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h index 3ddb5bf8793c29..6c36e9002640c3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h @@ -101,7 +101,7 @@ kernel_selector::data_layout to_data_layout(format f); cldnn::format from_data_layout(kernel_selector::data_layout l); kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped); cldnn::format::type from_weights_layout(kernel_selector::weights_layout l); -kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset = tensor {}); +kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset = tensor {}, const data_types dt = data_types::undefined); kernel_selector::weights_tensor convert_weights_tensor(const layout& l, bool is_grouped = false); layout from_weights_tensor(const kernel_selector::weights_tensor& t); kernel_selector::activation_function get_kernel_selector_activation_param(activation_func activation_func); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp index e4e4adfbb15452..d3e58cd7c2108d 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp @@ -65,21 +65,31 @@ struct kv_cache_impl : multi_stage_primitive { const size_t concat_stage = 0; const size_t beam_table_stage = 1; + const size_t scale_stage = 2; cldnn::memory::ptr beam_table_prev = nullptr; cldnn::memory::ptr beam_table_new = nullptr; + cldnn::memory::ptr compression_scale_prev = nullptr; + cldnn::memory::ptr scale_new = nullptr; + void load(BinaryInputBuffer& ib) override { parent::load(ib); if (is_dynamic()) { auto& kernel_selector = kernel_selector_t::Instance(); auto kernel_impl = kernel_selector.GetImplementation(_kernels_data[concat_stage].kernelName); kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[concat_stage]); - if (_kernels_data.size() == 2) { + if (_kernels_data.size() >= 2) { auto& bt_kernel_selector = bt_kernel_selector_t::Instance(); auto bt_kernel_impl = bt_kernel_selector.GetImplementation(_kernels_data[beam_table_stage].kernelName); bt_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[beam_table_stage]); } + // FIXME: indirectness and compression are orthogonal feature. + if (_kernels_data.size() == 3) { + auto& scale_kernel_selector = kernel_selector_t::Instance(); + auto scale_kernel_impl = scale_kernel_selector.GetImplementation(_kernels_data[scale_stage].kernelName); + scale_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[scale_stage]); + } } } void set_arguments_impl(kv_cache_inst& instance) override {} @@ -93,6 +103,10 @@ struct kv_cache_impl : multi_stage_primitive { } else if (stage == beam_table_stage) { args.inputs = { beam_table_prev, instance.input_memory_ptr(2) }; args.outputs = { beam_table_new }; + } else if (stage == scale_stage) { + // FIXME: indirectness and compression are orthogonal feature. + args.inputs = { compression_scale_prev, instance.input_memory_ptr(3) }; + args.outputs = { scale_new }; } return args; @@ -102,6 +116,7 @@ struct kv_cache_impl : multi_stage_primitive { stream& stream = instance.get_network().get_stream(); std::vector tmp_events(events); size_t kernel_offset = 0; + // FIXME: indirectness and compression are orthogonal feature. stage execution does not happen in sequential order for (size_t s = 0; s < stage; s++) { kernel_offset += _kernels_data[s].kernels.size(); } @@ -183,14 +198,46 @@ struct kv_cache_impl : multi_stage_primitive { beam_table_state->set(); } + if (desc->compressed) { + const auto scale_alloc_type = engine.get_preferred_memory_allocation_type(false); + auto comp_scale_state = + dynamic_cast(variable).get_compression_scale_state(); + auto comp_scale_layout = instance.get_impl_params()->output_layouts[2]; + auto comp_scale_shape = comp_scale_layout.get_shape(); + std::swap(compression_scale_prev, scale_new); + + if (!scale_new || scale_new->count() < ov::shape_size(comp_scale_shape)) { + auto alloc_shape = comp_scale_shape; + alloc_shape[desc->concat_axis] += instance.get_prealloc_iter_num(); + const layout comp_scale_alloc_layout = {alloc_shape, comp_scale_layout.data_type, comp_scale_layout.format}; + GPU_DEBUG_TRACE_DETAIL << "Realloc compression scale table to " << comp_scale_alloc_layout.to_short_string() << std::endl; + scale_new = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false); + + // Alloc prev mem too as it will be needed in the future + // That also simplifies arguments setting a little bit as we don't need to handle an optional past state + if (!compression_scale_prev) { + compression_scale_prev = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false); + } + } + + instance.set_output_memory(scale_new, false, 2); + comp_scale_state->set_memory(scale_new, instance.get_impl_params()->output_layouts[2]); + + auto comp_scale_kernel_params = get_compression_scale_update_kernel_params(impl_param, comp_scale_state->is_set()); + (_kernels_data[scale_stage].update_dispatch_data_func)(comp_scale_kernel_params, _kernels_data[scale_stage]); + + execute_stage(events, instance, res_events, scale_stage); + comp_scale_state->set(); + } + variable.set(); if (can_be_optimized) { GPU_DEBUG_TRACE_DETAIL << desc->id << " : Output is same as variable memory! Skip copying " << std::endl; // When primitive is optimized, concat kernel writes directly to variable memory return stream.aggregate_events(res_events, res_events.size() > 1); } else { - // Othwerise, we need to copy result from out buffer to state memory - GPU_DEBUG_TRACE_DETAIL << desc->id << " : Copying output to variable meomry" << std::endl; + // Otherwise, we need to copy result from out buffer to state memory + GPU_DEBUG_TRACE_DETAIL << desc->id << " : Copying output to variable memory" << std::endl; stream.enqueue_barrier(); auto out = instance.get_network().get_engine().reinterpret_buffer(instance.output_memory(0), variable.get_memory()->get_layout()); @@ -213,6 +260,21 @@ struct kv_cache_impl : multi_stage_primitive { return layout{beam_table_shape, impl_param.output_layouts[1].data_type, format::get_default_format(beam_table_shape.size())}; } + static layout get_compression_scale_layout(const kernel_impl_params& impl_param) { + // FIXME: it is implemented in multiple places + GPU_DEBUG_GET_INSTANCE(debug_config); + const auto& primitive = impl_param.typed_desc(); + auto kv_layout = impl_param.get_input_layout(0); + auto kv_shape = kv_layout.get_partial_shape(); + auto comp_scale_shape = ov::PartialShape(std::vector(kv_shape.size(), 1)); + comp_scale_shape[0] = kv_shape[0]; + comp_scale_shape[1] = kv_shape[1]; + GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // per-head compression + comp_scale_shape[2] = kv_shape[2]; + } + return layout{comp_scale_shape, impl_param.output_layouts[2].data_type, format::get_default_format(comp_scale_shape.size())}; + } + static kernel_params_t get_concat_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { const auto& primitive = impl_param.typed_desc(); auto params = get_default_params(impl_param, is_shape_agnostic); @@ -229,6 +291,21 @@ struct kv_cache_impl : multi_stage_primitive { const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, beam_table_past] const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present] + + GPU_DEBUG_TRACE_DETAIL << "Concat output start offset: " << in_offsets_map.size() << " " << out_offsets_map.size() << "\n"; + + + // for (const auto& in_offset : in_offsets_map) { + // if (impl_param.input_layouts.size() > in_offset.first) + // std::cout << in_offset.first << ". " << impl_param.input_layouts[in_offset.first].to_short_string() << ", input, offset=" << in_offset.second << "\n"; + // else + // std::cout << in_offset.first << ". NOPE " << ", input, offset=" << in_offset.second << "\n"; + // } + + // for (const auto& out_offset : out_offsets_map) { + // std::cout << out_offset.first << ". " << impl_param.output_layouts[out_offset.first].to_short_string() << ", output, offset=" << out_offset.second << "\n"; + // } + std::map in_tensor_to_offset_map = { {0, in_offsets_map.at(0)}, {1, in_offsets_map.at(1)}, @@ -237,6 +314,8 @@ struct kv_cache_impl : multi_stage_primitive { {0, out_offsets_map.at(0)}, }; + GPU_DEBUG_TRACE_DETAIL << "Concat output start offset: " << primitive->id << " " << out_offsets_map.at(0) << " layout: " << impl_param.output_layouts[0].to_string() << "\n"; + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); return params; @@ -264,10 +343,12 @@ struct kv_cache_impl : multi_stage_primitive { params.is_state_set = is_state_set; params.indirect_axis = indirect_axis; - const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, beam_table_past]] - const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present] + const bool compressed = impl_param.typed_desc()->compressed; + const auto beam_table_past_idx = compressed ? 4 : 3; + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, compression_scale_past, beam_table_past, compression_scale_new]] + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present, compression_scale_present] std::map in_tensor_to_offset_map = { - {0, in_offsets_map.at(3)}, // beam_table_past + {0, in_offsets_map.at(beam_table_past_idx)}, // beam_table_past {1, in_offsets_map.at(2)}, // beam_idx }; std::map out_tensor_to_offset_map = { @@ -279,17 +360,55 @@ struct kv_cache_impl : multi_stage_primitive { return params; } + static kernel_params_t get_compression_scale_update_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + const auto& primitive = impl_param.typed_desc(); + auto params = get_default_params(impl_param, is_shape_agnostic); + + auto inputs_count = 2; + auto comp_scale_present_layout = impl_param.output_layouts[2]; + layout comp_scale_past_layout = get_compression_scale_layout(impl_param); + + params.inputs.resize(inputs_count); + params.inputs[0] = convert_data_tensor(comp_scale_past_layout); + params.inputs[1] = convert_data_tensor(impl_param.input_layouts[3]); + params.outputs[0] = convert_data_tensor(comp_scale_present_layout); + + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; + + // FIXME: need to handle the index properly when indirect is off + std::map in_tensor_to_offset_map = { + {0, in_offsets_map.at(5)}, // compression_scale_past + {1, in_offsets_map.at(3)}, // compression_scale_new + }; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(2)}, // compression_scale_present + }; + + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); + + return params; + } + static std::unique_ptr create(const typed_program_node& arg, const kernel_impl_params& impl_param) { std::vector kernels_data; + // if (arg.id().find("kvcache:__module.model.transformer.h.0.attn/aten::cat/Concat_4") != std::string::npos) + // std::cout << "mingyuki: create " << arg.id() << std::endl; auto concat_kernel_params = get_concat_kernel_params(impl_param, impl_param.is_dynamic()); auto& concat_kernel_selector = kernel_selector_t::Instance(); kernels_data.push_back(concat_kernel_selector.get_best_kernel(concat_kernel_params)); const bool indirect = impl_param.typed_desc()->indirect; + const bool compressed = impl_param.typed_desc()->compressed; if (indirect) { auto bt_update_kernel_params = get_bt_update_kernel_params(impl_param, false); auto& bt_update_kernel_selector = bt_kernel_selector_t::Instance(); kernels_data.push_back(bt_update_kernel_selector.get_best_kernel(bt_update_kernel_params)); } + if (compressed) { + auto comp_scale_update_kernel_params = get_compression_scale_update_kernel_params(impl_param, false); + auto& comp_scale_update_kernel_selector = kernel_selector_t::Instance(); + kernels_data.push_back(comp_scale_update_kernel_selector.get_best_kernel(comp_scale_update_kernel_params)); + } return cldnn::make_unique(kernels_data); } @@ -313,7 +432,7 @@ struct kv_cache_impl : multi_stage_primitive { namespace detail { attach_kv_cache_impl::attach_kv_cache_impl() { - auto types = { data_types::f16, data_types::f32 }; + auto types = { data_types::i8, data_types::f16, data_types::f32 }; auto formats = { format::bfyx }; implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index b33871110ec6b4..e3da7d5f551f87 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -133,6 +133,20 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveis_causal; + config.is_kv_compressed = desc->is_kv_compressed; + + GPU_DEBUG_TRACE << "Set is_kv_compressed to " << config.is_kv_compressed << "\n"; return config; } @@ -229,6 +246,9 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveis_kv_compressed) + data_inputs_num -= 2; // key and value compression scales are handled separately + params.inputs.resize(data_inputs_num); for (size_t i = 0; i < data_inputs_num; i++) { params.inputs[i] = convert_data_tensor(impl_param.get_input_layout(i)); @@ -246,15 +266,31 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveindirect_axis; } - params.set_dynamic_shape_offsets(); + if (desc->is_kv_compressed) { + params.key_cache_comp_scale = convert_data_tensor(impl_param.get_input_layout(data_inputs_num)); + params.value_cache_comp_scale = convert_data_tensor(impl_param.get_input_layout(data_inputs_num + 1)); + } - // Need to adjust sdpa kernel offset to consider beam table input - if (has_indirect_inputs(impl_param)) { - auto out_offset = params.outputs[0].get_dynamic_shape_offset(); - if (indirect) - params.beam_table.SetDynamicShapeOffset(out_offset); + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; + std::map in_tensor_to_offset_map; + for (size_t i = 0; i < data_inputs_num; i++) { + in_tensor_to_offset_map[i] = in_offsets_map.at(i); + } + + const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; + std::map out_tensor_to_offset_map = { + {0, out_offsets_map.at(0)}, + }; + + params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); - params.outputs[0].SetDynamicShapeOffset(out_offset + kernel_selector::DataTensor::max_rank()); + if (desc->is_kv_compressed) { + params.key_cache_comp_scale.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num)); + params.value_cache_comp_scale.SetDynamicShapeOffset(in_offsets_map.at(data_inputs_num + 1)); + } + + if (indirect && has_indirect_inputs(impl_param)) { + params.beam_table.SetDynamicShapeOffset(get_beam_table_id(desc)); } return params; @@ -300,6 +336,7 @@ attach_scaled_dot_product_attention_impl::attach_scaled_dot_product_attention_im auto types = { data_types::f32, data_types::f16, + data_types::i8, }; auto formats = { diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h index 49dd62c6332549..a9630fa9b9b25a 100644 --- a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h @@ -35,7 +35,7 @@ class typed_primitive_inst : public typed_primitive_inst_base< // Internal function to be used from fakealignment template - static std::vector __calc_output_layouts(const layout &act_layout, uint64_t group_size); + static std::vector __calc_output_layouts(const layout &act_layout, const std::vector& group_size); static std::string to_string(dynamic_quantize_node const& node); typed_primitive_inst(network& network, dynamic_quantize_node const& node); diff --git a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h index f3aa4de5ec34e1..3d3d998399bc63 100644 --- a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h @@ -26,6 +26,7 @@ struct typed_program_node : public typed_program_node_base { std::vector get_shape_info_input_layouts() const override { std::vector res; + GPU_DEBUG_TRACE_DETAIL << "get_shape_info_input_layouts, get_dependencies.size()=" << get_dependencies().size() << "\n"; for (size_t i = 0; i < get_dependencies().size(); i++) { const auto& d = get_dependency_with_port(i); res.push_back(d.first->get_output_layout(false, d.second)); @@ -35,6 +36,10 @@ struct typed_program_node : public typed_program_node_base { res.push_back(layout(ov::PartialShape::dynamic(4), data_types::i32, format::bfyx)); } + if (get_primitive()->compressed) { // insert an additional input with compressed_scale past layout + res.push_back(layout(ov::PartialShape::dynamic(4), data_types::f16, format::bfyx)); + } + return res; } }; diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp index 95cdd587cdf175..1a954d11c9e543 100644 --- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp @@ -10,6 +10,7 @@ #include "primitive_type_base.h" #include #include +#include "to_string_utils.h" namespace cldnn { GPU_DEFINE_PRIMITIVE_TYPE_ID(kv_cache) @@ -25,7 +26,7 @@ layout kv_cache_inst::calc_output_layout(const kv_cache_node& node, kernel_impl_ } template -std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& /*node*/, kernel_impl_params const& impl_param) { +std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& node, kernel_impl_params const& impl_param) { auto desc = impl_param.typed_desc(); ov::intel_gpu::op::KVCache op; @@ -43,8 +44,15 @@ std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& /*no static const std::map ports_map = {{0, 0}, {1, 2}}; std::vector out_layouts; + // std::cout << "node: " << node.id() << " " << desc->num_outputs << " outputs" << std::endl; for (size_t i = 0; i < desc->num_outputs; i++) { - auto out_type = desc->output_data_types[i].value_or(impl_param.get_input_layout(ports_map.at(i)).data_type); + data_types out_type; + if (i == 0 && desc->compressed) // compressed tensor + out_type = data_types::i8; + else if (i == 2 && desc->compressed) // scale for compressed tensor + out_type = data_types::f16; + else + out_type = desc->output_data_types[i].value_or(impl_param.get_input_layout(ports_map.at(i)).data_type); out_layouts.emplace_back(output_shapes[i], out_type, impl_param.get_output_layout(i).format); } @@ -63,6 +71,7 @@ std::string kv_cache_inst::to_string(const kv_cache_node& node) { kv_cache_info.add("concat axis", node.get_primitive()->concat_axis); kv_cache_info.add("gather axis", node.get_primitive()->gather_axis); kv_cache_info.add("indirect", node.get_primitive()->indirect); + kv_cache_info.add("compressed", node.get_primitive()->compressed); node_info->add("kv_cache info", kv_cache_info); std::stringstream primitive_description; node_info->dump(primitive_description); @@ -92,6 +101,7 @@ void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) { size_t i = 0; // [kv_state, kv_new_token, [beam_idx, bt_past]] + // FIXME: do we need to handle compression scale value? for (i = 0; i < _node->get_dependencies().size(); i++) { const auto& node_in_lay = _node->get_input_layout(i); const auto& runtime_in_lay = params.input_layouts[i]; @@ -113,10 +123,27 @@ void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) { bt_layout.set_partial_shape(bt_shape); } - GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for input[" << i << "]" << std::endl; + GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for input[" << i++ << "]" << std::endl; fill_shape_info_data(bt_layout, bt_state->get_initial_layout(), shape_info_ptr, offset); } + if (params.typed_desc()->compressed) { + auto& var = dynamic_cast(get_network().get_variable(variable_id())); + const auto& scale_state = var.get_compression_scale_state(); + auto scale_layout = scale_state->get_layout(); + if (scale_layout.is_dynamic()) { + auto bt_shape = scale_layout.get_partial_shape(); + for (auto& d : bt_shape) { + if (d.is_dynamic()) + d = 0; + } + scale_layout.set_partial_shape(bt_shape); + } + + GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for input[" << i++ << "]" << std::endl; + fill_shape_info_data(scale_layout, scale_state->get_initial_layout(), shape_info_ptr, offset); + } + for (size_t i = 0; i < _node->get_output_layouts().size(); i++) { GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for output[" << i << "]" << std::endl; const auto& node_out_lay = _node->get_output_layout(i); diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index d69a2767c018f6..18bf624e737b90 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -296,7 +296,7 @@ void primitive_inst::update_shape() { auto new_layout = variable.get_layout(); // If variable is not set and we have an initializer - use it's shape as shape of variable - if (!variable.is_set() && _impl_params->input_layouts.size() == 1) { + if (!variable.is_set() && _impl_params->input_layouts.size() >= 1) { new_layout = _impl_params->get_input_layout(0); } @@ -311,6 +311,7 @@ void primitive_inst::update_shape() { new_layout.set_partial_shape(pshape); } + GPU_DEBUG_TRACE_DETAIL << id() << " set new layout " << new_layout.to_short_string() << "\n"; variable.set_layout(new_layout); if (!_impl_params->state_layout.has_value() || _impl_params->state_layout.value() != new_layout) { @@ -644,7 +645,8 @@ event::ptr primitive_inst::realloc_if_needed() { // dynamic quantization is only applied to activation of FC if (get_node().is_type()) { - auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], 0); + const auto& desc = get_node().as().get_primitive(); + auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], desc->group_sizes); GPU_DEBUG_TRACE_DETAIL << "update layout of dynamic quantize scale parameter layout " << dyn_quan_scale_layout[1].to_short_string() << std::endl; updated_params.output_layouts[1] = dyn_quan_scale_layout[1]; @@ -748,6 +750,7 @@ event::ptr primitive_inst::realloc_if_needed() { auto& present_layout = _impl_params->output_layouts[i]; const auto present_layout_rank = present_layout.get_partial_shape().size(); const auto sequence_axis = kv_cache_inst::get_sequence_axis(desc->concat_axis, present_layout_rank); + GPU_DEBUG_TRACE_DETAIL << "get_max_pad: " << present_layout.to_short_string() << " " << _max_output_layout_count[0] << " " << sequence_axis << "\n"; auto max_pad = kv_cache_inst::get_max_pad(present_layout, _max_output_layout_count[i], sequence_axis, @@ -809,6 +812,7 @@ event::ptr primitive_inst::realloc_if_needed() { } if (present_layout.data_padding._dynamic_dims_mask[sequence_axis] == 1) { // Apply padding of variable to make it be optimized in the next iteration + GPU_DEBUG_TRACE_DETAIL << "get_max_pad: " << present_layout.to_short_string() << " " << _max_output_layout_count[0] << " " << sequence_axis << "\n"; auto max_pad = kv_cache_inst::get_max_pad(present_layout, _max_output_layout_count[0], sequence_axis, @@ -1220,6 +1224,7 @@ void primitive_inst::do_runtime_in_place_kv_cache() { GPU_DEBUG_TRACE_DETAIL << "[do runtime kv_cache opt] " << id() << " initial present_layout : " << present_layout.to_string() << std::endl; GPU_DEBUG_TRACE_DETAIL << "[do runtime kv_cache opt] " << id() << " initial past_layout : " << past_layout.to_string() << std::endl; + GPU_DEBUG_TRACE_DETAIL << "get_max_pad: " << past_layout.to_short_string() << " " << _deps[0].first->_max_output_layout_count[0] << " " << sequence_axis << "\n"; auto max_pad = kv_cache_inst::get_max_pad(past_layout, _deps[0].first->_max_output_layout_count[0], sequence_axis, "past_layout"); const auto new_seq_len = static_cast(new_layout.get_shape()[sequence_axis]); // In chatbot scenario, when chat history must be stored in kvcache, new_seq_len may not be 1 even if max_pad is greater than 0 @@ -1883,6 +1888,9 @@ primitive_inst::primitive_inst(network & network, program_node const& node, bool _dynamic_impl = _impl->clone(); } } + if (_node) { + GPU_DEBUG_TRACE_DETAIL << _node->type()->to_string(*_node) << "\n"; + } _impl_params->strm = _network.get_stream_ptr(); for (size_t i = 0; i < get_node().get_output_layouts().size(); ++i) { if (_outputs.size() > i) { diff --git a/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp index e8e213ad97011a..0d981d18f8c179 100644 --- a/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/scaled_dot_product_attention.cpp @@ -87,6 +87,7 @@ std::string scaled_dot_product_attention_inst::to_string(scaled_dot_product_atte json_composite scaled_dot_product_attention_info; scaled_dot_product_attention_info.add("input id", input.id()); scaled_dot_product_attention_info.add("is_causal", desc->is_causal); + scaled_dot_product_attention_info.add("is_kv_compressed", desc->is_kv_compressed); scaled_dot_product_attention_info.add("has_attn_mask_input", desc->has_attn_mask_input); scaled_dot_product_attention_info.add("has_scale_input", desc->has_scale_input); scaled_dot_product_attention_info.add("input_q_transpose_order", desc->input_q_transpose_order); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index 948bd3c0f1a305..53329e0ec33a60 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -136,6 +136,10 @@ KERNEL(sdpa_opt)( const __global INPUT4_TYPE* scale, #endif __global OUTPUT_TYPE* output, +#if IS_KV_COMPRESSED + const __global KEY_COMPRESSION_SCALE_TYPE* key_scale, + const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale, +#endif #ifdef BEAM_TABLE_TYPE const __global BEAM_TABLE_TYPE* beam_table, #endif @@ -230,14 +234,34 @@ KERNEL(sdpa_opt)( INPUT0_TYPE acc[TARGET_SEQ_LEN_BLOCK_SIZE] = {INPUT0_VAL_ZERO}; +#if IS_KV_COMPRESSED +#ifndef INPUT1_DIMS_ORDER + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, b1_idx, 0); +#else + const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, 0, 0); +#endif + KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset]; +#endif + uint head_idx_index = 0; #define KEY_BLOCK_SIZE 8 for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) + +#if IS_KV_COMPRESSED + KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; +#else KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -258,9 +282,16 @@ KERNEL(sdpa_opt)( for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) +#if IS_KV_COMPRESSED + KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; +#else KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -281,9 +312,16 @@ KERNEL(sdpa_opt)( for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) +#if IS_KV_COMPRESSED + KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; +#else KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -304,9 +342,16 @@ KERNEL(sdpa_opt)( for (; head_idx_index + (KEY_BLOCK_SIZE * SUBGROUP_SIZE) <= HEAD_SIZE; head_idx_index += SUBGROUP_SIZE * KEY_BLOCK_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, KEY_BLOCK_SIZE, ptr, offset); #define KEY_BLOCK MAKE_VECTOR_TYPE(INPUT1_TYPE, KEY_BLOCK_SIZE) + #define KEY_BLOCK_UNCOMPRESSED MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, KEY_BLOCK_SIZE) + #define TO_KEY_BLOCK_UNCOMPRESSED_TYPE(val) CAT(convert_, KEY_BLOCK_UNCOMPRESSED)(val) #define QUERY_BLOCK MAKE_VECTOR_TYPE(INPUT0_TYPE, KEY_BLOCK_SIZE) +#if IS_KV_COMPRESSED + KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); + KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; +#else KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#endif uint query_offset = head_idx_index + sglid; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -475,16 +520,38 @@ KERNEL(sdpa_opt)( #endif #endif +#if IS_KV_COMPRESSED +#ifndef BEAM_TABLE_TYPE + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + // TODO: consider to change scales layout from [batch, seq_len, num_heads, 1] to [batch, num_heads, seq_len, 1] + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, b1_idx, 0); +#else + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, 0, 0); +#endif + VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; +#endif + OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len * SUBGROUP_SIZE + sglid]; } unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { +#if IS_KV_COMPRESSED +#ifdef BEAM_TABLE_TYPE + INPUT2_TYPE value_val_compressed = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); +#else + INPUT2_TYPE value_val_compressed = VALUE_BLOCK_READ(value_input, value_offset); +#endif + VALUE_COMPRESSION_SCALE_TYPE value_val = value_val_compressed * sub_group_broadcast(value_comp_scale, i); +#else #ifdef BEAM_TABLE_TYPE INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); #else INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); +#endif #endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc[seq_idx]); @@ -509,12 +576,30 @@ KERNEL(sdpa_opt)( const uint value_offset = INPUT2_GET_INDEX(b0_idx, b1_idx, start_partition_idx + seq_len, head_size_idx); #endif +#if IS_KV_COMPRESSED +#ifndef INPUT2_DIMS_ORDER + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + // TODO: consider to change scales layout from [batch, seq_len, num_heads, 1] to [batch, num_heads, seq_len, 1] + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, b1_idx, 0); +#else + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, 0, 0); +#endif + VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; +#endif + OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { qk_val[seq_idx] = qk_local[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len]; } +#if IS_KV_COMPRESSED + INPUT2_TYPE value_val_compressed = VALUE_BLOCK_READ(value_input, value_offset); + VALUE_COMPRESSION_SCALE_TYPE value_val = value_val_compressed * value_comp_scale; +#else INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); +#endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc[seq_idx] = mad(qk_val[seq_idx], value_val, acc[seq_idx]); @@ -675,6 +760,10 @@ KERNEL(sdpa_opt)( const __global INPUT4_TYPE* scale, #endif __global OUTPUT_TYPE* output, +#if IS_KV_COMPRESSED + const __global KEY_COMPRESSION_SCALE_TYPE* key_scale, + const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale, +#endif #ifdef BEAM_TABLE_TYPE const __global BEAM_TABLE_TYPE* beam_table, #endif @@ -862,10 +951,23 @@ KERNEL(sdpa_opt)( PA_BUFFERS); if (seq_len_calc_size >= SUBGROUP_SIZE) { +#if IS_KV_COMPRESSED +#ifndef BEAM_TABLE_TYPE + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, b1_idx, 0); +#else + const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, 0, 0); +#endif + KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset]; + // printf("[0]key_scale_comp_offset=%d, sglid=%d: %f\n", key_scale_comp_offset, sglid, key_comp_scale); +#endif + __attribute__((opencl_unroll_hint(1))) for (uint head_idx_index = 0; head_idx_index < HEAD_SIZE; head_idx_index += SUBGROUP_SIZE) { #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset); - #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) QUERY_VEC queries_vec; uint query_local_offset = (head_idx_index * TARGET_SEQ_LEN_BLOCK_SIZE) + sglid; @@ -875,10 +977,19 @@ KERNEL(sdpa_opt)( } unroll_for (uint key_row_idx = 0; key_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; key_row_idx++) { +#if IS_KV_COMPRESSED +#ifdef BEAM_TABLE_TYPE + INPUT1_TYPE key_vals_compressed = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); +#else + INPUT1_TYPE key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); +#endif + KEY_COMPRESSION_SCALE_TYPE key_vals = TO_KEY_COMPRESSION_SCALE_TYPE(key_vals_compressed) * sub_group_broadcast(key_comp_scale, key_row_idx); +#else #ifdef BEAM_TABLE_TYPE INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); #else INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); +#endif #endif unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { @@ -887,12 +998,29 @@ KERNEL(sdpa_opt)( } } } else if (seq_len_calc_size > 0) { +#if IS_KV_COMPRESSED +#ifndef BEAM_TABLE_TYPE + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, b1_idx, 0); +#else + const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, 0, 0); +#endif + KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset]; + // printf("[1]key_scale_comp_offset=%d, sglid=%d: %f\n", key_scale_comp_offset, sglid, key_comp_scale); +#endif __attribute__((opencl_unroll_hint(1))) for (uint head_idx_index = 0; head_idx_index < HEAD_SIZE; head_idx_index += SUBGROUP_SIZE) { - #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset); - #define QUERY_VEC MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) + #define KEY_BLOCK_READ(ptr, offset) BLOCK_READN(INPUT1_TYPE, 1, ptr, offset) + #define QUERY_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) +#if IS_KV_COMPRESSED + #define KEY_VEC_TYPE MAKE_VECTOR_TYPE(KEY_COMPRESSION_SCALE_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) +#else + #define KEY_VEC_TYPE MAKE_VECTOR_TYPE(INPUT1_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) +#endif - QUERY_VEC queries_vec; + QUERY_VEC_TYPE queries_vec; uint query_local_offset = (head_idx_index * TARGET_SEQ_LEN_BLOCK_SIZE) + sglid; unroll_for (uint q_row_idx = 0; q_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; q_row_idx++) { queries_vec[q_row_idx] = slm_query[query_local_offset]; @@ -900,18 +1028,41 @@ KERNEL(sdpa_opt)( } #ifndef LOAD_KEY_LEFTOVERS_IN_CALC_LOOP - QUERY_VEC key_vec = 0; + KEY_VEC_TYPE key_vec = 0; unroll_for (uint key_row_idx = 0; key_row_idx < seq_len_calc_size; key_row_idx++) { + #if IS_KV_COMPRESSED + #ifdef BEAM_TABLE_TYPE + key_vec[key_row_idx] = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index)); + #else + key_vec[key_row_idx] = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index)); + #endif + key_vec[key_row_idx] *= sub_group_broadcast(key_comp_scale, key_row_idx); + #else #ifdef BEAM_TABLE_TYPE key_vec[key_row_idx] = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); #else key_vec[key_row_idx] = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); + #endif #endif } #endif unroll_for (uint key_row_idx = 0; key_row_idx < TARGET_SEQ_LEN_BLOCK_SIZE; key_row_idx++) { #ifdef LOAD_KEY_LEFTOVERS_IN_CALC_LOOP + #if IS_KV_COMPRESSED + #ifdef BEAM_TABLE_TYPE + KEY_COMPRESSION_SCALE_TYPE key_vals = 0; + if (key_row_idx < seq_len_calc_size) + key_vals = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index)); + #else + KEY_COMPRESSION_SCALE_TYPE key_vals = 0; + if (key_row_idx < seq_len_calc_size) { + key_vals = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index)); + // printf("_%d %d %d. Loads key [%d] = %f\n", get_global_id(0), get_global_id(1), get_global_id(2), key_offset + key_row_idx * key_pitch + head_idx_index, key_vals); + } + #endif + key_vals *= sub_group_broadcast(key_comp_scale, key_row_idx); + #else #ifdef BEAM_TABLE_TYPE INPUT1_TYPE key_vals = 0; if (key_row_idx < seq_len_calc_size) @@ -921,6 +1072,7 @@ KERNEL(sdpa_opt)( if (key_row_idx < seq_len_calc_size) key_vals = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); #endif + #endif #else #define key_vals key_vec[key_row_idx] #endif @@ -947,6 +1099,8 @@ KERNEL(sdpa_opt)( qk_acc[i] = INPUT0_MIN_FUNC(INPUT0_MAX_FUNC(qk_acc[i], INPUT0_VAL_MIN), INPUT0_VAL_MAX); + // printf("%d %d %d. qk_acc=%v16f\n", get_global_id(0), get_global_id(1), get_global_id(2), qk_acc); + qk_max = SOFTMAX_ACCUMULATOR_MAX_FUNC(qk_max, TO_SOFTMAX_ACCUMULATOR_TYPE(qk_acc[i])); } } @@ -1006,6 +1160,16 @@ KERNEL(sdpa_opt)( barrier(CLK_LOCAL_MEM_FENCE); } + // if (get_global_id(0) == 0 && get_global_id(1) == 0 && get_global_id(2) == 0) { + // for (int i = 0; i < TARGET_SEQ_LEN_BLOCK_SIZE; i++) { + // printf("row %d: ", i); + // for (int j = 0; j < 16; j++) { + // printf("%f ", slm_qk_vals[SEQ_LEN_PARTITION_SIZE * i + j]); + // } + // printf("\n"); + // } + // } + { // QK*V calculation MAKE_VECTOR_TYPE(OUTPUT_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) acc_output_res = OUTPUT_VAL_ZERO; @@ -1050,15 +1214,36 @@ KERNEL(sdpa_opt)( qk_val[seq_idx] = slm_qk_vals[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len + sglid]; } +#if IS_KV_COMPRESSED +#ifndef BEAM_TABLE_TYPE + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len) + sglid, 0, 0, b1_idx, 0); +#else + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len) + sglid, 0, 0, 0, 0); +#endif + VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; + // printf("[0]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale); +#endif + unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { +#define UNCOMPPRESED_VALUE_TYPE half +#define TO_UNCOMPPRESED_VALUE_TYPE(val) convert_half(val) #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); + UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i))); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, value_offset)); +#endif + +#if IS_KV_COMPRESSED + value_val *= sub_group_broadcast(value_comp_scale, i); #endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc_output_res[seq_idx]); } +#undef UNCOMPPRESED_VALUE_TYPE +#undef TO_UNCOMPPRESED_VALUE_TYPE #ifndef BEAM_TABLE_TYPE value_offset += value_pitch; @@ -1093,20 +1278,42 @@ KERNEL(sdpa_opt)( #endif #endif +#if IS_KV_COMPRESSED +#ifndef BEAM_TABLE_TYPE + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, b1_idx, 0); +#else + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, 0, 0); +#endif + VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; + // printf("[1]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale); +#endif + MAKE_VECTOR_TYPE(OUTPUT_TYPE, TARGET_SEQ_LEN_BLOCK_SIZE) qk_val; unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { qk_val[seq_idx] = slm_qk_vals[seq_idx * SEQ_LEN_PARTITION_SIZE + seq_len * SUBGROUP_SIZE + sglid]; } unroll_for (uint i = 0; i < SUBGROUP_SIZE; i++) { +#define UNCOMPPRESED_VALUE_TYPE half +#define TO_UNCOMPPRESED_VALUE_TYPE(val) convert_half(val) #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); + UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i))); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, value_offset)); +#endif + +#if IS_KV_COMPRESSED + value_val *= sub_group_broadcast(value_comp_scale, i); #endif + unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc_output_res[seq_idx]); } +#undef UNCOMPPRESED_VALUE_TYPE +#undef TO_UNCOMPPRESED_VALUE_TYPE #ifndef BEAM_TABLE_TYPE value_offset += value_pitch; @@ -1144,17 +1351,38 @@ KERNEL(sdpa_opt)( #endif #endif +#if IS_KV_COMPRESSED +#ifndef BEAM_TABLE_TYPE + const uint b_idx = b0_idx; +#endif +#ifdef COMPRESSED_PER_HEAD + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len_leftovers_start + sglid, 0, 0, b1_idx, 0); +#else + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len_leftovers_start + sglid, 0, 0, 0, 0); +#endif + VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; + // printf("[2]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale); +#endif + for (uint seq_len_idx = 0; seq_len_idx < partition_seq_len - seq_len_leftovers_start; seq_len_idx++) { +#define UNCOMPPRESED_VALUE_TYPE half +#define TO_UNCOMPPRESED_VALUE_TYPE(val) convert_half(val) #ifdef BEAM_TABLE_TYPE - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, seq_len_idx)); + UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, seq_len_idx))); #else - INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); + UNCOMPPRESED_VALUE_TYPE value_val = TO_UNCOMPPRESED_VALUE_TYPE(VALUE_BLOCK_READ(value_input, value_offset)); +#endif + +#if IS_KV_COMPRESSED + value_val *= sub_group_broadcast(value_comp_scale, seq_len_idx); #endif for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], seq_len_idx), value_val, acc_output_res[seq_idx]); } +#undef UNCOMPPRESED_VALUE_TYPE +#undef TO_UNCOMPPRESED_VALUE_TYPE #ifndef BEAM_TABLE_TYPE value_offset += value_pitch; #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl index 83e3c7c7e9fef1..328f2b029a5425 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl @@ -125,6 +125,10 @@ KERNEL(sdpa_ref)( const __global INPUT4_TYPE* scale, #endif __global OUTPUT_TYPE* output, +#if IS_KV_COMPRESSED + const __global KEY_COMPRESSION_SCALE_TYPE* key_scale, + const __global VALUE_COMPRESSION_SCALE_TYPE* val_scale, +#endif #ifdef BEAM_TABLE_TYPE const __global BEAM_TABLE_TYPE* beam_table, #endif @@ -162,7 +166,18 @@ KERNEL(sdpa_ref)( #else INPUT0_TYPE q_val = query_input[query_offset]; #endif +#if IS_KV_COMPRESSED + INPUT1_TYPE k_val_comp = key_input[key_offset]; + half k_val = (half)k_val_comp; +#ifdef COMPRESSED_PER_HEAD + const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, s, 0, 0, b1, 0); +#else + const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, s, 0, 0, 0, 0); +#endif + k_val *= key_scale[key_scale_comp_offset]; +#else INPUT1_TYPE k_val = key_input[key_offset]; +#endif acc += q_val * k_val; } @@ -236,7 +251,19 @@ KERNEL(sdpa_ref)( #endif uint value_offset = FUNC_CALL(get_input2_index)(OPTIONAL_SHAPE_INFO_TENSOR b_idx, b1, 0, 0, s, head_size_idx); +#if IS_KV_COMPRESSED + INPUT2_TYPE __value = value_input[value_offset]; + half value = (half)__value; + #ifdef COMPRESSED_PER_HEAD + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, s, 0, 0, b1, 0); + #else + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, s, 0, 0, 0, 0); + #endif + value *= val_scale[value_scale_comp_offset]; + acc += tmp_buf[tmp_buf_offset] * value; +#else acc += tmp_buf[tmp_buf_offset] * value_input[value_offset]; +#endif } uint output_offset = OUTPUT_GET_INDEX(b0, b1, target_seq_idx, head_size_idx); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp index d4b5268eaee4e4..7d0edc0100dff2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt.cpp @@ -14,7 +14,7 @@ static std::pair get_input_bf_size(const dynamic_quantize_params size_t input_f = params.inputs[0].Feature().v; size_t input_batch = params.inputs[0].Batch().v; // 3D input - if (params.outputs[0].GetLayout() == DataLayout::bfyx) { + if (params.outputs[0].GetLayout() == DataLayout::bfyx && false) { input_f = params.inputs[0].Y().v * params.inputs[0].X().v; input_batch = params.inputs[0].Batch().v * params.inputs[0].Feature().v; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp index 3b214848e2f8ad..b766092758c792 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -35,6 +35,9 @@ CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_p OPENVINO_ASSERT(params.outputs[0].GetLayout() == DataLayout::bfyx, "It supports only 4d tensor"); dispatchData.gws = {params.outputs[0].Batch().v * params.outputs[0].Feature().v, 1, 1}; + GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // per-head compression + dispatchData.gws[1] = params.outputs[0].Y().v; + } dispatchData.lws = {1, 1, 1}; return dispatchData; @@ -90,6 +93,17 @@ KernelsPriority DynamicQuantizeKernelRef::GetKernelsPriority(const Params& /*par return FORCE_PRIORITY_8; } +// TODO: need this func? +Datatype DynamicQuantizeKernelRef::GetAccumulatorType(const dynamic_quantize_params& params) const { + Datatype types[] = { Datatype::F32, Datatype::F16, Datatype::INT64, Datatype::INT32, Datatype::UINT32}; + + for (Datatype type : types) + for (auto& in : params.inputs) + if (in.GetDType() == type) + return type; + return Datatype::F32; +} + bool DynamicQuantizeKernelRef::Validate(const Params& params) const { if (!KernelBaseOpenCL::Validate(params)) return false; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp index 6ca9fbd2f5bd76..aba81c1827c4d7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_selector.cpp @@ -9,7 +9,7 @@ namespace kernel_selector { dynamic_quantize_kernel_selector::dynamic_quantize_kernel_selector() { Attach(); - Attach(); + // Attach(); } KernelsData dynamic_quantize_kernel_selector::GetBestKernels(const Params& params) const { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp index 7556debd29df00..218e736cc70e14 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp @@ -4,6 +4,7 @@ #include "sdpa_kernel_base.h" #include "kernel_selector_utils.h" +#include "intel_gpu/runtime/debug_configuration.hpp" namespace kernel_selector { @@ -66,6 +67,7 @@ static std::string GetBroadcastInputStr(const size_t input_rank, const int64_t a } JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { + GPU_DEBUG_GET_INSTANCE(debug_config); auto jit = MakeBaseParamsJitConstants(params); if (params.conf.broadcast_axis != -1) { @@ -73,6 +75,8 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { jit.AddConstant(MakeJitConstant("DO_BROADCAST_KEY_VALUE", GetBroadcastInputStr(params.inputs[0].GetDims().size(), params.conf.broadcast_axis, params.conf.group_size))); + } else { + jit.AddConstant(MakeJitConstant("BROADCAST_GROUP_SIZE", 1)); } jit.AddConstant(MakeJitConstant("IS_CAUSAL", params.conf.is_causal)); @@ -81,6 +85,16 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { jit.AddConstant(MakeJitConstant("HAS_SCALE_INPUT", params.inputs.size() > 4)); } + jit.AddConstant(MakeJitConstant("IS_KV_COMPRESSED", params.conf.is_kv_compressed)); + if (params.conf.is_kv_compressed) { + jit.AddConstant(MakeJitConstant("KEY_COMPRESSION_SCALE", params.key_cache_comp_scale)); + jit.AddConstant(MakeJitConstant("VALUE_COMPRESSION_SCALE", params.value_cache_comp_scale)); + } + + GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // FIXME: it should be placed in params + jit.AddConstant(MakeJitConstant("COMPRESSED_PER_HEAD", 1)); + } + auto is_default_order = [](const std::vector& order) { for (size_t i = 0; i < order.size(); i++) if (order[i] != static_cast(i)) @@ -140,6 +154,7 @@ bool SDPAKernelBase::Validate(const Params& p) const { if (params.outputs[0].Dimentions() != 4) return false; + // FIXME: i8 input is supported only when kv cache is compressed return true; } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h index 6ea8d85527d19d..e510cba6480964 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h @@ -88,6 +88,7 @@ struct sdpa_configuration { bool is_causal = false; bool has_alibi_input = false; + bool is_kv_compressed = false; // Paged Attention configuration bool is_paged_attention = false; @@ -110,6 +111,8 @@ struct sdpa_params : public base_params { int64_t indirect_axis = -1; DataTensor beam_table; + DataTensor key_cache_comp_scale; + DataTensor value_cache_comp_scale; sdpa_configuration conf; }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp index 04ebf2f2165973..d615b97d75e0a5 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp @@ -342,6 +342,9 @@ bool SDPAKernelMicro::Validate(const Params& p) const { if (params.conf.head_size > 256) return false; + if (params.conf.is_kv_compressed) + return false; + return true; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp index 2f0174d0a45912..2422dad45651ff 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp @@ -4,6 +4,7 @@ #include "sdpa_kernel_opt.h" #include "kernel_selector_utils.h" +#include "common_types.h" #include #include @@ -126,6 +127,7 @@ static std::string GetKernelName(std::string base_name, KernelsTypes type, const ParamsKey SDPAKernelOpt::GetSupportedKey() const { ParamsKey k; + k.EnableInputDataType(Datatype::INT8); // For KV cache compression k.EnableInputDataType(Datatype::F16); k.EnableInputDataType(Datatype::F32); k.EnableInputDataType(Datatype::INT32); @@ -307,8 +309,21 @@ KernelsData SDPAKernelOpt::GetKernelsData(const Params& params) const { static_cast(prim_params.outputs.size()), prim_params.is_shape_agnostic); - if (prim_params.indirect_axis != -1 && kernel_idx != KernelsTypes::FINALIZATION) - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(prim_params.inputs.size())}); + auto beam_table_idx = prim_params.inputs.size(); + if (prim_params.conf.is_kv_compressed && kernel_idx != KernelsTypes::FINALIZATION) { + GPU_DEBUG_TRACE_DETAIL << "COMPRESSED???\n"; + auto key_cache_compression_scale_idx = static_cast(prim_params.inputs.size()); + auto value_cache_compression_scale_idx = static_cast(prim_params.inputs.size() + 1); + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx}); + + beam_table_idx += 2; + } + + if (prim_params.indirect_axis != -1 && kernel_idx != KernelsTypes::FINALIZATION) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(beam_table_idx)}); + } kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); @@ -391,6 +406,6 @@ void SDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const { } KernelsPriority SDPAKernelOpt::GetKernelsPriority(const Params& params) const { - return params.engineInfo.supports_immad ? FORCE_PRIORITY_2 : FORCE_PRIORITY_1; + return FORCE_PRIORITY_1; } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp index 579c4bc06c17e2..18bed177cdae1d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp @@ -12,10 +12,12 @@ namespace kernel_selector { ParamsKey SDPAKernelRef::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::INT8); k.EnableInputDataType(Datatype::F32); // beam table input k.EnableInputDataType(Datatype::INT32); + // FIXME: support for compressed KV cache k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); @@ -74,8 +76,20 @@ KernelsData SDPAKernelRef::GetKernelsData(const Params& params) const { "", false, false, static_cast(prim_params.inputs.size()), GetFusedPrimitiveInputsCount(params), 1, prim_params.is_shape_agnostic); - if (prim_params.indirect_axis != -1) - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(prim_params.inputs.size())}); + auto beam_table_idx = prim_params.inputs.size(); + if (prim_params.conf.is_kv_compressed) { + auto key_cache_compression_scale_idx = static_cast(prim_params.inputs.size()); + auto value_cache_compression_scale_idx = static_cast(prim_params.inputs.size() + 1); + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, key_cache_compression_scale_idx}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, value_cache_compression_scale_idx}); + + beam_table_idx += 2; + } + + if (prim_params.indirect_axis != -1) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, static_cast(beam_table_idx)}); + } kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp index e65fd7fd10976b..906ad3dcb23d5e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp @@ -13,11 +13,21 @@ namespace kernel_selector { sdpa_kernel_selector::sdpa_kernel_selector() { - Attach(); - Attach(); -#ifdef ENABLE_ONEDNN_FOR_GPU - Attach(); -#endif + int USE_REF = 0; + if (const auto env_var = std::getenv("USE_REF")) { + std::istringstream ss(env_var); + ss >> USE_REF; + } + + if (!USE_REF) { + Attach(); + Attach(); + #ifdef ENABLE_ONEDNN_FOR_GPU + // Attach(); + #endif + } else { + Attach(); + } } KernelsData sdpa_kernel_selector::GetBestKernels(const Params& params) const { diff --git a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp index 7574b664b6b4b7..029e8fd4bfbf81 100644 --- a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp +++ b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp @@ -34,14 +34,27 @@ VariableStateIndirectKVCache::VariableStateIndirectKVCache(const VariableStateIn RemoteContextImpl::Ptr context, std::shared_ptr shape_predictor, size_t beam_axis, - size_t concat_axis) + size_t concat_axis, + bool has_compression_scale) : MultiTensorState { {info}, context, shape_predictor} , m_beam_axis(beam_axis) - , m_concat_axis(concat_axis) { + , m_concat_axis(concat_axis) + , m_has_compression_scale(has_compression_scale) { + cldnn::layout beam_table_layout(get_beam_table_shape(info.m_layout.get_partial_shape()), ov::element::i32, cldnn::format::bfyx); VariableStateInfo beam_table_state_info(info.m_id + "/beam_table", beam_table_layout); m_hidden_states.push_back(std::make_shared(beam_table_state_info, context, shape_predictor)); - OPENVINO_ASSERT(m_hidden_states.size() == 2, "[GPU] VariableStateIndirectKVCache expects 2 internal states to be initialized"); + + if (has_compression_scale) { + cldnn::layout compression_scale_layout(get_compression_scale_shape(info.m_layout.get_partial_shape()), ov::element::f16, cldnn::format::bfyx); + VariableStateInfo compression_scale_state_info(info.m_id + "/comp_scale", compression_scale_layout); + m_hidden_states.push_back(std::make_shared(compression_scale_state_info, context, shape_predictor)); + } + + OPENVINO_ASSERT((!has_compression_scale && m_hidden_states.size() == 2) || + (has_compression_scale && m_hidden_states.size() == 3) + , "[GPU] VariableStateIndirectKVCache expects 2 or 3 internal states to be initialized: " + "has_compression_scale=", has_compression_scale, " internal_states=", m_hidden_states.size()); } void VariableStateIndirectKVCache::reset() { @@ -60,6 +73,7 @@ const cldnn::layout& VariableStateIndirectKVCache::get_layout() const { } void VariableStateIndirectKVCache::set_state(const ov::SoPtr& state) { + OPENVINO_ASSERT(!m_has_compression_scale, "[GPU] set_state API is supported only when KVcache compression is disabled"); OPENVINO_ASSERT(m_hidden_states.size() == 2, "[GPU] Corrupted VariableStateIndirectKVCache. Expected 2 internal states. Got: ", m_hidden_states.size()); m_hidden_states[0]->set_state(state); // user can set only KV cache @@ -109,6 +123,8 @@ static void rearrange_cache(cldnn::memory::ptr kv_in_mem, cldnn::memory::ptr bt_ } ov::SoPtr VariableStateIndirectKVCache::get_state() const { + OPENVINO_ASSERT(!m_has_compression_scale, "[GPU] get_state API is supported only when KVcache compression is disabled"); + auto kv_layout = m_hidden_states[0]->get_layout(); auto bt_mem = m_hidden_states[1]->get_memory(); if (kv_layout.get_partial_shape()[m_beam_axis].get_length() > 1 && bt_mem) { @@ -152,5 +168,23 @@ VariableState::Ptr VariableStateIndirectKVCache::get_beam_table_state() const { return m_hidden_states[1]; } +ov::PartialShape VariableStateIndirectKVCache::get_compression_scale_shape(const ov::PartialShape& kv_cache_shape) { + // FIXME: add assert to confirm that it is compressed + GPU_DEBUG_GET_INSTANCE(debug_config); + auto rank = kv_cache_shape.size(); + ov::PartialShape compression_scale_shape(std::vector(rank, 1)); + compression_scale_shape[0] = kv_cache_shape[0]; + compression_scale_shape[1] = kv_cache_shape[1]; + + GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // per-head compression + compression_scale_shape[2] = kv_cache_shape[2]; + } + return compression_scale_shape; +} + +VariableState::Ptr VariableStateIndirectKVCache::get_compression_scale_state() const { + return m_hidden_states[2]; +} + } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index 0373251e45c051..3780b5a5d14b47 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -3,6 +3,7 @@ // #include "ov_ops/dynamic_quantize.hpp" +#include "intel_gpu/op/kv_cache.hpp" #include "intel_gpu/plugin/program_builder.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/primitives/dynamic_quantize.hpp" @@ -16,14 +17,21 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptrget_group_sizes(); - for (size_t i = 0; i < group_sizes.size() - 1; i++) - OPENVINO_ASSERT(group_sizes[i] == 1, "Not supported group size at ", i, ": ", group_sizes[i]); - OPENVINO_ASSERT(group_sizes.back() == UINT64_MAX, "Not supported group size: ", group_sizes.back()); + const auto& users = op->get_users(); + if (users.size() >= 1) { + // std::cout << "KV cache user of dynamic quantization " << users[0]->get_friendly_name() << "\n"; + } else { + for (size_t i = 0; i < group_sizes.size() - 1; i++) + OPENVINO_ASSERT(group_sizes[i] == 1, "Not supported group size at ", i, ": ", group_sizes[i]); + + OPENVINO_ASSERT(group_sizes.back() == UINT64_MAX, "Not supported group size: ", group_sizes.back()); + } + auto prim = cldnn::dynamic_quantize(primitive_name, - inputs[0], - op->get_group_sizes().back(), - get_output_data_types(op)); + inputs[0], + group_sizes, + get_output_data_types(op)); p.add_primitive(*op, prim); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp index c2ee336e48bf06..f7afb23753a814 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp @@ -22,7 +22,7 @@ namespace intel_gpu { namespace { void CreateKVCacheOp(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {2, 3}); + validate_inputs_count(op, {2, 3, 4}); auto inputs = p.GetInputInfo(op); int64_t rank = op->get_input_partial_shape(0).size(); auto prim = cldnn::kv_cache(layer_type_name_ID(op), @@ -30,7 +30,8 @@ void CreateKVCacheOp(ProgramBuilder& p, const std::shared_ptrget_variable()->get_info(), ov::util::normalize(op->get_concat_axis(), rank), ov::util::normalize(op->get_gather_axis(), rank), - op->get_indirect()); + op->get_indirect(), + op->get_compressed()); prim.num_outputs = op->get_output_size(); prim.output_data_types = get_output_data_types(op); diff --git a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp index 0ba780183b186d..7642a411442efe 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp @@ -33,6 +33,7 @@ static void CreateScaledDotProductAttentionOp(ProgramBuilder& p, const std::shar auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName, inputs, is_causal, + false, -1, order, order, @@ -48,10 +49,12 @@ static void CreateSDPAOp(ProgramBuilder& p, const std::shared_ptrget_causal(); + bool is_kv_compressed = op->get_kv_compressed(); int64_t indirect_axis = -1; auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName, inputs, is_causal, + is_kv_compressed, indirect_axis, op->get_input0_transpose_order(), op->get_input1_transpose_order(), @@ -62,15 +65,18 @@ static void CreateSDPAOp(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {4, 5, 6}); auto inputs = p.GetInputInfo(op); auto layerName = layer_type_name_ID(op); bool is_causal = op->get_causal(); + bool is_kv_compressed = op->get_kv_compressed(); + size_t scale_input_cnt = is_kv_compressed ? 2 : 0; + validate_inputs_count(op, {4 + scale_input_cnt, 5 + scale_input_cnt, 6 + scale_input_cnt}); int64_t indirect_axis = op->get_indirect_axis(); auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName, inputs, is_causal, + is_kv_compressed, indirect_axis, op->get_input0_transpose_order(), op->get_input1_transpose_order(), diff --git a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp index 9d7d6854009316..16f19fd20d76da 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp @@ -16,6 +16,7 @@ namespace ov { namespace op { namespace internal { using ReadValue = ov::intel_gpu::op::ReadValue; +using CompressedReadValue = ov::intel_gpu::op::CompressedReadValue; } // namespace internal } // namespace op } // namespace ov @@ -55,6 +56,20 @@ void CreateReadValueOp(ProgramBuilder& p, const std::shared_ptr(p, op, op->get_variable_id()); } +void CreateCompressedReadValueOp(ProgramBuilder& p, const std::shared_ptr& op) { + validate_inputs_count(op, {2}); + + static bool warned = false; + if (!warned) { + std::cerr << "******************************************************************************\n"; + std::cerr << "WARNING: CompressedReadValue uses generic read_value primitive - need to implement logic to save init scales\n"; + std::cerr << "******************************************************************************\n"; + warned = true; + } + + CreateVariableAccessPrimitive(p, op, op->get_variable_id()); +} + void CreateAssignOp(ProgramBuilder& p, const std::shared_ptr& op) { validate_inputs_count(op, {1}); CreateVariableAccessPrimitive(p, op, op->get_variable_id()); @@ -77,6 +92,7 @@ REGISTER_FACTORY_IMPL(v6, Assign); REGISTER_FACTORY_IMPL(v3, ReadValue); REGISTER_FACTORY_IMPL(v6, ReadValue); REGISTER_FACTORY_IMPL(internal, ReadValue); +REGISTER_FACTORY_IMPL(internal, CompressedReadValue); } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index 346b4471779593..09581855fcd737 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -643,17 +643,20 @@ void SyncInferRequest::allocate_states() { bool indirect_kv_cache = false; int64_t beam_axis = 0; int64_t concat_axis = 0; + bool compressed = false; auto kv_cache_shape = vi.second.m_layout.get_partial_shape(); for (auto& p : state_prims) { if (auto kv_cache_prim = dynamic_cast(p)) { indirect_kv_cache = kv_cache_prim->indirect; beam_axis = ov::util::normalize(kv_cache_prim->gather_axis, kv_cache_shape.size()); concat_axis = ov::util::normalize(kv_cache_prim->concat_axis, kv_cache_shape.size()); + compressed = kv_cache_prim->compressed; } } + // TODO: barnch for compressed w/o indirectness if (indirect_kv_cache) { - m_variables.emplace(vi.first, std::make_shared(vi.second, m_context, m_shape_predictor, beam_axis, concat_axis)); + m_variables.emplace(vi.first, std::make_shared(vi.second, m_context, m_shape_predictor, beam_axis, concat_axis, compressed)); } else { m_variables.emplace(vi.first, std::make_shared(vi.second, m_context, m_shape_predictor)); } @@ -737,6 +740,19 @@ std::vector SyncInferRequest::prepare_input(const std::string auto& engine = m_graph->get_engine(); auto& stream = network->get_stream(); + auto print_arr = [&](int64_t* vec, size_t max_len, std::string name) { + std::stringstream ss; + for (size_t i = 0; i < max_len; i++) { + ss << vec[i] << ", "; + } + std::cout << "Array " << name << " (len=" << max_len << ") content: " << ss.str() << "\n"; + }; + + + if (internal_name == "parameter:input_ids") { + print_arr(user_tensor->data(), user_tensor->get_size(), "parameter:input_ids"); + } + auto need_lockable_mem = network->does_node_need_lockable_output(internal_name); OPENVINO_ASSERT(pshape.compatible(ov::PartialShape(user_tensor->get_shape())) || is_batched_input(port), diff --git a/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp index 7ecf4f28496e4d..c90bd601bb64a3 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp @@ -205,6 +205,7 @@ IndirectSDPAOpt::IndirectSDPAOpt() { auto indirect_sdpa = std::make_shared(data_inputs, indirect_kv_cache_0->output(1), // beam table is_causal, + false, // kv_compressed gather_axis_1, order_in0, order_in1, diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp new file mode 100644 index 00000000000000..4f47253c089840 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp @@ -0,0 +1,255 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "kv_cache_compression.hpp" +#include + +#include "intel_gpu/op/kv_cache.hpp" + +#include "intel_gpu/op/read_value.hpp" +#include "intel_gpu/plugin/common_utils.hpp" +#include "openvino/core/node_vector.hpp" +#include "openvino/core/rt_info.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/gather.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/sink.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/pattern/op/label.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/or.hpp" +#include "openvino/pass/visualize_tree.hpp" +#include "transformations/utils/utils.hpp" +#include "intel_gpu/op/indirect_sdpa.hpp" + +#include "ov_ops/dynamic_quantize.hpp" + +namespace ov { +namespace intel_gpu { + +class KVCacheCompressionMatcher : public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("KVCacheCompressionMatcher", "0"); + KVCacheCompressionMatcher(); +}; + +KVCacheCompressionMatcher::KVCacheCompressionMatcher() { + using namespace ov::pass::pattern; + + bool first = true; + + int KV_CACHE_COMP = 0; + if (const auto env_var = std::getenv("KV_CACHE_COMP")) { + std::istringstream ss(env_var); + ss >> KV_CACHE_COMP; + } + + if (KV_CACHE_COMP == 0) { + if (first) { + printf("NO_KV_CACHE_COMP\n"); + } + first = false; + return; + } else { + if (first) + printf("YES_KV_CACHE_COMP\n"); + + first = false; + } + + auto query = any_input(); + + auto k_past = wrap_type(); + auto k_new_token = any_input(); + auto k_beam_idx = any_input(); + auto key = wrap_type({k_past, k_new_token, k_beam_idx}); + + auto v_past = wrap_type(); + auto v_new_token = any_input(); + auto v_beam_idx = any_input(); + auto value = wrap_type({v_past, v_new_token, v_beam_idx}); + + auto input_attn_mask = any_input(); + auto input_scale = any_input(); + + auto present = wrap_type({query, key, value, input_attn_mask, input_scale}); + + // k, v, attention_mask, scale + ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) { + if (transformation_callback(m.get_match_root())) { + return false; + } + const auto& pattern_map = m.get_pattern_value_map(); + + auto k_new_token_node = pattern_map.at(k_new_token).get_node_shared_ptr(); + auto key_node = std::dynamic_pointer_cast(pattern_map.at(key).get_node_shared_ptr()); + auto value_node = std::dynamic_pointer_cast(pattern_map.at(value).get_node_shared_ptr()); + auto org_sdpa = std::dynamic_pointer_cast(pattern_map.at(present).get_node_shared_ptr()); + + auto key_past_node = std::dynamic_pointer_cast(pattern_map.at(k_past).get_node_shared_ptr()); + auto value_past_node = std::dynamic_pointer_cast(pattern_map.at(v_past).get_node_shared_ptr()); + + if (true + // || org_sdpa->get_friendly_name().find(".h.0.") != std::string::npos + // || org_sdpa->get_friendly_name().find(".h.1.") != std::string::npos + // || org_sdpa->get_friendly_name().find(".h.2.") != std::string::npos + // || org_sdpa->get_friendly_name().find(".h.3.") != std::string::npos + // || org_sdpa->get_friendly_name().find(".h.4.") != std::string::npos + // || org_sdpa->get_friendly_name().find(".h.5.") != std::string::npos + ) { + std::cout << "pattern matched! " << org_sdpa->get_friendly_name() << std::endl; + auto rank = key_node->get_input_partial_shape(0).size(); + std::vector shape_group_size(rank, 1); + shape_group_size[rank - 1] = UINT64_MAX; + + + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(debug_config->enable_kv_cache_compression != 1) { // per-token compression + shape_group_size[rank - 2] = UINT64_MAX; + } + + auto key_variable = key_past_node->get_variable(); + key_variable->update_data_type(element::i8); + + auto value_variable = value_past_node->get_variable(); + value_variable->update_data_type(element::i8); + + // auto replace_read_value_node = [](const std::shared_ptr& target, + // const std::shared_ptr& replacement) { + // target->output(0).replace(replacement->output(0)); + + // replacement->add_node_control_dependents(target); + // replacement->add_node_control_dependencies(target); + // target->clear_control_dependents(); + // }; + + if (key_past_node->get_input_size() == 1) { + auto k_init_dyn_quan = std::make_shared(key_past_node->get_input_node_shared_ptr(0), shape_group_size, element::f16); + auto new_key_past_node = std::make_shared(k_init_dyn_quan->output(0), k_init_dyn_quan->output(1), key_past_node->get_variable()); + k_init_dyn_quan->set_friendly_name(key_node->get_friendly_name() + "_init_dyn_quan"); + std::cout << "Key outputs: " << key_past_node->get_output_size() << " " << new_key_past_node->get_output_size() << "\n"; + ov::copy_runtime_info(key_past_node, new_key_past_node); + + // TODO: Old ReadValue node is kept in the graph and goes to ShapeOf - this needs to be fixed + // replace_read_value_node(key_past_node, new_key_past_node); + + key_past_node = new_key_past_node; + } + + if (value_past_node->get_input_size() == 1) { + auto v_init_dyn_quan = std::make_shared(value_past_node->get_input_node_shared_ptr(0), shape_group_size, element::f16); + auto new_value_past_node = std::make_shared(v_init_dyn_quan->output(0), v_init_dyn_quan->output(1), value_past_node->get_variable()); + + std::cout << "Value outputs: " << value_past_node->get_output_size() << " " << new_value_past_node->get_output_size() << "\n"; + + v_init_dyn_quan->set_friendly_name(value_node->get_friendly_name() + "_init_dyn_quan"); + ov::copy_runtime_info(value_past_node, new_value_past_node); + // replace_read_value_node(value_past_node, new_value_past_node); + + value_past_node = new_value_past_node; + } + + auto k_dyn_quan = std::make_shared(key_node->get_input_node_shared_ptr(1), shape_group_size, element::f16); + k_dyn_quan->set_friendly_name("dyn_quan_key"); + + // FIXME: need to tell whether it is direct KV cache or indirect kv cache + auto new_kv_cache_k = std::make_shared(key_past_node, + k_dyn_quan->output(0), + key_node->get_input_node_shared_ptr(2), + k_dyn_quan->output(1), + key_node->get_variable(), + key_node->get_concat_axis(), + key_node->get_gather_axis(), + key_node->get_output_element_type(0)); + + new_kv_cache_k->set_friendly_name(key_node->get_friendly_name()); + ov::copy_runtime_info(key_node, new_kv_cache_k); + + auto v_dyn_quan = std::make_shared(value_node->get_input_node_shared_ptr(1), shape_group_size, element::f16); + v_dyn_quan->set_friendly_name("dyn_quan_value"); + // FIXME: need to tell whether it is direct KV cache or indirect kv cache + auto new_kv_cache_v = std::make_shared(value_past_node, + v_dyn_quan->output(0), + value_node->get_input_node_shared_ptr(2), + v_dyn_quan->output(1), + value_node->get_variable(), + value_node->get_concat_axis(), + value_node->get_gather_axis(), + value_node->get_output_element_type(0)); + + new_kv_cache_v->set_friendly_name(value_node->get_friendly_name()); + ov::copy_runtime_info(value_node, new_kv_cache_v); + + // FIXME: output port from new_kv_cache_k is fixed. compression and indirectness is orthogonal. + OutputVector sdpa_inputs; + // QKV -- attention_mask -- input_scale -- key_scale -- beam_idx + for (size_t i = 0; i < org_sdpa->get_input_size() - 1; i++) + sdpa_inputs.push_back(org_sdpa->get_input_node_shared_ptr(i)); + sdpa_inputs[1] = new_kv_cache_k->output(0); // compressed K + sdpa_inputs[2] = new_kv_cache_v->output(0); // compressed V + sdpa_inputs.push_back(new_kv_cache_k->output(2)); // scale for compressed K + sdpa_inputs.push_back(new_kv_cache_v->output(2)); // scale for compressed V + + auto new_sdpa = std::make_shared(sdpa_inputs, + new_kv_cache_k->output(1), + org_sdpa->get_causal(), + true /* kv_compressed */, + org_sdpa->get_indirect_axis(), + org_sdpa->get_input0_transpose_order(), + org_sdpa->get_input1_transpose_order(), + org_sdpa->get_input2_transpose_order(), + org_sdpa->get_output_transpose_order(), + org_sdpa->get_output_type()); + + new_kv_cache_k->set_friendly_name(key_node->get_friendly_name()); + ov::copy_runtime_info(key_node, new_kv_cache_k); + + new_kv_cache_v->set_friendly_name(value_node->get_friendly_name()); + ov::copy_runtime_info(value_node, new_kv_cache_v); + + new_sdpa->set_friendly_name(org_sdpa->get_friendly_name()); + ov::copy_runtime_info(org_sdpa, new_sdpa); + + ov::replace_node(org_sdpa, new_sdpa); + return true; + } + return false; + }; + + auto m = std::make_shared(present, "KVCacheCompressionMatcher"); + this->register_matcher(m, callback); + +} + +bool KVCacheCompression::run_on_model(const std::shared_ptr& m) { + bool res = pass::GraphRewrite::run_on_model(m); + std::cout << "KVCacheCompression res=" << res << "\n"; + + + + // TODO: seems it's not needed and copied from kvcache + if (res) { + ov::SinkVector sinks = m->get_sinks(); + std::cout << "KVCacheCompression remove sinks " << sinks.size() << "\n"; + for (auto& sink : sinks) { + if (sink && sink->get_input_node_ptr(0)->get_type_info() == op::KVCache::get_type_info_static()) { + std::cout << "Remove " << sink->get_friendly_name() << ", kvcache=" << sink->get_input_node_ptr(0)->get_friendly_name() << "\n"; + m->remove_sink(sink); + } + } + } + + return res; +} + +KVCacheCompression::KVCacheCompression() { + add_matcher(); +} + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp new file mode 100644 index 00000000000000..1ac515a11ddcba --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_gpu { + + +/// Add dynamic quantization node before kv cache +/// ┌───────────┐ ┌─────────────┐ +/// │ New Key │ │ New Value │ +/// └──────┬────┘ └──────┬──────┘ +/// │ │ +/// f16 │ │ f16 +/// │ │ +/// ┌───────────┐ ┌─────────────┐ ┌───────┴─────┐ ┌──────┴──────┐ +/// │ New Key │ │ New Value │ │ Dyn Quant │ │ Dyn Quant │ +/// └──────┬────┘ └──────┬──────┘ └───────┬─────┘ └──────┬──────┘ +/// │ │ │ │ +/// │ f16 │ f16 i8:data │ f16:scale i8:data │ f16:scale +/// │ │ ==> │ │ +/// ┌─────────┐ ┌────────┴─────────┐ ┌────────┴───────────┐ ┌─────────┐ ┌────────┴─────────┐ ┌────────┴───────────┐ +/// │ Query │ │ KV cache (Key) │ │ KV cache (Value) │ │ Query │ │ KV cache (Key) │ │ KV cache (Value) │ +/// └───┬─────┘ └────────┬─────────┘ └────────┬───────────┘ └────┬────┘ └────────┬─────────┘ └────────┬───────────┘ +/// │ │ │ │ │ │ +/// │ f16 │ f16 │ f16 │f16 i8:data │ f16:scale i8:data │ f16:scale +/// │ │ │ │ │ │ +/// │ │ │ │ │ │ +/// │ ┌────┴───┐ │ │ ┌────┴───┐ │ +/// └─────────────┤ SDPA ├─────────────────┘ └─────────────┤ SDPA ├────────────────────┘ +/// └────────┘ └────────┘ + +class KVCacheCompression : public ov::pass::GraphRewrite { +public: + OPENVINO_RTTI("KVCacheCompression", "0"); + KVCacheCompression(); + + bool run_on_model(const std::shared_ptr& m) override; +}; + + +} // namespace intel_gpu +} // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp index 681c88119efd95..179eea8151d883 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/indirect_sdpa.cpp @@ -12,13 +12,14 @@ namespace op { IndirectSDPA::IndirectSDPA(const OutputVector& data_inputs, const ov::Output& beam_table, const bool is_causal, + const bool is_kv_compressed, const int64_t indirect_axis, const std::vector& order_q, const std::vector& order_k, const std::vector& order_v, const std::vector& order_out, const ov::element::Type output_type) - : ov::intel_gpu::op::SDPA(data_inputs, is_causal, order_q, order_k, order_v, order_out, output_type) + : ov::intel_gpu::op::SDPA(data_inputs, is_causal, is_kv_compressed, order_q, order_k, order_v, order_out, output_type) , m_indirect_axis(indirect_axis) { auto beam_table_idx = data_inputs.size(); set_argument(beam_table_idx, beam_table); @@ -34,6 +35,7 @@ std::shared_ptr IndirectSDPA::clone_with_new_inputs(const ov::OutputVe return std::make_shared(data_inputs, new_args.back(), m_is_causal, + m_is_kv_compressed, m_indirect_axis, m_order_q, m_order_k, @@ -44,11 +46,15 @@ std::shared_ptr IndirectSDPA::clone_with_new_inputs(const ov::OutputVe void IndirectSDPA::validate_and_infer_types() { const auto input_size = get_input_size(); + const size_t scale_data_cnt = m_is_kv_compressed ? 2 : 0; NODE_VALIDATION_CHECK(this, - input_size == 4 || input_size == 5 || input_size == 6, + input_size == 4 + scale_data_cnt || input_size == 5 + scale_data_cnt || input_size == 6 + scale_data_cnt, "Number of inputs is incorrect. Current value is: ", input_size, - ", expected 4, 5 or 6."); + ", expected 4, 5 or 6. (scale_data_cnt ", + scale_data_cnt, + ")" + ); std::vector input_shapes; for (size_t i = 0; i < input_size - 1; i++) { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp index a598e556a8f05d..5ca1af8ca2ad6b 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp @@ -8,6 +8,7 @@ #include "openvino/core/partial_shape.hpp" #include "openvino/core/validation_util.hpp" #include "openvino/op/concat.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" namespace ov { namespace intel_gpu { @@ -24,6 +25,7 @@ KVCache::KVCache(const Output& past, , m_concat_axis(concat_axis) , m_gather_axis(gather_axis) , m_indirect(true) + , m_compressed(false) , m_output_type(output_type) { m_variable = past_variable; if (m_indirect) @@ -40,28 +42,58 @@ KVCache::KVCache(const Output& past, , m_concat_axis(concat_axis) , m_gather_axis(0) , m_indirect(false) + , m_compressed(false) , m_output_type(output_type) { m_variable = past_variable; validate_and_infer_types(); } +KVCache::KVCache(const Output& past, + const Output& new_token_data, + const Output& beam_idx, + const Output& new_token_scale, + const std::shared_ptr& past_variable, + int64_t concat_axis, + int64_t gather_axis, + const ov::element::Type output_type) + : Op({past, new_token_data, beam_idx, new_token_scale}) + , m_concat_axis(concat_axis) + , m_gather_axis(gather_axis) + , m_indirect(true) + , m_compressed(true) + , m_output_type(output_type) { + m_variable = past_variable; + size_t out_ports = 1; + if (m_indirect) + set_output_size(++out_ports); + if (m_compressed) + set_output_size(++out_ports); + validate_and_infer_types(); +} + bool KVCache::visit_attributes(ov::AttributeVisitor& visitor) { visitor.on_attribute("concat_axis", m_concat_axis); visitor.on_attribute("gather_axis", m_gather_axis); visitor.on_attribute("indirect", m_indirect); visitor.on_attribute("output_type", m_output_type); + visitor.on_attribute("compressed", m_compressed); return true; } void KVCache::validate_and_infer_types() { auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; std::vector input_shapes = {m_variable->get_info().data_shape, get_input_partial_shape(1)}; - if (get_output_size() == 2) + if (m_indirect) input_shapes.push_back(get_input_partial_shape(2)); auto shapes = shape_infer(this, input_shapes); - set_output_type(0, output_type, shapes[0]); + size_t out_ports = 0; + set_output_type(out_ports++, output_type, shapes[0]); + // TODO: kv-cache compression is not supported for indirect kv cache if (m_indirect) { - set_output_type(1, get_input_element_type(2), shapes[1]); + set_output_type(out_ports++, get_input_element_type(2), shapes[1]); + } + if (m_compressed) { + set_output_type(out_ports++, get_input_element_type(2), shapes[1]); } } @@ -74,10 +106,19 @@ std::shared_ptr KVCache::clone_with_new_inputs(const ov::OutputVector& new m_concat_axis, m_output_type); + } else if (new_args.size() == 3) { + return std::make_shared(new_args.at(0), + new_args.at(1), + new_args.at(2), + m_variable, + m_concat_axis, + m_gather_axis, + m_output_type); } else { return std::make_shared(new_args.at(0), new_args.at(1), new_args.at(2), + new_args.at(3), m_variable, m_concat_axis, m_gather_axis, @@ -89,9 +130,11 @@ std::vector shape_infer(const KVCache* op, std::vector out_shapes; out_shapes.resize(op->get_output_size()); + // std::cout << "kv_cache shape infer " << op->get_output_size() << "\n"; + const auto& gather_axis = op->get_gather_axis(); const auto& concat_axis = ov::util::normalize(op->get_concat_axis(), input_shapes[0].size()); - if (op->get_output_size() == 2) { + if (op->get_output_size() >= 2) { out_shapes[0] = input_shapes[0]; out_shapes[0][gather_axis] = input_shapes[2][0]; out_shapes[0][concat_axis] += input_shapes[1][concat_axis]; @@ -100,6 +143,18 @@ std::vector shape_infer(const KVCache* op, std::vectorget_output_size() == 3){ + ov::PartialShape compression_scale_shape(std::vector(out_shapes[0].size(), 1)); + compression_scale_shape[0] = out_shapes[0][0]; + compression_scale_shape[1] = out_shapes[0][1]; + GPU_DEBUG_IF(cldnn::debug_configuration::get_instance()->enable_kv_cache_compression == 1) { // per-head compression + compression_scale_shape[2] = out_shapes[0][2]; + } + out_shapes[2] = compression_scale_shape; + } } else { out_shapes[0] = input_shapes[0]; out_shapes[0][concat_axis] += input_shapes[1][concat_axis]; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp index 5438a6e2e695b5..1e78cf65730775 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp @@ -89,6 +89,47 @@ std::shared_ptr ReadValue::clone_with_new_inputs(const ov::OutputVector& n } } +CompressedReadValue::CompressedReadValue(const Output& variable_initializer, + const Output& compressed_variable_initializer_scale, + const std::shared_ptr& past_values) : ReadValue(variable_initializer, past_values) { + const auto scales_input_idx = get_input_size(); + set_argument(scales_input_idx, compressed_variable_initializer_scale); + validate_and_infer_types(); +} + +bool CompressedReadValue::visit_attributes(ov::AttributeVisitor& visitor) { + visitor.on_attribute("variable_id", m_variable); + + auto variable_info = m_variable->get_info(); + visitor.on_attribute("variable_type", variable_info.data_type); + visitor.on_attribute("variable_shape", variable_info.data_shape); + m_variable->update(variable_info); + return true; +} + +void CompressedReadValue::validate_and_infer_types() { + OPENVINO_ASSERT(m_variable, "Variable is not initialized."); + + OPENVINO_ASSERT(get_input_size() == 2); + ReadValue::validate_and_infer_types(); + + const auto& scale_type = get_input_element_type(1); + const auto& scale_shape = get_input_partial_shape(1); + + set_output_type(1, scale_type, scale_shape); +} + +std::shared_ptr CompressedReadValue::clone_with_new_inputs(const ov::OutputVector& new_args) const { + check_new_args_count(this, new_args); + + OPENVINO_ASSERT(new_args.size() == 2, "Unable to clone CompressedReadValue ", + this->get_friendly_name(), + " Incorrect number of inputs. Expected: 2. Actual: ", + new_args.size()); + + return std::make_shared(new_args[0], new_args[1], m_variable); +} + } // namespace op } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp index 57d2899e2b2e77..b7b544b49791c7 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp @@ -16,12 +16,14 @@ namespace op { SDPA::SDPA(const OutputVector& inputs, const bool is_causal, + const bool is_kv_compressed, const std::vector& order_q, const std::vector& order_k, const std::vector& order_v, const std::vector& order_out, const ov::element::Type output_type) : m_is_causal(is_causal) + , m_is_kv_compressed(is_kv_compressed) , m_order_q(order_q) , m_order_k(order_k) , m_order_v(order_v) @@ -37,6 +39,7 @@ std::shared_ptr SDPA::clone_with_new_inputs(const ov::OutputVector& ne return std::make_shared(new_args, m_is_causal, + m_is_kv_compressed, m_order_q, m_order_k, m_order_v, @@ -46,11 +49,14 @@ std::shared_ptr SDPA::clone_with_new_inputs(const ov::OutputVector& ne void SDPA::validate_and_infer_types() { const auto input_size = get_input_size(); + + const unsigned long comp_scale_cnt = m_is_kv_compressed ? 2 : 0; NODE_VALIDATION_CHECK(this, - input_size == 3 || input_size == 4 || input_size == 5, + input_size == 3 + comp_scale_cnt || input_size == 4 + comp_scale_cnt || input_size == 5 + comp_scale_cnt, "Number of inputs is incorrect. Current value is: ", input_size, - ", expected 3, 4 or 5."); + ", expected 3, 4, 5 + comp_scale_cnt: ", + comp_scale_cnt); std::vector input_shapes; for (size_t i = 0; i < input_size; i++) { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp index f2fc64dedc7200..78f143699aa866 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp @@ -177,7 +177,7 @@ TransposeSDPAMatcher::TransposeSDPAMatcher() { inputs.push_back(sdpa->get_input_source_output(4)); } - auto sdpa_new = std::make_shared(inputs, sdpa->get_causal(), order_q, order_k, order_v, order_output); + auto sdpa_new = std::make_shared(inputs, sdpa->get_causal(), false /*is_kv_compressed*/, order_q, order_k, order_v, order_output); sdpa_new->set_friendly_name(sdpa->get_friendly_name()); ov::copy_runtime_info(m.get_matched_nodes(), sdpa_new); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.cpp index d525792ccd8d06..1bb6a7932ecc17 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.cpp @@ -115,7 +115,7 @@ UnsqueezeBroadcastReshapeSDPAFusion::UnsqueezeBroadcastReshapeSDPAFusion() { auto order_c = sdpa->get_input2_transpose_order(); auto order_d = sdpa->get_output_transpose_order(); - auto sdpa_new = std::make_shared(data_inputs, sdpa->get_causal(), order_a, order_b, order_c, order_d); + auto sdpa_new = std::make_shared(data_inputs, sdpa->get_causal(), false /* is_kv_compressed*/, order_a, order_b, order_c, order_d); sdpa_new->set_friendly_name(sdpa->get_friendly_name()); ov::copy_runtime_info(m.get_matched_nodes(), sdpa_new); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 40c7ab48c486cb..d095e3ec317d11 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -68,6 +68,7 @@ #include "plugin/transformations/swiglu_fusion.hpp" #include "plugin/transformations/transpose_fusion.hpp" #include "plugin/transformations/indirect_kv_cache.hpp" +#include "plugin/transformations/kv_cache_compression.hpp" #include "plugin/transformations/convert_convolution.hpp" #include "plugin/transformations/unsqueeze_broadcast_reshape_matmul_fusion.hpp" #include "plugin/transformations/unsqueeze_broadcast_reshape_sdpa_fusion.hpp" @@ -852,6 +853,8 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); manager.register_pass(); + manager.register_pass(); + manager.register_pass(); // This pass should be done after asymmetric quantization matching as it can move zp subtraction upper in the graph diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp index dcbabff548cc5d..804a3877ffc837 100644 --- a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp +++ b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp @@ -198,6 +198,8 @@ static void print_help_messages() { " Currently, other layers except input-layer('parameter' type) are loading binaries for only input." " Different input or output tensors are seperated by ','. Different layers are separated by space. For example, " " \"[input_layer_name1]:[binary_dumped_file1],[binary_dump_file2] [input_layer_name2]:[binary_dump_1],[binary_dump_2]\""); + message_list.emplace_back("OV_GPU_EnableKVCacheCompression", + "Enable KV cache compression. 1 is for per-head compression, 2 is for per-token compression"); auto max_name_length_item = std::max_element(message_list.begin(), message_list.end(), [](std::pair& a, std::pair& b){ @@ -254,7 +256,8 @@ debug_configuration::debug_configuration() , disable_primitive_fusing(0) , disable_fake_alignment(0) , dynamic_quantize_group_size(0) - , disable_horizontal_fc_fusion(0) { + , disable_horizontal_fc_fusion(0) + , enable_kv_cache_compression(0) { #ifdef GPU_DEBUG_CONFIG get_gpu_debug_env_var("Help", help); get_common_debug_env_var("Verbose", verbose); @@ -315,6 +318,7 @@ debug_configuration::debug_configuration() get_gpu_debug_env_var("LoadDumpRawBinary", load_dump_raw_bin_str); std::string dynamic_quantize_layers_without_onednn_str; get_gpu_debug_env_var("DynamicQuantizeLayersWithoutOnednn", dynamic_quantize_layers_without_onednn_str); + get_gpu_debug_env_var("EnableKVCacheCompression", enable_kv_cache_compression); if (help > 0) { print_help_messages();