From 8b5711ea415af199d83b4aa3c8d2692ac1e3a105 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Tue, 1 Oct 2024 20:16:07 +0400 Subject: [PATCH] WIP: change scales layout, enable more models --- .../include/ov_ops/dynamic_quantize.hpp | 9 +- .../src/ov_ops/dynamic_quantize.cpp | 21 ++- .../intel_gpu/graph/kernel_impl_params.hpp | 2 +- .../include/intel_gpu/op/kv_cache.hpp | 7 +- .../plugin/multi_tensor_variable_state.hpp | 1 + .../intel_gpu/primitives/dynamic_quantize.hpp | 5 +- .../intel_gpu/primitives/read_value.hpp | 2 + .../intel_gpu/src/graph/dynamic_quantize.cpp | 19 ++- .../src/graph/impls/ocl/dynamic_quantize.cpp | 4 + .../src/graph/impls/ocl/kv_cache.cpp | 52 +++--- .../src/graph/include/dynamic_quantize_inst.h | 2 +- .../src/graph/include/kv_cache_inst.h | 4 +- .../src/graph/include/read_value_inst.h | 10 +- src/plugins/intel_gpu/src/graph/kv_cache.cpp | 40 +++-- .../intel_gpu/src/graph/primitive_inst.cpp | 49 ++++-- .../intel_gpu/src/graph/program_node.cpp | 3 +- .../cl_kernels/dynamic_quantize_gpu_ref.cl | 101 +++++++++++- .../kernel_selector/cl_kernels/sdpa_ref.cl | 10 +- .../dynamic_quantize_kernel_ref.cpp | 60 ++++++- .../dynamic_quantize_kernel_ref.h | 3 + .../plugin/multi_tensor_variable_state.cpp | 22 +-- .../src/plugin/ops/dynamic_quantize.cpp | 3 + .../intel_gpu/src/plugin/ops/kv_cache.cpp | 2 +- .../intel_gpu/src/plugin/ops/variable.cpp | 39 ++++- .../transformations/indirect_kv_cache.cpp | 19 +++ .../transformations/kv_cache_compression.cpp | 151 ++++++++++++++---- .../plugin/transformations/op/kv_cache.cpp | 38 +++-- .../plugin/transformations/op/read_value.cpp | 7 +- .../src/plugin/transformations/op/sdpa.cpp | 28 ++++ src/plugins/intel_gpu/src/runtime/format.cpp | 2 + 30 files changed, 572 insertions(+), 143 deletions(-) diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index 69c148305fb94f..e546a426dbc878 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -22,7 +22,7 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { /// \param data Input tensor with data /// \param group_sizes Group sizes for dynamic quantization /// \param dt_scale Data type for scale output - DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale); + DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale, std::vector scales_output_order = {}); void validate_and_infer_types() override; @@ -30,12 +30,17 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { const std::vector& get_group_sizes() const { return m_group_sizes; }; + const std::vector& get_scales_output_order() const { + return m_scales_output_order; + }; static std::vector shape_infer(const DynamicQuantize* op, const std::vector& input_shapes, - const std::vector& group_sizes); + const std::vector& group_sizes, + const std::vector& scales_output_order = {}); private: std::vector m_group_sizes; + std::vector m_scales_output_order; element::Type m_dt_scale; }; diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index 74c0498e9a4425..0ac0a570e2a002 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -13,15 +13,17 @@ namespace ov { namespace op { namespace internal { -DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale) +DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale, std::vector scales_output_order) : Op({data}), m_group_sizes(std::move(group_sizes)), + m_scales_output_order(std::move(scales_output_order)), m_dt_scale(dt_scale) { OPENVINO_ASSERT(data.get_partial_shape().rank() == m_group_sizes.size(), "FC input rank should be same as the rank of group_size ", data.get_tensor_ptr()->get_partial_shape().rank(), " / ", m_group_sizes.size()); + OPENVINO_ASSERT(data.get_partial_shape().rank() == scales_output_order.size() || scales_output_order.empty()); set_output_size(2); validate_and_infer_types(); } @@ -29,7 +31,7 @@ DynamicQuantize::DynamicQuantize(const Output& data, std::vector void DynamicQuantize::validate_and_infer_types() { std::vector input_shapes = {get_input_partial_shape(0)}; - auto out_shapes = shape_infer(this, input_shapes, m_group_sizes); + auto out_shapes = shape_infer(this, input_shapes, m_group_sizes, m_scales_output_order); set_output_type(0, element::i8, out_shapes[0]); set_output_type(1, m_dt_scale, out_shapes[1]); } @@ -41,7 +43,8 @@ std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVec std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, const std::vector& input_shapes, - const std::vector& group_sizes) { + const std::vector& group_sizes, + const std::vector& scales_output_order) { std::vector out_shapes; out_shapes.push_back(input_shapes[0]); @@ -51,6 +54,8 @@ std::vector DynamicQuantize::shape_infer(const DynamicQuantize scale_shape.size(), " / ", group_sizes.size()); + OPENVINO_ASSERT(scale_shape.size() == scales_output_order.size() || scales_output_order.empty()); + for (size_t i = 0; i < scale_shape.size(); i++) { if (scale_shape[i].is_dynamic()) continue; @@ -58,10 +63,20 @@ std::vector DynamicQuantize::shape_infer(const DynamicQuantize if (group_sizes[i] == UINT64_MAX) scale_shape[i] = 1; else { + if (scale_shape[i] == 0) + continue; + scale_shape[i] /= group_sizes[i]; // if group_size is larger than shape, scale_shape will be 1 scale_shape[i] = std::max(static_cast(scale_shape[i].get_length()), 1); } } + if (!scales_output_order.empty()) { + auto non_transposed_scale_shape = scale_shape; + for (size_t i = 0; i < scales_output_order.size(); i++) { + OPENVINO_ASSERT(scales_output_order[i] < scale_shape.size()); + scale_shape[i] = non_transposed_scale_shape[scales_output_order[i]]; + } + } out_shapes.push_back(scale_shape); return out_shapes; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp index 3e8887fbb2f7ee..72623f6d120955 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp @@ -53,7 +53,7 @@ struct kernel_impl_params final { optional_layout weights_zero_points_layout = optional_layout(); optional_layout activations_zero_points_layout = optional_layout(); optional_layout compensation_layout = optional_layout(); - optional_layout state_layout = optional_layout(); + std::vector state_layouts; std::map memory_deps = {}; size_t primary_input_idx = 0; diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp index 8ed8f525161d5a..3bb25f84dc0b14 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp @@ -22,22 +22,23 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension { KVCache(const Output& past, const Output& new_token_data, - const Output& beam_idx, const std::shared_ptr& past_values, int64_t concat_axis, - int64_t gather_axis, const ov::element::Type output_type = ov::element::undefined); KVCache(const Output& past, const Output& new_token_data, + const Output& beam_idx, const std::shared_ptr& past_values, int64_t concat_axis, + int64_t gather_axis, const ov::element::Type output_type = ov::element::undefined); KVCache(const Output& past, const Output& new_token_data, - const Output& new_token_scale, const Output& beam_idx, + const Output& past_scale, + const Output& new_token_scale, const std::shared_ptr& past_values, int64_t concat_axis, int64_t gather_axis, diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp index c716198ad04ac0..4494ef9e7eb2cc 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/multi_tensor_variable_state.hpp @@ -44,6 +44,7 @@ class VariableStateIndirectKVCache : public MultiTensorState { VariableState::Ptr get_compression_scale_state() const; ov::PartialShape get_compression_scale_shape(const ov::PartialShape& kv_cache_shape); + void set_scales_layout(const cldnn::layout& new_layout); private: size_t m_beam_axis = 0; diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index dfacfca5cfbd36..f0a6ff136cbd8d 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -23,11 +23,14 @@ struct dynamic_quantize : public primitive_base { dynamic_quantize(const primitive_id& id, const input_info& input, const std::vector& group_sizes, + const std::vector& scales_output_order, const std::vector data_types = {optional_data_type(data_types::f16), optional_data_type(data_types::i8)}) : primitive_base(id, {input}, 2, data_types), - group_sizes(group_sizes) {} + group_sizes(group_sizes), + scales_output_order(scales_output_order) {} std::vector group_sizes; + std::vector scales_output_order; size_t hash() const override { size_t seed = primitive::hash(); diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp index 7d9e919f56cf13..077bcdba23680e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/read_value.hpp @@ -31,10 +31,12 @@ struct read_value : public primitive_base { : primitive_base(id, inputs, 1, {optional_data_type{output_layout.data_type}}), variable_id{variable_id}, output_layout{output_layout}, + compressed(false), user_specified_type(user_specified_type) {} std::string variable_id; layout output_layout; + bool compressed; ov::element::Type user_specified_type; bool operator==(const primitive& rhs) const override { diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index fec3c842a1d595..b64a924e3408a7 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -22,7 +22,7 @@ layout dynamic_quantize_inst::calc_output_layout(dynamic_quantize_node const& no } template -std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, const std::vector& group_sizes) { +std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, const std::vector& group_sizes, const std::vector& scales_output_order) { ov::op::internal::DynamicQuantize op; auto output_format = act_layout.format; @@ -30,19 +30,30 @@ std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &a act_layout.get(), }; - auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, group_sizes); + auto print_arr = [&](const std::vector& vec, size_t max_len, std::string name) { + std::stringstream ss; + for (size_t i = 0; i < std::min(max_len, vec.size()); i++) { + ss << vec[i] << ", "; + } + std::cout << "Array " << name << " for calc_shape (len=" << vec.size() << ") content: " << ss.str() << "\n"; + }; + + print_arr(scales_output_order, scales_output_order.size(), "scales_output_order"); + print_arr(group_sizes, group_sizes.size(), "group_sizes"); + + auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, group_sizes, scales_output_order); GPU_DEBUG_TRACE_DETAIL << "shape infer dynamic" << output_shapes[0] << " " << output_shapes[1] << "\n"; return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; } -template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, const std::vector& group_sizes); +template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, const std::vector& group_sizes, const std::vector& scales_output_order); template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); const auto& input_layout = impl_param.get_input_layout(); - return __calc_output_layouts(input_layout, desc->group_sizes); + return __calc_output_layouts(input_layout, desc->group_sizes, desc->scales_output_order); } template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& node, diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp index 91f141ae062723..f682f8648f369e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp @@ -37,6 +37,10 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl { auto params = get_default_params(impl_param, is_shape_agnostic); params.outputs.push_back(convert_data_tensor(impl_param.get_output_layout(1))); + const auto& desc = impl_param.typed_desc(); + params.group_sizes = desc->group_sizes; + params.scales_output_order = desc->scales_output_order; + return params; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp index d3e58cd7c2108d..1d6ce83168aeff 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp @@ -9,6 +9,7 @@ #include "multi_stage_primitive.hpp" #include "kv_cache_inst.h" +#include "dynamic_quantize_inst.h" #include "concatenation/concatenation_kernel_selector.h" #include "concatenation/concatenation_kernel_base.h" #include "beam_table_update/beam_table_update_kernel_selector.hpp" @@ -69,9 +70,7 @@ struct kv_cache_impl : multi_stage_primitive { cldnn::memory::ptr beam_table_prev = nullptr; cldnn::memory::ptr beam_table_new = nullptr; - - cldnn::memory::ptr compression_scale_prev = nullptr; - cldnn::memory::ptr scale_new = nullptr; + cldnn::memory::ptr compression_scale = nullptr; void load(BinaryInputBuffer& ib) override { parent::load(ib); @@ -105,8 +104,8 @@ struct kv_cache_impl : multi_stage_primitive { args.outputs = { beam_table_new }; } else if (stage == scale_stage) { // FIXME: indirectness and compression are orthogonal feature. - args.inputs = { compression_scale_prev, instance.input_memory_ptr(3) }; - args.outputs = { scale_new }; + args.inputs = { instance.input_memory_ptr(3), instance.input_memory_ptr(4) }; // [past, new, beam_table, past_scale, new_scale] + args.outputs = { compression_scale }; } return args; @@ -204,27 +203,25 @@ struct kv_cache_impl : multi_stage_primitive { dynamic_cast(variable).get_compression_scale_state(); auto comp_scale_layout = instance.get_impl_params()->output_layouts[2]; auto comp_scale_shape = comp_scale_layout.get_shape(); - std::swap(compression_scale_prev, scale_new); - if (!scale_new || scale_new->count() < ov::shape_size(comp_scale_shape)) { + bool skip_first_kernel = true; + if (!compression_scale || compression_scale->count() < ov::shape_size(comp_scale_shape)) { + const auto concat_axis = 2; auto alloc_shape = comp_scale_shape; - alloc_shape[desc->concat_axis] += instance.get_prealloc_iter_num(); + alloc_shape[concat_axis] += instance.get_prealloc_iter_num(); const layout comp_scale_alloc_layout = {alloc_shape, comp_scale_layout.data_type, comp_scale_layout.format}; GPU_DEBUG_TRACE_DETAIL << "Realloc compression scale table to " << comp_scale_alloc_layout.to_short_string() << std::endl; - scale_new = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false); + compression_scale = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false); - // Alloc prev mem too as it will be needed in the future - // That also simplifies arguments setting a little bit as we don't need to handle an optional past state - if (!compression_scale_prev) { - compression_scale_prev = engine.allocate_memory(comp_scale_alloc_layout, scale_alloc_type, false); - } + skip_first_kernel = comp_scale_state->get_layout().count() == 0; } - instance.set_output_memory(scale_new, false, 2); - comp_scale_state->set_memory(scale_new, instance.get_impl_params()->output_layouts[2]); + instance.set_output_memory(compression_scale, false, 2); + comp_scale_state->set_memory(compression_scale, instance.get_impl_params()->output_layouts[2]); auto comp_scale_kernel_params = get_compression_scale_update_kernel_params(impl_param, comp_scale_state->is_set()); (_kernels_data[scale_stage].update_dispatch_data_func)(comp_scale_kernel_params, _kernels_data[scale_stage]); + _kernels_data[scale_stage].kernels[0].skip_execution = skip_first_kernel; execute_stage(events, instance, res_events, scale_stage); comp_scale_state->set(); @@ -344,8 +341,8 @@ struct kv_cache_impl : multi_stage_primitive { params.indirect_axis = indirect_axis; const bool compressed = impl_param.typed_desc()->compressed; - const auto beam_table_past_idx = compressed ? 4 : 3; - const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, compression_scale_past, beam_table_past, compression_scale_new]] + const auto beam_table_past_idx = compressed ? 5 : 3; + const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; // [kv_past, kv_new_token, [beam_idx, compression_scale_past, compression_scale_new, beam_table_past]] const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; // [kv_present, beam_table_present, compression_scale_present] std::map in_tensor_to_offset_map = { {0, in_offsets_map.at(beam_table_past_idx)}, // beam_table_past @@ -364,13 +361,21 @@ struct kv_cache_impl : multi_stage_primitive { const auto& primitive = impl_param.typed_desc(); auto params = get_default_params(impl_param, is_shape_agnostic); + const auto concat_axis = 2; + params.axis = convert_axis(concat_axis, impl_param.get_output_layout().get_rank()); + auto inputs_count = 2; + auto comp_scale_past_layout = impl_param.input_layouts[3]; + auto comp_scale_new_layout = impl_param.input_layouts[4]; auto comp_scale_present_layout = impl_param.output_layouts[2]; - layout comp_scale_past_layout = get_compression_scale_layout(impl_param); + + GPU_DEBUG_TRACE_DETAIL << "Past scale: " << comp_scale_past_layout.to_short_string() << "\n"; + GPU_DEBUG_TRACE_DETAIL << "New scale: " << comp_scale_new_layout.to_short_string() << "\n"; + GPU_DEBUG_TRACE_DETAIL << "Present scale: " << comp_scale_present_layout.to_short_string() << "\n"; params.inputs.resize(inputs_count); params.inputs[0] = convert_data_tensor(comp_scale_past_layout); - params.inputs[1] = convert_data_tensor(impl_param.input_layouts[3]); + params.inputs[1] = convert_data_tensor(comp_scale_new_layout); params.outputs[0] = convert_data_tensor(comp_scale_present_layout); const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset; @@ -378,13 +383,16 @@ struct kv_cache_impl : multi_stage_primitive { // FIXME: need to handle the index properly when indirect is off std::map in_tensor_to_offset_map = { - {0, in_offsets_map.at(5)}, // compression_scale_past - {1, in_offsets_map.at(3)}, // compression_scale_new + {0, in_offsets_map.at(3)}, // compression_scale_past + {1, in_offsets_map.at(4)}, // compression_scale_new }; std::map out_tensor_to_offset_map = { {0, out_offsets_map.at(2)}, // compression_scale_present }; + GPU_DEBUG_TRACE_DETAIL << "Dynamic shape in0 " << in_offsets_map.at(3) << "\n"; + GPU_DEBUG_TRACE_DETAIL << "Dynamic shape in1 " << in_offsets_map.at(4) << "\n"; + GPU_DEBUG_TRACE_DETAIL << "Dynamic shape offset " << out_offsets_map.at(2) << "\n"; params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); return params; diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h index a9630fa9b9b25a..999c7f297552de 100644 --- a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h @@ -35,7 +35,7 @@ class typed_primitive_inst : public typed_primitive_inst_base< // Internal function to be used from fakealignment template - static std::vector __calc_output_layouts(const layout &act_layout, const std::vector& group_size); + static std::vector __calc_output_layouts(const layout &act_layout, const std::vector& group_size, const std::vector& scales_output_order); static std::string to_string(dynamic_quantize_node const& node); typed_primitive_inst(network& network, dynamic_quantize_node const& node); diff --git a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h index 3d3d998399bc63..b8f05ac3272353 100644 --- a/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/kv_cache_inst.h @@ -36,9 +36,7 @@ struct typed_program_node : public typed_program_node_base { res.push_back(layout(ov::PartialShape::dynamic(4), data_types::i32, format::bfyx)); } - if (get_primitive()->compressed) { // insert an additional input with compressed_scale past layout - res.push_back(layout(ov::PartialShape::dynamic(4), data_types::f16, format::bfyx)); - } + GPU_DEBUG_TRACE_DETAIL << "Total shape info input layouts: " << res.size() << "\n"; return res; } diff --git a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h index 74f9ffff581b87..ed57b2419bcbc6 100644 --- a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h @@ -35,7 +35,15 @@ class typed_primitive_inst : public typed_primitive_inst_base(); const auto& default_layout = desc->output_layout; - return { impl_param.state_layout.value_or(default_layout) }; + std::vector output_layouts; + output_layouts.push_back(impl_param.state_layouts.size() >= 1 ? impl_param.state_layouts[0] : default_layout); + + if (desc->compressed) { + const auto default_layout = layout{ov::PartialShape::dynamic(4), data_types::f16, format::get_default_format(4)}; + output_layouts.push_back(impl_param.state_layouts.size() >= 2 ? impl_param.state_layouts[1] : default_layout); + } + + return output_layouts; } static layout calc_output_layout(const read_value_node& node, kernel_impl_params const& impl_param); diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp index 1a954d11c9e543..64ba5a0f6706f5 100644 --- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp @@ -36,8 +36,14 @@ std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& node std::vector input_shapes = {impl_param.get_input_layout(0).get(), impl_param.get_input_layout(1).get()}; - if (desc->num_outputs > 1) + if (desc->indirect) { input_shapes.push_back(impl_param.get_input_layout(2).get()); + } + + if (desc->compressed) { + input_shapes.push_back(impl_param.get_input_layout(3).get()); + input_shapes.push_back(impl_param.get_input_layout(4).get()); + } std::vector output_shapes = shape_infer(&op, input_shapes); @@ -127,22 +133,22 @@ void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) { fill_shape_info_data(bt_layout, bt_state->get_initial_layout(), shape_info_ptr, offset); } - if (params.typed_desc()->compressed) { - auto& var = dynamic_cast(get_network().get_variable(variable_id())); - const auto& scale_state = var.get_compression_scale_state(); - auto scale_layout = scale_state->get_layout(); - if (scale_layout.is_dynamic()) { - auto bt_shape = scale_layout.get_partial_shape(); - for (auto& d : bt_shape) { - if (d.is_dynamic()) - d = 0; - } - scale_layout.set_partial_shape(bt_shape); - } - - GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for input[" << i++ << "]" << std::endl; - fill_shape_info_data(scale_layout, scale_state->get_initial_layout(), shape_info_ptr, offset); - } + // if (params.typed_desc()->compressed) { + // auto& var = dynamic_cast(get_network().get_variable(variable_id())); + // const auto& scale_state = var.get_compression_scale_state(); + // auto scale_layout = scale_state->get_layout(); + // if (scale_layout.is_dynamic()) { + // auto bt_shape = scale_layout.get_partial_shape(); + // for (auto& d : bt_shape) { + // if (d.is_dynamic()) + // d = 0; + // } + // scale_layout.set_partial_shape(bt_shape); + // } + + // GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for input[" << i++ << "]" << std::endl; + // fill_shape_info_data(scale_layout, scale_state->get_initial_layout(), shape_info_ptr, offset); + // } for (size_t i = 0; i < _node->get_output_layouts().size(); i++) { GPU_DEBUG_TRACE_DETAIL << id() << " : update shape_info for output[" << i << "]" << std::endl; diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 18bf624e737b90..2c4dc724c40dc5 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -37,6 +37,7 @@ #include "graph_optimizer/prepare_buffer_fusing.h" #include "intel_gpu/plugin/common_utils.hpp" +#include "intel_gpu/plugin/multi_tensor_variable_state.hpp" #include "intel_gpu/graph/network.hpp" #include "intel_gpu/graph/serialization/set_serializer.hpp" #include "intel_gpu/runtime/engine.hpp" @@ -301,23 +302,51 @@ void primitive_inst::update_shape() { } // If we still have a dynamic dimension, which basiclly means that we don't have an initializer, then replace dynamic dims with 0 - if (new_layout.is_dynamic()) { - auto pshape = new_layout.get_partial_shape(); - for (auto& d : pshape) { - if (d.is_dynamic()) { - d = 0; + auto update_layout = [](layout& layout) { + if (layout.is_dynamic()) { + auto pshape = layout.get_partial_shape(); + for (auto& d : pshape) { + if (d.is_dynamic()) { + d = 0; + } } + layout.set_partial_shape(pshape); } - new_layout.set_partial_shape(pshape); - } + }; + update_layout(new_layout); GPU_DEBUG_TRACE_DETAIL << id() << " set new layout " << new_layout.to_short_string() << "\n"; variable.set_layout(new_layout); - if (!_impl_params->state_layout.has_value() || _impl_params->state_layout.value() != new_layout) { - _impl_params->state_layout = new_layout; + if (_impl_params->state_layouts.empty()) { + _impl_params->state_layouts.resize(1); + } + + if (_impl_params->state_layouts[0] != new_layout) { + _impl_params->state_layouts[0] = new_layout; input_shape_changed = true; } + + if (prim->compressed) { + _impl_params->state_layouts.resize(2); + + auto multi_tensor_variable = downcast(variable); + auto new_scales_layout = multi_tensor_variable.get_compression_scale_state()->get_layout(); + + if (!variable.is_set() && _impl_params->input_layouts.size() >= 2) { + new_scales_layout = _impl_params->get_input_layout(1); + } + + update_layout(new_scales_layout); + + GPU_DEBUG_TRACE_DETAIL << id() << " set new scales layout " << new_scales_layout.to_short_string() << "\n"; + multi_tensor_variable.set_scales_layout(new_scales_layout); + + if (_impl_params->state_layouts[1] != new_scales_layout) { + _impl_params->state_layouts[1] = new_scales_layout; + input_shape_changed = true; + } + } } if (input_shape_changed) @@ -646,7 +675,7 @@ event::ptr primitive_inst::realloc_if_needed() { // dynamic quantization is only applied to activation of FC if (get_node().is_type()) { const auto& desc = get_node().as().get_primitive(); - auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], desc->group_sizes); + auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], desc->group_sizes, desc->scales_output_order); GPU_DEBUG_TRACE_DETAIL << "update layout of dynamic quantize scale parameter layout " << dyn_quan_scale_layout[1].to_short_string() << std::endl; updated_params.output_layouts[1] = dyn_quan_scale_layout[1]; diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index 3c21800c66d938..0b7b0ca4ca2b1b 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -90,7 +90,8 @@ void program_node::replace_dependency(size_t idx, std::pair const program_node::get_input_layouts() const { std::vector layouts; for (size_t i = 0; i < dependencies.size(); i++) { - layouts.push_back(get_input_layout(i)); + auto input_layout = get_input_layout(i); + layouts.push_back(input_layout); } return layouts; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl index 436276a67e48c0..d771cb7110be8f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl @@ -8,6 +8,59 @@ #error "dynamic_quantize_gpu_ref.cl: Unsupported output dimension" #endif +/* +TODO: check this coniguration: +GPU_Debug: primitive_inst.cpp:1921:primitive_inst: +{ + dynamic_quantize info : + { + scale dt : f16, + activation dt : i8, + group size : 1,18446744073709551615,1,18446744073709551615, + } + implementation : dynamic_quantize_gpu_ref, + cl dump_ info : + { + kernel_entry : dynamic_quantize_gpu_ref_17256100832148678061_0_0__sa, + batch_hash : 8176231137263359740, + } + ptr : node_213002368, + id : dynamicquantize:__module.model.layers.9.self_attn/aten::cat/Concat_3_init_dyn_quan, + optimized : false, + type : dynamic_quantize, + valid output layout : true, + output layouts : + { + 1 : f16:bfyx:?x1x0x1:nopad, + 0 : i8:bfyx:?x32x0x128:nopad, + } + dependant_shape_of_nodes_ids : , + fused primitives : + { + } + constant : false, + in_shape_of_subgraph : 0, + in data flow : true, + output : false, + preferred impl : any, + dependencies : 219236192(0), + users : 207254784,207254784, +} + + */ + +inline uint FUNC(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { + return OUTPUT1_GET_INDEX(b, f, y, x); +} + +inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { +#ifdef SCALES_OUTPUT_ORDER + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR SCALES_OUTPUT_ORDER); +#else + return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#endif +} + KERNEL(dynamic_quantize_gpu_ref)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, @@ -15,14 +68,27 @@ KERNEL(dynamic_quantize_gpu_ref)( __global OUTPUT1_TYPE* output_scale) { const uint bf = (uint)get_global_id(0); - const uint b = (uint)get_global_id(0) / INPUT0_FEATURE_NUM; - const uint f = (uint)get_global_id(0) % INPUT0_FEATURE_NUM; + const uint b = bf / INPUT0_FEATURE_NUM; + const uint f = bf % INPUT0_FEATURE_NUM; const uint y = (uint)get_global_id(1); - const uint scale_idx = OUTPUT1_GET_INDEX(b, f, y, 0); + const uint x = (uint)get_global_id(2); +#ifdef SCALES_OUTPUT_ORDER + const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); +#else + const uint scale_idx = OUTPUT1_GET_INDEX_SAFE(b, f, y, x); +#endif half max_val = 0.0001h; - for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) { - const uint offset = INPUT0_GET_INDEX(b, f, y + y_off, 0); + for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) { + for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) { + for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) { +#if GROUP_SIZE_DIM3 == 1 + const uint offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x); + half val = input[offset]; + half abs_val = fabs(val); + max_val = fmax(max_val, abs_val); +#else + const uint offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); int x; for (x = 0; x < INPUT0_SIZE_X / 8; x++) { half8 val = as_half8(vload8(0, (ushort*)input + offset + x * 8)); @@ -34,21 +100,40 @@ KERNEL(dynamic_quantize_gpu_ref)( x *= 8; for (; x < INPUT0_SIZE_X; x++) max_val = fmax(max_val, fabs(input[offset + x])); +#endif + } + } } half scale = 127.0h / max_val; - for (int y_off = 0; y_off < (get_global_size(1) == 1 ? INPUT0_SIZE_Y : 1); y_off++) { - const uint in_offset = INPUT0_GET_INDEX(b, f, y + y_off, 0); - const uint out_offset = OUTPUT_GET_INDEX(b, f, y + y_off, 0); + for (int b_off = 0; b_off < (GROUP_SIZE_DIM0 == 1 ? 1 : INPUT0_BATCH_NUM); b_off++) { + for (int f_off = 0; f_off < (GROUP_SIZE_DIM1 == 1 ? 1 : INPUT0_FEATURE_NUM); f_off++) { + for (int y_off = 0; y_off < (GROUP_SIZE_DIM2 == 1 ? 1 : INPUT0_SIZE_Y); y_off++) { +#if GROUP_SIZE_DIM3 == 1 + const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, x); + const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, x); + + half val = input[in_offset]; + val *= scale; + output[out_offset] = convert_char(val); +#else + const uint in_offset = INPUT0_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); + const uint out_offset = OUTPUT_GET_INDEX(b + b_off, f + f_off, y + y_off, 0); int x; for (x = 0; x < INPUT0_SIZE_X / 8; x++) { half8 val = as_half8(vload8(0, (ushort*)input + in_offset + x * 8)); val *= scale; + // TODO: why it's _rtz instead of _rte? vstore8(convert_char8(val), 0, output + out_offset + x * 8); + // vstore8(convert_char8_rte(val), 0, output + out_offset + x * 8); } x *= 8; for (; x < INPUT0_SIZE_X; x++) output[out_offset + x] = convert_char(input[in_offset + x] * scale); + // output[out_offset + x] = convert_char_rte(input[in_offset + x] * scale); +#endif + } + } } output_scale[scale_idx] = 1.0h / scale; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl index 328f2b029a5425..e9d60e09fd7806 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl @@ -170,9 +170,10 @@ KERNEL(sdpa_ref)( INPUT1_TYPE k_val_comp = key_input[key_offset]; half k_val = (half)k_val_comp; #ifdef COMPRESSED_PER_HEAD - const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, s, 0, 0, b1, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, s, b1); #else - const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, s, 0, 0, 0, 0); + const uint key_scale_comp_offset = s; + // const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, s, 0); #endif k_val *= key_scale[key_scale_comp_offset]; #else @@ -255,9 +256,10 @@ KERNEL(sdpa_ref)( INPUT2_TYPE __value = value_input[value_offset]; half value = (half)__value; #ifdef COMPRESSED_PER_HEAD - const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, s, 0, 0, b1, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, s, b1); #else - const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, s, 0, 0, 0, 0); + const uint value_scale_comp_offset = s; + // const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, s, 0); #endif value *= val_scale[value_scale_comp_offset]; acc += tmp_buf[tmp_buf_offset] * value; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp index 809225748cffe4..d38aa766c508b6 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.cpp @@ -26,6 +26,47 @@ JitConstants DynamicQuantizeKernelRef::GetJitConstants(const dynamic_quantize_pa jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0])); + auto print_arr = [&](const std::vector& vec, size_t max_len, std::string name) { + std::stringstream ss; + for (size_t i = 0; i < std::min(max_len, vec.size()); i++) { + ss << vec[i] << ", "; + } + GPU_DEBUG_TRACE_DETAIL << "Array " << name << " (len=" << vec.size() << ") content: " << ss.str() << "\n"; + }; + + bool rearrange_scales = false; + const auto& scales_output_order = params.scales_output_order; + if (!scales_output_order.empty()) { + for (size_t i = 0; i < scales_output_order.size(); i++) { + if (i != scales_output_order[i]) { + rearrange_scales = true; + break; + } + } + } + + if (rearrange_scales) { + const std::array default_dim_order = {'b', 'f', 'y', 'x'}; + + std::stringstream ss; + for (size_t i = 0; i < scales_output_order.size(); i++) { + ss << default_dim_order[scales_output_order[i]]; + + if (i + 1 != scales_output_order.size()) + ss << ", "; + } + + jit.AddConstant(MakeJitConstant("SCALES_OUTPUT_ORDER", ss.str())); + std::cout << "SCALES_OUTPUT_ORDER: " << ss.str() << "\n"; + } + + print_arr(params.group_sizes, params.group_sizes.size(), "group_sizes"); + + const auto& group_sizes = params.group_sizes; + for (size_t i = 0; i < group_sizes.size(); i++) { + jit.AddConstant(MakeJitConstant("GROUP_SIZE_DIM" + std::to_string(i), group_sizes[i])); + } + return jit; } @@ -34,10 +75,17 @@ CommonDispatchData DynamicQuantizeKernelRef::SetDefault(const dynamic_quantize_p CommonDispatchData dispatchData; OPENVINO_ASSERT(params.outputs[0].GetLayout() == DataLayout::bfyx, "It supports only 4d tensor"); - dispatchData.gws = {params.outputs[0].Batch().v * params.outputs[0].Feature().v, 1, 1}; - GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // per-head compression - dispatchData.gws[1] = params.outputs[0].Y().v; - } + + const auto& group_sizes = params.group_sizes; + auto batch_size = group_sizes[0] == 1 ? params.outputs[0].Batch().v : 1; + auto feature_size = group_sizes[1] == 1 ? params.outputs[0].Feature().v : 1; + auto y_size = group_sizes[2] == 1 ? params.outputs[0].Y().v : 1; + auto x_size = group_sizes[3] == 1 ? params.outputs[0].X().v : 1; + + dispatchData.gws = {batch_size * feature_size, y_size, x_size}; + // GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // per-head compression + // dispatchData.gws[1] = params.outputs[0].Y().v; + // } dispatchData.lws = {1, 1, 1}; return dispatchData; @@ -97,6 +145,10 @@ bool DynamicQuantizeKernelRef::Validate(const Params& params) const { if (!KernelBaseOpenCL::Validate(params)) return false; + const auto& prim_params = static_cast(params); + if (prim_params.group_sizes.size() != 4) + return false; + return true; } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h index ce52ed9fb19714..4104e2f88361d2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h @@ -12,6 +12,9 @@ namespace kernel_selector { /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// struct dynamic_quantize_params : public base_params { dynamic_quantize_params() : base_params(KernelType::DYNAMIC_QUANTIZE) {} + + std::vector group_sizes; + std::vector scales_output_order; }; class DynamicQuantizeKernelRef : public KernelBaseOpenCL { diff --git a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp index 029e8fd4bfbf81..20dbb2630195a2 100644 --- a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp +++ b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp @@ -170,21 +170,25 @@ VariableState::Ptr VariableStateIndirectKVCache::get_beam_table_state() const { ov::PartialShape VariableStateIndirectKVCache::get_compression_scale_shape(const ov::PartialShape& kv_cache_shape) { // FIXME: add assert to confirm that it is compressed - GPU_DEBUG_GET_INSTANCE(debug_config); - auto rank = kv_cache_shape.size(); - ov::PartialShape compression_scale_shape(std::vector(rank, 1)); - compression_scale_shape[0] = kv_cache_shape[0]; - compression_scale_shape[1] = kv_cache_shape[1]; + // GPU_DEBUG_GET_INSTANCE(debug_config); + // auto rank = kv_cache_shape.size(); + // ov::PartialShape compression_scale_shape(std::vector(rank, 1)); + // compression_scale_shape[0] = kv_cache_shape[0]; + // compression_scale_shape[1] = kv_cache_shape[1]; - GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // per-head compression - compression_scale_shape[2] = kv_cache_shape[2]; - } - return compression_scale_shape; + // GPU_DEBUG_IF(debug_config->enable_kv_cache_compression == 1) { // per-head compression + // compression_scale_shape[2] = kv_cache_shape[2]; + // } + return ov::PartialShape::dynamic(kv_cache_shape.size()); } VariableState::Ptr VariableStateIndirectKVCache::get_compression_scale_state() const { return m_hidden_states[2]; } +void VariableStateIndirectKVCache::set_scales_layout(const cldnn::layout& new_layout) { + m_hidden_states[2]->set_layout(new_layout); +} + } // namespace intel_gpu } // namespace ov diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index 3780b5a5d14b47..3781f900f1b2c1 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -28,9 +28,12 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptrget_scales_output_order().size() << " number\n"; + auto prim = cldnn::dynamic_quantize(primitive_name, inputs[0], group_sizes, + op->get_scales_output_order(), get_output_data_types(op)); p.add_primitive(*op, prim); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp index f7afb23753a814..40755a74e534f6 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp @@ -22,7 +22,7 @@ namespace intel_gpu { namespace { void CreateKVCacheOp(ProgramBuilder& p, const std::shared_ptr& op) { - validate_inputs_count(op, {2, 3, 4}); + validate_inputs_count(op, {2, 3, 5}); auto inputs = p.GetInputInfo(op); int64_t rank = op->get_input_partial_shape(0).size(); auto prim = cldnn::kv_cache(layer_type_name_ID(op), diff --git a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp index 16f19fd20d76da..2fc37709217de8 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/variable.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/variable.cpp @@ -46,6 +46,27 @@ void CreateVariableAccessPrimitive(ProgramBuilder &p, const std::shared_ptr +// void CreateVariableAccessPrimitive(ProgramBuilder &p, const std::shared_ptr &op, +// const std::string &variable_id) { +// const auto output_pshape = op->get_output_partial_shape(0); +// const auto output_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(0)); +// const auto output_format = cldnn::format::get_default_format(output_pshape.size()); + +// const auto variable_layout = cldnn::layout{ output_pshape, output_dtype, output_format }; + +// auto inputs = p.GetInputInfo(op); +// auto user_specified_type = get_original_precision(op); +// const auto prim = T_PRIMITIVE{layer_type_name_ID(op), +// inputs, +// variable_id, +// variable_layout, +// user_specified_type, +// true}; + +// p.add_primitive(*op, prim); +// } + void CreateReadValueOp(ProgramBuilder& p, const std::shared_ptr& op) { validate_inputs_count(op, {0, 1}); CreateVariableAccessPrimitive(p, op, op->get_variable_id()); @@ -67,7 +88,23 @@ void CreateCompressedReadValueOp(ProgramBuilder& p, const std::shared_ptr(p, op, op->get_variable_id()); + const auto output_pshape = op->get_output_partial_shape(0); + const auto output_dtype = cldnn::element_type_to_data_type(op->get_output_element_type(0)); + const auto output_format = cldnn::format::get_default_format(output_pshape.size()); + + const auto variable_layout = cldnn::layout{ output_pshape, output_dtype, output_format }; + + auto inputs = p.GetInputInfo(op); + auto user_specified_type = get_original_precision(op); + auto prim = cldnn::read_value{layer_type_name_ID(op), + inputs, + op->get_variable_id(), + variable_layout, + user_specified_type}; + prim.compressed = true; + prim.num_outputs = 2; + + p.add_primitive(*op, prim); } void CreateAssignOp(ProgramBuilder& p, const std::shared_ptr& op) { diff --git a/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp index c90bd601bb64a3..9c70eb02340c1c 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/indirect_kv_cache.cpp @@ -190,6 +190,25 @@ IndirectSDPAOpt::IndirectSDPAOpt() { auto order_out = sdpa->get_output_transpose_order(); auto is_causal = sdpa->get_causal(); + auto input0_transpose_order = sdpa->get_input0_transpose_order(); + auto input1_transpose_order = sdpa->get_input1_transpose_order(); + auto input2_transpose_order = sdpa->get_input2_transpose_order(); + auto output_transpose_order = sdpa->get_output_transpose_order(); + + auto print_arr2 = [&](const std::vector& vec, size_t max_len, std::string name) { + std::stringstream ss; + for (size_t i = 0; i < std::min(max_len, vec.size()); i++) { + ss << vec[i] << ", "; + } + // std::cout << "-> Orig SDPA Array " << name << " (len=" << vec.size() << ") content: " << ss.str() << "\n"; + }; + + print_arr2(input0_transpose_order, input0_transpose_order.size(), "input0_transpose_order"); + print_arr2(input1_transpose_order, input1_transpose_order.size(), "input1_transpose_order"); + print_arr2(input2_transpose_order, input2_transpose_order.size(), "input2_transpose_order"); + print_arr2(output_transpose_order, output_transpose_order.size(), "output_transpose_order"); + + OutputVector data_inputs; data_inputs.push_back(sdpa->get_input_node_shared_ptr(0)); // Q data_inputs.push_back(sdpa->get_input_node_shared_ptr(1)); // K diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp index 4f47253c089840..bff8c0ced5861a 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp @@ -104,14 +104,32 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher() { ) { std::cout << "pattern matched! " << org_sdpa->get_friendly_name() << std::endl; auto rank = key_node->get_input_partial_shape(0).size(); - std::vector shape_group_size(rank, 1); - shape_group_size[rank - 1] = UINT64_MAX; - - - GPU_DEBUG_GET_INSTANCE(debug_config); - GPU_DEBUG_IF(debug_config->enable_kv_cache_compression != 1) { // per-token compression - shape_group_size[rank - 2] = UINT64_MAX; - } + auto get_shape_group_sizes = [&](const std::vector& transposed_order) { + std::vector shape_group_size(rank, 1); + std::vector order = transposed_order; + if (transposed_order.size() != rank) { + order.resize(rank); + std::iota(order.begin(), order.end(), 0); + } + + shape_group_size[order[rank - 1]] = UINT64_MAX; + GPU_DEBUG_GET_INSTANCE(debug_config); + GPU_DEBUG_IF(debug_config->enable_kv_cache_compression != 1) { // per-token compression + shape_group_size[order[1]] = UINT64_MAX; + } + + return shape_group_size; + }; + + auto get_scales_output_order = [&](const std::vector& transposed_order) { + std::vector scales_output_order(rank, 1); + scales_output_order[0] = transposed_order[0]; + scales_output_order[1] = transposed_order[3]; + scales_output_order[2] = transposed_order[2]; + scales_output_order[3] = transposed_order[1]; + + return scales_output_order; + }; auto key_variable = key_past_node->get_variable(); key_variable->update_data_type(element::i8); @@ -119,68 +137,117 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher() { auto value_variable = value_past_node->get_variable(); value_variable->update_data_type(element::i8); - // auto replace_read_value_node = [](const std::shared_ptr& target, - // const std::shared_ptr& replacement) { - // target->output(0).replace(replacement->output(0)); - - // replacement->add_node_control_dependents(target); - // replacement->add_node_control_dependencies(target); - // target->clear_control_dependents(); - // }; + auto print_arr = [&](const std::vector& vec, size_t max_len, std::string name) { + std::stringstream ss; + for (size_t i = 0; i < std::min(max_len, vec.size()); i++) { + ss << vec[i] << ", "; + } + std::cout << "Array " << name << " (len=" << vec.size() << ") content: " << ss.str() << "\n"; + }; + + auto shape_group_size = get_shape_group_sizes(org_sdpa->get_input1_transpose_order()); + print_arr(shape_group_size, shape_group_size.size(), "shape_group_size"); + + auto scales_output_order = get_scales_output_order(org_sdpa->get_input1_transpose_order()); + print_arr(scales_output_order, scales_output_order.size(), "scales_output_order"); + + auto replace_read_value_node = [](const std::shared_ptr& target, + const std::shared_ptr& replacement) { + target->output(0).replace(replacement->output(0)); + + // replacement->add_node_control_dependents(target); + // replacement->add_node_control_dependencies(target); + // target->clear_control_dependents(); + }; + + // llama2-7b + // indirect : 1, + // gather axis : 0, + // compressed : 1, + // concat axis : 2, + // variable shape : [?,32,?,128], + // k_order: 0, 1, 2, 3 + // shape_group_size init: 1, 1, 1, 1 + // shape_group_size applied TOKEN: 1, MAX, 1, MAX + // shape_group_size applied HEAD: 1, 1, 1, MAX + // sizes TOKEN: [1, 1, 8, 1] + [1, 1, 1, 1] = [1, 1, 9, 1] + // sizes HEAD: [1, 32, 8, 1] + [1, 32, 1, 1] = [1, 32, 9, 1] + // GWS TOKEN: BATCH(1) * 1, Y(concat_axis), 1 + // GWS PER HEAD: BATCH(1) * HEADS_NUM(32), Y(concat_axis), 1 + + // Scales order: 0, 3, 2, 1 + + + // qwen + // indirect : 1, + // gather axis : 0, + // compressed : 1, + // concat axis : 1, + // variable shape : [?,?,32,128], + // k_oder: 0, 2, 1, 3 + // shape_group_size init: 1, 1, 1, 1 + // shape_group_size applied TOKEN: 1, 1, MAX, MAX + // shape_group_size applied HEAD: 1, 1, 1, MAX + // GWS TOKEN: BATCH * FEATURE, 1, 1 + // GWS HEAD: BATCH * FEATURE, HEADS_NUM, 1 + + // Scales order: 0, 3, 1, 2 if (key_past_node->get_input_size() == 1) { - auto k_init_dyn_quan = std::make_shared(key_past_node->get_input_node_shared_ptr(0), shape_group_size, element::f16); + auto k_init_dyn_quan = std::make_shared(key_past_node->get_input_node_shared_ptr(0), shape_group_size, element::f16, scales_output_order); auto new_key_past_node = std::make_shared(k_init_dyn_quan->output(0), k_init_dyn_quan->output(1), key_past_node->get_variable()); k_init_dyn_quan->set_friendly_name(key_node->get_friendly_name() + "_init_dyn_quan"); std::cout << "Key outputs: " << key_past_node->get_output_size() << " " << new_key_past_node->get_output_size() << "\n"; ov::copy_runtime_info(key_past_node, new_key_past_node); // TODO: Old ReadValue node is kept in the graph and goes to ShapeOf - this needs to be fixed - // replace_read_value_node(key_past_node, new_key_past_node); + replace_read_value_node(key_past_node, new_key_past_node); + // ov::replace_node(key_past_node, new_key_past_node); key_past_node = new_key_past_node; } if (value_past_node->get_input_size() == 1) { - auto v_init_dyn_quan = std::make_shared(value_past_node->get_input_node_shared_ptr(0), shape_group_size, element::f16); + auto v_init_dyn_quan = std::make_shared(value_past_node->get_input_node_shared_ptr(0), shape_group_size, element::f16, scales_output_order); auto new_value_past_node = std::make_shared(v_init_dyn_quan->output(0), v_init_dyn_quan->output(1), value_past_node->get_variable()); std::cout << "Value outputs: " << value_past_node->get_output_size() << " " << new_value_past_node->get_output_size() << "\n"; v_init_dyn_quan->set_friendly_name(value_node->get_friendly_name() + "_init_dyn_quan"); ov::copy_runtime_info(value_past_node, new_value_past_node); - // replace_read_value_node(value_past_node, new_value_past_node); + replace_read_value_node(value_past_node, new_value_past_node); + // ov::replace_node(value_past_node, new_value_past_node); value_past_node = new_value_past_node; } - auto k_dyn_quan = std::make_shared(key_node->get_input_node_shared_ptr(1), shape_group_size, element::f16); + auto k_dyn_quan = std::make_shared(key_node->get_input_node_shared_ptr(1), shape_group_size, element::f16, scales_output_order); k_dyn_quan->set_friendly_name("dyn_quan_key"); // FIXME: need to tell whether it is direct KV cache or indirect kv cache - auto new_kv_cache_k = std::make_shared(key_past_node, + auto new_kv_cache_k = std::make_shared(key_past_node->output(0), k_dyn_quan->output(0), key_node->get_input_node_shared_ptr(2), + key_past_node->output(1), k_dyn_quan->output(1), key_node->get_variable(), key_node->get_concat_axis(), - key_node->get_gather_axis(), - key_node->get_output_element_type(0)); + key_node->get_gather_axis()); new_kv_cache_k->set_friendly_name(key_node->get_friendly_name()); ov::copy_runtime_info(key_node, new_kv_cache_k); - auto v_dyn_quan = std::make_shared(value_node->get_input_node_shared_ptr(1), shape_group_size, element::f16); + auto v_dyn_quan = std::make_shared(value_node->get_input_node_shared_ptr(1), shape_group_size, element::f16, scales_output_order); v_dyn_quan->set_friendly_name("dyn_quan_value"); // FIXME: need to tell whether it is direct KV cache or indirect kv cache - auto new_kv_cache_v = std::make_shared(value_past_node, + auto new_kv_cache_v = std::make_shared(value_past_node->output(0), v_dyn_quan->output(0), value_node->get_input_node_shared_ptr(2), + value_past_node->output(1), v_dyn_quan->output(1), value_node->get_variable(), value_node->get_concat_axis(), - value_node->get_gather_axis(), - value_node->get_output_element_type(0)); + value_node->get_gather_axis()); new_kv_cache_v->set_friendly_name(value_node->get_friendly_name()); ov::copy_runtime_info(value_node, new_kv_cache_v); @@ -195,17 +262,37 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher() { sdpa_inputs.push_back(new_kv_cache_k->output(2)); // scale for compressed K sdpa_inputs.push_back(new_kv_cache_v->output(2)); // scale for compressed V + auto input0_transpose_order = org_sdpa->get_input0_transpose_order(); + auto input1_transpose_order = org_sdpa->get_input1_transpose_order(); + auto input2_transpose_order = org_sdpa->get_input2_transpose_order(); + auto output_transpose_order = org_sdpa->get_output_transpose_order(); + + auto print_arr2 = [&](const std::vector& vec, size_t max_len, std::string name) { + std::stringstream ss; + for (size_t i = 0; i < std::min(max_len, vec.size()); i++) { + ss << vec[i] << ", "; + } + std::cout << "-> Array " << name << " (len=" << vec.size() << ") content: " << ss.str() << "\n"; + }; + + print_arr2(input0_transpose_order, input0_transpose_order.size(), "input0_transpose_order"); + print_arr2(input1_transpose_order, input1_transpose_order.size(), "input1_transpose_order"); + print_arr2(input2_transpose_order, input2_transpose_order.size(), "input2_transpose_order"); + print_arr2(output_transpose_order, output_transpose_order.size(), "output_transpose_order"); + + auto new_sdpa = std::make_shared(sdpa_inputs, new_kv_cache_k->output(1), org_sdpa->get_causal(), true /* kv_compressed */, org_sdpa->get_indirect_axis(), - org_sdpa->get_input0_transpose_order(), - org_sdpa->get_input1_transpose_order(), - org_sdpa->get_input2_transpose_order(), - org_sdpa->get_output_transpose_order(), + input0_transpose_order, + input1_transpose_order, + input2_transpose_order, + output_transpose_order, org_sdpa->get_output_type()); + new_kv_cache_k->set_friendly_name(key_node->get_friendly_name()); ov::copy_runtime_info(key_node, new_kv_cache_k); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp index 5ca1af8ca2ad6b..a0f2b07afe9dac 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/kv_cache.cpp @@ -51,23 +51,21 @@ KVCache::KVCache(const Output& past, KVCache::KVCache(const Output& past, const Output& new_token_data, const Output& beam_idx, + const Output& past_scale, const Output& new_token_scale, const std::shared_ptr& past_variable, int64_t concat_axis, int64_t gather_axis, const ov::element::Type output_type) - : Op({past, new_token_data, beam_idx, new_token_scale}) + : Op({past, new_token_data, beam_idx, past_scale, new_token_scale}) , m_concat_axis(concat_axis) , m_gather_axis(gather_axis) , m_indirect(true) , m_compressed(true) , m_output_type(output_type) { m_variable = past_variable; - size_t out_ports = 1; - if (m_indirect) - set_output_size(++out_ports); - if (m_compressed) - set_output_size(++out_ports); + size_t out_ports = 3; + set_output_size(out_ports); validate_and_infer_types(); } @@ -83,8 +81,15 @@ bool KVCache::visit_attributes(ov::AttributeVisitor& visitor) { void KVCache::validate_and_infer_types() { auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; std::vector input_shapes = {m_variable->get_info().data_shape, get_input_partial_shape(1)}; - if (m_indirect) + if (m_indirect) { input_shapes.push_back(get_input_partial_shape(2)); + } + + if (m_compressed) { + input_shapes.push_back(get_input_partial_shape(3)); + input_shapes.push_back(get_input_partial_shape(4)); + } + auto shapes = shape_infer(this, input_shapes); size_t out_ports = 0; set_output_type(out_ports++, output_type, shapes[0]); @@ -93,7 +98,7 @@ void KVCache::validate_and_infer_types() { set_output_type(out_ports++, get_input_element_type(2), shapes[1]); } if (m_compressed) { - set_output_type(out_ports++, get_input_element_type(2), shapes[1]); + set_output_type(out_ports++, get_input_element_type(3), shapes[2]); } } @@ -119,6 +124,7 @@ std::shared_ptr KVCache::clone_with_new_inputs(const ov::OutputVector& new new_args.at(1), new_args.at(2), new_args.at(3), + new_args.at(4), m_variable, m_concat_axis, m_gather_axis, @@ -147,13 +153,17 @@ std::vector shape_infer(const KVCache* op, std::vectorget_output_size() == 3){ - ov::PartialShape compression_scale_shape(std::vector(out_shapes[0].size(), 1)); - compression_scale_shape[0] = out_shapes[0][0]; - compression_scale_shape[1] = out_shapes[0][1]; - GPU_DEBUG_IF(cldnn::debug_configuration::get_instance()->enable_kv_cache_compression == 1) { // per-head compression - compression_scale_shape[2] = out_shapes[0][2]; - } + ov::PartialShape compression_scale_shape = input_shapes[3]; + compression_scale_shape[concat_axis] += input_shapes[4][concat_axis]; out_shapes[2] = compression_scale_shape; + + // ov::PartialShape compression_scale_shape(std::vector(out_shapes[0].size(), 1)); + // compression_scale_shape[0] = out_shapes[0][0]; + // compression_scale_shape[1] = out_shapes[0][1]; + // GPU_DEBUG_IF(cldnn::debug_configuration::get_instance()->enable_kv_cache_compression == 1) { // per-head compression + // compression_scale_shape[2] = out_shapes[0][2]; + // } + // out_shapes[2] = compression_scale_shape; } } else { out_shapes[0] = input_shapes[0]; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp index 1e78cf65730775..99357f6d26eeb6 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/read_value.cpp @@ -46,6 +46,10 @@ void ReadValue::validate_and_infer_types() { const auto compatible_type = variable_type.is_dynamic() || initial_type == variable_type; const auto compatible_shape = variable_shape.relaxes(initial_shape); + if (get_friendly_name() == "ReadValue_179538") { + std::cout << "input name" << get_input_node_shared_ptr(0)->get_friendly_name() << " " << get_input_node_shared_ptr(0)->get_type_name() << "\n"; + } + OPENVINO_ASSERT(compatible_shape, "The shape specified in the Variable has to extend (relax) the shape " "inferred from the initializing subgraph.", @@ -59,7 +63,8 @@ void ReadValue::validate_and_infer_types() { " Variable type: ", variable_type, " Initialization type: ", - initial_type); + initial_type, " ", get_friendly_name(), " ", std::string(get_type_name())); + // workaround: // dynamic rank/type can be derived from the IRs generated via the prev versions of OV, // but dynamic rank/type are not supported in plugins, diff --git a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp index b7b544b49791c7..48535f77081202 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/op/sdpa.cpp @@ -29,6 +29,20 @@ SDPA::SDPA(const OutputVector& inputs, , m_order_v(order_v) , m_order_out(order_out) , m_output_type(output_type) { + + auto print_arr = [&](const std::vector& vec, size_t max_len, std::string name) { + std::stringstream ss; + for (size_t i = 0; i < std::min(max_len, vec.size()); i++) { + ss << vec[i] << ", "; + } + // std::cout << "Init Array " << name << " (len=" << vec.size() << ") content: " << ss.str() << "\n"; + }; + + print_arr(m_order_q, m_order_q.size(), "m_order_q"); + print_arr(m_order_k, m_order_k.size(), "m_order_k"); + print_arr(m_order_v, m_order_v.size(), "m_order_v"); + print_arr(m_order_out, m_order_out.size(), "m_order_out"); + set_arguments(inputs); set_causal(is_causal); validate_and_infer_types(); @@ -63,6 +77,19 @@ void SDPA::validate_and_infer_types() { input_shapes.push_back(get_input_partial_shape(i)); } + auto print_arr = [&](const std::vector& vec, size_t max_len, std::string name) { + std::stringstream ss; + for (size_t i = 0; i < std::min(max_len, vec.size()); i++) { + ss << vec[i] << ", "; + } + // std::cout << "Array " << name << " (len=" << vec.size() << ") content: " << ss.str() << "\n"; + }; + + print_arr(m_order_q, m_order_q.size(), "m_order_q"); + print_arr(m_order_k, m_order_k.size(), "m_order_k"); + print_arr(m_order_v, m_order_v.size(), "m_order_v"); + print_arr(m_order_out, m_order_out.size(), "m_order_out"); + auto out_shapes = shape_infer(this, input_shapes, m_order_q, @@ -97,6 +124,7 @@ std::vector shape_infer(const SDPA* op, auto transpose_pshape = [](const ov::PartialShape pshape, const std::vector& order) { auto transposed_pshape = ov::PartialShape::dynamic(pshape.rank()); for (size_t i = 0; i < order.size(); i++) { + // std::cout << "Check order " << order[i] << "\n"; transposed_pshape[i] = pshape[order[i]]; } diff --git a/src/plugins/intel_gpu/src/runtime/format.cpp b/src/plugins/intel_gpu/src/runtime/format.cpp index 7aca55d3f0bf0f..7c1498d5156bba 100644 --- a/src/plugins/intel_gpu/src/runtime/format.cpp +++ b/src/plugins/intel_gpu/src/runtime/format.cpp @@ -170,6 +170,8 @@ static const std::map format_traits_map { const format_traits& format::traits(type fmt) { auto it = format_traits_map.find(fmt); + if (it == format_traits_map.end()) + std::cout << "Error\n"; OPENVINO_ASSERT(it != format_traits_map.end(), "[GPU] Format description is missing in fmt traits"); return it->second; }