From 775d01a0726a7f1213924a1028f44f5698b3b9bd Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Tue, 15 Oct 2024 21:19:52 +0400 Subject: [PATCH] WIP: zp support --- .../include/ov_ops/dynamic_quantize.hpp | 11 +- .../src/ov_ops/dynamic_quantize.cpp | 5 +- .../include/intel_gpu/op/indirect_sdpa.hpp | 3 + .../include/intel_gpu/op/kv_cache.hpp | 9 ++ .../intel_gpu/include/intel_gpu/op/sdpa.hpp | 1 + .../intel_gpu/primitives/dynamic_quantize.hpp | 2 + .../include/intel_gpu/primitives/kv_cache.hpp | 1 + .../scaled_dot_product_attention.hpp | 2 + .../intel_gpu/src/graph/dynamic_quantize.cpp | 11 +- .../src/graph/impls/ocl/dynamic_quantize.cpp | 1 + .../src/graph/impls/ocl/kv_cache.cpp | 2 + .../ocl/scaled_dot_product_attention.cpp | 2 + .../src/graph/include/dynamic_quantize_inst.h | 2 +- src/plugins/intel_gpu/src/graph/kv_cache.cpp | 8 ++ .../intel_gpu/src/graph/primitive_inst.cpp | 2 +- .../dynamic_quantize_gpu_opt_generic.cl | 65 ++++++++--- .../kernel_selector/cl_kernels/sdpa_opt.cl | 104 +++++++++++++++--- .../kernel_selector/cl_kernels/sdpa_ref.cl | 23 +++- .../dynamic_quantize_kernel_opt_generic.cpp | 2 + .../dynamic_quantize_kernel_ref.h | 2 + .../kernels/sdpa/sdpa_kernel_base.cpp | 1 + .../kernels/sdpa/sdpa_kernel_base.h | 1 + .../src/plugin/ops/dynamic_quantize.cpp | 3 + .../intel_gpu/src/plugin/ops/kv_cache.cpp | 1 + .../ops/scaled_dot_product_attention.cpp | 4 + .../dynamic_quantize_fully_connected.cpp | 2 +- .../transformations/kv_cache_compression.cpp | 22 +++- 27 files changed, 243 insertions(+), 49 deletions(-) diff --git a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp index e546a426dbc878..28ffc8368e2501 100644 --- a/src/common/transformations/include/ov_ops/dynamic_quantize.hpp +++ b/src/common/transformations/include/ov_ops/dynamic_quantize.hpp @@ -16,13 +16,18 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { public: OPENVINO_OP("DynamicQuantize", "gpu_opset"); + enum class QuantizationMode { + Asymmetric, + Symmetric + }; + DynamicQuantize() = default; /// \brief Constructs an DynamicQuantize operation. /// /// \param data Input tensor with data /// \param group_sizes Group sizes for dynamic quantization /// \param dt_scale Data type for scale output - DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale, std::vector scales_output_order = {}); + DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale, QuantizationMode mode, std::vector scales_output_order = {}); void validate_and_infer_types() override; @@ -33,12 +38,16 @@ class TRANSFORMATIONS_API DynamicQuantize : public ov::op::Op { const std::vector& get_scales_output_order() const { return m_scales_output_order; }; + QuantizationMode get_quantization_mode() const { + return m_mode; + }; static std::vector shape_infer(const DynamicQuantize* op, const std::vector& input_shapes, const std::vector& group_sizes, const std::vector& scales_output_order = {}); private: + QuantizationMode m_mode; std::vector m_group_sizes; std::vector m_scales_output_order; element::Type m_dt_scale; diff --git a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp index 0ac0a570e2a002..6ee34731160c4b 100644 --- a/src/common/transformations/src/ov_ops/dynamic_quantize.cpp +++ b/src/common/transformations/src/ov_ops/dynamic_quantize.cpp @@ -13,8 +13,9 @@ namespace ov { namespace op { namespace internal { -DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale, std::vector scales_output_order) +DynamicQuantize::DynamicQuantize(const Output& data, std::vector group_sizes, element::Type dt_scale, QuantizationMode mode, std::vector scales_output_order) : Op({data}), + m_mode(mode), m_group_sizes(std::move(group_sizes)), m_scales_output_order(std::move(scales_output_order)), m_dt_scale(dt_scale) { @@ -38,7 +39,7 @@ void DynamicQuantize::validate_and_infer_types() { std::shared_ptr DynamicQuantize::clone_with_new_inputs(const ov::OutputVector& new_args) const { check_new_args_count(this, new_args); - return std::make_shared(new_args.at(0), m_group_sizes, m_dt_scale); + return std::make_shared(new_args.at(0), m_group_sizes, m_dt_scale, m_mode); } std::vector DynamicQuantize::shape_infer(const DynamicQuantize* op, diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp index 3b38159bd8a873..ccee746a886568 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/indirect_sdpa.hpp @@ -35,6 +35,9 @@ class IndirectSDPA : public ov::intel_gpu::op::SDPA { std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + void set_asym(bool val) { m_is_asym_compressed = val; } + bool get_asym() const { return m_is_asym_compressed; } + ov::element::Type get_output_type() const { return m_output_type; } int64_t get_indirect_axis() const { return m_indirect_axis; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp index 1692abd3be9bcd..9ffa3b638d827f 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/kv_cache.hpp @@ -70,13 +70,22 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension { const std::vector& get_group_sizes() const { return m_group_sizes; }; const std::vector& get_scales_output_order() const { return m_scales_output_order; }; + bool get_asymmetric_quantization() const { + return m_use_asymmetric_quantization; + } + void set_asymmetric_quantization(bool val) { + m_use_asymmetric_quantization = val; + } + private: int64_t m_concat_axis = 0; int64_t m_gather_axis = 0; bool m_indirect = false; // KV-cache compression parameters + // TODO: move these parameters to separate structure bool m_compressed = false; + bool m_use_asymmetric_quantization = false; std::vector m_group_sizes = {}; std::vector m_scales_output_order = {}; ov::element::Type m_compression_type = ov::element::undefined; diff --git a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp index 69de61105cba06..00d408ba130c84 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/op/sdpa.hpp @@ -52,6 +52,7 @@ class SDPA : public ov::op::v13::ScaledDotProductAttention { protected: bool m_is_causal; bool m_is_kv_compressed; + bool m_is_asym_compressed; std::vector m_order_q; std::vector m_order_k; std::vector m_order_v; diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp index f0a6ff136cbd8d..187b24a3cd2447 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/dynamic_quantize.hpp @@ -29,6 +29,7 @@ struct dynamic_quantize : public primitive_base { group_sizes(group_sizes), scales_output_order(scales_output_order) {} + bool use_asymmetric_quantization = false; std::vector group_sizes; std::vector scales_output_order; @@ -50,6 +51,7 @@ struct dynamic_quantize : public primitive_base { void save(BinaryOutputBuffer& ob) const override { primitive_base::save(ob); ob << group_sizes; + // TODO: add more parameters } void load(BinaryInputBuffer& ib) override { diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp index bff666783f3640..70e4bd39f15da3 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/kv_cache.hpp @@ -35,6 +35,7 @@ struct kv_cache : public primitive_base { int64_t gather_axis = 0; bool indirect = false; bool compressed = false; + bool use_asymmetric_quantization = false; std::vector group_sizes = {}; std::vector scales_output_order = {}; ov::element::Type compression_type = ov::element::undefined; diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp index 28eaa625832f94..c11d2c9dd55eb0 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/scaled_dot_product_attention.hpp @@ -44,6 +44,7 @@ struct scaled_dot_product_attention : public primitive_base -std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, const std::vector& group_sizes, const std::vector& scales_output_order) { +std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, const std::vector& group_sizes, const std::vector& scales_output_order, bool use_asymmetric_quantization) { ov::op::internal::DynamicQuantize op; auto output_format = act_layout.format; @@ -44,16 +44,21 @@ std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &a auto output_shapes = ov::op::internal::DynamicQuantize::shape_infer(&op, input_shapes, group_sizes, scales_output_order); GPU_DEBUG_TRACE_DETAIL << "shape infer dynamic" << output_shapes[0] << " " << output_shapes[1] << "\n"; + if (use_asymmetric_quantization) { + output_shapes[1][3] *= 2; + } + return { layout(output_shapes[0], data_types::i8, output_format), layout(output_shapes[1], data_types::f16, output_format) }; } -template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, const std::vector& group_sizes, const std::vector& scales_output_order); +template std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &act_layout, const std::vector& group_sizes, const std::vector& scales_output_order, bool use_asymmetric_quantization); template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& /*node*/, const kernel_impl_params& impl_param) { auto desc = impl_param.typed_desc(); const auto& input_layout = impl_param.get_input_layout(); - return __calc_output_layouts(input_layout, desc->group_sizes, desc->scales_output_order); + + return __calc_output_layouts(input_layout, desc->group_sizes, desc->scales_output_order, desc->use_asymmetric_quantization); } template std::vector dynamic_quantize_inst::calc_output_layouts(dynamic_quantize_node const& node, diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp index f682f8648f369e..17c810a590d35e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/dynamic_quantize.cpp @@ -40,6 +40,7 @@ struct dynamic_quantize_impl : typed_primitive_impl_ocl { const auto& desc = impl_param.typed_desc(); params.group_sizes = desc->group_sizes; params.scales_output_order = desc->scales_output_order; + params.use_asymmetric_quantization = desc->use_asymmetric_quantization; return params; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp index 6a048856c6694c..056e09c76de729 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp @@ -411,6 +411,8 @@ struct kv_cache_impl : multi_stage_primitive { params.append_axis = primitive->concat_axis; params.group_sizes = primitive->group_sizes; params.scales_output_order = primitive->scales_output_order; + params.use_asymmetric_quantization = primitive->use_asymmetric_quantization; + params.group_scales_with_zp = true; if (!is_shape_agnostic) { const auto& past_kv_cache_shape = impl_param.input_layouts[0].get_partial_shape(); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index e3da7d5f551f87..566ebc03a99a96 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -231,8 +231,10 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveis_causal; config.is_kv_compressed = desc->is_kv_compressed; + config.is_asym_compressed = desc->is_asym_compressed; GPU_DEBUG_TRACE << "Set is_kv_compressed to " << config.is_kv_compressed << "\n"; + GPU_DEBUG_TRACE << "Set is_asym_compressed to " << config.is_asym_compressed << "\n"; return config; } diff --git a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h index 999c7f297552de..73ea583a3a2133 100644 --- a/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/dynamic_quantize_inst.h @@ -35,7 +35,7 @@ class typed_primitive_inst : public typed_primitive_inst_base< // Internal function to be used from fakealignment template - static std::vector __calc_output_layouts(const layout &act_layout, const std::vector& group_size, const std::vector& scales_output_order); + static std::vector __calc_output_layouts(const layout &act_layout, const std::vector& group_size, const std::vector& scales_output_order, bool use_asymmetric_quantization); static std::string to_string(dynamic_quantize_node const& node); typed_primitive_inst(network& network, dynamic_quantize_node const& node); diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp index bf372021f390dc..cf480f499588e0 100644 --- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp @@ -45,9 +45,17 @@ std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& node // input_shapes.push_back(impl_param.get_input_layout(4).get()); } + if (desc->compressed && desc->use_asymmetric_quantization) { + input_shapes[3][3] /= 2; + } + std::vector output_shapes = desc->compressed ? shape_infer(&op, input_shapes, desc->group_sizes, desc->scales_output_order) : shape_infer(&op, input_shapes); + if (desc->compressed && desc->use_asymmetric_quantization) { + output_shapes[2][3] *= 2; + } + if (desc->num_outputs == 3) GPU_DEBUG_TRACE_DETAIL << desc->id << " scales output calculated shape: " << output_shapes[2] << "\n"; diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 3992b02473fc76..c3509dfb224db8 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -688,7 +688,7 @@ event::ptr primitive_inst::realloc_if_needed() { // dynamic quantization is only applied to activation of FC if (get_node().is_type()) { const auto& desc = get_node().as().get_primitive(); - auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], desc->group_sizes, desc->scales_output_order); + auto dyn_quan_scale_layout = dynamic_quantize_inst::__calc_output_layouts(updated_layouts[dep_idx], desc->group_sizes, desc->scales_output_order, desc->use_asymmetric_quantization); GPU_DEBUG_TRACE_DETAIL << "update layout of dynamic quantize scale parameter layout " << dyn_quan_scale_layout[1].to_short_string() << std::endl; updated_params.output_layouts[1] = dyn_quan_scale_layout[1]; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl index c5dc3609133192..c27a1fc9bb937a 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_opt_generic.cl @@ -26,10 +26,7 @@ inline uint FUNC(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, u return OUTPUT1_GET_INDEX(b, f, y, x); } -inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x, uint axis_offset) { -#ifdef APPEND_MODE - APPEND_AXIS_NAME += axis_offset; -#endif +inline uint FUNC(get_scales_offset)(OPTIONAL_SHAPE_INFO_ARG uint b, uint f, uint y, uint x) { #ifdef SCALES_OUTPUT_ORDER return FUNC_CALL(get_scales_offset_nt)(OPTIONAL_SHAPE_INFO_TENSOR SCALES_OUTPUT_ORDER); #else @@ -49,6 +46,9 @@ KERNEL(dynamic_quantize_gpu_opt_generic)( const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, __global OUTPUT1_TYPE* output_scale +#if ASYMMETRIC_QUANTIZATION && !GROUP_SCALES_WITH_ZP + , __global OUTPUT1_TYPE* output_zp +#endif #ifdef APPEND_MODE , const uint axis_offset #endif @@ -64,18 +64,31 @@ KERNEL(dynamic_quantize_gpu_opt_generic)( // the innermost dimension is always handled in the loop inside the kernel const uint x = 0; - half max_value = 0.0001h; + half max_value = INPUT0_VAL_MIN; + half min_value = INPUT0_VAL_MAX; + half val[INNERMOST_DIM_VALUE / SUBGROUP_SIZE]; const uint input_offset = INPUT0_GET_INDEX(b, f, y, x); unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) { val[i] = INPUT_BLOCK_READ(input, input_offset + i * SUBGROUP_SIZE); +#if ASYMMETRIC_QUANTIZATION + max_value = fmax(max_value, val[i]); + min_value = fmin(min_value, val[i]); +#else max_value = fmax(max_value, fabs(val[i])); +#endif } +#if ASYMMETRIC_QUANTIZATION + min_value = work_group_reduce_min(min_value); max_value = work_group_reduce_max(max_value); - - half scale = 127.0h / max_value; + OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value)); + OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_value * scale) - CHAR_MAX; +#else + max_value = work_group_reduce_max(max_value); + OUTPUT1_TYPE scale = 127.0h / max_value; +#endif #ifdef APPEND_MODE APPEND_AXIS_NAME += axis_offset; @@ -83,23 +96,39 @@ KERNEL(dynamic_quantize_gpu_opt_generic)( const uint output_offset = OUTPUT_GET_INDEX(b, f, y, x); unroll_for (uint i = 0; i < INNERMOST_DIM_VALUE / SUBGROUP_SIZE; i++) { - OUTPUT_BLOCK_WRITE(output, output_offset + i * SUBGROUP_SIZE, convert_char(val[i] * scale)); - } - -#ifdef APPEND_MODE - // const uint scale_axis_offset = axis_offset; - const uint scale_axis_offset = 0; +#if ASYMMETRIC_QUANTIZATION + OUTPUT_TYPE res = convert_char(val[i] * scale + zp); #else - const uint scale_axis_offset = 0; + OUTPUT_TYPE res = convert_char(val[i] * scale); #endif - const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x, scale_axis_offset); + OUTPUT_BLOCK_WRITE(output, output_offset + i * SUBGROUP_SIZE, res); + } + + const uint scale_idx = FUNC_CALL(get_scales_offset)(OPTIONAL_SHAPE_INFO_TENSOR b, f, y, x); if (grouped_indexes == 0 && sglid == 0) { #ifdef APPEND_MODE - // if (axis_offset > 0) { - // printf("Save scale_idx=%d, axis_offset=%d; output=%p, scale=%p; val=%f\n", scale_idx, axis_offset, output, output_scale, 1.0h / scale); - // } +#if GROUP_SCALES_WITH_ZP + // half result0 = (convert_half(convert_char(val[0] * scale + zp)) - zp) * (1.0h / scale); + // half result1 = (convert_half(convert_char(val[1] * scale + zp)) - zp) * (1.0h / scale); + // half result2 = (convert_half(convert_char(val[2] * scale + zp)) - zp) * (1.0h / scale); + // half result3 = (convert_half(convert_char(val[3] * scale + zp)) - zp) * (1.0h / scale); + // printf("Save scale_idx=%d, axis_offset=%d; scale=%f; zp=%f, min=%f, max=%f; orig=(%f %f %f %f), compressed=(%d %d %d %d), decompressed=(%f %f)\n", scale_idx, axis_offset, scale, zp, min_value, max_value, + // val[0], val[1], val[2], val[3], + // convert_char(val[0] * scale + zp), convert_char(val[1] * scale + zp), convert_char(val[2] * scale + zp), convert_char(val[3] * scale + zp), + // result0, + // result1); +#endif #endif +#if ASYMMETRIC_QUANTIZATION output_scale[scale_idx] = 1.0h / scale; +#if GROUP_SCALES_WITH_ZP + output_scale[scale_idx + 1] = zp; +#else + output_zp[scale_idx] = zp; +#endif +#else + output_scale[scale_idx] = 1.0h / scale; +#endif } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl index 46664aa329c3ab..57cd987f1f1df1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_opt.cl @@ -258,13 +258,16 @@ KERNEL(sdpa_opt)( #endif #ifdef COMPRESSED_PER_HEAD // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, b1_idx, 0); - const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0); #else // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len, 0, 0, 0, 0); - const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + seq_len, 0); #endif KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset]; +#if ASYMMETRIC_COMPRESSED + KEY_COMPRESSION_SCALE_TYPE key_comp_zp = key_scale[key_scale_comp_offset + 1]; +#endif #endif // ulong timer_start = intel_get_cycle_counter(); @@ -288,7 +291,11 @@ KERNEL(sdpa_opt)( // KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); // KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if ASYMMETRIC_COMPRESSED + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) - key_comp_zp) * key_comp_scale; +#else KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; +#endif #else KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); #endif @@ -318,7 +325,11 @@ KERNEL(sdpa_opt)( #if IS_KV_COMPRESSED KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if ASYMMETRIC_COMPRESSED + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) - key_comp_zp) * key_comp_scale; +#else KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; +#endif #else KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); #endif @@ -348,7 +359,11 @@ KERNEL(sdpa_opt)( #if IS_KV_COMPRESSED KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if ASYMMETRIC_COMPRESSED + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) - key_comp_zp) * key_comp_scale; +#else KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; +#endif #else KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); #endif @@ -378,7 +393,11 @@ KERNEL(sdpa_opt)( #if IS_KV_COMPRESSED KEY_BLOCK key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); +#if ASYMMETRIC_COMPRESSED + KEY_BLOCK_UNCOMPRESSED key_vals = (TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) - key_comp_zp) * key_comp_scale; +#else KEY_BLOCK_UNCOMPRESSED key_vals = TO_KEY_BLOCK_UNCOMPRESSED_TYPE(key_vals_compressed) * key_comp_scale; +#endif #else KEY_BLOCK key_vals = KEY_BLOCK_READ(key_input, key_offset + head_idx_index); #endif @@ -584,12 +603,15 @@ KERNEL(sdpa_opt)( #ifdef COMPRESSED_PER_HEAD // TODO: consider to change scales layout from [batch, seq_len, num_heads, 1] to [batch, num_heads, seq_len, 1] // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, b1_idx, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); #else // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, 0, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); #endif VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; +#if ASYMMETRIC_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1]; +#endif #endif OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; @@ -610,7 +632,11 @@ KERNEL(sdpa_opt)( #else INPUT2_TYPE value_val_compressed = VALUE_BLOCK_READ(value_input, value_offset); #endif +#if ASYMMETRIC_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_val_compressed - sub_group_broadcast(value_comp_zp, i)) * sub_group_broadcast(value_comp_scale, i); +#else VALUE_COMPRESSION_SCALE_TYPE value_val = value_val_compressed * sub_group_broadcast(value_comp_scale, i); +#endif #else #ifdef BEAM_TABLE_TYPE INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, sub_group_broadcast(value_offset, i)); @@ -655,12 +681,15 @@ KERNEL(sdpa_opt)( #ifdef COMPRESSED_PER_HEAD // TODO: consider to change scales layout from [batch, seq_len, num_heads, 1] to [batch, num_heads, seq_len, 1] // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len, 0, 0, b1_idx, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len, 0); #else // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len, 0, 0, 0, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + seq_len, 0); #endif VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; +#if ASYMMETRIC_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1]; +#endif #endif OUTPUT_TYPE qk_val[TARGET_SEQ_LEN_BLOCK_SIZE]; @@ -670,7 +699,11 @@ KERNEL(sdpa_opt)( #if IS_KV_COMPRESSED INPUT2_TYPE value_val_compressed = VALUE_BLOCK_READ(value_input, value_offset); +#if ASYMMETRIC_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_val = (value_val_compressed - value_comp_zp) * value_comp_scale; +#else VALUE_COMPRESSION_SCALE_TYPE value_val = value_val_compressed * value_comp_scale; +#endif #else INPUT2_TYPE value_val = VALUE_BLOCK_READ(value_input, value_offset); #endif @@ -1059,12 +1092,15 @@ KERNEL(sdpa_opt)( #endif #ifdef COMPRESSED_PER_HEAD // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, b1_idx, 0); - const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid, 0); #else // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, 0, 0); - const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, seq_len + sglid, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, 0, seq_len + sglid, 0); #endif KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset]; +#if ASYMMETRIC_COMPRESSED + KEY_COMPRESSION_SCALE_TYPE key_comp_zp = key_scale[key_scale_comp_offset + 1]; +#endif // printf("[0]key_scale_comp_offset=%d, sglid=%d: %f\n", key_scale_comp_offset, sglid, key_comp_scale); #endif @@ -1087,7 +1123,11 @@ KERNEL(sdpa_opt)( #else INPUT1_TYPE key_vals_compressed = KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index); #endif +#if ASYMMETRIC_COMPRESSED + KEY_COMPRESSION_SCALE_TYPE key_vals = (TO_KEY_COMPRESSION_SCALE_TYPE(key_vals_compressed) - sub_group_broadcast(key_comp_zp, key_row_idx)) * sub_group_broadcast(key_comp_scale, key_row_idx); +#else KEY_COMPRESSION_SCALE_TYPE key_vals = TO_KEY_COMPRESSION_SCALE_TYPE(key_vals_compressed) * sub_group_broadcast(key_comp_scale, key_row_idx); +#endif #else #ifdef BEAM_TABLE_TYPE INPUT1_TYPE key_vals = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); @@ -1108,12 +1148,15 @@ KERNEL(sdpa_opt)( #endif #ifdef COMPRESSED_PER_HEAD // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, b1_idx, 0); - const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx,b1_idx / BROADCAST_GROUP_SIZE, seq_len + sglid, 0); #else // const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, seq_len + sglid, 0, 0, 0, 0); - const uint key_scale_comp_offset = GET_DATA_INDEX_6D(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, seq_len + sglid, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, 0, seq_len + sglid, 0); #endif KEY_COMPRESSION_SCALE_TYPE key_comp_scale = key_scale[key_scale_comp_offset]; +#if ASYMMETRIC_COMPRESSED + KEY_COMPRESSION_SCALE_TYPE key_comp_zp = key_scale[key_scale_comp_offset + 1]; +#endif // printf("[1]key_scale_comp_offset=%d, sglid=%d: %f\n", key_scale_comp_offset, sglid, key_comp_scale); #endif __attribute__((opencl_unroll_hint(1))) @@ -1142,7 +1185,11 @@ KERNEL(sdpa_opt)( #else key_vec[key_row_idx] = TO_KEY_COMPRESSION_SCALE_TYPE(KEY_BLOCK_READ(key_input, key_offset + key_row_idx * key_pitch + head_idx_index)); #endif +#if ASYMMETRIC_COMPRESSED + key_vec[key_row_idx] = (key_vec[key_row_idx] - sub_group_broadcast(key_comp_zp, key_row_idx)) * sub_group_broadcast(key_comp_scale, key_row_idx); +#else key_vec[key_row_idx] *= sub_group_broadcast(key_comp_scale, key_row_idx); +#endif #else #ifdef BEAM_TABLE_TYPE key_vec[key_row_idx] = KEY_BLOCK_READ(key_input, sub_group_broadcast(key_offset, key_row_idx) + head_idx_index); @@ -1167,7 +1214,11 @@ KERNEL(sdpa_opt)( // printf("_%d %d %d. Loads key [%d] = %f\n", get_global_id(0), get_global_id(1), get_global_id(2), key_offset + key_row_idx * key_pitch + head_idx_index, key_vals); } #endif +#if ASYMMETRIC_COMPRESSED + key_vals = (key_vals - sub_group_broadcast(key_comp_zp, key_row_idx)) * sub_group_broadcast(key_comp_scale, key_row_idx); +#else key_vals *= sub_group_broadcast(key_comp_scale, key_row_idx); +#endif #else #ifdef BEAM_TABLE_TYPE INPUT1_TYPE key_vals = 0; @@ -1321,12 +1372,15 @@ KERNEL(sdpa_opt)( #endif #ifdef COMPRESSED_PER_HEAD // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len) + sglid, 0, 0, b1_idx, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len) + sglid, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len) + sglid, 0); #else // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len) + sglid, 0, 0, 0, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len) + sglid, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + (seq_len) + sglid, 0); #endif VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; +#if ASYMMETRIC_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1]; +#endif // printf("[0]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale); #endif @@ -1340,7 +1394,11 @@ KERNEL(sdpa_opt)( #endif #if IS_KV_COMPRESSED +#if ASYMMETRIC_COMPRESSED + value_val = (value_val - sub_group_broadcast(value_comp_zp, i)) * sub_group_broadcast(value_comp_scale, i); +#else value_val *= sub_group_broadcast(value_comp_scale, i); +#endif #endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { acc_output_res[seq_idx] = mad(sub_group_broadcast(qk_val[seq_idx], i), value_val, acc_output_res[seq_idx]); @@ -1387,12 +1445,15 @@ KERNEL(sdpa_opt)( #endif #ifdef COMPRESSED_PER_HEAD // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, b1_idx, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); #else // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0, 0, 0, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + (seq_len * SUBGROUP_SIZE) + sglid, 0); #endif VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; +#if ASYMMETRIC_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1]; +#endif // printf("[1]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale); #endif @@ -1411,7 +1472,11 @@ KERNEL(sdpa_opt)( #endif #if IS_KV_COMPRESSED +#if ASYMMETRIC_COMPRESSED + value_val = (value_val - sub_group_broadcast(value_comp_zp, i)) * sub_group_broadcast(value_comp_scale, i); +#else value_val *= sub_group_broadcast(value_comp_scale, i); +#endif #endif unroll_for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { @@ -1462,12 +1527,15 @@ KERNEL(sdpa_opt)( #endif #ifdef COMPRESSED_PER_HEAD // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len_leftovers_start + sglid, 0, 0, b1_idx, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len_leftovers_start + sglid, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1_idx / BROADCAST_GROUP_SIZE, start_partition_idx + seq_len_leftovers_start + sglid, 0); #else // const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, start_partition_idx + seq_len_leftovers_start + sglid, 0, 0, 0, 0); - const uint value_scale_comp_offset = GET_DATA_INDEX_6D(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, start_partition_idx + seq_len_leftovers_start + sglid, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, start_partition_idx + seq_len_leftovers_start + sglid, 0); #endif VALUE_COMPRESSION_SCALE_TYPE value_comp_scale = val_scale[value_scale_comp_offset]; +#if ASYMMETRIC_COMPRESSED + VALUE_COMPRESSION_SCALE_TYPE value_comp_zp = val_scale[value_scale_comp_offset + 1]; +#endif // printf("[2]value_scale_comp_offset=%d, sglid=%d: %f\n", value_scale_comp_offset, sglid, value_comp_scale); #endif @@ -1481,7 +1549,11 @@ KERNEL(sdpa_opt)( #endif #if IS_KV_COMPRESSED +#if ASYMMETRIC_COMPRESSED + value_val = (value_val - sub_group_broadcast(value_comp_zp, seq_len_idx)) * sub_group_broadcast(value_comp_scale, seq_len_idx); +#else value_val *= sub_group_broadcast(value_comp_scale, seq_len_idx); +#endif #endif for (uint seq_idx = 0; seq_idx < TARGET_SEQ_LEN_BLOCK_SIZE; seq_idx++) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl index 6822000ba9e0f2..2f5bf6078e54f2 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_ref.cl @@ -170,12 +170,19 @@ KERNEL(sdpa_ref)( INPUT1_TYPE k_val_comp = key_input[key_offset]; half k_val = (half)k_val_comp; #ifdef COMPRESSED_PER_HEAD - const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, s, b1 / BROADCAST_GROUP_SIZE); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, b1 / BROADCAST_GROUP_SIZE, s, 0); #else // const uint key_scale_comp_offset = s; - const uint key_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(KEY_COMPRESSION_SCALE, b_idx, 0, 0, 0, s, 0); + const uint key_scale_comp_offset = GET_DATA_INDEX(KEY_COMPRESSION_SCALE, b_idx, 0, s, 0); #endif +#if ASYMMETRIC_COMPRESSED + if (b0 == 0 && b1 == 0 && target_seq_idx == 0 && head_size_idx == 0 && s == 0 && h == 0) { + // printf("k=%f, zp=%f, scale=%f, res=%f\n", k_val, key_scale[key_scale_comp_offset + 1], key_scale[key_scale_comp_offset], ((k_val - key_scale[key_scale_comp_offset + 1]) * key_scale[key_scale_comp_offset])); + } + k_val = (k_val - key_scale[key_scale_comp_offset + 1]) * key_scale[key_scale_comp_offset]; +#else k_val *= key_scale[key_scale_comp_offset]; +#endif #else INPUT1_TYPE k_val = key_input[key_offset]; #endif @@ -256,12 +263,20 @@ KERNEL(sdpa_ref)( INPUT2_TYPE __value = value_input[value_offset]; half value = (half)__value; #ifdef COMPRESSED_PER_HEAD - const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, s, b1 / BROADCAST_GROUP_SIZE); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, b1 / BROADCAST_GROUP_SIZE, s, 0); #else // const uint value_scale_comp_offset = s; - const uint value_scale_comp_offset = GET_DATA_INDEX_6D_SAFE(VALUE_COMPRESSION_SCALE, b_idx, 0, 0, 0, s, 0); + const uint value_scale_comp_offset = GET_DATA_INDEX(VALUE_COMPRESSION_SCALE, b_idx, 0, s, 0); #endif +#if ASYMMETRIC_COMPRESSED + // if (b0 == 0 && b1 == 0 && target_seq_idx == 0 && head_size_idx == 0 && s == 0) { + if (b0 == 0 && b1 == 0 && target_seq_idx == 0 && head_size_idx == 0 && s == 0) { + // printf("v=%f, zp=%f, scale=%f, res=%f\n", value, val_scale[value_scale_comp_offset + 1], val_scale[value_scale_comp_offset], ((value - val_scale[value_scale_comp_offset + 1]) * val_scale[value_scale_comp_offset])); + } + value = (value - val_scale[value_scale_comp_offset + 1]) * val_scale[value_scale_comp_offset]; +#else value *= val_scale[value_scale_comp_offset]; +#endif acc += tmp_buf[tmp_buf_offset] * value; #else acc += tmp_buf[tmp_buf_offset] * value_input[value_offset]; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp index ac6b2632aa49fb..10e101756c0cd3 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_generic.cpp @@ -177,6 +177,8 @@ JitConstants DynamicQuantizeKernelOptGeneric::GetJitConstants(const dynamic_quan const auto iterations_number = total_grouped_elements / per_iter_elements_number; jit.AddConstant(MakeJitConstant("ITERATIONS_NUMBER", iterations_number)); + jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization)); + jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.group_scales_with_zp)); bool rearrange_scales_order = false; const auto& scales_output_order = params.scales_output_order; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h index 7a1186a0e4ad40..664355e686fadc 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_ref.h @@ -17,6 +17,8 @@ struct dynamic_quantize_params : public base_params { int64_t axis_offset = -1; std::vector group_sizes; std::vector scales_output_order; + bool use_asymmetric_quantization = false; + bool group_scales_with_zp = false; }; class DynamicQuantizeKernelRef : public KernelBaseOpenCL { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp index 218e736cc70e14..d8ccc38fc1afba 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp @@ -86,6 +86,7 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { } jit.AddConstant(MakeJitConstant("IS_KV_COMPRESSED", params.conf.is_kv_compressed)); + jit.AddConstant(MakeJitConstant("ASYMMETRIC_COMPRESSED", params.conf.is_asym_compressed)); if (params.conf.is_kv_compressed) { jit.AddConstant(MakeJitConstant("KEY_COMPRESSION_SCALE", params.key_cache_comp_scale)); jit.AddConstant(MakeJitConstant("VALUE_COMPRESSION_SCALE", params.value_cache_comp_scale)); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h index e510cba6480964..76188734532e64 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h @@ -89,6 +89,7 @@ struct sdpa_configuration { bool is_causal = false; bool has_alibi_input = false; bool is_kv_compressed = false; + bool is_asym_compressed = false; // Paged Attention configuration bool is_paged_attention = false; diff --git a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp index 3781f900f1b2c1..c344c3436d3f83 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/dynamic_quantize.cpp @@ -35,6 +35,9 @@ static void CreateDynamicQuantizeOp(ProgramBuilder& p, const std::shared_ptrget_scales_output_order(), get_output_data_types(op)); + + prim.use_asymmetric_quantization = op->get_quantization_mode() == ov::op::internal::DynamicQuantize::QuantizationMode::Asymmetric; + p.add_primitive(*op, prim); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp index 5abd22370ea599..5a8f911e2ea16e 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/kv_cache.cpp @@ -38,6 +38,7 @@ void CreateKVCacheOp(ProgramBuilder& p, const std::shared_ptrget_compressed()) { prim.compression_type = op->get_compression_type(); + prim.use_asymmetric_quantization = op->get_asymmetric_quantization(); prim.group_sizes = op->get_group_sizes(); prim.scales_output_order = op->get_scales_output_order(); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp index 7642a411442efe..7c6f6158e2b1fa 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp @@ -83,6 +83,10 @@ static void CreateIndirectSDPAOp(ProgramBuilder& p, const std::shared_ptrget_input2_transpose_order(), op->get_output_transpose_order()); + if (op->get_asym()) { + sdpa_prim.is_asym_compressed = true; + } + p.add_primitive(*op, sdpa_prim); } diff --git a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp index eb16213bcb936c..1d2a1d1b098f7f 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/dynamic_quantize_fully_connected.cpp @@ -61,7 +61,7 @@ DynamicQuantizeFullyConnected::DynamicQuantizeFullyConnected(uint64_t group_size auto rank = m_fc->get_input_partial_shape(0).size(); std::vector shape_group_size(rank, 1); shape_group_size.back() = group_size; - auto dyn_quan = std::make_shared(m_data, shape_group_size, element::f16); + auto dyn_quan = std::make_shared(m_data, shape_group_size, element::f16, ov::op::internal::DynamicQuantize::QuantizationMode::Symmetric); auto optional_w_zp = m_fc->get_input_size() > 4 ? m_fc->get_input_node_shared_ptr(4) : std::make_shared(); auto output_type = m_fc->get_output_type(); diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp index 8507da7aef8aa2..3c3621b42b932e 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp @@ -62,6 +62,18 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher() { first = false; } + int USE_ZP = 0; + if (const auto env_var = std::getenv("USE_ZP")) { + std::istringstream ss(env_var); + ss >> USE_ZP; + } + + std::cout << "Set USE_ZP = " << USE_ZP << "\n"; + + auto quantization_mode = ov::op::internal::DynamicQuantize::QuantizationMode::Symmetric; + if (USE_ZP) + quantization_mode = ov::op::internal::DynamicQuantize::QuantizationMode::Asymmetric; + auto query = any_input(); auto k_past = wrap_type(); @@ -199,7 +211,7 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher() { // Scales order: 0, 3, 1, 2 if (key_past_node->get_input_size() == 1) { - auto k_init_dyn_quan = std::make_shared(key_past_node->get_input_node_shared_ptr(0), shape_group_size, element::f16, scales_output_order); + auto k_init_dyn_quan = std::make_shared(key_past_node->get_input_node_shared_ptr(0), shape_group_size, element::f16, quantization_mode, scales_output_order); auto new_key_past_node = std::make_shared(k_init_dyn_quan->output(0), k_init_dyn_quan->output(1), key_past_node->get_variable()); k_init_dyn_quan->set_friendly_name(key_node->get_friendly_name() + "_init_dyn_quan"); // std::cout << "Key outputs: " << key_past_node->get_output_size() << " " << new_key_past_node->get_output_size() << "\n"; @@ -213,7 +225,7 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher() { } if (value_past_node->get_input_size() == 1) { - auto v_init_dyn_quan = std::make_shared(value_past_node->get_input_node_shared_ptr(0), shape_group_size, element::f16, scales_output_order); + auto v_init_dyn_quan = std::make_shared(value_past_node->get_input_node_shared_ptr(0), shape_group_size, element::f16, quantization_mode, scales_output_order); auto new_value_past_node = std::make_shared(v_init_dyn_quan->output(0), v_init_dyn_quan->output(1), value_past_node->get_variable()); // std::cout << "Value outputs: " << value_past_node->get_output_size() << " " << new_value_past_node->get_output_size() << "\n"; @@ -241,6 +253,8 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher() { shape_group_size, scales_output_order); + new_kv_cache_k->set_asymmetric_quantization(quantization_mode == ov::op::internal::DynamicQuantize::QuantizationMode::Asymmetric); + new_kv_cache_k->set_friendly_name(key_node->get_friendly_name()); ov::copy_runtime_info(key_node, new_kv_cache_k); @@ -258,6 +272,8 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher() { shape_group_size, scales_output_order); + new_kv_cache_v->set_asymmetric_quantization(quantization_mode == ov::op::internal::DynamicQuantize::QuantizationMode::Asymmetric); + new_kv_cache_v->set_friendly_name(value_node->get_friendly_name()); ov::copy_runtime_info(value_node, new_kv_cache_v); @@ -302,6 +318,8 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher() { org_sdpa->get_output_type()); + new_sdpa->set_asym(quantization_mode == ov::op::internal::DynamicQuantize::QuantizationMode::Asymmetric); + new_kv_cache_k->set_friendly_name(key_node->get_friendly_name()); ov::copy_runtime_info(key_node, new_kv_cache_k);