diff --git a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp index 8e4957d5f52797..bfedab4280c52d 100644 --- a/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp +++ b/src/plugins/intel_gpu/src/graph/dynamic_quantize.cpp @@ -40,6 +40,7 @@ std::vector dynamic_quantize_inst::__calc_output_layouts(const layout &a if (attrs.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric && attrs.output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::Planar) { + GPU_DEBUG_TRACE_DETAIL << "Set 3d output: " << layout(output_shapes[2], attrs.zp_dt, output_format).to_short_string() << "\n"; output_layouts.emplace_back(layout(output_shapes[2], attrs.zp_dt, output_format)); } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp index 5db452dcda26f0..7adcff0a4aaf3a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp @@ -348,6 +348,7 @@ void kernels_cache::build_batch(const batch_program& batch, compiled_kernels& co // Run compilation if (precompiled_kernels.empty()) { + GPU_DEBUG_TRACE_DETAIL << "Compiling " << batch.kernels_counter << " " << batch.has_microkernels << "\n"; cl::Program program(cl_build_device.get_context(), batch.source); { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "KernelsCache::BuildProgram::RunCompilation"); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp index fef2a3c51ee821..1ffbfbbfbade37 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp @@ -230,7 +230,7 @@ struct kv_cache_impl : multi_stage_primitive { if (desc->get_compression_zp_inputs_num() > 0) { // Copy zero points to the new buffer if needed - execute_stage(events, instance, res_events, scale_concat_stage, zp_concat_stage); + execute_stage(events, instance, res_events, zp_concat_stage, zp_concat_stage); } // Perform dynamic quantization of new token data and append result to the KV-cache @@ -417,15 +417,19 @@ struct kv_cache_impl : multi_stage_primitive { return params; } - static kernel_params_t get_compression_scale_update_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) { + static kernel_params_t get_compression_scale_update_kernel_params(const kernel_impl_params& impl_param, + bool is_scale = true, + bool is_shape_agnostic = false) { auto params = get_default_params(impl_param, is_shape_agnostic); const auto concat_axis = 2; params.axis = convert_axis(concat_axis, impl_param.get_output_layout().get_rank()); - auto inputs_count = 1; - auto comp_scale_past_layout = impl_param.input_layouts[3]; - auto comp_scale_present_layout = impl_param.output_layouts[2]; + const auto inputs_count = 1; + const auto input_idx = is_scale ? 3 : 4; // scale or zp + const auto output_idx = is_scale ? 2 : 3; // scale or zp + auto comp_scale_past_layout = impl_param.input_layouts[input_idx]; + auto comp_scale_present_layout = impl_param.output_layouts[output_idx]; params.inputs.resize(inputs_count); params.inputs[0] = convert_data_tensor(comp_scale_past_layout); @@ -435,10 +439,10 @@ struct kv_cache_impl : multi_stage_primitive { const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset; std::map in_tensor_to_offset_map = { - {0, in_offsets_map.at(3)}, // compression_scale_past + {0, in_offsets_map.at(input_idx)}, // compression_[scale/zp]_past }; std::map out_tensor_to_offset_map = { - {0, out_offsets_map.at(2)}, // compression_scale_present + {0, out_offsets_map.at(output_idx)}, // compression_[scale/zp]_present }; params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map); @@ -451,8 +455,11 @@ struct kv_cache_impl : multi_stage_primitive { auto concat_kernel_params = get_concat_kernel_params(impl_param, impl_param.is_dynamic()); auto& concat_kernel_selector = kernel_selector_t::Instance(); kernels_data.push_back(concat_kernel_selector.get_best_kernel(concat_kernel_params)); - const bool indirect = impl_param.typed_desc()->indirect; - const bool compressed = impl_param.typed_desc()->compressed; + + const auto desc = impl_param.typed_desc(); + const bool indirect = desc->indirect; + const bool compressed = desc->compressed; + const bool has_zp_input = desc->get_compression_zp_inputs_num() > 0; if (indirect) { auto bt_update_kernel_params = get_bt_update_kernel_params(impl_param, false); auto& bt_update_kernel_selector = bt_kernel_selector_t::Instance(); @@ -464,9 +471,14 @@ struct kv_cache_impl : multi_stage_primitive { auto& dq_kernel_selector = dq_kernel_selector_t::Instance(); kernels_data.push_back(dq_kernel_selector.get_best_kernel(dq_kernel_params)); - auto concat_scale_zp_kernel_params = get_compression_scale_update_kernel_params(impl_param, impl_param.is_dynamic()); auto& concat_scale_zp_kernel_selector = kernel_selector_t::Instance(); - kernels_data.push_back(concat_scale_zp_kernel_selector.get_best_kernel(concat_scale_zp_kernel_params)); + auto concat_scale_kernel_params = get_compression_scale_update_kernel_params(impl_param, true, impl_param.is_dynamic()); + kernels_data.push_back(concat_scale_zp_kernel_selector.get_best_kernel(concat_scale_kernel_params)); + + if (has_zp_input) { + auto concat_zp_kernel_params = get_compression_scale_update_kernel_params(impl_param, false, impl_param.is_dynamic()); + kernels_data.push_back(concat_scale_zp_kernel_selector.get_best_kernel(concat_zp_kernel_params)); + } } return cldnn::make_unique(kernels_data); } @@ -494,9 +506,15 @@ struct kv_cache_impl : multi_stage_primitive { _kernels_data[concat_stage].kernels[1].skip_execution = true; // Update dynamic quantization parameters - auto comp_scale_kernel_params = get_compression_scale_update_kernel_params(impl_param, impl_param.is_dynamic()); + auto comp_scale_kernel_params = get_compression_scale_update_kernel_params(impl_param, true, impl_param.is_dynamic()); (_kernels_data[scale_concat_stage].update_dispatch_data_func)(comp_scale_kernel_params, _kernels_data[scale_concat_stage]); _kernels_data[scale_concat_stage].kernels[0].skip_execution = impl_param._can_be_optimized || impl_param.get_input_layout(3).count() == 0; + + if (impl_param.typed_desc()->get_compression_zp_inputs_num() > 0) { + auto comp_scale_kernel_params = get_compression_scale_update_kernel_params(impl_param, false, impl_param.is_dynamic()); + (_kernels_data[zp_concat_stage].update_dispatch_data_func)(comp_scale_kernel_params, _kernels_data[zp_concat_stage]); + _kernels_data[zp_concat_stage].kernels[0].skip_execution = impl_param._can_be_optimized || impl_param.get_input_layout(4).count() == 0; + } } } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index 895fd86bb01e5f..832d7bba10ded6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -133,6 +133,20 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveget_compression_zp_inputs_num() > 0; if (desc->is_kv_compressed) { data_inputs_num -= 2; // key and value compression scales are handled separately diff --git a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h index ea0c8b82bb21fa..1dc41a1d92feda 100644 --- a/src/plugins/intel_gpu/src/graph/include/read_value_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/read_value_inst.h @@ -37,6 +37,8 @@ class typed_primitive_inst : public typed_primitive_inst_basenum_outputs; i++) { const auto& default_layout = desc->output_layouts[i]; + // if (impl_param.state_layouts.size() <= i) + // std::cout << "Use default layout\n"; output_layouts.push_back(impl_param.state_layouts.size() > i ? impl_param.state_layouts[i] : default_layout); } diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp index 808a593c601ad0..d00a87a1144a47 100644 --- a/src/plugins/intel_gpu/src/graph/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/kv_cache.cpp @@ -68,6 +68,7 @@ std::vector kv_cache_inst::calc_output_layouts(kv_cache_node const& /*no for (size_t i = 0; i < desc->num_outputs; i++) { auto out_type = desc->output_data_types[i].value_or(impl_param.get_input_layout(ports_map.at(i)).data_type); out_layouts.emplace_back(output_shapes[i], out_type, impl_param.get_output_layout(i).format); + GPU_DEBUG_TRACE_DETAIL << "NEW: kv_cache " << i << ": " << output_shapes[i] << " " << out_type << "\n"; } return out_layouts; diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index dac2c9a3403468..eddf0a117290b0 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -343,6 +343,7 @@ void primitive_inst::update_shape() { if (compressed_cache_variable->has_zp_state()) { auto scales_state = compressed_cache_variable->get_compression_zp_state(); auto new_zp_layout = compressed_cache_variable->get_compression_zp_state()->get_layout(); + GPU_DEBUG_TRACE_DETAIL << "NEW: Update state_layouts:" << new_zp_layout << "\n"; update_state_layout(*scales_state, new_zp_layout, 2); } } @@ -969,8 +970,9 @@ void primitive_inst::realloc_if_needed() { compressed_cache_variable->get_compression_scale_state()->set_memory(_outputs[2], present_scales_layout); if (compressed_cache_variable->has_zp_state()) { auto present_zp_layout = present_scales_layout; + present_zp_layout.data_type = _impl_params->output_layouts[3].data_type; - _impl_params->output_layouts[3] = present_scales_layout; + _impl_params->output_layouts[3] = present_zp_layout; compressed_cache_variable->get_compression_zp_state()->set_memory(_outputs[3], present_zp_layout); } } @@ -1360,7 +1362,7 @@ void primitive_inst::do_runtime_in_place_kv_cache() { GPU_DEBUG_TRACE_DETAIL << "[do runtime_in_place_kv_cache] " << id() << " Updated present_zp_layout's pad : " << present_scales_layout.to_string() << std::endl; - compressed_cache_variable->get_compression_zp_state()->set_layout(present_scales_layout); + compressed_cache_variable->get_compression_zp_state()->set_layout(present_zp_layout); } } @@ -2076,6 +2078,9 @@ primitive_inst::primitive_inst(network & network, program_node const& node, bool _outputs = allocate_outputs(); } } + if (_node) { + GPU_DEBUG_TRACE_DETAIL << _node->type()->to_string(*_node) << "\n"; + } _impls_factory = std::make_shared(_node); _impl_params->strm = _network.get_stream_ptr(); for (size_t i = 0; i < get_node().get_output_layouts().size(); ++i) { diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 2bfaac84134387..5bf5b2147f84f4 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -223,8 +223,11 @@ void program::init_program() { pm = std::unique_ptr(new pass_manager(*this)); new_shape_infer = _config.get_property(ov::intel_gpu::allow_new_shape_infer); - if (_task_executor == nullptr) - _task_executor = program::make_task_executor(_config); + if (true) { + auto config = _config; + config.set_property(ov::compilation_num_threads(1)); + _task_executor = program::make_task_executor(config); + } _kernels_cache = std::unique_ptr(new kernels_cache(_engine, _config, prog_id, _task_executor, kernel_selector::KernelBase::get_db().get_batch_headers())); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl index b0e363169e9e4d..591a2c34943669 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl @@ -84,7 +84,7 @@ KERNEL(dynamic_quantize_gpu_kv_cache)( min_value = work_group_reduce_min(min_value); max_value = work_group_reduce_max(max_value); ACCUMULATOR_TYPE scale = (ACCUMULATOR_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value)); - ACCUMULATOR_TYPE zp = (ACCUMULATOR_TYPE)(-min_value * scale) - CHAR_MAX; + ACCUMULATOR_TYPE zp = (ACCUMULATOR_TYPE)(-min_value * scale) + CHAR_MIN; #else max_value = work_group_reduce_max(max_value); ACCUMULATOR_TYPE scale = 127.0h / max_value; @@ -112,7 +112,11 @@ KERNEL(dynamic_quantize_gpu_kv_cache)( #if GROUP_SCALES_WITH_ZP output_scale[scale_idx + 1] = zp; #else + #if OUTPUT2_IS_FP output_zp[scale_idx] = zp; + #else + output_zp[scale_idx] = convert_char_rte(zp); + #endif #endif #else output_scale[scale_idx] = 1.0h / scale; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl index 62482b8b9b5047..e9ebade851e42e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_ref.cl @@ -88,7 +88,7 @@ KERNEL(dynamic_quantize_gpu_ref)( #if ASYMMETRIC_QUANTIZATION OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_val - min_val)); - OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_val * scale) - CHAR_MAX; + OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_val * scale) + CHAR_MIN; #else max_val = work_group_reduce_max(max_val); OUTPUT1_TYPE scale = 127.0h / max_val; @@ -145,6 +145,10 @@ KERNEL(dynamic_quantize_gpu_ref)( #if ASYMMETRIC_QUANTIZATION && GROUP_SCALES_WITH_ZP output_scale[scale_idx + 1] = zp; #elif ASYMMETRIC_QUANTIZATION - output_zp[scale_idx] = zp; + #if OUTPUT2_IS_FP + output_zp[scale_idx] = zp; + #else + output_zp[scale_idx] = convert_char_rte(zp); + #endif #endif } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl index 1584dffe95a3c3..58cf8396459b9d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl @@ -18,6 +18,12 @@ #include "include/batch_headers/sdpa_utils.cl" #include "include/batch_headers/tile_ops.cl" +/* The quantization parameter may be unique for each token/element */ +#define QUANTIZE_2D 2 + +/* The quantization parameter shares the same value across the work-group */ +#define QUANTIZE_COMMON 3 + #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define DIV_UP(x, y) (((x) + (y)-1) / (y)) @@ -133,7 +139,9 @@ DECLARE_2D_TILE_RSELECT(a_scale_tile_type, SUBGROUP_SIZE, ugemm_vs_sg_tile_n, 1, __attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE))) KERNEL(micro_sdpa)(OPTIONAL_SHAPE_INFO_ARG - const global half *K, const global half *Q, const global half *V, + const global KEY_DATA_T *K, + const global QRY_DATA_T *Q, + const global VAL_DATA_T *V, global half *A, #if WITH_ATTN_MASK const global half *msk, @@ -141,10 +149,18 @@ KERNEL(micro_sdpa)(OPTIONAL_SHAPE_INFO_ARG #if WITH_SCALE global SCALE_DATA_T *scale_ptr, #endif - int d, int k, int q) { + int d, int k, int q +#ifdef KV_COMPRESSED + , const global KEY_ATTR_SCALES_DATA_T *K_scales + , const global KEY_ATTR_ZP_DATA_T *K_zp + , const global VAL_ATTR_SCALES_DATA_T *V_scales + , const global VAL_ATTR_ZP_DATA_T *V_zp +#endif + ) { uint sg_ij = sub_group_broadcast(get_local_id(1), 0); uint b0 = get_group_id(1); uint b1 = get_group_id(2); + uint b0_kv = b0 / KV_GROUP_SIZE; uint wg_j0 = get_group_id(0) * ugemm_kq_wg_tile_n; @@ -154,6 +170,13 @@ KERNEL(micro_sdpa)(OPTIONAL_SHAPE_INFO_ARG uint ldv = VAL_S2; uint lda = DST_S2; +#if KEY_SCALES || KEY_ZERO_POINTS + uint ldkq = DIV_UP(d, KEY_GROUP_SIZE); +#endif +#if VAL_SCALES || VAL_ZERO_POINTS + uint ldvq = DIV_UP(d, VAL_GROUP_SIZE); +#endif + /* Subgroup IDs for each GEMM */ uint sg_i_kq = sg_ij % ugemm_kq_sg_per_wg_m; uint sg_j_kq = sg_ij / ugemm_kq_sg_per_wg_m; @@ -183,11 +206,35 @@ KERNEL(micro_sdpa)(OPTIONAL_SHAPE_INFO_ARG const bool need_sum_barrier = (ugemm_vs_barrier_count == 0); /* Locate K/Q/V/A matrices within batch */ - K += KEY_OFF(b1, (b0 / KV_GROUP_SIZE), 0, 0) + INPUT1_OFFSET; - Q += QRY_OFF(b1, b0, 0, 0) + INPUT0_OFFSET; - V += VAL_OFF(b1, (b0 / KV_GROUP_SIZE), 0, 0) + INPUT2_OFFSET; + K += (KEY_OFF(b1, b0_kv, 0, 0) + INPUT1_OFFSET) / KEY_ELEMENTS_PER_BYTE; + Q += (QRY_OFF(b1, b0, 0, 0) + INPUT0_OFFSET); + V += (VAL_OFF(b1, b0_kv, 0, 0) + INPUT2_OFFSET) / VAL_ELEMENTS_PER_BYTE; A += DST_OFF(b1, b0, 0, 0, 0); +// if (b0 == 0 || b0 == 1) +// printf("b1=%d b0=%d: %d\n", b1, b0_kv, KEY_OFF(b1, b0_kv, 0, 0) / KEY_GROUP_SIZE); + +#if KEY_SCALES + K_scales += KEY_OFF(b1, b0_kv, 0, 0) / KEY_GROUP_SIZE; +#endif +#if KEY_SCALES == QUANTIZE_COMMON + float k_scale = convert_float(*K_scales); +#endif +#if KEY_ZERO_POINTS + K_zp += KEY_OFF(b1, b0_kv, 0, 0) / KEY_GROUP_SIZE + / KEY_ZP_ELEMENTS_PER_BYTE; +#endif +#if VAL_SCALES + V_scales += VAL_OFF(b1, b0_kv, 0, 0) / VAL_GROUP_SIZE; +#endif +#if VAL_SCALES == QUANTIZE_COMMON + float v_scale = convert_float(*V_scales); +#endif +#if VAL_ZERO_POINTS + V_zp += VAL_OFF(b1, b0_kv, 0, 0) / VAL_GROUP_SIZE + / VAL_ZP_ELEMENTS_PER_BYTE; +#endif + __builtin_assume_aligned(K, K_ALIGN); __builtin_assume_aligned(Q, Q_ALIGN); __builtin_assume_aligned(V, V_ALIGN); @@ -283,7 +330,25 @@ KERNEL(micro_sdpa)(OPTIONAL_SHAPE_INFO_ARG /* Calculate S = (K^T) * Q */ s_tile_type S_tile = ugemm_kq(K, ldk, Q_slm, D_MAX, k, ugemm_kq_wg_tile_n, d, k0, - 0, 0, sg_i_kq, sg_j_kq, (local char *)ugemm_slm); + 0, 0, sg_i_kq, sg_j_kq, (local char *)ugemm_slm +#if KEY_SCALES == QUANTIZE_2D + , + K_scales +#endif +#if KEY_ZERO_POINTS + , + K_zp +#endif +#if (KEY_SCALES == QUANTIZE_2D) || KEY_ZERO_POINTS + , + ldkq +#endif + ); + +#if KEY_SCALES == QUANTIZE_COMMON +#define k_scale_op(x) ((x)*k_scale) + tile_elementwise(S_tile, k_scale_op); +#endif /* Apply attention mask */ #if WITH_ATTN_MASK @@ -419,10 +484,31 @@ KERNEL(micro_sdpa)(OPTIONAL_SHAPE_INFO_ARG /* Accumulate A += V * S */ int k_chunk = min(k - k0, ugemm_kq_wg_tile_m); - a_tile_type A_tile1 = ugemm_vs(V, ldv, S_slm, ugemm_kq_wg_tile_m, d, - ugemm_kq_wg_tile_n, k_chunk, 0, 0, 0, sg_i_vs, sg_j_vs, - (local char *)ugemm_slm); - V += ldv * ugemm_kq_wg_tile_m; + + a_tile_type A_tile1 = ugemm_vs( + V, ldv, S_slm, ugemm_kq_wg_tile_m, d, ugemm_kq_wg_tile_n, + k_chunk, 0, 0, 0, sg_i_vs, sg_j_vs, (local char *)ugemm_slm +#if VAL_SCALES == QUANTIZE_2D + , + V_scales +#endif +#if VAL_ZERO_POINTS + , + V_zp +#endif +#if (VAL_SCALES == QUANTIZE_2D) || VAL_ZERO_POINTS + , + ldvq +#endif + ); + + V += ldv * ugemm_kq_wg_tile_m / VAL_ELEMENTS_PER_BYTE; +#if VAL_SCALES == QUANTIZE_2D + V_scales += ldvq * ugemm_kq_wg_tile_m; +#endif +#if VAL_ZERO_POINTS == QUANTIZE_2D + V_zp += ldvq * ugemm_kq_wg_tile_m / VAL_ZP_ELEMENTS_PER_BYTE; +#endif tile_binary(A_tile, A_tile1, binary_add); } @@ -440,6 +526,11 @@ KERNEL(micro_sdpa)(OPTIONAL_SHAPE_INFO_ARG tile_binary(A_scale_tile, A_scale_tile_load, binary_add); } +#if VAL_SCALES == QUANTIZE_COMMON +#define v_scale_op(x) ((x)*v_scale) + tile_elementwise(A_tile, v_scale_op); +#endif + /* Rescale by 1 / (column sums) */ tile_elementwise(A_scale_tile, native_vrecip); tile_hbroadcast_mul(&A_tile, A_scale_tile); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp index 8f0672da8875a1..cf549dcc0dc8f9 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp @@ -47,6 +47,8 @@ micro::Type convert_type(Datatype t) { switch (t) { case Datatype::F32: return micro::Type::f32; case Datatype::F16: return micro::Type::f16; + case Datatype::INT8: return micro::Type::s8; + case Datatype::UINT8: return micro::Type::u8; default: break; } OPENVINO_THROW("Unsupported dt: ", toString(t)); @@ -181,6 +183,11 @@ sdpa_config_t *choose_config_xehpc(int head_size, int seq, bool thin_q) { std::mutex SDPAKernelMicro::m; +const bool kq_common_scales = false; +const bool kq_common_zp = false; +const bool vs_common_scales = false; +const bool vs_common_zp = false; + void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Package& gemm_kq, micro::Package& gemm_vs, bool is_prefill) const { // TODO: Remove once micro API is thread safe std::lock_guard l(m); @@ -224,13 +231,53 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag /* Set up GEMMProblem structure for first GEMM: K^T * Q */ micro::GEMMProblem problem; - problem.Ta = problem.Ta_ext = convert_type(K.GetDType()); - problem.Tb = problem.Tb_ext = convert_type(Q.GetDType()); + problem.Ta_ext = convert_type(K.GetDType()); + problem.Tb_ext = convert_type(Q.GetDType()); + + // std::cout << "problem.Ta_ext=" << static_cast(problem.Ta_ext) << "\n"; + // std::cout << "problem.Tb_ext=" << static_cast(problem.Tb_ext) << "\n"; + + problem.Ta = problem.Tb = micro::Type::f16; problem.Tc = problem.Tc_ext = micro::Type::f32; problem.Ts = problem.Tc; auto problem_kq = problem; problem_kq.A.layout = micro::MatrixLayout::T; + + /* Set up microkernel options */ + micro::GEMMProtocol::Options opts_kq; + opts_kq.localB = true; + opts_kq.slmPtr = true; + + if (params.conf.is_kv_compressed && !kq_common_scales) { + auto scale_dt = micro::Type::f16; + problem_kq.Ta_scale = scale_dt; + problem_kq.A_scale.alignment = micro::data_type_alignment(scale_dt); + + // std::cout << "problem_kq.A_scale.alignment=" << static_cast(problem_kq.A_scale.alignment) << "\n"; + problem_kq.A_scale.layout = micro::MatrixLayout::T; + problem_kq.aScale2D = true; + } + + if (params.conf.is_kv_compressed && params.conf.use_asymmetric_quantization) { + auto zp_dt = micro::Type::s8; + problem_kq.Tao = zp_dt; + problem_kq.AO.alignment = micro::data_type_alignment(zp_dt); + // std::cout << "problem_kq.AO.alignment=" << static_cast(problem_kq.AO.alignment) << "\n"; + problem_kq.AO.layout = micro::MatrixLayout::T; + problem_kq.aoPtrDims = kq_common_zp ? 0 : 2; + problem_kq.aOffset = micro::ABOffset::Calc; + } + + if (params.conf.is_kv_compressed) { + problem_kq.aqGroupM = 1; + problem_kq.aqGroupK = (kq_common_scales || kq_common_zp) ? 1 : params.conf.head_size; + // problem_kq.aqGroupK = (kq_common_scales || kq_common_zp) ? 1 : 1; + } + + opts_kq.scaleA = params.conf.is_kv_compressed && !kq_common_scales; + opts_kq.offsetA = params.conf.is_kv_compressed && params.conf.use_asymmetric_quantization; + problem_kq.B.layout = micro::MatrixLayout::Pr; problem_kq.C.layout = micro::MatrixLayout::T; problem_kq.A.setAlignment(micro::alignment_for_ld(head_size * problem.Ta)); @@ -239,6 +286,8 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag problem_kq.B.tileR = d_max; problem_kq.B.tileC = static_cast(subgroup_size(params.engineInfo.arch)); + // std::cout << problem_kq.toString() << "\n"; + /* Set up problem size information */ micro::SizeParams sizes; sizes.m = static_cast(n_keys.v); @@ -253,18 +302,53 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag reqs_kq.push_back(micro::StrategyRequirement::WGM == config->wg_m_kq); reqs_kq.push_back(micro::StrategyRequirement::WGN == config->wg_n_kq); - /* Set up microkernel options */ - micro::GEMMProtocol::Options opts_kq; - opts_kq.localB = true; - opts_kq.slmPtr = true; - /* Ask microkernel provider for microkernel */ - gemm_kq = micro::select_gemm_microkernel(opts_kq, hw_info, sizes, problem_kq, reqs_kq); + try { + gemm_kq = micro::select_gemm_microkernel(opts_kq, hw_info, sizes, problem_kq, reqs_kq); + } catch (const std::runtime_error &ex) { + std::cout << "QK: " << ex.what() << "\n"; + } + + /* Set up microkernel options */ + micro::GEMMProtocol::Options opts_vs; + opts_vs.localB = true; + opts_vs.slmPtr = true; /* Update for second GEMM: V*S */ auto problem_vs = problem; - problem_vs.Ta = problem_vs.Ta_ext = convert_type(V.GetDType()); + problem_vs.Ta_ext = convert_type(V.GetDType()); problem_vs.A.layout = micro::MatrixLayout::N; + + if (params.conf.is_kv_compressed && !vs_common_scales) { + auto scale_dt = micro::Type::f16; + problem_vs.Ta_scale = scale_dt; + problem_vs.A_scale.alignment = micro::data_type_alignment(scale_dt); + problem_vs.A_scale.layout = micro::MatrixLayout::N; + problem_vs.aScale2D = true; + } + + if (params.conf.is_kv_compressed && params.conf.use_asymmetric_quantization) { + auto zp_dt = micro::Type::s8; + problem_vs.Tao = zp_dt; + problem_vs.AO.alignment = micro::data_type_alignment(zp_dt); + problem_vs.AO.layout = micro::MatrixLayout::N; + problem_vs.aoPtrDims = vs_common_zp ? 0 : 2; + problem_vs.aOffset = micro::ABOffset::Calc; + } + + if (params.conf.is_kv_compressed) { + // problem_vs.aqGroupM = (vs_common_scales || vs_common_zp) ? 1 : 1; + problem_vs.aqGroupM = (vs_common_scales || vs_common_zp) ? 1 : params.conf.head_size; + problem_vs.aqGroupK = 1; + } + + opts_vs.scaleA = params.conf.is_kv_compressed && !vs_common_scales; + opts_vs.offsetA = params.conf.is_kv_compressed && params.conf.use_asymmetric_quantization; + + // printf("confit %d %d %d %d %d %d %d %d\n", config->unroll_m_kq, + // config->unroll_n_kq, config->unroll_m_vs, config->unroll_n_vs, + // config->wg_m_kq, config->wg_n_kq, config->wg_m_vs, config->wg_n_vs); + problem_vs.B.layout = micro::MatrixLayout::Pr; problem_vs.C.layout = micro::MatrixLayout::N; problem_vs.A.setAlignment(micro::alignment_for_ld(head_size * problem.Ta)); @@ -281,20 +365,22 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag reqs_vs.push_back(micro::StrategyRequirement::WGM == config->wg_m_vs); reqs_vs.push_back(micro::StrategyRequirement::WGN == config->wg_n_vs); - micro::GEMMProtocol::Options opts_vs; - opts_vs.localB = true; - opts_vs.slmPtr = true; - auto adjust_vs = [](micro::GEMMStrategy &strategy) { /* Enable dpasw */ strategy.dpasw |= strategy.fused; }; /* Ask microkernel provider for microkernel */ - gemm_vs = micro::select_gemm_microkernel(opts_vs, hw_info, sizes, problem_vs, reqs_vs, adjust_vs); + try { + gemm_vs = micro::select_gemm_microkernel(opts_vs, hw_info, sizes, problem_vs, reqs_vs, adjust_vs); + } catch (const std::runtime_error &ex) { + std::cout << "VS: " << ex.what() << "\n"; + } } ParamsKey SDPAKernelMicro::GetSupportedKey() const { ParamsKey k; + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::UINT8); k.EnableInputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F16); @@ -344,7 +430,16 @@ bool SDPAKernelMicro::Validate(const Params& p) const { if (params.conf.head_size > 256) return false; - if (params.conf.is_kv_compressed) + // if (params.conf.is_kv_compressed) + // return false; + + int DISABLE_MICRO = 0; + if (const auto env_var = std::getenv("DISABLE_MICRO")) { + std::istringstream ss(env_var); + ss >> DISABLE_MICRO; + } + + if (DISABLE_MICRO) return false; // Do not use sdpa_micro kernel with a scalar-value mask @@ -388,6 +483,58 @@ JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const m jit.AddConstant(MakeJitConstant("TRANSPOSE_K", false)); + jit.AddConstant(MakeJitConstant("QRY_DATA_T", toCLType(Q.GetDType()))); + jit.AddConstant(MakeJitConstant("KEY_DATA_T", toCLType(K.GetDType()))); + jit.AddConstant(MakeJitConstant("VAL_DATA_T", toCLType(V.GetDType()))); + + if (params.conf.is_kv_compressed) { + jit.AddConstant(MakeJitConstant("KV_COMPRESSED", 1)); + jit.AddConstant(MakeJitConstant("KEY_ATTR_SCALES_DATA_T", toCLType(params.key_cache_comp_scale.GetDType()))); + if (params.conf.use_asymmetric_quantization) { + // jit.AddConstant(MakeJitConstant("KEY_ATTR_ZP_DATA_T", toCLType(params.key_cache_comp_zp.GetDType()))); + jit.AddConstant(MakeJitConstant("KEY_ATTR_ZP_DATA_T", "char")); + } + + jit.AddConstant(MakeJitConstant("VAL_ATTR_SCALES_DATA_T", toCLType(params.value_cache_comp_scale.GetDType()))); + if (params.conf.use_asymmetric_quantization) { + // jit.AddConstant(MakeJitConstant("VAL_ATTR_ZP_DATA_T", toCLType(params.value_cache_comp_zp.GetDType()))); + jit.AddConstant(MakeJitConstant("VAL_ATTR_ZP_DATA_T", "char")); + } + } + + auto elems_per_byte = [](Datatype dt) { + switch (dt) { + case Datatype::UINT4: + case Datatype::INT4: + return 2; + default: + return 1; + } + }; + + jit.AddConstant(MakeJitConstant("KEY_ELEMENTS_PER_BYTE", elems_per_byte(params.inputs[1].GetDType()))); + jit.AddConstant(MakeJitConstant("VAL_ELEMENTS_PER_BYTE", elems_per_byte(params.inputs[2].GetDType()))); + + if (params.conf.is_kv_compressed) { + int kq_scale_mask = (static_cast(params.conf.is_kv_compressed) << 1) | static_cast(kq_common_scales); + int vs_scale_mask = (static_cast(params.conf.is_kv_compressed) << 1) | static_cast(vs_common_scales); + jit.AddConstant(MakeJitConstant("KEY_SCALES", kq_scale_mask)); + jit.AddConstant(MakeJitConstant("VAL_SCALES", vs_scale_mask)); + // jit.AddConstant(MakeJitConstant("KEY_GROUP_SIZE", params.conf.head_size)); + // jit.AddConstant(MakeJitConstant("VAL_GROUP_SIZE", params.conf.head_size)); + jit.AddConstant(MakeJitConstant("KEY_GROUP_SIZE", params.conf.head_size)); + jit.AddConstant(MakeJitConstant("VAL_GROUP_SIZE", params.conf.head_size)); + + if (params.conf.use_asymmetric_quantization) { + int kq_zp_mask = (static_cast(params.conf.use_asymmetric_quantization) << 1) | static_cast(kq_common_zp); + int vs_zp_mask = (static_cast(params.conf.use_asymmetric_quantization) << 1) | static_cast(vs_common_zp); + jit.AddConstant(MakeJitConstant("KEY_ZERO_POINTS", kq_zp_mask)); + jit.AddConstant(MakeJitConstant("VAL_ZERO_POINTS", vs_zp_mask)); + jit.AddConstant(MakeJitConstant("KEY_ZP_ELEMENTS_PER_BYTE", elems_per_byte(params.key_cache_comp_zp.GetDType()))); + jit.AddConstant(MakeJitConstant("VAL_ZP_ELEMENTS_PER_BYTE", elems_per_byte(params.value_cache_comp_zp.GetDType()))); + } + } + int tile_k = gemm_kq.getSetting("wg_tile_m"); int tile_q = gemm_kq.getSetting("wg_tile_n"); int tile_v = gemm_vs.getSetting("wg_tile_m"); @@ -521,6 +668,17 @@ clKernelData SDPAKernelMicro::get_kernel_data(const sdpa_params& params, bool is kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // K kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 2}); // Q + if (params.conf.is_kv_compressed) { + uint32_t input_idx = params.inputs.size(); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, input_idx + 0}); // K scales + if (params.conf.use_asymmetric_quantization) + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, input_idx + 2}); // K zp + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, input_idx + 1}); // V scales + if (params.conf.use_asymmetric_quantization) + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, input_idx + 3}); // V zp + } + const auto& Q = params.inputs[0]; const auto& K = params.inputs[1]; diff --git a/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp index c6b0e031a027e8..7ff228b4176a9b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp +++ b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp @@ -26,6 +26,7 @@ namespace micro { using Package = dnnl::impl::gpu::intel::micro::Package; using HWInformation = dnnl::impl::gpu::intel::jit::HWInformation; using GEMMProblem = dnnl::impl::gpu::intel::jit::GEMMProblem; +using ABOffset = dnnl::impl::gpu::intel::jit::ABOffset; using GEMMStrategy = dnnl::impl::gpu::intel::jit::GEMMStrategy; using GEMMProtocol = dnnl::impl::gpu::intel::micro::GEMMProtocol; using MatrixLayout = dnnl::impl::gpu::intel::jit::MatrixLayout; @@ -77,6 +78,10 @@ static inline int alignment_for_ld(int ld) { return dnnl::impl::gpu::intel::jit::alignmentForLD(ld); } +static inline uint8_t data_type_alignment(micro::Type dt) { + return uint8_t(dnnl::impl::types::data_type_size(micro::Type(dt).get_dnnl_type())); +} + } // namespace micro #undef UNUSED diff --git a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp index 8173a29c1b35f8..97af54a181875c 100644 --- a/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp +++ b/src/plugins/intel_gpu/src/plugin/multi_tensor_variable_state.cpp @@ -172,6 +172,7 @@ VariableStateIndirectKVCacheCompressed::VariableStateIndirectKVCacheCompressed( if (has_zp_state) { const auto compression_zp_layout = output_layouts[2]; + GPU_DEBUG_TRACE_DETAIL << "NEW: create state:" << compression_zp_layout << "\n"; VariableStateInfo compression_zp_state_info(info.m_id + "/comp_zp", compression_zp_layout); m_hidden_states.push_back(std::make_shared(compression_zp_state_info, context, shape_predictor)); } diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp index 561822f9661109..ede1eda24ab2b1 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp @@ -137,7 +137,7 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher(ov::element::Type compressi return; const auto quantization_type = ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric; - const auto output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::InterleavedScalesZP; + const auto output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::Planar; bool combine_scales_and_zp = output_storage_type == ov::op::internal::DynamicQuantize::OutputStorageType::InterleavedScalesZP; GPU_DEBUG_LOG << "KV-cache compression configuration: " @@ -210,6 +210,16 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher(ov::element::Type compressi return scales_zp_output_order; }; + auto print_arr = [&](const std::vector& vec, size_t max_len, std::string name) { + std::stringstream ss; + for (size_t i = 0; i < std::min(max_len, vec.size()); i++) { + ss << vec[i] << ", "; + } + std::cout << "Transpose order " << name << " (len=" << vec.size() << ") content: " << ss.str() << "\n"; + }; + + print_arr(sdpa_node->get_input1_transpose_order(), sdpa_node->get_input1_transpose_order().size(), ""); + ov::op::internal::DynamicQuantize::Attributes config; config.quantization_type = quantization_type; config.group_sizes = get_shape_group_sizes(sdpa_node->get_input1_transpose_order()); @@ -219,7 +229,7 @@ KVCacheCompressionMatcher::KVCacheCompressionMatcher(ov::element::Type compressi config.output_storage_type = output_storage_type; if (config.quantization_type == ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric) - config.zp_dt = query_node->get_output_element_type(0); + config.zp_dt = element::i8; key_past_rv_node = update_past_read_value(key_past_rv_node, config); value_past_rv_node = update_past_read_value(value_past_rv_node, config); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index f4ec7afb5c3d1e..a99adeea943d23 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -947,6 +947,7 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(); auto kv_cache_compression_dt = config.get_property(ov::hint::kv_cache_precision); + std::cout << "kv_cache_compression_dt=" << kv_cache_compression_dt << "\n"; manager.register_pass(kv_cache_compression_dt); manager.register_pass(); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp index c0e317ff6ce915..551a056a325f6c 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/dynamic_quantize_gpu_test.cpp @@ -23,6 +23,7 @@ using namespace cldnn; using namespace ::tests; using QuantizationType = ov::op::internal::DynamicQuantize::QuantizationType; +using OutputStorageType = ov::op::internal::DynamicQuantize::OutputStorageType; class dynamic_quantization_gpu_tests: public ::testing::Test { public: @@ -30,7 +31,8 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { void test_dynamic_quantization(bool is_caching_test, const ov::PartialShape& input_shape, const ov::Shape& data_shape, - const QuantizationType quantization_type = QuantizationType::Symmetric, + const ov::element::Type zp_dt = data_types::undefined, + const OutputStorageType output_storage_type = OutputStorageType::Planar, const std::string& impl_name = "") { tests::random_generator rg(GET_SUITE_NAME); auto& engine = get_test_engine(); @@ -45,6 +47,10 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { auto input_data = rg.generate_random_1d(ov::shape_size(data_shape), -16.0f, 16.0f); set_values(input_mem, input_data); + // std::cout << "Input data:\n"; + // for (size_t i = 0; i < input_data.size(); i++) + // std::cout << i << ". " << input_data[i] << "\n"; + auto in_layout_f32 = input_shape.is_dynamic() ? layout{ dyn_input_ps, data_types::f32, format::bfyx } : layout{ input_ps, data_types::f32, format::bfyx }; @@ -52,23 +58,23 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { : layout{ input_ps, data_types::f16, format::bfyx }; dynamic_quantize::Attributes dq_config; - dq_config.quantization_type = quantization_type; + dq_config.quantization_type = (zp_dt == data_types::undefined) ? QuantizationType::Symmetric + : QuantizationType::Asymmetric; dq_config.quantization_dt = data_types::i8; dq_config.scale_dt = data_types::f16; - dq_config.zp_dt = data_types::undefined; + dq_config.zp_dt = zp_dt; dq_config.group_sizes = group_sizes; dq_config.scales_zp_output_order = { 0, 1, 2, 3 }; - dq_config.output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::Planar; + dq_config.output_storage_type = output_storage_type; - if (quantization_type == QuantizationType::Asymmetric) { - dq_config.zp_dt = data_types::f16; - dq_config.output_storage_type = ov::op::internal::DynamicQuantize::OutputStorageType::InterleavedScalesZP; - } + bool has_zp_output = dq_config.quantization_type == QuantizationType::Asymmetric && + dq_config.output_storage_type == OutputStorageType::Planar; auto reorder_1 = reorder("reorder_1", input_info("input"), layout{ input_ps, data_types::f16, format::bfyx }); auto dyn_quan_prim = dynamic_quantize("dyn_quan_prim", input_info("reorder_1"), dq_config); auto reorder_data = reorder("reorder_data", input_info("dyn_quan_prim", 0), layout{ input_ps, data_types::f16, format::bfyx }); auto reorder_scale = reorder("reorder_scale", input_info("dyn_quan_prim", 1), layout{ scales_ps, data_types::f16, format::bfyx }); + auto reorder_zp = reorder("reorder_zp", input_info("dyn_quan_prim", 2), layout{ scales_ps, data_types::f16, format::bfyx }); // Implemented dynamic quantize kernel auto get_ref_results = [&]() { @@ -80,6 +86,9 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { reorder_scale ); + if (has_zp_output) + topology.add(reorder_zp); + auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); config.set_property(ov::intel_gpu::optimize_data(true)); @@ -92,19 +101,27 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { auto outputs = network.execute(); - auto output_layout = outputs.begin()->second.get_layout(); - auto output_mem = outputs.begin()->second.get_memory(); + std::vector output_buffers; + for (const auto& output : outputs) { + auto output_layout = output.second.get_layout(); + auto output_mem = output.second.get_memory(); + output_buffers.push_back(engine.reinterpret_buffer(*output_mem, output_layout)); + } - return engine.reinterpret_buffer(*output_mem, output_layout); + return output_buffers; }; topology topology( input_layout("input", in_layout_f32), reorder_1, dyn_quan_prim, - reorder_data + reorder_data, + reorder_scale ); + if (has_zp_output) + topology.add(reorder_zp); + auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); config.set_property(ov::intel_gpu::optimize_data(true)); @@ -120,23 +137,30 @@ class dynamic_quantization_gpu_tests: public ::testing::Test { auto outputs = network->execute(); - auto output_mem = outputs.begin()->second.get_memory(); - cldnn::mem_lock output_ptr (output_mem, get_test_stream()); - - auto ref_output_mem = get_ref_results(); - cldnn::mem_lock output_ptr_ref (ref_output_mem, get_test_stream()); - size_t count = 0; - float max_diff = 0.f; - float avg = 0.f; - for (size_t i = 0; i < output_ptr_ref.size(); ++i) { - auto abs_diff = std::abs(output_ptr_ref[i] - output_ptr[i]); - if (max_diff < abs_diff) - max_diff = abs_diff; - avg += abs_diff; - count++; - ASSERT_LE(abs_diff, 1); + std::vector output_buffers; + for (const auto& output : outputs) { + auto output_layout = output.second.get_layout(); + auto output_mem = output.second.get_memory(); + output_buffers.push_back(engine.reinterpret_buffer(*output_mem, output_layout)); + } + + auto ref_output_buffers = get_ref_results(); + + ASSERT_EQ(ref_output_buffers.size(), output_buffers.size()); + + std::cout << "Outputs number: " << ref_output_buffers.size() << "\n"; + + for (size_t i = 0; i < ref_output_buffers.size(); i++) { + cldnn::mem_lock output_ptr(output_buffers[i], get_test_stream()); + cldnn::mem_lock output_ptr_ref(ref_output_buffers[i], get_test_stream()); + + // std::cout << "Check input: " << i << "\n"; + for (size_t i = 0; i < output_ptr_ref.size(); ++i) { + // std::cout << "Res: " << output_ptr_ref[i] << " " << output_ptr[i] << "\n"; + auto abs_diff = std::abs(output_ptr_ref[i] - output_ptr[i]); + ASSERT_LE(abs_diff, 1); + } } - GPU_DEBUG_LOG << "---> count: " << count << ", max_diff:" << max_diff << ", avg_diff: " << (avg/count) << std::endl; } }; @@ -177,33 +201,65 @@ TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_unaligned_dynamic) { } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache) { - this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache"); + this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, data_types::undefined, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched) { - this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache"); + this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, data_types::undefined, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_reordered) { - this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache"); + this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, data_types::undefined, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); } TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_reordered) { - this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, QuantizationType::Symmetric, "dynamic_quantize_gpu_kv_cache"); + this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, data_types::undefined, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_asym_planar) { + this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, data_types::f16, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_asym_planar) { + this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, data_types::f16, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_reordered_asym_planar) { + this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, data_types::f16, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_reordered_asym_planar) { + this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, data_types::f16, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_asym_interleaved) { + this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, data_types::f16, OutputStorageType::InterleavedScalesZP, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_asym_interleaved) { + this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, data_types::f16, OutputStorageType::InterleavedScalesZP, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_reordered_asym_interleaved) { + this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, data_types::f16, OutputStorageType::InterleavedScalesZP, "dynamic_quantize_gpu_kv_cache"); +} + +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_reordered_asym_interleaved) { + this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, data_types::f16, OutputStorageType::InterleavedScalesZP, "dynamic_quantize_gpu_kv_cache"); } -TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_asym) { - this->test_dynamic_quantization(false, {-1, 8, -1, 96}, {1, 8, 1, 96}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache"); +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_asym_planar_i8_zp) { + this->test_dynamic_quantization(false, {-1, 8, -1, 32}, {1, 8, 1, 32}, data_types::i8, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); } -TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_asym) { - this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache"); +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_asym_planar_i8_zp) { + this->test_dynamic_quantization(false, {-1, 4, -1, 64}, {1, 4, 35, 64}, data_types::i8, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); } -TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_reordered_asym) { - this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache"); +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_reordered_asym_planar_i8_zp) { + this->test_dynamic_quantization(false, {-1, -1, 8, 96}, {1, 1, 8, 96}, data_types::i8, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); } -TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_reordered_asym) { - this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, QuantizationType::Asymmetric, "dynamic_quantize_gpu_kv_cache"); +TEST_F(dynamic_quantization_gpu_tests, simple_quantizing_kv_cache_batched_reordered_asym_planar_i8_zp) { + this->test_dynamic_quantization(false, {-1, -1, 4, 64}, {1, 35, 4, 64}, data_types::i8, OutputStorageType::Planar, "dynamic_quantize_gpu_kv_cache"); } diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 1722066ad4c0f1..701f081f7b5551 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 1722066ad4c0f15495f2d0fcbe9deb2bfd188c36 +Subproject commit 701f081f7b5551b9944471d22a2759a46f3a9411