From 68fb2e6a85ce04230c23b19c8410f08798c47889 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Tue, 5 Nov 2024 18:24:18 +0400 Subject: [PATCH] [GPU] Enable KV-cache compression by default for non-systolic platforms, use FP32 as accumulator type for scale/zp calculation --- .../cl_kernels/dynamic_quantize_gpu_kv_cache.cl | 6 +++--- .../dynamic_quantize_kernel_opt_kv_cache.cpp | 3 +++ src/plugins/intel_gpu/src/runtime/execution_config.cpp | 5 +++++ .../functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp | 5 ++++- 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl index 22a2f03c94564a..b0e363169e9e4d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/dynamic_quantize_gpu_kv_cache.cl @@ -83,11 +83,11 @@ KERNEL(dynamic_quantize_gpu_kv_cache)( #if ASYMMETRIC_QUANTIZATION min_value = work_group_reduce_min(min_value); max_value = work_group_reduce_max(max_value); - OUTPUT1_TYPE scale = (OUTPUT1_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value)); - OUTPUT1_TYPE zp = (OUTPUT1_TYPE)(-min_value * scale) - CHAR_MAX; + ACCUMULATOR_TYPE scale = (ACCUMULATOR_TYPE)((CHAR_MAX - CHAR_MIN) / (max_value - min_value)); + ACCUMULATOR_TYPE zp = (ACCUMULATOR_TYPE)(-min_value * scale) - CHAR_MAX; #else max_value = work_group_reduce_max(max_value); - OUTPUT1_TYPE scale = 127.0h / max_value; + ACCUMULATOR_TYPE scale = 127.0h / max_value; #endif #ifdef APPEND_MODE diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_kv_cache.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_kv_cache.cpp index d0c99484e3f52e..8f7537eeeb5d7d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_kv_cache.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/dynamic_quantize/dynamic_quantize_kernel_opt_kv_cache.cpp @@ -141,6 +141,9 @@ JitConstants DynamicQuantizeKernelKVCache::GetJitConstants(const dynamic_quantiz jit.AddConstant(MakeJitConstant("ASYMMETRIC_QUANTIZATION", params.use_asymmetric_quantization)); jit.AddConstant(MakeJitConstant("GROUP_SCALES_WITH_ZP", params.combine_scales_and_zp)); + // Use FP32 accumulator type for scale/zp calculation + jit.Merge(MakeTypeJitConstants(Datatype::F32, "ACCUMULATOR")); + bool rearrange_scales_order = false; const auto& scales_output_order = params.scales_output_order; if (!scales_output_order.empty()) { diff --git a/src/plugins/intel_gpu/src/runtime/execution_config.cpp b/src/plugins/intel_gpu/src/runtime/execution_config.cpp index 7c8e55cddfe593..44758f73289edb 100644 --- a/src/plugins/intel_gpu/src/runtime/execution_config.cpp +++ b/src/plugins/intel_gpu/src/runtime/execution_config.cpp @@ -246,6 +246,11 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) { set_property(ov::intel_gpu::queue_type(QueueTypes::in_order)); } + // Enable KV-cache compression by default for non-systolic platforms + if (!is_set_by_user(ov::hint::kv_cache_precision) && !info.supports_immad) { + set_property(ov::hint::kv_cache_precision(ov::element::i8)); + } + user_properties.clear(); } diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp index 16db9d89c28b4d..2563fe535a93d9 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache_sdpa.cpp @@ -47,8 +47,11 @@ class SDPAWithKVCacheTest : public ::testing::Test, public ::testing::WithParamI ov::AnyMap properties = {ov::hint::inference_precision(ov::element::f16), ov::intel_gpu::hint::enable_sdpa_optimization(true)}; - if (p.compressed) + if (p.compressed) { properties.emplace(ov::hint::kv_cache_precision(ov::element::i8)); + } else { + properties.emplace(ov::hint::kv_cache_precision(ov::element::undefined)); + } const size_t n_heads = 16; const size_t n_features = 64;