Skip to content

Commit

Permalink
[GPU] Enable KV-cache compression by default for non-systolic platforms
Browse files Browse the repository at this point in the history
  • Loading branch information
sshlyapn committed Nov 6, 2024
1 parent 84e6837 commit f599577
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
5 changes: 5 additions & 0 deletions src/plugins/intel_gpu/src/runtime/execution_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) {
set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
}

// Enable KV-cache compression by default for non-systolic platforms
if (!is_set_by_user(ov::hint::kv_cache_precision) && !info.supports_immad) {
set_property(ov::hint::kv_cache_precision(ov::element::i8));
}

user_properties.clear();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,11 @@ class SDPAWithKVCacheTest : public ::testing::Test, public ::testing::WithParamI
ov::AnyMap properties = {ov::hint::inference_precision(ov::element::f16),
ov::intel_gpu::hint::enable_sdpa_optimization(true)};

if (p.compressed)
if (p.compressed) {
properties.emplace(ov::hint::kv_cache_precision(ov::element::i8));
} else {
properties.emplace(ov::hint::kv_cache_precision(ov::element::undefined));
}

const size_t n_heads = 16;
const size_t n_features = 64;
Expand Down

0 comments on commit f599577

Please sign in to comment.