From d4c1aa533bab4346e1b8bb5d448a60233fc79f8c Mon Sep 17 00:00:00 2001 From: "Kim, SungEun" Date: Thu, 21 Nov 2024 16:22:34 +0900 Subject: [PATCH 1/5] set output from variable's memory if kv-cache --- src/plugins/intel_gpu/src/graph/primitive_inst.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index dac2c9a3403468..e8ffb0a2097a79 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -618,6 +618,7 @@ void primitive_inst::realloc_if_needed() { _max_output_layout_count[j] = 0; } } else { + _outputs[0] = variable.get_memory(); GPU_DEBUG_TRACE_DETAIL << id() << " : realloc_if_needed: can_be_optimized = false and memories are not being shared" << std::endl; } } else { From 3efd529afc6c5212d594892a7b24d7f72cd99a65 Mon Sep 17 00:00:00 2001 From: "Kim, SungEun" Date: Mon, 25 Nov 2024 18:38:24 +0900 Subject: [PATCH 2/5] add a test-case --- .../subgraph_tests/dynamic/kv_cache.cpp | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp index 77477648fd4860..a7c0b527198901 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp @@ -530,4 +530,77 @@ TEST_F(KVCacheTests, smoke_multipleIterations_stateful_with_set_state) { this->test_smoke_multipleIterations_stateful(false, true, true, 1, 2, ov::element::f16, 5, 1, true); } +class KVCacheIssueTests: public ::testing::Test { +public: + void test_smoke_conflicted_memory_for_two_inf_req() { + #if defined(ANDROID) + GTEST_SKIP(); + #endif + auto core = ov::test::utils::PluginCache::get().core(); + + ov::AnyMap properties = { + ov::hint::kv_cache_precision(ov::element::undefined) + }; + + const size_t n_batch = 1; + const size_t n_heads = 32; + const size_t n_features = 10; + const size_t context_size = 20; + ov::element::Type element_type = ov::element::f16; + + const bool stateful = true; + + auto model = tests::make_llm_kv_cache_pattern(n_batch, + n_heads, + n_features, + element_type, + 2, + stateful, + false, + stateful); + auto compiled_model = core->compile_model(model, ov::test::utils::DEVICE_GPU, properties); + + auto input0 = model->get_parameters().at(0); + auto input1 = model->get_parameters().at(1); + + auto infer_request1 = compiled_model.create_infer_request(); + auto infer_request2 = compiled_model.create_infer_request(); + + auto tensor1_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, context_size, n_heads, n_features}, -0.5f, 0.5f, 1); + auto tensor1_input2 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, n_heads, context_size, context_size}, -0.5f, 0.5f, 1); + infer_request1.set_tensor(input0, tensor1_input1); + infer_request1.set_tensor(input1, tensor1_input2); + + auto tensor2_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, context_size + 1, n_heads, n_features}, -0.5f, 0.5f, 555); + auto tensor2_input2 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, n_heads, context_size + 1, context_size + 1}, -0.5f, 0.5f, 555); + infer_request2.set_tensor(input0, tensor2_input1); + infer_request2.set_tensor(input1, tensor2_input2); + + std::stringstream oss1; + std::stringstream oss2; + for (auto&& state : infer_request1.query_state()) { + state.reset(); + } + infer_request1.infer(); + for (auto&& state : infer_request1.query_state()) { + oss1.write((char*)state.get_state().data(), state.get_state().get_byte_size()); + } + + for (auto&& state : infer_request2.query_state()) { + state.reset(); + } + infer_request2.infer(); + for (auto&& state : infer_request1.query_state()) { + oss2.write((char*)state.get_state().data(), state.get_state().get_byte_size()); + } + + ASSERT_TRUE(oss1.str() == oss2.str()); + } +}; + +TEST_F(KVCacheIssueTests, smoke_issue_cases) { + this->test_smoke_conflicted_memory_for_two_inf_req(); +} + + } // namespace From eebe767fda9ab604b47c9974d5cf352b2555ba27 Mon Sep 17 00:00:00 2001 From: "Kim, SungEun" Date: Thu, 28 Nov 2024 14:18:29 +0900 Subject: [PATCH 3/5] set outputs for scale/zp --- src/plugins/intel_gpu/src/graph/primitive_inst.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index e8ffb0a2097a79..c9cfe96538e9db 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -619,6 +619,14 @@ void primitive_inst::realloc_if_needed() { } } else { _outputs[0] = variable.get_memory(); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + _outputs[2] = compressed_cache_variable->get_compression_scale_state()->get_memory(); + + if (compressed_cache_variable->has_zp_state()) { + _outputs[3] = compressed_cache_variable->get_compression_zp_state()->get_memory(); + } + } GPU_DEBUG_TRACE_DETAIL << id() << " : realloc_if_needed: can_be_optimized = false and memories are not being shared" << std::endl; } } else { From 1ebcb3312b3cfaf45cea84d9e6d2b1ceb2f3e115 Mon Sep 17 00:00:00 2001 From: "Kim, SungEun" Date: Thu, 28 Nov 2024 14:25:45 +0900 Subject: [PATCH 4/5] update naming --- .../subgraph_tests/dynamic/kv_cache.cpp | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp index a7c0b527198901..aebdeca95a5a95 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp @@ -563,34 +563,34 @@ class KVCacheIssueTests: public ::testing::Test { auto input0 = model->get_parameters().at(0); auto input1 = model->get_parameters().at(1); - auto infer_request1 = compiled_model.create_infer_request(); - auto infer_request2 = compiled_model.create_infer_request(); + auto ireq1 = compiled_model.create_infer_request(); + auto ireq2 = compiled_model.create_infer_request(); - auto tensor1_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, context_size, n_heads, n_features}, -0.5f, 0.5f, 1); - auto tensor1_input2 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, n_heads, context_size, context_size}, -0.5f, 0.5f, 1); - infer_request1.set_tensor(input0, tensor1_input1); - infer_request1.set_tensor(input1, tensor1_input2); + auto ireq1_input0 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, context_size, n_heads, n_features}, -0.5f, 0.5f, 1); + auto ireq1_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, n_heads, context_size, context_size}, -0.5f, 0.5f, 1); + ireq1.set_tensor(input0, ireq1_input0); + ireq1.set_tensor(input1, ireq1_input1); - auto tensor2_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, context_size + 1, n_heads, n_features}, -0.5f, 0.5f, 555); - auto tensor2_input2 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, n_heads, context_size + 1, context_size + 1}, -0.5f, 0.5f, 555); - infer_request2.set_tensor(input0, tensor2_input1); - infer_request2.set_tensor(input1, tensor2_input2); + auto ireq2_input0 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, context_size + 1, n_heads, n_features}, -0.5f, 0.5f, 555); + auto ireq2_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, n_heads, context_size + 1, context_size + 1}, -0.5f, 0.5f, 555); + ireq2.set_tensor(input0, ireq2_input0); + ireq2.set_tensor(input1, ireq2_input1); std::stringstream oss1; std::stringstream oss2; - for (auto&& state : infer_request1.query_state()) { + for (auto&& state : ireq1.query_state()) { state.reset(); } - infer_request1.infer(); - for (auto&& state : infer_request1.query_state()) { + ireq1.infer(); + for (auto&& state : ireq1.query_state()) { oss1.write((char*)state.get_state().data(), state.get_state().get_byte_size()); } - for (auto&& state : infer_request2.query_state()) { + for (auto&& state : ireq2.query_state()) { state.reset(); } - infer_request2.infer(); - for (auto&& state : infer_request1.query_state()) { + ireq2.infer(); + for (auto&& state : ireq1.query_state()) { oss2.write((char*)state.get_state().data(), state.get_state().get_byte_size()); } @@ -598,7 +598,7 @@ class KVCacheIssueTests: public ::testing::Test { } }; -TEST_F(KVCacheIssueTests, smoke_issue_cases) { +TEST_F(KVCacheIssueTests, conflicted_memory_for_two_inf_req) { this->test_smoke_conflicted_memory_for_two_inf_req(); } From 9a046c24610901b4b9373a6ea3d91e733696952e Mon Sep 17 00:00:00 2001 From: "Kim, SungEun" Date: Fri, 29 Nov 2024 01:28:41 +0900 Subject: [PATCH 5/5] fixed cpplint issues --- .../subgraph_tests/dynamic/kv_cache.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp index aebdeca95a5a95..4945cc8d717be3 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp @@ -566,13 +566,17 @@ class KVCacheIssueTests: public ::testing::Test { auto ireq1 = compiled_model.create_infer_request(); auto ireq2 = compiled_model.create_infer_request(); - auto ireq1_input0 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, context_size, n_heads, n_features}, -0.5f, 0.5f, 1); - auto ireq1_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, n_heads, context_size, context_size}, -0.5f, 0.5f, 1); + auto ireq1_input0 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, + {n_batch, context_size, n_heads, n_features}, -0.5f, 0.5f, 1); + auto ireq1_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, + {n_batch, n_heads, context_size, context_size}, -0.5f, 0.5f, 1); ireq1.set_tensor(input0, ireq1_input0); ireq1.set_tensor(input1, ireq1_input1); - auto ireq2_input0 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, context_size + 1, n_heads, n_features}, -0.5f, 0.5f, 555); - auto ireq2_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, n_heads, context_size + 1, context_size + 1}, -0.5f, 0.5f, 555); + auto ireq2_input0 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, + {n_batch, context_size + 1, n_heads, n_features}, -0.5f, 0.5f, 555); + auto ireq2_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, + {n_batch, n_heads, context_size + 1, context_size + 1}, -0.5f, 0.5f, 555); ireq2.set_tensor(input0, ireq2_input0); ireq2.set_tensor(input1, ireq2_input1); @@ -583,7 +587,7 @@ class KVCacheIssueTests: public ::testing::Test { } ireq1.infer(); for (auto&& state : ireq1.query_state()) { - oss1.write((char*)state.get_state().data(), state.get_state().get_byte_size()); + oss1.write(reinterpret_cast(state.get_state().data()), state.get_state().get_byte_size()); } for (auto&& state : ireq2.query_state()) { @@ -591,7 +595,7 @@ class KVCacheIssueTests: public ::testing::Test { } ireq2.infer(); for (auto&& state : ireq1.query_state()) { - oss2.write((char*)state.get_state().data(), state.get_state().get_byte_size()); + oss2.write(reinterpret_cast(state.get_state().data()), state.get_state().get_byte_size()); } ASSERT_TRUE(oss1.str() == oss2.str());