openvinotoolkit · isanghao · Dec 2, 2024 · Nov 21, 2024 · Nov 25, 2024 · Nov 28, 2024
@@ -618,6 +618,7 @@ void primitive_inst::realloc_if_needed() {
                     _max_output_layout_count[j] = 0;
                 }
             } else {
+                _outputs[0] = variable.get_memory();
 std::shared_ptr<cldnn::ShapePredictor> m_shape_predictor = nullptr; 
 std::shared_ptr<cldnn::ShapePredictor> m_shape_predictor = nullptr; 
                 GPU_DEBUG_TRACE_DETAIL << id() << " : realloc_if_needed: can_be_optimized = false and memories are not being shared" << std::endl;
             }
         } else {

@@ -530,4 +530,77 @@ TEST_F(KVCacheTests, smoke_multipleIterations_stateful_with_set_state) {
     this->test_smoke_multipleIterations_stateful(false, true, true, 1, 2, ov::element::f16, 5, 1, true);
 }
 
+class KVCacheIssueTests: public ::testing::Test {
+public:
+    void test_smoke_conflicted_memory_for_two_inf_req() {
+    #if defined(ANDROID)
+        GTEST_SKIP();
+    #endif
+        auto core = ov::test::utils::PluginCache::get().core();
+
+        ov::AnyMap properties = {
+            ov::hint::kv_cache_precision(ov::element::undefined)
+        };
+
+        const size_t n_batch = 1;
+        const size_t n_heads = 32;
+        const size_t n_features = 10;
+        const size_t context_size = 20;
+        ov::element::Type element_type = ov::element::f16;
+
+        const bool stateful = true;
+
+        auto model = tests::make_llm_kv_cache_pattern(n_batch,
+                                                      n_heads,
+                                                      n_features,
+                                                      element_type,
+                                                      2,
+                                                      stateful,
+                                                      false,
+                                                      stateful);
+        auto compiled_model = core->compile_model(model, ov::test::utils::DEVICE_GPU, properties);
+
+        auto input0 = model->get_parameters().at(0);
+        auto input1 = model->get_parameters().at(1);
+
+        auto infer_request1 = compiled_model.create_infer_request();
+        auto infer_request2 = compiled_model.create_infer_request();
+
+        auto tensor1_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, context_size, n_heads, n_features}, -0.5f, 0.5f, 1);
+        auto tensor1_input2 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, n_heads, context_size, context_size}, -0.5f, 0.5f, 1);
+        infer_request1.set_tensor(input0, tensor1_input1);
+        infer_request1.set_tensor(input1, tensor1_input2);
+
+        auto tensor2_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, context_size + 1, n_heads, n_features}, -0.5f, 0.5f, 555);
+        auto tensor2_input2 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, {n_batch, n_heads, context_size + 1, context_size + 1}, -0.5f, 0.5f, 555);
+        infer_request2.set_tensor(input0, tensor2_input1);
+        infer_request2.set_tensor(input1, tensor2_input2);
+
+        std::stringstream oss1;
+        std::stringstream oss2;
+        for (auto&& state : infer_request1.query_state()) {
+            state.reset();
+        }
+        infer_request1.infer();
+        for (auto&& state : infer_request1.query_state()) {
+            oss1.write((char*)state.get_state().data(), state.get_state().get_byte_size());
+        }
+
+        for (auto&& state : infer_request2.query_state()) {
+            state.reset();
+        }
+        infer_request2.infer();
+        for (auto&& state : infer_request1.query_state()) {
+            oss2.write((char*)state.get_state().data(), state.get_state().get_byte_size());
+        }
+
+        ASSERT_TRUE(oss1.str() == oss2.str());
+    }
+};
+
+TEST_F(KVCacheIssueTests, smoke_issue_cases) {
+    this->test_smoke_conflicted_memory_for_two_inf_req();
+}
+
+
 } // namespace