[GPU] Make ShapePredictor instance unique for each InferRequest inste…

…ad of the cldnn::network
sshlyapn · Nov 10, 2023 · 0bb61c5 · 0bb61c5
1 parent b1705e8
commit 0bb61c5
Show file tree

Hide file tree

Showing 6 changed files with 26 additions and 12 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
@@ -247,7 +247,8 @@ struct network {
     const variables_state_info_map& get_variables_state_info() const;
     const ExecutionConfig& get_config() const { return _config; }
 
-    ShapePredictor& get_shape_predictor() { return *_shape_predictor; }
+    std::shared_ptr<ShapePredictor> get_shape_predictor() { return _shape_predictor; }
+    void set_shape_predictor(std::shared_ptr<ShapePredictor> shape_predictor) { _shape_predictor = shape_predictor; }
 
 #ifdef GPU_DEBUG_CONFIG
     int64_t get_current_iteration_num() { return iteration; }
@@ -287,7 +288,7 @@ struct network {
     std::unordered_map<primitive_id, event::ptr> _old_events;
     output_chains_map _output_chains;
 
-    std::unique_ptr<ShapePredictor> _shape_predictor;
+    std::shared_ptr<ShapePredictor> _shape_predictor;
 
     void build_exec_order();
     void allocate_primitive_instance(program_node const& node);

diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp
@@ -83,6 +83,7 @@ class SyncInferRequest : public ov::ISyncInferRequest {
     std::shared_ptr<Graph> m_graph;
     RemoteContextImpl::Ptr m_context = nullptr;
     std::shared_ptr<ov::threading::IStreamsExecutor> m_stream_executor = nullptr;
+    std::shared_ptr<cldnn::ShapePredictor> m_shape_predictor = nullptr;
     bool m_enable_profiling = false;
     bool m_use_external_queue = false;
 

diff --git a/src/plugins/intel_gpu/src/graph/impls/common/condition.cpp b/src/plugins/intel_gpu/src/graph/impls/common/condition.cpp
@@ -37,8 +37,9 @@ struct condition_impl : typed_primitive_impl<condition> {
         set_node_params(instance.get_node());
 
         auto pred = condition_inst::get_pred_from_memory(instance.pred_memory_ptr(), instance.get_network().get_stream());
-        network::ptr executed_net = pred? instance.get_net_true() : instance.get_net_false();
-        auto branch = pred? instance.get_branch_true() : instance.get_branch_false();
+        network::ptr executed_net = pred ? instance.get_net_true() : instance.get_net_false();
+        auto branch = pred ? instance.get_branch_true() : instance.get_branch_false();
+        executed_net->set_shape_predictor(instance.get_network().get_shape_predictor());
         GPU_DEBUG_LOG << "predicate: " << (pred ? "True" : "False") << std::endl;
 
         // Set input memory of inner network before its execution

diff --git a/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp b/src/plugins/intel_gpu/src/graph/impls/common/loop.cpp
@@ -121,6 +121,7 @@ struct loop_impl : typed_primitive_impl<loop> {
 
         auto ev = stream.create_user_event(false);
 
+        body_network->set_shape_predictor(outer_network.get_shape_predictor());
         OPENVINO_ASSERT(!primitive->num_iteration_id.empty(), "loop operation should have num_iteration_id");
 
         auto num_iterations = instance.get_num_iterations();

diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -427,7 +427,7 @@ event::ptr primitive_inst::realloc_if_needed() {
     }
 
     auto current_shape = actual_layout.get_shape();
-    auto& sp = get_network().get_shape_predictor();
+    auto& sp = *get_network().get_shape_predictor();
     auto dt_size = ov::element::Type(actual_layout.data_type).bitwidth();
     auto prealloc_info = sp.predict_preallocation_shape(id(), current_shape, dt_size, can_reuse_buffer);
     if (prealloc_info.first && sp.can_preallocate(ov::shape_size(prealloc_info.second) * dt_size)) {

diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -226,13 +226,25 @@ SyncInferRequest::SyncInferRequest(const std::shared_ptr<const CompiledModel>& c
     : ov::ISyncInferRequest(compiled_model)
     , m_graph(compiled_model->get_graph(0))
     , m_context(std::static_pointer_cast<RemoteContextImpl>(compiled_model->get_context_impl()))
+    , m_shape_predictor(new cldnn::ShapePredictor(&m_graph->get_engine(), m_graph->get_config().get_property(ov::intel_gpu::buffers_preallocation_ratio)))
     , m_enable_profiling(m_graph->get_config().get_property(ov::enable_profiling))
     , m_use_external_queue(m_graph->use_external_queue()) {
     bool is_legacy_api = !compiled_model->is_new_api();
     init_mappings(is_legacy_api);
     allocate_inputs();
     allocate_outputs();
     allocate_states();
+
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->mem_preallocation_params.is_initialized) {
+        auto& mem_preallocation_params = debug_config->mem_preallocation_params;
+        m_shape_predictor.reset(
+            new cldnn::ShapePredictor(&m_graph->get_engine(),
+                                      mem_preallocation_params.next_iters_preallocation_count,
+                                      mem_preallocation_params.max_per_iter_size,
+                                      mem_preallocation_params.max_per_dim_diff,
+                                      mem_preallocation_params.buffers_preallocation_ratio));
+    }
 }
 
 void SyncInferRequest::infer() {
@@ -401,6 +413,7 @@ void SyncInferRequest::enqueue() {
 
     auto network = m_graph->get_network();
     network->assign_variables_memories();
+    network->set_shape_predictor(m_shape_predictor);
 
     m_internal_outputs.clear();
     m_internal_outputs = network->execute(dependencies);
@@ -476,8 +489,7 @@ void SyncInferRequest::wait() {
                     need_reallocate = usm_host_tensor->get_impl()->get_original_memory()->size() < output_memory->size();
 
                 if (need_reallocate) {
-                    auto& shape_predictor = m_graph->get_network()->get_shape_predictor();
-                    auto actual_memory_shape = predict_shape(name, mem_shape, output_tensor->get_element_type(), shape_predictor);
+                    auto actual_memory_shape = predict_shape(name, mem_shape, output_tensor->get_element_type(), *m_shape_predictor);
                     output_tensor->set_shape(actual_memory_shape);
                 }
             }
@@ -585,8 +597,7 @@ TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrappe
 
     auto actual_memory_shape = tensor_shape;
     if (is_dynamic) {
-        auto& shape_predictor = m_graph->get_network()->get_shape_predictor();
-        actual_memory_shape = predict_shape(name, tensor_shape, element_type, shape_predictor);
+        actual_memory_shape = predict_shape(name, tensor_shape, element_type, *m_shape_predictor);
     }
 
     return { create_device_tensor(actual_memory_shape, element_type, need_lockable_mem), TensorOwner::PLUGIN };
@@ -746,7 +757,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
 
     if (is_remote) {
         m_plugin_inputs[name] = user_tensor_wrapper;
-    } else if (is_usm_host_tensor && !convert_needed) {
+    } else if (is_usm_host_tensor && !convert_needed && can_use_usm_host(engine)) {
         m_plugin_inputs[name] = {usm_host_ptr->get_impl(), user_tensor_wrapper.owner};
         is_remote = true;
     }
@@ -762,8 +773,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
         auto device_tensor = std::dynamic_pointer_cast<RemoteTensorImpl>(device_tensor_wrapper.ptr);
         if (is_dynamic) {
             if (device_tensor->get_original_memory()->size() < user_tensor->get_byte_size()) {
-                auto& shape_predictor = network->get_shape_predictor();
-                auto actual_shape = predict_shape(name, user_tensor->get_shape(), device_tensor_et, shape_predictor);
+                auto actual_shape = predict_shape(name, user_tensor->get_shape(), device_tensor_et, *m_shape_predictor);
                 GPU_DEBUG_TRACE_DETAIL << "    actual memory shape: " << actual_shape.to_string() << std::endl;
                 auto new_tensor = create_device_tensor(actual_shape, device_tensor_et, false);
                 new_tensor->set_shape(user_tensor->get_shape());