[GPU] Do not use usm_host when network output tensor size is large (#…

…27513) ### Details: - For dGPU including A770, when network output size is large, performance is better with explicit copy then writing data to usm_host. - Allow usm_host access for input - Next-gen dGPU, write to usm_device and copy it to usm_host - For DG2, write to usm_device only for large output ### Tickets: - 157439
openvinotoolkit · Nov 22, 2024 · 0f149e3 · 0f149e3
1 parent aca1bb4
commit 0f149e3
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 5 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/sync_infer_request.hpp
@@ -118,6 +118,7 @@ class SyncInferRequest : public ov::ISyncInferRequest {
 
     void init_mappings();
     bool is_batched_input(const ov::Output<const ov::Node>& port) const;
+    uint64_t total_output_bytes = 0;
 };
 
 }  // namespace intel_gpu

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@@ -142,6 +142,7 @@ class debug_configuration {
     int disable_runtime_skip_reorder;                           // Disable runtime skip reorder
     int disable_primitive_fusing;                               // Disable primitive fusing
     int disable_fake_alignment;                                 // Disable fake alignment
+    int use_usm_host;                                           // Set explicit usm_host usage for network input and output
     std::vector<std::string> dynamic_quantize_layers_without_onednn;  // Specify Fully-connected layers which enable Dynamic quantization
     int use_kv_cache_compression;                               // Enable KV-cache compression
     int dynamic_quantize_group_size;                            // Enable Dynamic quantization for fully connected primitive by specified group size

diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -32,12 +32,19 @@
 
 namespace {
 
-inline bool can_use_usm_host(const cldnn::engine& engine) {
+inline bool can_use_usm_host(const cldnn::engine& engine, const uint64_t total_output_bytes) {
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    GPU_DEBUG_IF(debug_config->use_usm_host == 1) { return true; }
+    GPU_DEBUG_IF(debug_config->use_usm_host == 2) { return false; }
+
     auto can_use_usm = engine.use_unified_shared_memory();
+    // When output size is large, it is better not to write to usm_host directly
+    const uint64_t LARGE_OUTPUT_BYTES_THRESHOLD = 4 * 1048576;
 
     const auto& device_info = engine.get_device_info();
     if ((device_info.gfx_ver.major == 12 && device_info.gfx_ver.minor == 60) ||
-        (device_info.gfx_ver.major >= 20 && device_info.dev_type == cldnn::device_type::discrete_gpu)) {
+        (device_info.gfx_ver.major >= 20 && device_info.dev_type == cldnn::device_type::discrete_gpu) ||
+        (device_info.dev_type == cldnn::device_type::discrete_gpu && total_output_bytes > LARGE_OUTPUT_BYTES_THRESHOLD)) {
         // WA: Disable USM host memory for infer request`s tensors for PVC and subsequent dGPUs, as kernel access
         // to system memory is slower than using an explicit memcpy (Host <-> Device) call with the copy engine
         // Driver tickets with additional details: 6155, 10054
@@ -544,7 +551,7 @@ std::shared_ptr<ov::ITensor> SyncInferRequest::create_device_tensor(const ov::Pa
     }
 
     // Create OpenCL buffer for PVC if lockable memory is needed due to performance issue with usm host
-    if (!can_use_usm_host(m_graph->get_engine()) && need_lockable_memory)
+    if (!can_use_usm_host(m_graph->get_engine(), total_output_bytes) && need_lockable_memory)
         tensor_type = TensorType::BT_BUF_INTERNAL;
 
     return std::make_shared<RemoteTensorImpl>(m_context,
@@ -573,7 +580,9 @@ TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrappe
     auto usm_host_raw_ptr = engine.get_device_info().dev_type == cldnn::device_type::integrated_gpu &&
                             user_tensor_mem_type == cldnn::allocation_type::usm_host;
 
-    bool can_share = !is_convert_required(user_tensor->get_element_type(), element_type) && can_use_usm_host(engine) && !generic_remote_tensor;
+    bool can_share = !is_convert_required(user_tensor->get_element_type(), element_type)
+                     && can_use_usm_host(engine, total_output_bytes)
+                     && !generic_remote_tensor;
 
     if (usm_host_tensor && can_share && m_context == usm_host_tensor->get_impl()->get_context()) {
         return { usm_host_tensor->get_impl(), user_tensor_wrapper.owner };
@@ -662,13 +671,15 @@ void SyncInferRequest::allocate_inputs() {
 void SyncInferRequest::allocate_outputs() {
     OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::allocate_outputs");
 
+    total_output_bytes = 0;
     // allocate outputs
     for (const auto& it : m_output_ports_map) {
         size_t output_idx = it.first;
         const auto& port = it.second;
         GPU_DEBUG_LOG << "[init output blob with index: " << output_idx << "]" << std::endl;
 
         allocate_output(port, output_idx);
+        total_output_bytes += ov::ISyncInferRequest::get_tensor(port)->get_byte_size();
     }
 }
 
@@ -817,7 +828,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
         } else {
             m_plugin_inputs[input_idx] = user_tensor_wrapper;
         }
-    } else if (is_usm_host_tensor && !convert_needed && can_use_usm_host(engine)) {
+    } else if (is_usm_host_tensor && !convert_needed) {
         if (element_type != ::data_type_for_remote_tensor(element_type)) {
             m_plugin_inputs[input_idx] = { std::make_shared<RemoteTensorImpl>(m_context,
                                                                               user_tensor->get_shape(),

diff --git a/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp b/src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
@@ -183,6 +183,8 @@ static void print_help_messages() {
     message_list.emplace_back("OV_GPU_DisableRuntimeSkipReorder", "Disable runtime skip reorder.");
     message_list.emplace_back("OV_GPU_DisablePrimitiveFusing", "Disable primitive fusing");
     message_list.emplace_back("OV_GPU_DisableFakeAlignment", "Disable fake alignment");
+    message_list.emplace_back("OV_GPU_UseUsmHost", "Set explicit policy for usm host usage for network input/output. "
+                              "0: default, 1: use usm_host, 2: do not use usm_host");
     message_list.emplace_back("OV_GPU_KVCacheCompression", "Enable/Disable KV-cache compression");
     message_list.emplace_back("OV_GPU_DynamicQuantizeLayersWithoutOnednn", "Enable Dynamic quantization for specified Fully connected layers only, "
                                 "separated by space. Support case-insensitive and regular expression. For example .*fully_connected.*");
@@ -254,6 +256,7 @@ debug_configuration::debug_configuration()
         , disable_runtime_skip_reorder(0)
         , disable_primitive_fusing(0)
         , disable_fake_alignment(0)
+        , use_usm_host(0)
         , use_kv_cache_compression(-1)
         , dynamic_quantize_group_size(DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET)
         , disable_horizontal_fc_fusion(0) {
@@ -307,6 +310,7 @@ debug_configuration::debug_configuration()
     get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder);
     get_gpu_debug_env_var("DisablePrimitiveFusing", disable_primitive_fusing);
     get_gpu_debug_env_var("DisableFakeAlignment", disable_fake_alignment);
+    get_gpu_debug_env_var("UseUsmHost", use_usm_host);
     get_gpu_debug_env_var("KVCacheCompression", use_kv_cache_compression);
     get_gpu_debug_env_var("DynamicQuantizeGroupSize", dynamic_quantize_group_size);
     get_gpu_debug_env_var("DisableHorizontalFCFusion", disable_horizontal_fc_fusion);