Skip to content

Commit

Permalink
[GPU] Do not use usm_host when network output tensor size is large (#…
Browse files Browse the repository at this point in the history
…27513)

### Details:
- For dGPU including A770, when network output size is large,
performance is better with explicit copy then writing data to usm_host.
 - Allow usm_host access for input
 - Next-gen dGPU, write to usm_device and copy it to usm_host
 - For DG2, write to usm_device only for large output

### Tickets:
 - 157439
  • Loading branch information
isanghao authored Nov 22, 2024
1 parent aca1bb4 commit 0f149e3
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ class SyncInferRequest : public ov::ISyncInferRequest {

void init_mappings();
bool is_batched_input(const ov::Output<const ov::Node>& port) const;
uint64_t total_output_bytes = 0;
};

} // namespace intel_gpu
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ class debug_configuration {
int disable_runtime_skip_reorder; // Disable runtime skip reorder
int disable_primitive_fusing; // Disable primitive fusing
int disable_fake_alignment; // Disable fake alignment
int use_usm_host; // Set explicit usm_host usage for network input and output
std::vector<std::string> dynamic_quantize_layers_without_onednn; // Specify Fully-connected layers which enable Dynamic quantization
int use_kv_cache_compression; // Enable KV-cache compression
int dynamic_quantize_group_size; // Enable Dynamic quantization for fully connected primitive by specified group size
Expand Down
21 changes: 16 additions & 5 deletions src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,19 @@

namespace {

inline bool can_use_usm_host(const cldnn::engine& engine) {
inline bool can_use_usm_host(const cldnn::engine& engine, const uint64_t total_output_bytes) {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->use_usm_host == 1) { return true; }
GPU_DEBUG_IF(debug_config->use_usm_host == 2) { return false; }

auto can_use_usm = engine.use_unified_shared_memory();
// When output size is large, it is better not to write to usm_host directly
const uint64_t LARGE_OUTPUT_BYTES_THRESHOLD = 4 * 1048576;

const auto& device_info = engine.get_device_info();
if ((device_info.gfx_ver.major == 12 && device_info.gfx_ver.minor == 60) ||
(device_info.gfx_ver.major >= 20 && device_info.dev_type == cldnn::device_type::discrete_gpu)) {
(device_info.gfx_ver.major >= 20 && device_info.dev_type == cldnn::device_type::discrete_gpu) ||
(device_info.dev_type == cldnn::device_type::discrete_gpu && total_output_bytes > LARGE_OUTPUT_BYTES_THRESHOLD)) {
// WA: Disable USM host memory for infer request`s tensors for PVC and subsequent dGPUs, as kernel access
// to system memory is slower than using an explicit memcpy (Host <-> Device) call with the copy engine
// Driver tickets with additional details: 6155, 10054
Expand Down Expand Up @@ -544,7 +551,7 @@ std::shared_ptr<ov::ITensor> SyncInferRequest::create_device_tensor(const ov::Pa
}

// Create OpenCL buffer for PVC if lockable memory is needed due to performance issue with usm host
if (!can_use_usm_host(m_graph->get_engine()) && need_lockable_memory)
if (!can_use_usm_host(m_graph->get_engine(), total_output_bytes) && need_lockable_memory)
tensor_type = TensorType::BT_BUF_INTERNAL;

return std::make_shared<RemoteTensorImpl>(m_context,
Expand Down Expand Up @@ -573,7 +580,9 @@ TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrappe
auto usm_host_raw_ptr = engine.get_device_info().dev_type == cldnn::device_type::integrated_gpu &&
user_tensor_mem_type == cldnn::allocation_type::usm_host;

bool can_share = !is_convert_required(user_tensor->get_element_type(), element_type) && can_use_usm_host(engine) && !generic_remote_tensor;
bool can_share = !is_convert_required(user_tensor->get_element_type(), element_type)
&& can_use_usm_host(engine, total_output_bytes)
&& !generic_remote_tensor;

if (usm_host_tensor && can_share && m_context == usm_host_tensor->get_impl()->get_context()) {
return { usm_host_tensor->get_impl(), user_tensor_wrapper.owner };
Expand Down Expand Up @@ -662,13 +671,15 @@ void SyncInferRequest::allocate_inputs() {
void SyncInferRequest::allocate_outputs() {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "SyncInferRequest::allocate_outputs");

total_output_bytes = 0;
// allocate outputs
for (const auto& it : m_output_ports_map) {
size_t output_idx = it.first;
const auto& port = it.second;
GPU_DEBUG_LOG << "[init output blob with index: " << output_idx << "]" << std::endl;

allocate_output(port, output_idx);
total_output_bytes += ov::ISyncInferRequest::get_tensor(port)->get_byte_size();
}
}

Expand Down Expand Up @@ -817,7 +828,7 @@ std::vector<cldnn::event::ptr> SyncInferRequest::prepare_input(const std::string
} else {
m_plugin_inputs[input_idx] = user_tensor_wrapper;
}
} else if (is_usm_host_tensor && !convert_needed && can_use_usm_host(engine)) {
} else if (is_usm_host_tensor && !convert_needed) {
if (element_type != ::data_type_for_remote_tensor(element_type)) {
m_plugin_inputs[input_idx] = { std::make_shared<RemoteTensorImpl>(m_context,
user_tensor->get_shape(),
Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/src/runtime/debug_configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ static void print_help_messages() {
message_list.emplace_back("OV_GPU_DisableRuntimeSkipReorder", "Disable runtime skip reorder.");
message_list.emplace_back("OV_GPU_DisablePrimitiveFusing", "Disable primitive fusing");
message_list.emplace_back("OV_GPU_DisableFakeAlignment", "Disable fake alignment");
message_list.emplace_back("OV_GPU_UseUsmHost", "Set explicit policy for usm host usage for network input/output. "
"0: default, 1: use usm_host, 2: do not use usm_host");
message_list.emplace_back("OV_GPU_KVCacheCompression", "Enable/Disable KV-cache compression");
message_list.emplace_back("OV_GPU_DynamicQuantizeLayersWithoutOnednn", "Enable Dynamic quantization for specified Fully connected layers only, "
"separated by space. Support case-insensitive and regular expression. For example .*fully_connected.*");
Expand Down Expand Up @@ -254,6 +256,7 @@ debug_configuration::debug_configuration()
, disable_runtime_skip_reorder(0)
, disable_primitive_fusing(0)
, disable_fake_alignment(0)
, use_usm_host(0)
, use_kv_cache_compression(-1)
, dynamic_quantize_group_size(DYNAMIC_QUANTIZE_GROUP_SIZE_NOT_SET)
, disable_horizontal_fc_fusion(0) {
Expand Down Expand Up @@ -307,6 +310,7 @@ debug_configuration::debug_configuration()
get_gpu_debug_env_var("DisableRuntimeSkipReorder", disable_runtime_skip_reorder);
get_gpu_debug_env_var("DisablePrimitiveFusing", disable_primitive_fusing);
get_gpu_debug_env_var("DisableFakeAlignment", disable_fake_alignment);
get_gpu_debug_env_var("UseUsmHost", use_usm_host);
get_gpu_debug_env_var("KVCacheCompression", use_kv_cache_compression);
get_gpu_debug_env_var("DynamicQuantizeGroupSize", dynamic_quantize_group_size);
get_gpu_debug_env_var("DisableHorizontalFCFusion", disable_horizontal_fc_fusion);
Expand Down

0 comments on commit 0f149e3

Please sign in to comment.