From 85f408ecd14080260d7521f6d2ad24f10c7e3d21 Mon Sep 17 00:00:00 2001 From: Jade Cho Date: Tue, 19 Nov 2024 20:55:40 +0900 Subject: [PATCH] [GPU] Add support for i16, u16, and u32 element types in remote tensors (#27573) ### Details: - *Removed host memory data converting for user input/output tensors with data types i16, u16, or u32.* - *User tensors can now be directly used as plugin tensors without additional data conversion overhead.* ### Tickets: - *156709* --- .../include/intel_gpu/plugin/common_utils.hpp | 3 - .../kernels/reorder/reorder_kernel.cpp | 6 ++ .../intel_gpu/src/plugin/common_utils.cpp | 1 + .../intel_gpu/src/plugin/ops/parameter.cpp | 3 +- .../intel_gpu/src/plugin/ops/result.cpp | 3 +- .../src/plugin/sync_infer_request.cpp | 45 ++++++++-- .../gpu_remote_tensor_tests.cpp | 90 +++++++++++++++++++ 7 files changed, 139 insertions(+), 12 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp index 3c14895befb101..792745193ed550 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp @@ -64,11 +64,8 @@ inline cldnn::layout make_layout(const ov::element::Type type, const ov::Shape& inline ov::element::Type convert_to_supported_device_type(ov::element::Type et) { switch (et) { case ov::element::f64: - case ov::element::i16: - case ov::element::u16: return ov::element::f32; case ov::element::u64: - case ov::element::u32: return ov::element::i32; default: return et; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_kernel.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_kernel.cpp index 8f4fdbf1f5c992..08bf9d3fb81794 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_kernel.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_kernel.cpp @@ -10,7 +10,10 @@ ParamsKey ReorderKernelRef::GetSupportedKey() const { ParamsKey k; k.EnableInputDataType(Datatype::BF16); k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::UINT16); + k.EnableInputDataType(Datatype::UINT32); k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::INT16); k.EnableInputDataType(Datatype::INT32); k.EnableInputDataType(Datatype::INT64); k.EnableInputDataType(Datatype::F16); @@ -18,9 +21,12 @@ ParamsKey ReorderKernelRef::GetSupportedKey() const { k.EnableOutputDataType(Datatype::F16); k.EnableOutputDataType(Datatype::F32); k.EnableOutputDataType(Datatype::INT8); + k.EnableOutputDataType(Datatype::INT16); k.EnableOutputDataType(Datatype::INT32); k.EnableOutputDataType(Datatype::INT64); k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::UINT16); + k.EnableOutputDataType(Datatype::UINT32); k.EnableOutputDataType(Datatype::BF16); k.EnableSurfaceInputSupport(); k.EnableDifferentTypes(); diff --git a/src/plugins/intel_gpu/src/plugin/common_utils.cpp b/src/plugins/intel_gpu/src/plugin/common_utils.cpp index ddd6b5677adc45..857123e53bf011 100644 --- a/src/plugins/intel_gpu/src/plugin/common_utils.cpp +++ b/src/plugins/intel_gpu/src/plugin/common_utils.cpp @@ -235,6 +235,7 @@ void convert_and_copy(const ov::ITensor* src, ov::ITensor* dst, const cldnn::str tmp_tensor = ov::Tensor(dst_et, src->get_shape()); ::convert_and_copy(src_ptr, src_et, tmp_tensor.data(), dst_et, size, cldnn::layout({}, ov::element::undefined, cldnn::format::bfyx, cldnn::padding())); remote->copy_from(get_tensor_impl(tmp_tensor)._ptr); + return; } else { dst_ptr = dst->data(); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp b/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp index 7f5c4b73223326..0b9874ffe694e1 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/parameter.cpp @@ -29,7 +29,8 @@ static void CreateParameterOp(ProgramBuilder& p, const std::shared_ptrget_output_element_type(0))); + auto element_type = convert_to_supported_device_type(op->get_output_element_type(0)); + element_type = element_type == ov::element::boolean ? ov::element::u8 : element_type; // look at the expected color format of this input auto input_name = layer_type_name_ID(op); diff --git a/src/plugins/intel_gpu/src/plugin/ops/result.cpp b/src/plugins/intel_gpu/src/plugin/ops/result.cpp index 4172f56e483af3..eb6df76c39b108 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/result.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/result.cpp @@ -30,7 +30,8 @@ static void CreateResultOp(ProgramBuilder& p, const std::shared_ptrget_input_element_type(0))); + auto out_data_type = convert_to_supported_device_type(op->get_input_element_type(0)); + out_data_type = out_data_type == ov::element::boolean ? ov::element::u8 : out_data_type; auto reorder_primitive = cldnn::reorder(out_primitive_name, inputs[0], diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index 985336b801b9d3..c52c022df6424c 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -82,6 +82,18 @@ inline bool all_host_tensors(const std::vector>& tensors) }); } +cldnn::data_types data_type_for_remote_tensor(ov::element::Type t) { + switch (t) { + case ov::element::Type_t::f64: + return cldnn::data_types::f32; + case ov::element::Type_t::u64: + return cldnn::data_types::i32; + case ov::element::Type_t::boolean: + return cldnn::data_types::u8; + default: return t; + } +} + } // namespace namespace ov { @@ -446,6 +458,21 @@ void SyncInferRequest::wait() { iremote_tensor_ptr->copy_from(plugin_tensor.ptr); } } + } else if (!is_dynamic && is_remote_tensor_impl && output_memory) { + auto& stream = m_graph->get_network()->get_stream(); + auto user_mem = remote_tensor_impl_ptr->get_original_memory(); + if (user_mem->get_allocation_type() == cldnn::allocation_type::cl_mem + && output_memory->get_allocation_type() != cldnn::allocation_type::cl_mem) { + auto plugin_tensor = m_plugin_outputs.at(port_idx); + if (is_convert_required(plugin_tensor.ptr->get_element_type(), iremote_tensor_ptr->get_element_type())) { + auto& stream = m_graph->get_network()->get_stream(); + convert_and_copy(plugin_tensor.ptr.get(), iremote_tensor_ptr.get(), stream); + } else { + iremote_tensor_ptr->copy_from(plugin_tensor.ptr); + } + } else { + copy_events.push_back(output_memory->copy_to(stream, *user_mem, false)); + } } else if (is_remote_tensor_impl && is_dynamic) { auto& stream = m_graph->get_network()->get_stream(); auto user_mem = remote_tensor_impl_ptr->get_original_memory(); @@ -522,7 +549,7 @@ std::shared_ptr SyncInferRequest::create_device_tensor(const ov::Pa return std::make_shared(m_context, get_tensor_shape(port_shape), - cldnn::element_type_to_data_type(element_type), + ::data_type_for_remote_tensor(element_type), tensor_type); } @@ -553,7 +580,7 @@ TensorWrapper SyncInferRequest::create_or_share_device_tensor(const TensorWrappe } else if (usm_host_raw_ptr && can_share) { return { std::make_shared(m_context, user_tensor->get_shape(), - cldnn::element_type_to_data_type(element_type), + ::data_type_for_remote_tensor(element_type), TensorType::BT_USM_SHARED, user_tensor->data()), TensorOwner::USER }; } @@ -785,16 +812,16 @@ std::vector SyncInferRequest::prepare_input(const std::string if (is_remote_tensor_impl && !need_lockable_mem) { if (convert_needed) { m_plugin_inputs[input_idx] = { create_device_tensor(pshape, - cldnn::element_type_to_data_type(element_type), + ::data_type_for_remote_tensor(element_type), false), TensorOwner::PLUGIN }; } else { m_plugin_inputs[input_idx] = user_tensor_wrapper; } } else if (is_usm_host_tensor && !convert_needed && can_use_usm_host(engine)) { - if (element_type != cldnn::element_type_to_data_type(element_type)) { + if (element_type != ::data_type_for_remote_tensor(element_type)) { m_plugin_inputs[input_idx] = { std::make_shared(m_context, user_tensor->get_shape(), - cldnn::element_type_to_data_type(element_type), + ::data_type_for_remote_tensor(element_type), TensorType::BT_USM_SHARED, user_tensor->data()), TensorOwner::USER }; } else { @@ -953,8 +980,12 @@ std::vector SyncInferRequest::prepare_output(size_t output_id is_generic_remote || (m_plugin_outputs[output_idx].owner == TensorOwner::USER && !is_remote_tensor_impl); if (update_device_tensor) { - m_plugin_outputs[output_idx] = - create_or_share_device_tensor(user_tensor_wrapper, internal_name, pshape, device_tensor_et, need_lockable_mem || convert_needed); + if (!is_remote_tensor_impl) { + m_plugin_outputs[output_idx] = + create_or_share_device_tensor(user_tensor_wrapper, internal_name, pshape, device_tensor_et, need_lockable_mem || convert_needed); + } else { + m_plugin_outputs[output_idx] = { create_device_tensor(pshape, device_tensor_et, need_lockable_mem || convert_needed), TensorOwner::PLUGIN }; + } } } diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp index baad7361425cca..11c2b034d20821 100644 --- a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp +++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/gpu_remote_tensor_tests.cpp @@ -2873,3 +2873,93 @@ TEST(RemoteTensor, smoke_CanSetRoiRemoteTensor) { compare_tensors(output_tensor_copy_0, output_tensor_copy_1); } + + +using RemoteTensorDataTypesOptionsParams = std::tuple; +class OVRemoteTensorDataType_Test : public OVRemoteTensor_Test, + public testing::WithParamInterface { +protected: + std::shared_ptr fn_ptr; + std::string deviceName; + ov::AnyMap config; + ov::element::Type_t element_type; + +public: + void SetUp() override { + deviceName = ov::test::utils::DEVICE_GPU; + std::tie(element_type) = this->GetParam(); + config = {ov::hint::inference_precision(ov::element::f16), + ov::hint::model_priority(ov::hint::Priority::HIGH), + ov::hint::execution_mode(ov::hint::ExecutionMode::PERFORMANCE), + ov::hint::performance_mode(ov::hint::PerformanceMode::LATENCY)}; + + auto input1 = std::make_shared(element_type, ov::Shape{1, 2, 10, 10}); + auto constant = ov::op::v0::Constant::create(element_type, ov::Shape{1, 2, 10, 10}, {1}); + auto add = std::make_shared(input1, constant); + fn_ptr = std::make_shared(ov::NodeVector{add}, ov::ParameterVector{input1}); + } + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + ov::element::Type_t elem_type; + std::tie(elem_type) = obj.param; + + std::ostringstream result; + result << "OVRemoteTensorTest_" << elem_type; + return result.str(); + } +}; + +TEST_P(OVRemoteTensorDataType_Test, smoke_RemoteTensorDataType) { +#if defined(ANDROID) + GTEST_SKIP(); +#endif + auto ppp = ov::preprocess::PrePostProcessor(fn_ptr); + ppp.output(0).tensor().set_element_type(element_type); + auto ov_model = ppp.build(); + + auto core = ov::Core(); + ov::CompiledModel compiled_model = core.compile_model(ov_model, deviceName, config); + + // regular inference + auto inf_req = compiled_model.create_infer_request(); + auto input_element_type = inf_req.get_input_tensor(0).get_element_type(); + auto input_shape = inf_req.get_input_tensor(0).get_shape(); + auto output_element_type = inf_req.get_output_tensor(0).get_element_type(); + auto output_shape = inf_req.get_output_tensor(0).get_shape(); + + ASSERT_EQ(input_element_type, element_type); + ASSERT_EQ(output_element_type, element_type); + + auto remote_context = compiled_model.get_context().as(); + auto input_tensor = ov::test::utils::create_and_fill_tensor(input_element_type, input_shape); + auto output_tensor = ov::test::utils::create_and_fill_tensor(output_element_type, output_shape); + + auto input_cl_tensor = remote_context.create_tensor(input_element_type, input_shape); + auto output_cl_tensor = remote_context.create_tensor(output_element_type, output_shape); + + input_cl_tensor.copy_from(input_tensor); + + inf_req.set_input_tensor(0, input_tensor); + inf_req.set_output_tensor(0, output_tensor); + inf_req.infer(); + + inf_req.set_input_tensor(0, input_cl_tensor); + inf_req.set_output_tensor(0, output_cl_tensor); + inf_req.infer(); + + auto tmp_tensor = ov::Tensor(output_element_type, output_shape); + output_cl_tensor.copy_to(tmp_tensor); + + if (element_type == ov::element::i16) { + compare_data::value_type>(output_tensor, tmp_tensor); + } else if (element_type == ov::element::u16) { + compare_data::value_type>(output_tensor, tmp_tensor); + } else if (element_type == ov::element::u32) { + compare_data::value_type>(output_tensor, tmp_tensor); + } +} + +INSTANTIATE_TEST_SUITE_P(smoke_RemoteTensorDataType, OVRemoteTensorDataType_Test, + ::testing::Combine(::testing::Values(ov::element::Type_t::i16, + ov::element::Type_t::u16, + ov::element::Type_t::u32)), + OVRemoteTensorDataType_Test::getTestCaseName);