From c561f61c839bf985971c6f68861578039dd44ad0 Mon Sep 17 00:00:00 2001 From: Bogdan Pereanu Date: Tue, 8 Oct 2024 11:00:51 +0300 Subject: [PATCH] [NPU] Adding support for the set_tensors method (#26823) ### Details: - *Adding support for the set_tensor method* set_tensors works differently in case the plugin or the compiler handles the batch: - in case the compiler handles batching we need to create a continuous L0 tensor and copy all the tensors into that big tensor even when tensors are part of the same L0 context - in case the plugin handles batching and the remote tensor feature is supported copy is not used if the tensors are part of the same L0 context. ### Tickets: - *EISW-116494* --- .../src/al/include/sync_infer_request.hpp | 27 +- .../src/al/src/sync_infer_request.cpp | 125 +++++- .../backend/include/zero_infer_request.hpp | 14 +- .../src/backend/include/zero_pipeline.hpp | 5 +- .../src/backend/src/zero_infer_request.cpp | 317 +++++++++---- .../src/backend/src/zero_pipeline.cpp | 60 ++- .../batched_tensors_tests/batched_run.cpp | 21 + .../batched_tensors_tests/batched_run.hpp | 425 ++++++++++++++++++ .../skip_tests_config.cpp | 13 +- 9 files changed, 872 insertions(+), 135 deletions(-) create mode 100644 src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.cpp create mode 100644 src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.hpp diff --git a/src/plugins/intel_npu/src/al/include/sync_infer_request.hpp b/src/plugins/intel_npu/src/al/include/sync_infer_request.hpp index bf9e0f20af3b78..ade70f9b67dc0f 100644 --- a/src/plugins/intel_npu/src/al/include/sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/al/include/sync_infer_request.hpp @@ -22,7 +22,7 @@ namespace intel_npu { */ class SyncInferRequest : public ov::IInferRequest { public: - explicit SyncInferRequest(const std::shared_ptr& compiledModel); + explicit SyncInferRequest(const std::shared_ptr& compiledModel, const Config& config); /** * @brief Gets an input/output tensor for inference. @@ -50,8 +50,8 @@ class SyncInferRequest : public ov::IInferRequest { * @brief Currently there is no support implemented for batches of tensors, thus this call is a simple redirection * to the "set_tensor" one. */ - void set_tensors(const ov::Output& port, - const std::vector>& tensors) override; + virtual void set_tensors(const ov::Output& port, + const std::vector>& tensors) override; /** * @brief Gets inputs for infer request @@ -126,6 +126,15 @@ class SyncInferRequest : public ov::IInferRequest { */ void check_tensor(const ov::Output& port, const ov::SoPtr& tensor) const; + /** + * @brief Basic checks for input tensors + * + * @param port Input port + * @param tensors Input tensors + */ + void check_batched_tensors(const ov::Output& port, + const std::vector>& tensors) const; + /** * @brief Check that all tensors are valid. Throws an exception if it's not. */ @@ -153,14 +162,22 @@ class SyncInferRequest : public ov::IInferRequest { const ov::Allocator& allocator = {}, const std::optional batchSize = std::nullopt) const; + bool is_batched_input(size_t idx) const; + + ov::SoPtr& get_user_input(size_t index) const; + std::vector>& get_user_inputs(size_t index) const; + // This is intel_npu::ICompiledModel pointer, but need to use OV base class because // ov::IInferRequest::get_compiled_model returns a refernce to shared_ptr! std::shared_ptr _compiledModel; NetworkMetadata _metadata; - mutable std::vector> _userInputTensors; - mutable std::vector> _userOutputTensors; + Logger _logger; + + // In case set_tensors is called, we receive a vector with N tensors otherwise only 1 tensor is needed + mutable std::vector>> _userInputTensors; + mutable std::vector> _userOutputTensors; mutable std::vector> _variableStates; diff --git a/src/plugins/intel_npu/src/al/src/sync_infer_request.cpp b/src/plugins/intel_npu/src/al/src/sync_infer_request.cpp index 08d5b518b98cad..04e9ce0d9bbcf8 100644 --- a/src/plugins/intel_npu/src/al/src/sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/al/src/sync_infer_request.cpp @@ -19,11 +19,12 @@ constexpr size_t BATCH_AXIS = 0; namespace intel_npu { -SyncInferRequest::SyncInferRequest(const std::shared_ptr& compiledModel) +SyncInferRequest::SyncInferRequest(const std::shared_ptr& compiledModel, const Config& config) : _compiledModel(compiledModel), _metadata(compiledModel->get_network_metadata()), - _userInputTensors(_metadata.inputs.size(), nullptr), - _userOutputTensors(_metadata.outputs.size(), nullptr) { + _logger("SyncInferRequest", config.get()), + _userInputTensors(_metadata.inputs.size(), std::vector>(1, {nullptr})), + _userOutputTensors(_metadata.outputs.size(), {nullptr}) { OPENVINO_ASSERT(_compiledModel); if (get_outputs().empty()) { @@ -121,7 +122,7 @@ ov::SoPtr SyncInferRequest::get_tensor(const ov::Output& port, const } if (foundPort.is_input()) { - _userInputTensors.at(foundPort.idx) = tensor._ptr; + get_user_input(foundPort.idx) = tensor; } else { - _userOutputTensors.at(foundPort.idx) = tensor._ptr; + _userOutputTensors.at(foundPort.idx) = tensor; } } -std::vector> SyncInferRequest::get_tensors(const ov::Output& /*port*/) const { +std::vector> SyncInferRequest::get_tensors(const ov::Output& port) const { OV_ITT_SCOPED_TASK(ov::itt::domains::Plugin, "get_tensors"); - // Using batches of tensors is currently not supported by the NPU plugin. In this scenario, the OpenVINO API demands - // returning an empty vector. + auto foundPort = find_port(port); + OPENVINO_ASSERT(foundPort.found(), "Cannot find input tensors for port ", port); + + if (foundPort.is_input() && is_batched_input(foundPort.idx)) { + return get_user_inputs(foundPort.idx); + } + return {}; } @@ -192,11 +198,89 @@ void SyncInferRequest::check_tensor(const ov::Output& port, "Tensor data equal nullptr!"); } +void SyncInferRequest::check_batched_tensors(const ov::Output& port, + const std::vector>& tensors) const { + OPENVINO_ASSERT(!tensors.empty(), "set_input_tensors/set_tensors can't be called with empty tensors"); + OPENVINO_ASSERT( + tensors.size() != 1, + "Internal error (plugin): check_batched_tensors is not allowed to have only one tensor inside batch"); + + auto layout = ov::layout::get_layout(port); + + int64_t batch_idx; + + if (layout.empty()) { + _logger.warning("set_input_tensors/set_tensors layout is not set, assuming batch dimension is found on 0 axis"); + batch_idx = BATCH_AXIS; + } else { + OPENVINO_ASSERT(ov::layout::has_batch(layout), + "set_input_tensors/set_tensors can be used only for inputs with N(batch) dimension" + " 'layout' defined. Current layout is ", + layout.to_string()); + batch_idx = ov::layout::batch_idx(layout); + } + + if (batch_idx < 0) { + batch_idx += static_cast(tensors[BATCH_AXIS]->get_shape().size()); + } + OPENVINO_ASSERT(batch_idx == BATCH_AXIS, + "set_input_tensors/set_tensors is not currently supported for batch dimension index ", + batch_idx, + " != 0"); + std::for_each(tensors.begin(), tensors.end(), [&batch_idx](const ov::SoPtr& item) { + OPENVINO_ASSERT(item, "Unintialized tensor is provided!"); + OPENVINO_ASSERT(item->get_shape()[batch_idx] == 1, + "set_input_tensors/set_tensors. Tensors shall represent one item in a batch, ", + item->get_shape()[batch_idx], + " provided"); + }); + auto tensors_size = static_cast(tensors.size()); + if (port.get_partial_shape().rank().is_static()) { + OPENVINO_ASSERT(batch_idx >= 0 && batch_idx < port.get_partial_shape().rank().get_length(), + "set_input_tensors/set_tensors error. Layout ", + layout.to_string(), + " is incorrect for operation with shape ", + port.get_partial_shape()); + auto batch = port.get_partial_shape()[batch_idx]; + + OPENVINO_ASSERT(batch.is_dynamic() || batch.get_length() == tensors_size, + "set_input_tensors/set_tensors error. Input shape ", + port.get_partial_shape(), + "batch ", + batch, + "doesn't match with total blobs count: ", + tensors_size); + } + + auto batched_shape = tensors[BATCH_AXIS]->get_shape(); + auto element_type = tensors[BATCH_AXIS]->get_element_type(); + batched_shape[batch_idx] = tensors_size; + for (const auto& item : tensors) { + OPENVINO_ASSERT(item, "Unintialized tensor is provided!"); + auto item_shape = item->get_shape(); + item_shape[batch_idx] = batched_shape[batch_idx]; + OPENVINO_ASSERT(item_shape == batched_shape && item->get_element_type() == element_type && + "set_input_tensors/set_tensors error. Tensor with element type ", + item->get_element_type(), + " and shape ", + item_shape, + " is not compatible with batched tensor with element type ", + element_type, + " and shape ", + batched_shape); + OPENVINO_ASSERT(item->is_continuous(), "Strides for batched tensors should be default."); + } +} + void SyncInferRequest::check_tensors() const { const auto& inputs = _compiledModel->inputs(); for (size_t i = 0; i < inputs.size(); i++) { - if (_userInputTensors.at(i)) { - check_tensor(inputs[i], _userInputTensors.at(i)); + if (is_batched_input(i)) { + check_batched_tensors(inputs[i], get_user_inputs(i)); + continue; + } + if (get_user_input(i)) { + check_tensor(inputs[i], get_user_input(i)); } } @@ -229,7 +313,7 @@ std::shared_ptr SyncInferRequest::allocate_tensor(const IODescripto OPENVINO_ASSERT(descriptor.relatedDescriptorIndex.has_value(), "The link between state descriptors is missing, state name: ", descriptor.nameFromCompiler); - tensor = _userInputTensors.at(*descriptor.relatedDescriptorIndex); + tensor = get_user_input(*descriptor.relatedDescriptorIndex)._ptr; } else if (allocator) { tensor = ov::make_tensor(descriptor.precision, allocatedTensorShape, allocator); } else { @@ -237,8 +321,8 @@ std::shared_ptr SyncInferRequest::allocate_tensor(const IODescripto } if (isInput) { - if (_userInputTensors.at(index) == nullptr) { - _userInputTensors.at(index) = tensor; + if (get_user_input(index) == nullptr) { + get_user_input(index) = tensor; } if (descriptor.isStateInput) { @@ -250,4 +334,17 @@ std::shared_ptr SyncInferRequest::allocate_tensor(const IODescripto return tensor; } + +bool SyncInferRequest::is_batched_input(size_t idx) const { + return _userInputTensors.at(idx).size() > 1; +} + +ov::SoPtr& SyncInferRequest::get_user_input(size_t index) const { + return _userInputTensors.at(index).at(0); +} + +std::vector>& SyncInferRequest::get_user_inputs(size_t index) const { + return _userInputTensors.at(index); +} + } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp index f6d15d2c2aed5e..6d0b343bf8d7b7 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp @@ -27,6 +27,8 @@ class ZeroInferRequest final : public SyncInferRequest { ov::SoPtr get_tensor(const ov::Output& port) const override; void set_tensor(const ov::Output& port, const ov::SoPtr& tensor) override; + void set_tensors(const ov::Output& port, + const std::vector>& tensors) override; void infer() override; void infer_async() override; @@ -54,7 +56,7 @@ class ZeroInferRequest final : public SyncInferRequest { * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside * the plugin. */ - std::optional getBatchSize(const NetworkMetadata& metadata); + std::optional get_batch_size(const NetworkMetadata& metadata); /** * @brief Check the received tensor and set the Level Zero tensor accordingly @@ -75,6 +77,12 @@ class ZeroInferRequest final : public SyncInferRequest { void check_network_precision(const ov::element::Type_t precision) const override; void create_pipeline(); + std::shared_ptr& get_level_zero_input(size_t index, size_t tensorNo = 0) const; + std::vector>& get_level_zero_inputs(size_t index) const; + + std::optional& get_input_tensor_data(size_t index, size_t tensorNo = 0) const; + std::vector>& get_input_tensors_data(size_t index) const; + const std::shared_ptr _initStructs; const std::shared_ptr _executorPtr; const ZeroExecutor* _executor; @@ -83,10 +91,10 @@ class ZeroInferRequest final : public SyncInferRequest { // A copy of each tensor is needed to maintain the original L0 memory allocation in case the user provides another // memory area for the tensor. - mutable std::vector> _levelZeroInputTensors; + mutable std::vector>> _levelZeroInputTensors; mutable std::vector> _levelZeroOutputTensors; - mutable std::vector> _inputTensorsData; + mutable std::vector>> _inputTensorsData; mutable std::vector> _outputTensorsData; ze_device_properties_t _properties = {}; diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp index 6a5cc79ed7a7fc..4160a2ca979290 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp @@ -25,7 +25,7 @@ struct Pipeline { zeroProfiling::ProfilingPool& profiling_pool, zeroProfiling::ProfilingQuery& profiling_query, std::shared_ptr npu_profiling, - const std::vector>& inputTensorsData, + const std::vector>>& inputTensorsData, const std::vector>& outputTensorsData, const size_t numberOfCommandLists); @@ -37,7 +37,8 @@ struct Pipeline { void pull(); void reset() const; - void updateCommandList(const TensorData& tensorsData, const uint32_t index); + void updateCommandList(const TensorData& tensorsData, uint32_t index); + void updateCommandList(const TensorData& tensorsData, uint32_t index, size_t commandListIndex); protected: const Config _config; diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp index 0a8d8dded5e97d..2c954151a4f652 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp @@ -19,6 +19,7 @@ using namespace intel_npu; namespace { +constexpr std::size_t SINGLE_TENSOR = 0; constexpr std::size_t BATCH_AXIS = 0; constexpr std::size_t DEFAULT_BATCH_SIZE = 1; constexpr bool INPUT = true; @@ -30,8 +31,8 @@ constexpr bool OUTPUT = false; * @param ioDescriptor The OpenVINO API specific I/O descriptor which shall be compared. * @param zeDescriptor The Level Zero specific structure used for comparison. */ -void checkLevelZeroAttributesMatch(const IODescriptor& ioDescriptor, - const ZeroExecutor::ArgumentDescriptor& zeDescriptor) { +void check_level_zero_attributes_match(const IODescriptor& ioDescriptor, + const ZeroExecutor::ArgumentDescriptor& zeDescriptor) { std::string zeDescriptorName = zeDescriptor.info.name; if (isStateInputName(zeDescriptorName)) { @@ -78,9 +79,25 @@ Type extract_object(const ov::AnyMap& params, const ov::Property& p) { return res.as(); } +bool memory_was_allocated_in_the_same_l0_context(ze_context_handle_t hContext, const void* ptr) { + ze_memory_allocation_properties_t desc = {}; + desc.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES; + auto res = intel_npu::zeMemGetAllocProperties(hContext, ptr, &desc, nullptr); + if (res == ZE_RESULT_SUCCESS) { + if (desc.id) { + if ((desc.type & ZE_MEMORY_TYPE_HOST) || (desc.type & ZE_MEMORY_TYPE_DEVICE) || + (desc.type & ZE_MEMORY_TYPE_SHARED)) { + return true; + } + } + } + + return false; +} + } // namespace -std::optional ZeroInferRequest::getBatchSize(const NetworkMetadata& metadata) { +std::optional ZeroInferRequest::get_batch_size(const NetworkMetadata& metadata) { if (!metadata.outputs.at(0).shapeFromIRModel.has_value()) { _logger.debug("Batching on the plugin is not used, batching is handled by the compiler"); return std::nullopt; @@ -143,15 +160,15 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& const std::shared_ptr& compiledModel, const std::shared_ptr& executor, const Config& config) - : SyncInferRequest(compiledModel), + : SyncInferRequest(compiledModel, config), _initStructs(initStructs), _executorPtr(executor), _executor(static_cast(_executorPtr.get())), _config(config), _logger("ZeroInferRequest", config.get()), - _levelZeroInputTensors(_metadata.inputs.size(), nullptr), + _levelZeroInputTensors(_metadata.inputs.size(), std::vector>(1, nullptr)), _levelZeroOutputTensors(_metadata.outputs.size(), nullptr), - _inputTensorsData(_metadata.inputs.size(), std::nullopt), + _inputTensorsData(_metadata.inputs.size(), std::vector>(1, std::nullopt)), _outputTensorsData(_metadata.outputs.size(), std::nullopt), _profilingPool(_executor->graph(), zeroProfiling::POOL_SIZE, _executor->getInitStructs()->getProfilingDdiTable()), _profilingQuery(0, @@ -179,7 +196,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& std::make_shared(_initStructs, ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED); if (config.get() != ov::intel_npu::BatchMode::COMPILER) { - _batchSize = getBatchSize(_metadata); + _batchSize = get_batch_size(_metadata); } if (_batchSize.has_value()) { _numberOfCommandLists = *_batchSize; @@ -189,24 +206,23 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& size_t ioIndex = 0; for (const IODescriptor& inputDescriptor : _metadata.inputs) { - checkLevelZeroAttributesMatch(inputDescriptor, executorInputDescriptors.at(ioIndex)); + check_level_zero_attributes_match(inputDescriptor, executorInputDescriptors.at(ioIndex)); if (!(inputDescriptor.isStateInput || inputDescriptor.isShapeTensor)) { ++ioIndex; continue; } - _levelZeroInputTensors.at(ioIndex) = - allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _batchSize); - _inputTensorsData.at(ioIndex) = - TensorData{_levelZeroInputTensors.at(ioIndex)->data(), _levelZeroInputTensors.at(ioIndex)->get_byte_size()}; + get_level_zero_input(ioIndex) = allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _batchSize); + get_input_tensor_data(ioIndex) = + TensorData{get_level_zero_input(ioIndex)->data(), get_level_zero_input(ioIndex)->get_byte_size()}; ++ioIndex; } ioIndex = 0; for (const IODescriptor& outputDescriptor : _metadata.outputs) { - checkLevelZeroAttributesMatch(outputDescriptor, executorOutputDescriptors.at(ioIndex)); + check_level_zero_attributes_match(outputDescriptor, executorOutputDescriptors.at(ioIndex)); if (!(outputDescriptor.isStateOutput || outputDescriptor.isShapeTensor)) { ++ioIndex; @@ -227,18 +243,25 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr& void ZeroInferRequest::create_pipeline() { for (size_t inputIndex = 0; inputIndex < _metadata.inputs.size(); ++inputIndex) { - if (_levelZeroInputTensors.at(inputIndex)) { + if (is_batched_input(inputIndex)) { + if (_batchSize.has_value()) { + _logger.debug("ZeroInferRequest::create_pipeline - tensors %s were already allocated", + _metadata.inputs.at(inputIndex).nodeFriendlyName.c_str()); + continue; + } + } + + if (get_level_zero_input(inputIndex)) { _logger.debug("ZeroInferRequest::create_pipeline - tensor %s was already allocated", _metadata.inputs.at(inputIndex).nodeFriendlyName.c_str()); continue; } - _logger.debug("ZeroInferRequest::create_pipeline - Allocate new tensor"); - _levelZeroInputTensors.at(inputIndex) = + _logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor"); + get_level_zero_input(inputIndex) = allocate_tensor(_metadata.inputs.at(inputIndex), inputIndex, INPUT, *_inputAllocator, _batchSize); - _inputTensorsData.at(inputIndex) = - std::optional(TensorData{_levelZeroInputTensors.at(inputIndex)->data(), - _levelZeroInputTensors.at(inputIndex)->get_byte_size()}); + get_input_tensor_data(inputIndex) = std::optional( + TensorData{get_level_zero_input(inputIndex)->data(), get_level_zero_input(inputIndex)->get_byte_size()}); } for (size_t outputIndex = 0; outputIndex < _metadata.outputs.size(); ++outputIndex) { @@ -274,47 +297,32 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr tensor const size_t index, const bool isInput) { OV_ITT_TASK_CHAIN(ZERO_SET_TENSOR, itt::domains::LevelZeroBackend, "set_tensor", "set_tensor_data"); - auto& levelZeroTensors = isInput ? _levelZeroInputTensors : _levelZeroOutputTensors; - auto& tensorsData = isInput ? _inputTensorsData : _outputTensorsData; + auto& levelZeroTensors = isInput ? get_level_zero_input(index) : _levelZeroOutputTensors.at(index); + auto& tensorsData = isInput ? get_input_tensor_data(index) : _outputTensorsData.at(index); bool setTensorData = false; bool levelZeroTensorCreatedLocally = true; OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "check_data_allocation"); - ze_memory_allocation_properties_t desc = {}; - desc.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES; - auto res = zeMemGetAllocProperties(_initStructs->getContext(), tensor->data(), &desc, nullptr); - if (res == ZE_RESULT_SUCCESS) { - if (desc.id) { - switch (desc.type) { - case ZE_MEMORY_TYPE_HOST: - case ZE_MEMORY_TYPE_DEVICE: - case ZE_MEMORY_TYPE_SHARED: - _logger.debug("ZeroInferRequest::set_tensor_data - tensor was created in the same L0 context"); - levelZeroTensors.at(index) = tensor; - levelZeroTensorCreatedLocally = false; - setTensorData = true; - break; - case ZE_MEMORY_TYPE_UNKNOWN: - case ZE_MEMORY_TYPE_FORCE_UINT32: - break; - } - } + if (memory_was_allocated_in_the_same_l0_context(_initStructs->getContext(), tensor->data())) { + _logger.debug("ZeroInferRequest::set_tensor_data - tensor was created in the same L0 context"); + levelZeroTensors = tensor; + levelZeroTensorCreatedLocally = false; + setTensorData = true; } if (!setTensorData) { // make sure that the L0 tensor was allocated locally and is not received from the user when receiving // random tensor - if (tensorsData.at(index).has_value() && !tensorsData.at(index)->levelZeroTensorCreatedLocally) { + if (tensorsData.has_value() && !tensorsData->levelZeroTensorCreatedLocally) { _logger.debug("ZeroInferRequest::set_tensor_data - create locally L0 tensor"); OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "allocate tensor"); - levelZeroTensors.at(index) = - allocate_tensor(isInput ? _metadata.inputs.at(index) : _metadata.outputs.at(index), - index, - isInput, - isInput ? *_inputAllocator : *_outputAllocator, - _batchSize); + levelZeroTensors = allocate_tensor(isInput ? _metadata.inputs.at(index) : _metadata.outputs.at(index), + index, + isInput, + isInput ? *_inputAllocator : *_outputAllocator, + _batchSize); setTensorData = true; levelZeroTensorCreatedLocally = true; @@ -322,15 +330,14 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr tensor } if (setTensorData) { - tensorsData.at(index) = std::optional(TensorData{levelZeroTensors.at(index)->data(), - levelZeroTensors.at(index)->get_byte_size(), - levelZeroTensorCreatedLocally}); + tensorsData = std::optional( + TensorData{levelZeroTensors->data(), levelZeroTensors->get_byte_size(), levelZeroTensorCreatedLocally}); if (_pipelineIsCreated) { _logger.debug("ZeroInferRequest::infer_async - update command list"); OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "updateCommandList"); - _pipeline->updateCommandList(*tensorsData.at(index), + _pipeline->updateCommandList(*tensorsData, isInput ? _executor->get_input_descriptors().at(index).idx : _executor->get_output_descriptors().at(index).idx); } @@ -353,17 +360,17 @@ void ZeroInferRequest::set_remote_tensor_data(const std::shared_ptrget_byte_size(), false}); + levelZeroTensors = tensor; + tensorsData = std::optional(TensorData{data, tensor->get_byte_size(), false}); if (_pipelineIsCreated) { _logger.debug("ZeroInferRequest::infer_async - update command list"); OV_ITT_TASK_NEXT(ZERO_SET_REMOTE_TENSOR, "updateCommandList"); - _pipeline->updateCommandList(*tensorsData.at(index), + _pipeline->updateCommandList(*tensorsData, isInput ? _executor->get_input_descriptors().at(index).idx : _executor->get_output_descriptors().at(index).idx); } @@ -381,9 +388,16 @@ void ZeroInferRequest::set_tensor(const ov::Output& port, const } if (foundPort.is_input()) { - _userInputTensors.at(foundPort.idx) = tensor._ptr; + if (is_batched_input(foundPort.idx)) { + // resize vector size to 1 if set_tensor is called after set_tensors + get_input_tensors_data(foundPort.idx).resize(1); + get_level_zero_inputs(foundPort.idx).resize(1); + get_user_inputs(foundPort.idx).resize(1); + } + + get_user_input(foundPort.idx) = tensor; } else { - _userOutputTensors.at(foundPort.idx) = tensor._ptr; + _userOutputTensors.at(foundPort.idx) = tensor; } if (_initStructs->getMutableCommandListVersion()) { @@ -399,6 +413,78 @@ void ZeroInferRequest::set_tensor(const ov::Output& port, const } } +void ZeroInferRequest::set_tensors(const ov::Output& port, + const std::vector>& tensors) { + OV_ITT_TASK_CHAIN(SET_TENSORS, itt::domains::LevelZeroBackend, "set_tensors", "set_tensors"); + if (tensors.size() == 1) { + set_tensor(port, tensors[0]); + return; + } + + auto foundPort = find_port(port); + OPENVINO_ASSERT(foundPort.found(), "Cannot find input tensor for port ", port); + if (!foundPort.is_input()) { + OPENVINO_THROW("set_input_tensors/set_tensors is not supported for output port."); + } + + check_batched_tensors(port, tensors); + + get_user_inputs(foundPort.idx).resize(tensors.size()); + get_user_inputs(foundPort.idx) = tensors; + + if (_initStructs->getMutableCommandListVersion()) { + if (_batchSize.has_value()) { + for (size_t i = 0; i < tensors.size(); i++) { + auto remoteTensor = std::dynamic_pointer_cast(tensors[i]._ptr); + + get_level_zero_inputs(foundPort.idx).resize(tensors.size()); + get_input_tensors_data(foundPort.idx).resize(tensors.size()); + + if (remoteTensor == nullptr) { + bool tensorHasSameL0Context = false; + + OV_ITT_TASK_NEXT(SET_TENSORS, "check_data_allocation"); + if (memory_was_allocated_in_the_same_l0_context(_initStructs->getContext(), tensors[i]->data())) { + _logger.debug("ZeroInferRequest::set_tensors - tensor was created in the same L0 context"); + + get_level_zero_input(foundPort.idx, i) = tensors.at(i)._ptr; + tensorHasSameL0Context = true; + } + + if (!tensorHasSameL0Context) { + _logger.debug("ZeroInferRequest::set_tensors - tensor wasn't created in the same L0 context, " + "create a L0 tensor"); + + get_level_zero_input(foundPort.idx, i) = + allocate_tensor(_metadata.inputs.at(foundPort.idx), foundPort.idx, true, *_inputAllocator); + } + + get_input_tensor_data(foundPort.idx, i) = + std::optional(TensorData{get_level_zero_input(foundPort.idx, i)->data(), + get_level_zero_input(foundPort.idx, i)->get_byte_size(), + false}); + } else { + _logger.debug("ZeroInferRequest::set_tensors - remote tensor is used"); + + get_input_tensor_data(foundPort.idx, i) = std::optional( + TensorData{extract_object(remoteTensor->get_properties(), ov::intel_npu::mem_handle), + remoteTensor->get_byte_size(), + false}); + + get_level_zero_input(foundPort.idx, i) = tensors.at(i)._ptr; + } + + if (_pipelineIsCreated) { + OV_ITT_TASK_NEXT(SET_TENSORS, "updateCommandList"); + _pipeline->updateCommandList(*get_input_tensor_data(foundPort.idx, i), + _executor->get_input_descriptors().at(foundPort.idx).idx, + i); + } + } + } + } +} + ov::SoPtr ZeroInferRequest::get_tensor(const ov::Output& port) const { OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "get_tensor"); @@ -407,28 +493,31 @@ ov::SoPtr ZeroInferRequest::get_tensor(const ov::Outputdata(), levelZeroTensors.at(ioIndex)->get_byte_size()}); + levelZeroTensors = allocate_tensor(isInput ? _metadata.inputs.at(ioIndex) : _metadata.outputs.at(ioIndex), + ioIndex, + isInput, + isInput ? *_inputAllocator : *_outputAllocator, + _batchSize); + tensorsData = std::optional(TensorData{levelZeroTensors->data(), levelZeroTensors->get_byte_size()}); - return levelZeroTensors.at(ioIndex); + return levelZeroTensors; } void ZeroInferRequest::infer() { @@ -450,26 +539,75 @@ void ZeroInferRequest::infer_async() { _executor->mutexUnlock(); size_t inputIndex = 0; - for (const std::shared_ptr& userTensor : _userInputTensors) { + for (const auto& userTensor : _userInputTensors) { const IODescriptor inputDescriptor = _metadata.inputs.at(inputIndex); if (inputDescriptor.isShapeTensor) { OPENVINO_ASSERT(inputDescriptor.relatedDescriptorIndex.has_value(), "The link between the dynamic tensor and its shape tensor is missing, entry name: ", inputDescriptor.nameFromCompiler); - const auto& inputDims = _userInputTensors.at(*inputDescriptor.relatedDescriptorIndex)->get_shape(); + const auto& inputDims = get_user_input(*inputDescriptor.relatedDescriptorIndex)->get_shape(); - for (size_t i = 0; i < userTensor->get_size(); ++i) { + for (size_t i = 0; i < userTensor.at(SINGLE_TENSOR)->get_size(); ++i) { const auto reverseIdx = inputDims.size() - 1 - i; - userTensor->data()[i] = static_cast(inputDims[reverseIdx]); + userTensor.at(SINGLE_TENSOR)->data()[i] = static_cast(inputDims[reverseIdx]); } } - auto userRemoteTensor = std::dynamic_pointer_cast(userTensor); + if (is_batched_input(inputIndex)) { + if (_batchSize.has_value()) { + for (size_t i = 0; i < userTensor.size(); i++) { + auto levelZeroBatchRemoteTensor = + std::dynamic_pointer_cast(get_level_zero_input(inputIndex, i)); + if (levelZeroBatchRemoteTensor == nullptr) { + void* levelZeroBuffer = get_level_zero_input(inputIndex, i)->data(); + + auto userBatchRemoteTensor = std::dynamic_pointer_cast(userTensor.at(i)._ptr); + + void* userBuffer = + !userBatchRemoteTensor + ? userTensor.at(i)->data() + : extract_object(userBatchRemoteTensor->get_properties(), ov::intel_npu::mem_handle); + + if (userBuffer != levelZeroBuffer) { + if (userBuffer == nullptr || levelZeroBuffer == nullptr) { + OPENVINO_THROW("Empty buffer"); + } + + _logger.info("Batched Tensors - Tensor is not allocated in the current Level Zero context"); + OV_ITT_TASK_NEXT(ZERO_INFER, "memcpy"); + std::memcpy(levelZeroBuffer, userBuffer, userTensor.at(i)->get_byte_size()); + } + } + } + } else { + void* levelZeroBuffer = get_level_zero_input(inputIndex)->data(); + + _logger.info("Batched Tensors - Tensor is not allocated in the current Level Zero context or must be " + "in a continued memory space"); + + for (size_t i = 0; i < userTensor.size(); i++) { + auto userBatchRemoteTensor = std::dynamic_pointer_cast(userTensor.at(i)._ptr); + + void* userBuffer = !userBatchRemoteTensor ? userTensor.at(i)->data() + : extract_object(userBatchRemoteTensor->get_properties(), + ov::intel_npu::mem_handle); + + std::memcpy(static_cast(levelZeroBuffer) + (i * userTensor.at(i)->get_byte_size()), + userBuffer, + userTensor.at(i)->get_byte_size()); + } + } + + ++inputIndex; + continue; + } + + auto userRemoteTensor = std::dynamic_pointer_cast(userTensor.at(SINGLE_TENSOR)._ptr); void* userBuffer = !userRemoteTensor - ? userTensor->data() + ? userTensor.at(SINGLE_TENSOR)->data() : extract_object(userRemoteTensor->get_properties(), ov::intel_npu::mem_handle); - const std::shared_ptr& levelZeroTensor = _levelZeroInputTensors.at(inputIndex); + const std::shared_ptr& levelZeroTensor = get_level_zero_input(inputIndex); auto levelZeroRemoteTensor = std::dynamic_pointer_cast(levelZeroTensor); if (levelZeroRemoteTensor == nullptr) { void* levelZeroBuffer = levelZeroTensor->data(); @@ -481,7 +619,7 @@ void ZeroInferRequest::infer_async() { _logger.info("Tensor is not allocated in the current Level Zero context"); OV_ITT_TASK_NEXT(ZERO_INFER, "memcpy"); - std::memcpy(levelZeroBuffer, userBuffer, userTensor->get_byte_size()); + std::memcpy(levelZeroBuffer, userBuffer, userTensor.at(SINGLE_TENSOR)->get_byte_size()); } } @@ -498,7 +636,7 @@ void ZeroInferRequest::get_result() { _pipeline->pull(); size_t outputIndex = 0; - for (const std::shared_ptr& userTensor : _userOutputTensors) { + for (const auto& userTensor : _userOutputTensors) { const IODescriptor outputDescriptor = _metadata.outputs.at(outputIndex); if (outputDescriptor.isShapeTensor) { OPENVINO_ASSERT(outputDescriptor.relatedDescriptorIndex.has_value(), @@ -516,7 +654,7 @@ void ZeroInferRequest::get_result() { tensorToBeReshaped->set_shape(actualDims); } - auto userRemoteTensor = std::dynamic_pointer_cast(userTensor); + auto userRemoteTensor = std::dynamic_pointer_cast(userTensor._ptr); void* userBuffer = !userRemoteTensor ? userTensor->data() : extract_object(userRemoteTensor->get_properties(), ov::intel_npu::mem_handle); @@ -615,3 +753,18 @@ std::vector ZeroInferRequest::get_profiling_info() const { std::vector ZeroInferRequest::get_raw_profiling_data() const { return _profilingQuery.getData(); } + +std::shared_ptr& ZeroInferRequest::get_level_zero_input(size_t index, size_t tensorNo) const { + return _levelZeroInputTensors.at(index).at(tensorNo); +} + +std::vector>& ZeroInferRequest::get_level_zero_inputs(size_t index) const { + return _levelZeroInputTensors.at(index); +} + +std::optional& ZeroInferRequest::get_input_tensor_data(size_t index, size_t tensorNo) const { + return _inputTensorsData.at(index).at(tensorNo); +} +std::vector>& ZeroInferRequest::get_input_tensors_data(size_t index) const { + return _inputTensorsData.at(index); +} diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp index ff24536a52d9b6..cfc80d48c50707 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp @@ -20,7 +20,7 @@ Pipeline::Pipeline(const Config& config, zeroProfiling::ProfilingPool& profiling_pool, zeroProfiling::ProfilingQuery& profiling_query, std::shared_ptr npu_profiling, - const std::vector>& inputTensorsData, + const std::vector>>& inputTensorsData, const std::vector>& outputTensorsData, const size_t numberOfCommandLists) : _config(config), @@ -31,9 +31,9 @@ Pipeline::Pipeline(const Config& config, numberOfCommandLists ? static_cast(numberOfCommandLists) : 1, _config}, _npu_profiling(std::move(npu_profiling)), - _logger("IntegratedPipeline", _config.get()) { - OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::IntegratedPipeline::IntegratedPipeline"); - _logger.debug("IntegratedPipeline - initialize started"); + _logger("Pipeline", _config.get()) { + OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::Pipeline::Pipeline"); + _logger.debug("Pipeline - initialize started"); if (profiling_pool.create()) { profiling_query.create(profiling_pool._handle); @@ -42,7 +42,7 @@ Pipeline::Pipeline(const Config& config, _command_lists.reserve(numberOfCommandLists); _events.reserve(numberOfCommandLists); _fences.reserve(numberOfCommandLists); - _logger.debug("IntegratedPipeline - emplace_back _event_pool and _command_queue"); + _logger.debug("Pipeline - emplace_back _event_pool and _command_queue"); for (size_t i = 0; i < numberOfCommandLists; i++) { _command_lists.emplace_back( std::make_unique(_executor->getInitStructs()->getDevice(), @@ -58,9 +58,17 @@ Pipeline::Pipeline(const Config& config, for (size_t i = 0; i < numberOfCommandLists; i++) { size_t ioIndex = 0; for (const auto& desc : _executor->get_input_descriptors()) { + if (inputTensorsData.at(ioIndex).size() > 1) { + _executor->setArgumentValue(desc.idx, inputTensorsData.at(ioIndex).at(i)->mem); + + ++ioIndex; + continue; + } + _executor->setArgumentValue(desc.idx, - static_cast(inputTensorsData.at(ioIndex)->mem) + - (i * inputTensorsData.at(ioIndex)->size) / numberOfCommandLists); + static_cast(inputTensorsData.at(ioIndex).at(0)->mem) + + (i * inputTensorsData.at(ioIndex).at(0)->size) / numberOfCommandLists); + ++ioIndex; } @@ -93,14 +101,14 @@ Pipeline::Pipeline(const Config& config, } _command_lists.at(i)->close(); } - _logger.debug("IntegratedPipeline - initialize completed"); + _logger.debug("Pipeline - initialize completed"); } void Pipeline::push() { - _logger.debug("IntegratedPipeline - push() started"); + _logger.debug("Pipeline - push() started"); for (size_t i = 0; i < _command_lists.size(); ++i) { - OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PUSH, itt::domains::LevelZeroBackend, "IntegratedPipeline", "push"); + OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PUSH, itt::domains::LevelZeroBackend, "Pipeline", "push"); if (sync_output_with_fences_) { _command_queue.executeCommandList(*_command_lists.at(i), *_fences.at(i)); } else { @@ -108,12 +116,12 @@ void Pipeline::push() { } } - _logger.debug("IntegratedPipeline - push() completed"); + _logger.debug("Pipeline - push() completed"); }; void Pipeline::pull() { - _logger.debug("IntegratedPipeline - pull() started"); - OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PULL, itt::domains::LevelZeroBackend, "IntegratedPipeline", "pull"); + _logger.debug("Pipeline - pull() started"); + OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PULL, itt::domains::LevelZeroBackend, "Pipeline", "pull"); for (size_t i = 0; i < _command_lists.size(); ++i) { if (sync_output_with_fences_) { @@ -127,11 +135,11 @@ void Pipeline::pull() { } } - _logger.debug("IntegratedPipeline - pull() completed"); + _logger.debug("Pipeline - pull() completed"); }; void Pipeline::reset() const { - _logger.debug("IntegratedPipeline - rest() started"); + _logger.debug("Pipeline - rest() started"); for (size_t i = 0; i < _command_lists.size(); ++i) { if (sync_output_with_fences_) { @@ -141,11 +149,13 @@ void Pipeline::reset() const { } } - _logger.debug("IntegratedPipeline - rest() completed"); + _logger.debug("Pipeline - rest() completed"); }; -void Pipeline::updateCommandList(const TensorData& tensorsData, const uint32_t index) { - OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "IntegratedPipeline", "updateCommandList"); +void Pipeline::updateCommandList(const TensorData& tensorsData, uint32_t index) { + OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandList"); + _logger.debug("Pipeline - updateCommandList"); + const size_t numberOfCommandLists = _command_lists.size(); for (size_t i = 0; i < numberOfCommandLists; i++) { @@ -156,4 +166,18 @@ void Pipeline::updateCommandList(const TensorData& tensorsData, const uint32_t i } }; +void Pipeline::updateCommandList(const TensorData& tensorsData, uint32_t index, size_t commandListIndex) { + OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandList"); + _logger.debug("Pipeline - updateCommandList"); + + const size_t numberOfCommandLists = _command_lists.size(); + + OPENVINO_ASSERT(commandListIndex < numberOfCommandLists, + "Command list index is higgher than the number of Command lists ", + commandListIndex); + + _command_lists.at(commandListIndex)->updateMutableCommandList(index, tensorsData.mem); + _command_lists.at(commandListIndex)->close(); +}; + } // namespace intel_npu diff --git a/src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.cpp b/src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.cpp new file mode 100644 index 00000000000000..3d7d4eb89eff4c --- /dev/null +++ b/src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.cpp @@ -0,0 +1,21 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "behavior/batched_tensors_tests/batched_run.hpp" + +#include "common/npu_test_env_cfg.hpp" +#include "common/utils.hpp" +#include "intel_npu/al/config/common.hpp" + +using namespace ov::test::behavior; + +const std::vector batchedConfigs = {{ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::PLUGIN)}, + {ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::COMPILER)}, + {ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::AUTO)}}; + +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, + BatchedTensorsRunTests, + ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), + ::testing::ValuesIn(batchedConfigs)), + BatchedTensorsRunTests::getTestCaseName); diff --git a/src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.hpp new file mode 100644 index 00000000000000..05e580ab99664c --- /dev/null +++ b/src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.hpp @@ -0,0 +1,425 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "base/ov_behavior_test_utils.hpp" +#include "common/npu_test_env_cfg.hpp" +#include "common/utils.hpp" +#include "functional_test_utils/ov_plugin_cache.hpp" +#include "npu_private_properties.hpp" +#include "openvino/core/any.hpp" +#include "openvino/core/node_vector.hpp" +#include "openvino/core/type/element_iterator.hpp" +#include "openvino/op/op.hpp" +#include "openvino/opsets/opset8.hpp" +#include "openvino/runtime/compiled_model.hpp" +#include "openvino/runtime/core.hpp" +#include "openvino/runtime/intel_npu/level_zero/level_zero.hpp" +#include "overload/overload_test_utils_npu.hpp" + +using CompilationParams = std::tuple; + +using ::testing::AllOf; +using ::testing::HasSubstr; + +namespace ov { +namespace test { +namespace behavior { +class BatchedTensorsRunTests : public ov::test::behavior::OVPluginTestBase, + public testing::WithParamInterface { +protected: + std::shared_ptr core = utils::PluginCache::get().core(); + ov::AnyMap configuration; + std::shared_ptr ov_model; + ov::CompiledModel compiled_model; + ov::Output input; + ov::Output output; + std::string m_cache_dir; + +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + std::string targetDevice; + ov::AnyMap configuration; + std::tie(targetDevice, configuration) = obj.param; + std::replace(targetDevice.begin(), targetDevice.end(), ':', '_'); + targetDevice = ov::test::utils::getTestsPlatformFromEnvironmentOr(ov::test::utils::DEVICE_NPU); + + std::ostringstream result; + result << "targetDevice=" << targetDevice << "_"; + result << "targetPlatform=" << ov::test::utils::getTestsPlatformFromEnvironmentOr(targetDevice) << "_"; + if (!configuration.empty()) { + for (auto& configItem : configuration) { + result << "configItem=" << configItem.first << "_"; + configItem.second.print(result); + } + } + + return result.str(); + } + + void SetUp() override { + std::tie(target_device, configuration) = this->GetParam(); + + SKIP_IF_CURRENT_TEST_IS_DISABLED() + OVPluginTestBase::SetUp(); + ov_model = getDefaultNGraphFunctionForTheDeviceNPU(); // FIXME: E#80555 + } + + std::string generateCacheDirName(const std::string& test_name) { + using namespace std::chrono; + // Generate unique file names based on test name, thread id and timestamp + // This allows execution of tests in parallel (stress mode) + auto hash = std::to_string(std::hash()(test_name)); + std::stringstream ss; + auto ts = duration_cast(high_resolution_clock::now().time_since_epoch()); + ss << hash << "_" + << "_" << ts.count(); + return ss.str(); + } + + void TearDown() override { + if (!m_cache_dir.empty()) { + core->set_property({ov::cache_dir()}); + core.reset(); + ov::test::utils::PluginCache::get().reset(); + ov::test::utils::removeFilesWithExt(m_cache_dir, "blob"); + ov::test::utils::removeDir(m_cache_dir); + } + + if (!configuration.empty()) { + utils::PluginCache::get().reset(); + } + + APIBaseTest::TearDown(); + } + + std::shared_ptr create_n_inputs(size_t n, + element::Type type, + const PartialShape& shape, + const ov::Layout& layout) { + ResultVector res; + ParameterVector params; + + for (size_t i = 0; i < n; i++) { + auto index_str = std::to_string(i); + auto data1 = std::make_shared(type, shape); + data1->set_friendly_name("input" + index_str); + data1->get_output_tensor(0).set_names({"tensor_input" + index_str}); + data1->set_layout(layout); + auto constant = opset8::Constant::create(type, {1}, {1}); + auto op1 = std::make_shared(data1, constant); + op1->set_friendly_name("Add" + index_str); + auto res1 = std::make_shared(op1); + res1->set_friendly_name("Result" + index_str); + res1->get_output_tensor(0).set_names({"tensor_output" + index_str}); + params.push_back(data1); + res.push_back(res1); + } + + return std::make_shared(res, params); + } +}; + +TEST_P(BatchedTensorsRunTests, SetInputRemoteTensorsMultipleInfer) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + size_t batch = 4; + auto one_shape = Shape{1, 2, 2, 2}; + auto batch_shape = Shape{batch, 2, 2, 2}; + auto one_shape_size = ov::shape_size(one_shape); + auto model = BatchedTensorsRunTests::create_n_inputs(2, element::f32, batch_shape, "N..."); + auto execNet = core->compile_model(model, target_device, configuration); + auto context = core->get_default_context(target_device); + // Create InferRequest + ov::InferRequest req; + req = execNet.create_infer_request(); + std::vector tensors; + for (size_t i = 0; i < batch; ++i) { + // non contiguous memory + auto tensor = context.create_host_tensor(ov::element::f32, one_shape); + tensors.push_back(std::move(tensor)); + } + req.set_tensors("tensor_input0", tensors); + + auto actual_tensor = req.get_tensor("tensor_output0"); + auto* actual = actual_tensor.data(); + for (auto testNum = 0; testNum < 5; testNum++) { + for (size_t i = 0; i < batch; ++i) { + auto* f = tensors[i].data(); + for (size_t j = 0; j < one_shape_size; ++j) { + f[j] = static_cast(testNum + 20); + } + } + req.infer(); // Adds '1' to each element + for (size_t j = 0; j < one_shape_size * batch; ++j) { + EXPECT_EQ(actual[j], testNum + 21) << "Infer " << testNum << ": Expected=" << testNum + 21 + << ", actual=" << actual[j] << " for index " << j; + } + } +} + +TEST_P(BatchedTensorsRunTests, SetInputDifferentTensorsMultipleInfer) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + size_t batch = 4; + auto one_shape = Shape{1, 2, 2, 2}; + auto batch_shape = Shape{batch, 2, 2, 2}; + auto one_shape_size = ov::shape_size(one_shape); + auto model = BatchedTensorsRunTests::create_n_inputs(2, element::f32, batch_shape, "N..."); + auto execNet = core->compile_model(model, target_device, configuration); + auto context = core->get_default_context(target_device); + // Create InferRequest + ov::InferRequest req; + req = execNet.create_infer_request(); + std::vector tensors; + + std::vector buffer(one_shape_size * 2 * 2, 0); + + auto tensor0 = ov::Tensor(element::f32, one_shape, &buffer[(0 * 2) * one_shape_size]); + auto tensor1 = context.create_host_tensor(ov::element::f32, one_shape); + auto tensor2 = ov::Tensor(element::f32, one_shape, &buffer[(1 * 2) * one_shape_size]); + auto tensor3 = context.create_host_tensor(ov::element::f32, one_shape); + + tensors.push_back(std::move(tensor0)); + tensors.push_back(std::move(tensor1)); + tensors.push_back(std::move(tensor2)); + tensors.push_back(std::move(tensor3)); + + req.set_tensors("tensor_input0", tensors); + + auto actual_tensor = req.get_tensor("tensor_output0"); + auto* actual = actual_tensor.data(); + for (auto testNum = 0; testNum < 5; testNum++) { + for (size_t i = 0; i < batch; ++i) { + auto* f = tensors[i].data(); + for (size_t j = 0; j < one_shape_size; ++j) { + f[j] = static_cast(testNum + 20); + } + } + req.infer(); // Adds '1' to each element + for (size_t j = 0; j < one_shape_size * batch; ++j) { + EXPECT_EQ(actual[j], testNum + 21) << "Infer " << testNum << ": Expected=" << testNum + 21 + << ", actual=" << actual[j] << " for index " << j; + } + } +} + +TEST_P(BatchedTensorsRunTests, SetInputDifferentTensorsMultipleInferMCL) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + size_t batch = 4; + auto one_shape = Shape{1, 2, 2, 2}; + auto batch_shape = Shape{batch, 2, 2, 2}; + auto one_shape_size = ov::shape_size(one_shape); + auto model = BatchedTensorsRunTests::create_n_inputs(2, element::f32, batch_shape, "N..."); + auto execNet = core->compile_model(model, target_device, configuration); + auto context = core->get_default_context(target_device); + // Create InferRequest + ov::InferRequest req; + req = execNet.create_infer_request(); + + std::vector buffer(one_shape_size * batch * 2, 0); + + { + std::vector tensors; + + auto tensor0 = ov::Tensor(element::f32, one_shape, &buffer[(0 * 2) * one_shape_size]); + auto tensor1 = context.create_host_tensor(ov::element::f32, one_shape); + auto tensor2 = ov::Tensor(element::f32, one_shape, &buffer[(1 * 2) * one_shape_size]); + auto tensor3 = context.create_host_tensor(ov::element::f32, one_shape); + + tensors.push_back(std::move(tensor0)); + tensors.push_back(std::move(tensor1)); + tensors.push_back(std::move(tensor2)); + tensors.push_back(std::move(tensor3)); + + req.set_tensors("tensor_input0", tensors); + + auto actual_tensor = req.get_tensor("tensor_output0"); + auto* actual = actual_tensor.data(); + for (auto testNum = 0; testNum < 5; testNum++) { + for (size_t i = 0; i < batch; ++i) { + auto* f = tensors[i].data(); + for (size_t j = 0; j < one_shape_size; ++j) { + f[j] = static_cast(testNum + 20); + } + } + req.infer(); // Adds '1' to each element + for (size_t j = 0; j < one_shape_size * batch; ++j) { + EXPECT_EQ(actual[j], testNum + 21) << "Infer " << testNum << ": Expected=" << testNum + 21 + << ", actual=" << actual[j] << " for index " << j; + } + } + } + + { + std::vector tensors; + + auto tensor0 = context.create_host_tensor(ov::element::f32, one_shape); + auto tensor1 = ov::Tensor(element::f32, one_shape, &buffer[(2 * 2) * one_shape_size]); + auto tensor2 = ov::Tensor(element::f32, one_shape, &buffer[(3 * 2) * one_shape_size]); + auto tensor3 = context.create_host_tensor(ov::element::f32, one_shape); + + tensors.push_back(std::move(tensor0)); + tensors.push_back(std::move(tensor1)); + tensors.push_back(std::move(tensor2)); + tensors.push_back(std::move(tensor3)); + + req.set_tensors("tensor_input0", tensors); + + auto actual_tensor = req.get_tensor("tensor_output0"); + auto* actual = actual_tensor.data(); + for (auto testNum = 0; testNum < 5; testNum++) { + for (size_t i = 0; i < batch; ++i) { + auto* f = tensors[i].data(); + for (size_t j = 0; j < one_shape_size; ++j) { + f[j] = static_cast(testNum + 200); + } + } + req.infer(); // Adds '1' to each element + for (size_t j = 0; j < one_shape_size * batch; ++j) { + EXPECT_EQ(actual[j], testNum + 201) << "Infer " << testNum << ": Expected=" << testNum + 21 + << ", actual=" << actual[j] << " for index " << j; + } + } + } +} + +TEST_P(BatchedTensorsRunTests, SetInputDifferentRemoteTensorsMultipleInferMCL) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + size_t batch = 4; + auto one_shape = Shape{1, 2, 2, 2}; + auto batch_shape = Shape{batch, 2, 2, 2}; + auto one_shape_size = ov::shape_size(one_shape); + auto model = BatchedTensorsRunTests::create_n_inputs(2, element::f32, batch_shape, "N..."); + auto execNet = core->compile_model(model, target_device, configuration); + auto context = core->get_default_context(target_device).as(); + // Create InferRequest + ov::InferRequest req; + req = execNet.create_infer_request(); + + std::vector buffer(one_shape_size * 2 * 2, 0); + + { + std::vector tensors; + + auto tensor0 = ov::Tensor(element::f32, one_shape, &buffer[(0 * 2) * one_shape_size]); + auto tensor1 = context.create_l0_host_tensor(ov::element::f32, one_shape); + auto tensor2 = context.create_l0_host_tensor(ov::element::f32, one_shape); + auto tensor3 = context.create_host_tensor(ov::element::f32, one_shape); + + tensors.push_back(tensor0); + tensors.push_back(tensor1); + tensors.push_back(tensor2); + tensors.push_back(tensor3); + + req.set_tensors("tensor_input0", tensors); + + auto actual_tensor = req.get_tensor("tensor_output0"); + auto* actual = actual_tensor.data(); + for (auto testNum = 0; testNum < 5; testNum++) { + { + auto* f = tensor0.data(); + for (size_t j = 0; j < one_shape_size; ++j) { + f[j] = static_cast(testNum + 20); + } + } + { + auto* data = tensor1.get(); + float* f = static_cast(data); + for (size_t j = 0; j < one_shape_size; ++j) { + f[j] = static_cast(testNum + 20); + } + } + { + auto* data = tensor2.get(); + float* f = static_cast(data); + for (size_t j = 0; j < one_shape_size; ++j) { + f[j] = static_cast(testNum + 20); + } + } + { + auto* f = tensor3.data(); + for (size_t j = 0; j < one_shape_size; ++j) { + f[j] = static_cast(testNum + 20); + } + } + + req.infer(); // Adds '1' to each element + for (size_t j = 0; j < one_shape_size * batch; ++j) { + EXPECT_EQ(actual[j], testNum + 21) << "Infer " << testNum << ": Expected=" << testNum + 21 + << ", actual=" << actual[j] << " for index " << j; + } + } + } + + { + std::vector tensors; + + auto tensor0 = context.create_l0_host_tensor(ov::element::f32, one_shape); + auto tensor1 = context.create_host_tensor(ov::element::f32, one_shape); + auto tensor2 = ov::Tensor(element::f32, one_shape, &buffer[(1 * 2) * one_shape_size]); + auto tensor3 = context.create_l0_host_tensor(ov::element::f32, one_shape); + + tensors.push_back(tensor0); + tensors.push_back(tensor1); + tensors.push_back(tensor2); + tensors.push_back(tensor3); + + req.set_tensors("tensor_input0", tensors); + + auto actual_tensor = req.get_tensor("tensor_output0"); + auto* actual = actual_tensor.data(); + for (auto testNum = 0; testNum < 5; testNum++) { + { + auto* data = tensor0.get(); + float* f = static_cast(data); + for (size_t j = 0; j < one_shape_size; ++j) { + f[j] = static_cast(testNum + 20); + } + } + { + auto* f = tensor1.data(); + for (size_t j = 0; j < one_shape_size; ++j) { + f[j] = static_cast(testNum + 20); + } + } + { + auto* f = tensor2.data(); + for (size_t j = 0; j < one_shape_size; ++j) { + f[j] = static_cast(testNum + 20); + } + } + { + auto* data = tensor3.get(); + float* f = static_cast(data); + for (size_t j = 0; j < one_shape_size; ++j) { + f[j] = static_cast(testNum + 20); + } + } + + req.infer(); // Adds '1' to each element + for (size_t j = 0; j < one_shape_size * batch; ++j) { + EXPECT_EQ(actual[j], testNum + 21) << "Infer " << testNum << ": Expected=" << testNum + 21 + << ", actual=" << actual[j] << " for index " << j; + } + } + } +} + +} // namespace behavior +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_npu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_npu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 4eb829045c964a..aa61afdcacc1bc 100644 --- a/src/plugins/intel_npu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_npu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -702,18 +702,9 @@ std::vector disabledTestPatterns() { ".*OVCompiledModelPropertiesDefaultSupportedTests.CanCompileWithDefaultValueFromPlugin.*" }); - // [Tracking number: E#116494] - _skipRegistry.addPatterns( - "NPU plugin doesn't implement `set_tensors` function", { - ".*OVInferRequestBatchedTests.SetInputTensorsBase.*", - ".*OVInferRequestBatchedTests.SetInputTensorsAsync.*", - ".*OVInferRequestBatchedTests.SetInputTensors_override_with_set.*", - ".*OVInferRequestBatchedTests.SetInputTensorsBase_Caching.*", - ".*OVInferRequestBatchedTests.SetInputTensors_Multiple_Infer.*", + _skipRegistry.addPatterns( + "NPU plugin doesn't support infer dynamic", { ".*OVInferRequestBatchedTests.SetInputTensors_Can_Infer_Dynamic.*", - ".*OVInferRequestBatchedTests.SetInputTensors_Get_Tensor_Not_Allowed.*", - ".*OVInferRequestBatchedTests.SetInputTensors_Correct_all.*", - ".*OVInferRequestBatchedTests.SetInputTensors_Cache_CheckDeepCopy.*" }); // [Tracking number: E#118381]