From c561f61c839bf985971c6f68861578039dd44ad0 Mon Sep 17 00:00:00 2001
From: Bogdan Pereanu <bogdan.pereanu@intel.com>
Date: Tue, 8 Oct 2024 11:00:51 +0300
Subject: [PATCH] [NPU] Adding support for the set_tensors method (#26823)

### Details:
 - *Adding support for the set_tensor method*

set_tensors works differently in case the plugin or the compiler handles
the batch:
- in case the compiler handles batching we need to create a continuous
L0 tensor and copy all the tensors into that big tensor even when
tensors are part of the same L0 context
- in case the plugin handles batching and the remote tensor feature is
supported copy is not used if the tensors are part of the same L0
context.

### Tickets:
 - *EISW-116494*
---
 .../src/al/include/sync_infer_request.hpp     |  27 +-
 .../src/al/src/sync_infer_request.cpp         | 125 +++++-
 .../backend/include/zero_infer_request.hpp    |  14 +-
 .../src/backend/include/zero_pipeline.hpp     |   5 +-
 .../src/backend/src/zero_infer_request.cpp    | 317 +++++++++----
 .../src/backend/src/zero_pipeline.cpp         |  60 ++-
 .../batched_tensors_tests/batched_run.cpp     |  21 +
 .../batched_tensors_tests/batched_run.hpp     | 425 ++++++++++++++++++
 .../skip_tests_config.cpp                     |  13 +-
 9 files changed, 872 insertions(+), 135 deletions(-)
 create mode 100644 src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.cpp
 create mode 100644 src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.hpp
diff --git a/src/plugins/intel_npu/src/al/include/sync_infer_request.hpp b/src/plugins/intel_npu/src/al/include/sync_infer_request.hpp
index bf9e0f20af3b78..ade70f9b67dc0f 100644
--- a/src/plugins/intel_npu/src/al/include/sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/al/include/sync_infer_request.hpp
@@ -22,7 +22,7 @@ namespace intel_npu {
  */
 class SyncInferRequest : public ov::IInferRequest {
 public:
-    explicit SyncInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel);
+    explicit SyncInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel, const Config& config);
 
     /**
      * @brief Gets an input/output tensor for inference.
@@ -50,8 +50,8 @@ class SyncInferRequest : public ov::IInferRequest {
      * @brief Currently there is no support implemented for batches of tensors, thus this call is a simple redirection
      * to the "set_tensor" one.
      */
-    void set_tensors(const ov::Output<const ov::Node>& port,
-                     const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
+    virtual void set_tensors(const ov::Output<const ov::Node>& port,
+                             const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
 
     /**
      * @brief Gets inputs for infer request
@@ -126,6 +126,15 @@ class SyncInferRequest : public ov::IInferRequest {
      */
     void check_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) const;
 
+    /**
+     * @brief Basic checks for input tensors
+     *
+     * @param port Input port
+     * @param tensors Input tensors
+     */
+    void check_batched_tensors(const ov::Output<const ov::Node>& port,
+                               const std::vector<ov::SoPtr<ov::ITensor>>& tensors) const;
+
     /**
      * @brief Check that all tensors are valid. Throws an exception if it's not.
      */
@@ -153,14 +162,22 @@ class SyncInferRequest : public ov::IInferRequest {
                                                  const ov::Allocator& allocator = {},
                                                  const std::optional<std::size_t> batchSize = std::nullopt) const;
 
+    bool is_batched_input(size_t idx) const;
+
+    ov::SoPtr<ov::ITensor>& get_user_input(size_t index) const;
+    std::vector<ov::SoPtr<ov::ITensor>>& get_user_inputs(size_t index) const;
+
     // This is intel_npu::ICompiledModel pointer, but need to use OV base class because
     // ov::IInferRequest::get_compiled_model returns a refernce to shared_ptr!
     std::shared_ptr<const ov::ICompiledModel> _compiledModel;
 
     NetworkMetadata _metadata;
 
-    mutable std::vector<std::shared_ptr<ov::ITensor>> _userInputTensors;
-    mutable std::vector<std::shared_ptr<ov::ITensor>> _userOutputTensors;
+    Logger _logger;
+
+    // In case set_tensors is called, we receive a vector with N tensors otherwise only 1 tensor is needed
+    mutable std::vector<std::vector<ov::SoPtr<ov::ITensor>>> _userInputTensors;
+    mutable std::vector<ov::SoPtr<ov::ITensor>> _userOutputTensors;
 
     mutable std::vector<ov::SoPtr<ov::IVariableState>> _variableStates;
 
diff --git a/src/plugins/intel_npu/src/al/src/sync_infer_request.cpp b/src/plugins/intel_npu/src/al/src/sync_infer_request.cpp
index 08d5b518b98cad..04e9ce0d9bbcf8 100644
--- a/src/plugins/intel_npu/src/al/src/sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/al/src/sync_infer_request.cpp
@@ -19,11 +19,12 @@ constexpr size_t BATCH_AXIS = 0;
 
 namespace intel_npu {
 
-SyncInferRequest::SyncInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel)
+SyncInferRequest::SyncInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel, const Config& config)
     : _compiledModel(compiledModel),
       _metadata(compiledModel->get_network_metadata()),
-      _userInputTensors(_metadata.inputs.size(), nullptr),
-      _userOutputTensors(_metadata.outputs.size(), nullptr) {
+      _logger("SyncInferRequest", config.get<LOG_LEVEL>()),
+      _userInputTensors(_metadata.inputs.size(), std::vector<ov::SoPtr<ov::ITensor>>(1, {nullptr})),
+      _userOutputTensors(_metadata.outputs.size(), {nullptr}) {
     OPENVINO_ASSERT(_compiledModel);
 
     if (get_outputs().empty()) {
@@ -121,7 +122,7 @@ ov::SoPtr<ov::ITensor> SyncInferRequest::get_tensor(const ov::Output<const ov::N
     OPENVINO_ASSERT(foundPort.found(), "Cannot find tensor for port ", port);
 
     if (foundPort.is_input()) {
-        return _userInputTensors.at(foundPort.idx);
+        return get_user_input(foundPort.idx);
     }
     return _userOutputTensors.at(foundPort.idx);
 }
@@ -138,17 +139,22 @@ void SyncInferRequest::set_tensor(const ov::Output<const ov::Node>& port, const
     }
 
     if (foundPort.is_input()) {
-        _userInputTensors.at(foundPort.idx) = tensor._ptr;
+        get_user_input(foundPort.idx) = tensor;
     } else {
-        _userOutputTensors.at(foundPort.idx) = tensor._ptr;
+        _userOutputTensors.at(foundPort.idx) = tensor;
     }
 }
 
-std::vector<ov::SoPtr<ov::ITensor>> SyncInferRequest::get_tensors(const ov::Output<const ov::Node>& /*port*/) const {
+std::vector<ov::SoPtr<ov::ITensor>> SyncInferRequest::get_tensors(const ov::Output<const ov::Node>& port) const {
     OV_ITT_SCOPED_TASK(ov::itt::domains::Plugin, "get_tensors");
 
-    // Using batches of tensors is currently not supported by the NPU plugin. In this scenario, the OpenVINO API demands
-    // returning an empty vector.
+    auto foundPort = find_port(port);
+    OPENVINO_ASSERT(foundPort.found(), "Cannot find input tensors for port ", port);
+
+    if (foundPort.is_input() && is_batched_input(foundPort.idx)) {
+        return get_user_inputs(foundPort.idx);
+    }
+
     return {};
 }
 
@@ -192,11 +198,89 @@ void SyncInferRequest::check_tensor(const ov::Output<const ov::Node>& port,
         "Tensor data equal nullptr!");
 }
 
+void SyncInferRequest::check_batched_tensors(const ov::Output<const ov::Node>& port,
+                                             const std::vector<ov::SoPtr<ov::ITensor>>& tensors) const {
+    OPENVINO_ASSERT(!tensors.empty(), "set_input_tensors/set_tensors can't be called with empty tensors");
+    OPENVINO_ASSERT(
+        tensors.size() != 1,
+        "Internal error (plugin): check_batched_tensors is not allowed to have only one tensor inside batch");
+
+    auto layout = ov::layout::get_layout(port);
+
+    int64_t batch_idx;
+
+    if (layout.empty()) {
+        _logger.warning("set_input_tensors/set_tensors layout is not set, assuming batch dimension is found on 0 axis");
+        batch_idx = BATCH_AXIS;
+    } else {
+        OPENVINO_ASSERT(ov::layout::has_batch(layout),
+                        "set_input_tensors/set_tensors can be used only for inputs with N(batch) dimension"
+                        " 'layout' defined. Current layout is ",
+                        layout.to_string());
+        batch_idx = ov::layout::batch_idx(layout);
+    }
+
+    if (batch_idx < 0) {
+        batch_idx += static_cast<int64_t>(tensors[BATCH_AXIS]->get_shape().size());
+    }
+    OPENVINO_ASSERT(batch_idx == BATCH_AXIS,
+                    "set_input_tensors/set_tensors is not currently supported for batch dimension index ",
+                    batch_idx,
+                    " != 0");
+    std::for_each(tensors.begin(), tensors.end(), [&batch_idx](const ov::SoPtr<ov::ITensor>& item) {
+        OPENVINO_ASSERT(item, "Unintialized tensor is provided!");
+        OPENVINO_ASSERT(item->get_shape()[batch_idx] == 1,
+                        "set_input_tensors/set_tensors. Tensors shall represent one item in a batch, ",
+                        item->get_shape()[batch_idx],
+                        " provided");
+    });
+    auto tensors_size = static_cast<int>(tensors.size());
+    if (port.get_partial_shape().rank().is_static()) {
+        OPENVINO_ASSERT(batch_idx >= 0 && batch_idx < port.get_partial_shape().rank().get_length(),
+                        "set_input_tensors/set_tensors error. Layout ",
+                        layout.to_string(),
+                        " is incorrect for operation with shape ",
+                        port.get_partial_shape());
+        auto batch = port.get_partial_shape()[batch_idx];
+
+        OPENVINO_ASSERT(batch.is_dynamic() || batch.get_length() == tensors_size,
+                        "set_input_tensors/set_tensors error. Input shape ",
+                        port.get_partial_shape(),
+                        "batch ",
+                        batch,
+                        "doesn't match with total blobs count: ",
+                        tensors_size);
+    }
+
+    auto batched_shape = tensors[BATCH_AXIS]->get_shape();
+    auto element_type = tensors[BATCH_AXIS]->get_element_type();
+    batched_shape[batch_idx] = tensors_size;
+    for (const auto& item : tensors) {
+        OPENVINO_ASSERT(item, "Unintialized tensor is provided!");
+        auto item_shape = item->get_shape();
+        item_shape[batch_idx] = batched_shape[batch_idx];
+        OPENVINO_ASSERT(item_shape == batched_shape && item->get_element_type() == element_type &&
+                            "set_input_tensors/set_tensors error. Tensor with element type ",
+                        item->get_element_type(),
+                        " and shape ",
+                        item_shape,
+                        " is not compatible with batched tensor with element type ",
+                        element_type,
+                        " and shape ",
+                        batched_shape);
+        OPENVINO_ASSERT(item->is_continuous(), "Strides for batched tensors should be default.");
+    }
+}
+
 void SyncInferRequest::check_tensors() const {
     const auto& inputs = _compiledModel->inputs();
     for (size_t i = 0; i < inputs.size(); i++) {
-        if (_userInputTensors.at(i)) {
-            check_tensor(inputs[i], _userInputTensors.at(i));
+        if (is_batched_input(i)) {
+            check_batched_tensors(inputs[i], get_user_inputs(i));
+            continue;
+        }
+        if (get_user_input(i)) {
+            check_tensor(inputs[i], get_user_input(i));
         }
     }
 
@@ -229,7 +313,7 @@ std::shared_ptr<ov::ITensor> SyncInferRequest::allocate_tensor(const IODescripto
         OPENVINO_ASSERT(descriptor.relatedDescriptorIndex.has_value(),
                         "The link between state descriptors is missing, state name: ",
                         descriptor.nameFromCompiler);
-        tensor = _userInputTensors.at(*descriptor.relatedDescriptorIndex);
+        tensor = get_user_input(*descriptor.relatedDescriptorIndex)._ptr;
     } else if (allocator) {
         tensor = ov::make_tensor(descriptor.precision, allocatedTensorShape, allocator);
     } else {
@@ -237,8 +321,8 @@ std::shared_ptr<ov::ITensor> SyncInferRequest::allocate_tensor(const IODescripto
     }
 
     if (isInput) {
-        if (_userInputTensors.at(index) == nullptr) {
-            _userInputTensors.at(index) = tensor;
+        if (get_user_input(index) == nullptr) {
+            get_user_input(index) = tensor;
         }
 
         if (descriptor.isStateInput) {
@@ -250,4 +334,17 @@ std::shared_ptr<ov::ITensor> SyncInferRequest::allocate_tensor(const IODescripto
 
     return tensor;
 }
+
+bool SyncInferRequest::is_batched_input(size_t idx) const {
+    return _userInputTensors.at(idx).size() > 1;
+}
+
+ov::SoPtr<ov::ITensor>& SyncInferRequest::get_user_input(size_t index) const {
+    return _userInputTensors.at(index).at(0);
+}
+
+std::vector<ov::SoPtr<ov::ITensor>>& SyncInferRequest::get_user_inputs(size_t index) const {
+    return _userInputTensors.at(index);
+}
+
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
index f6d15d2c2aed5e..6d0b343bf8d7b7 100644
--- a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
+++ b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
@@ -27,6 +27,8 @@ class ZeroInferRequest final : public SyncInferRequest {
 
     ov::SoPtr<ov::ITensor> get_tensor(const ov::Output<const ov::Node>& port) const override;
     void set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override;
+    void set_tensors(const ov::Output<const ov::Node>& port,
+                     const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
 
     void infer() override;
     void infer_async() override;
@@ -54,7 +56,7 @@ class ZeroInferRequest final : public SyncInferRequest {
      * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside
      * the plugin.
      */
-    std::optional<size_t> getBatchSize(const NetworkMetadata& metadata);
+    std::optional<size_t> get_batch_size(const NetworkMetadata& metadata);
 
     /**
      * @brief Check the received tensor and set the Level Zero tensor accordingly
@@ -75,6 +77,12 @@ class ZeroInferRequest final : public SyncInferRequest {
     void check_network_precision(const ov::element::Type_t precision) const override;
     void create_pipeline();
 
+    std::shared_ptr<ov::ITensor>& get_level_zero_input(size_t index, size_t tensorNo = 0) const;
+    std::vector<std::shared_ptr<ov::ITensor>>& get_level_zero_inputs(size_t index) const;
+
+    std::optional<TensorData>& get_input_tensor_data(size_t index, size_t tensorNo = 0) const;
+    std::vector<std::optional<TensorData>>& get_input_tensors_data(size_t index) const;
+
     const std::shared_ptr<ZeroInitStructsHolder> _initStructs;
     const std::shared_ptr<const IExecutor> _executorPtr;
     const ZeroExecutor* _executor;
@@ -83,10 +91,10 @@ class ZeroInferRequest final : public SyncInferRequest {
 
     // A copy of each tensor is needed to maintain the original L0 memory allocation in case the user provides another
     // memory area for the tensor.
-    mutable std::vector<std::shared_ptr<ov::ITensor>> _levelZeroInputTensors;
+    mutable std::vector<std::vector<std::shared_ptr<ov::ITensor>>> _levelZeroInputTensors;
     mutable std::vector<std::shared_ptr<ov::ITensor>> _levelZeroOutputTensors;
 
-    mutable std::vector<std::optional<TensorData>> _inputTensorsData;
+    mutable std::vector<std::vector<std::optional<TensorData>>> _inputTensorsData;
     mutable std::vector<std::optional<TensorData>> _outputTensorsData;
 
     ze_device_properties_t _properties = {};
diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
index 6a5cc79ed7a7fc..4160a2ca979290 100644
--- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
+++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
@@ -25,7 +25,7 @@ struct Pipeline {
              zeroProfiling::ProfilingPool& profiling_pool,
              zeroProfiling::ProfilingQuery& profiling_query,
              std::shared_ptr<zeroProfiling::NpuInferProfiling> npu_profiling,
-             const std::vector<std::optional<TensorData>>& inputTensorsData,
+             const std::vector<std::vector<std::optional<TensorData>>>& inputTensorsData,
              const std::vector<std::optional<TensorData>>& outputTensorsData,
              const size_t numberOfCommandLists);
 
@@ -37,7 +37,8 @@ struct Pipeline {
     void pull();
     void reset() const;
 
-    void updateCommandList(const TensorData& tensorsData, const uint32_t index);
+    void updateCommandList(const TensorData& tensorsData, uint32_t index);
+    void updateCommandList(const TensorData& tensorsData, uint32_t index, size_t commandListIndex);
 
 protected:
     const Config _config;
diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
index 0a8d8dded5e97d..2c954151a4f652 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -19,6 +19,7 @@ using namespace intel_npu;
 
 namespace {
 
+constexpr std::size_t SINGLE_TENSOR = 0;
 constexpr std::size_t BATCH_AXIS = 0;
 constexpr std::size_t DEFAULT_BATCH_SIZE = 1;
 constexpr bool INPUT = true;
@@ -30,8 +31,8 @@ constexpr bool OUTPUT = false;
  * @param ioDescriptor The OpenVINO API specific I/O descriptor which shall be compared.
  * @param zeDescriptor The Level Zero specific structure used for comparison.
  */
-void checkLevelZeroAttributesMatch(const IODescriptor& ioDescriptor,
-                                   const ZeroExecutor::ArgumentDescriptor& zeDescriptor) {
+void check_level_zero_attributes_match(const IODescriptor& ioDescriptor,
+                                       const ZeroExecutor::ArgumentDescriptor& zeDescriptor) {
     std::string zeDescriptorName = zeDescriptor.info.name;
 
     if (isStateInputName(zeDescriptorName)) {
@@ -78,9 +79,25 @@ Type extract_object(const ov::AnyMap& params, const ov::Property<Type>& p) {
     return res.as<Type>();
 }
 
+bool memory_was_allocated_in_the_same_l0_context(ze_context_handle_t hContext, const void* ptr) {
+    ze_memory_allocation_properties_t desc = {};
+    desc.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES;
+    auto res = intel_npu::zeMemGetAllocProperties(hContext, ptr, &desc, nullptr);
+    if (res == ZE_RESULT_SUCCESS) {
+        if (desc.id) {
+            if ((desc.type & ZE_MEMORY_TYPE_HOST) || (desc.type & ZE_MEMORY_TYPE_DEVICE) ||
+                (desc.type & ZE_MEMORY_TYPE_SHARED)) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
 }  // namespace
 
-std::optional<size_t> ZeroInferRequest::getBatchSize(const NetworkMetadata& metadata) {
+std::optional<size_t> ZeroInferRequest::get_batch_size(const NetworkMetadata& metadata) {
     if (!metadata.outputs.at(0).shapeFromIRModel.has_value()) {
         _logger.debug("Batching on the plugin is not used, batching is handled by the compiler");
         return std::nullopt;
@@ -143,15 +160,15 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
                                    const std::shared_ptr<const ICompiledModel>& compiledModel,
                                    const std::shared_ptr<const IExecutor>& executor,
                                    const Config& config)
-    : SyncInferRequest(compiledModel),
+    : SyncInferRequest(compiledModel, config),
       _initStructs(initStructs),
       _executorPtr(executor),
       _executor(static_cast<const ZeroExecutor*>(_executorPtr.get())),
       _config(config),
       _logger("ZeroInferRequest", config.get<LOG_LEVEL>()),
-      _levelZeroInputTensors(_metadata.inputs.size(), nullptr),
+      _levelZeroInputTensors(_metadata.inputs.size(), std::vector<std::shared_ptr<ov::ITensor>>(1, nullptr)),
       _levelZeroOutputTensors(_metadata.outputs.size(), nullptr),
-      _inputTensorsData(_metadata.inputs.size(), std::nullopt),
+      _inputTensorsData(_metadata.inputs.size(), std::vector<std::optional<TensorData>>(1, std::nullopt)),
       _outputTensorsData(_metadata.outputs.size(), std::nullopt),
       _profilingPool(_executor->graph(), zeroProfiling::POOL_SIZE, _executor->getInitStructs()->getProfilingDdiTable()),
       _profilingQuery(0,
@@ -179,7 +196,7 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
         std::make_shared<const zeroMemory::HostMemAllocator>(_initStructs, ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED);
 
     if (config.get<BATCH_MODE>() != ov::intel_npu::BatchMode::COMPILER) {
-        _batchSize = getBatchSize(_metadata);
+        _batchSize = get_batch_size(_metadata);
     }
     if (_batchSize.has_value()) {
         _numberOfCommandLists = *_batchSize;
@@ -189,24 +206,23 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
 
     size_t ioIndex = 0;
     for (const IODescriptor& inputDescriptor : _metadata.inputs) {
-        checkLevelZeroAttributesMatch(inputDescriptor, executorInputDescriptors.at(ioIndex));
+        check_level_zero_attributes_match(inputDescriptor, executorInputDescriptors.at(ioIndex));
 
         if (!(inputDescriptor.isStateInput || inputDescriptor.isShapeTensor)) {
             ++ioIndex;
             continue;
         }
 
-        _levelZeroInputTensors.at(ioIndex) =
-            allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _batchSize);
-        _inputTensorsData.at(ioIndex) =
-            TensorData{_levelZeroInputTensors.at(ioIndex)->data(), _levelZeroInputTensors.at(ioIndex)->get_byte_size()};
+        get_level_zero_input(ioIndex) = allocate_tensor(inputDescriptor, ioIndex, INPUT, *_inputAllocator, _batchSize);
+        get_input_tensor_data(ioIndex) =
+            TensorData{get_level_zero_input(ioIndex)->data(), get_level_zero_input(ioIndex)->get_byte_size()};
 
         ++ioIndex;
     }
 
     ioIndex = 0;
     for (const IODescriptor& outputDescriptor : _metadata.outputs) {
-        checkLevelZeroAttributesMatch(outputDescriptor, executorOutputDescriptors.at(ioIndex));
+        check_level_zero_attributes_match(outputDescriptor, executorOutputDescriptors.at(ioIndex));
 
         if (!(outputDescriptor.isStateOutput || outputDescriptor.isShapeTensor)) {
             ++ioIndex;
@@ -227,18 +243,25 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
 
 void ZeroInferRequest::create_pipeline() {
     for (size_t inputIndex = 0; inputIndex < _metadata.inputs.size(); ++inputIndex) {
-        if (_levelZeroInputTensors.at(inputIndex)) {
+        if (is_batched_input(inputIndex)) {
+            if (_batchSize.has_value()) {
+                _logger.debug("ZeroInferRequest::create_pipeline - tensors %s were already allocated",
+                              _metadata.inputs.at(inputIndex).nodeFriendlyName.c_str());
+                continue;
+            }
+        }
+
+        if (get_level_zero_input(inputIndex)) {
             _logger.debug("ZeroInferRequest::create_pipeline - tensor %s was already allocated",
                           _metadata.inputs.at(inputIndex).nodeFriendlyName.c_str());
             continue;
         }
 
-        _logger.debug("ZeroInferRequest::create_pipeline - Allocate new tensor");
-        _levelZeroInputTensors.at(inputIndex) =
+        _logger.debug("ZeroInferRequest::create_pipeline - allocate new tensor");
+        get_level_zero_input(inputIndex) =
             allocate_tensor(_metadata.inputs.at(inputIndex), inputIndex, INPUT, *_inputAllocator, _batchSize);
-        _inputTensorsData.at(inputIndex) =
-            std::optional(TensorData{_levelZeroInputTensors.at(inputIndex)->data(),
-                                     _levelZeroInputTensors.at(inputIndex)->get_byte_size()});
+        get_input_tensor_data(inputIndex) = std::optional(
+            TensorData{get_level_zero_input(inputIndex)->data(), get_level_zero_input(inputIndex)->get_byte_size()});
     }
 
     for (size_t outputIndex = 0; outputIndex < _metadata.outputs.size(); ++outputIndex) {
@@ -274,47 +297,32 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor> tensor
                                        const size_t index,
                                        const bool isInput) {
     OV_ITT_TASK_CHAIN(ZERO_SET_TENSOR, itt::domains::LevelZeroBackend, "set_tensor", "set_tensor_data");
-    auto& levelZeroTensors = isInput ? _levelZeroInputTensors : _levelZeroOutputTensors;
-    auto& tensorsData = isInput ? _inputTensorsData : _outputTensorsData;
+    auto& levelZeroTensors = isInput ? get_level_zero_input(index) : _levelZeroOutputTensors.at(index);
+    auto& tensorsData = isInput ? get_input_tensor_data(index) : _outputTensorsData.at(index);
 
     bool setTensorData = false;
     bool levelZeroTensorCreatedLocally = true;
 
     OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "check_data_allocation");
-    ze_memory_allocation_properties_t desc = {};
-    desc.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES;
-    auto res = zeMemGetAllocProperties(_initStructs->getContext(), tensor->data(), &desc, nullptr);
-    if (res == ZE_RESULT_SUCCESS) {
-        if (desc.id) {
-            switch (desc.type) {
-            case ZE_MEMORY_TYPE_HOST:
-            case ZE_MEMORY_TYPE_DEVICE:
-            case ZE_MEMORY_TYPE_SHARED:
-                _logger.debug("ZeroInferRequest::set_tensor_data - tensor was created in the same L0 context");
-                levelZeroTensors.at(index) = tensor;
-                levelZeroTensorCreatedLocally = false;
-                setTensorData = true;
-                break;
-            case ZE_MEMORY_TYPE_UNKNOWN:
-            case ZE_MEMORY_TYPE_FORCE_UINT32:
-                break;
-            }
-        }
+    if (memory_was_allocated_in_the_same_l0_context(_initStructs->getContext(), tensor->data())) {
+        _logger.debug("ZeroInferRequest::set_tensor_data - tensor was created in the same L0 context");
+        levelZeroTensors = tensor;
+        levelZeroTensorCreatedLocally = false;
+        setTensorData = true;
     }
 
     if (!setTensorData) {
         // make sure that the L0 tensor was allocated locally and is not received from the user when receiving
         // random tensor
-        if (tensorsData.at(index).has_value() && !tensorsData.at(index)->levelZeroTensorCreatedLocally) {
+        if (tensorsData.has_value() && !tensorsData->levelZeroTensorCreatedLocally) {
             _logger.debug("ZeroInferRequest::set_tensor_data - create locally L0 tensor");
             OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "allocate tensor");
 
-            levelZeroTensors.at(index) =
-                allocate_tensor(isInput ? _metadata.inputs.at(index) : _metadata.outputs.at(index),
-                                index,
-                                isInput,
-                                isInput ? *_inputAllocator : *_outputAllocator,
-                                _batchSize);
+            levelZeroTensors = allocate_tensor(isInput ? _metadata.inputs.at(index) : _metadata.outputs.at(index),
+                                               index,
+                                               isInput,
+                                               isInput ? *_inputAllocator : *_outputAllocator,
+                                               _batchSize);
 
             setTensorData = true;
             levelZeroTensorCreatedLocally = true;
@@ -322,15 +330,14 @@ void ZeroInferRequest::set_tensor_data(const std::shared_ptr<ov::ITensor> tensor
     }
 
     if (setTensorData) {
-        tensorsData.at(index) = std::optional(TensorData{levelZeroTensors.at(index)->data(),
-                                                         levelZeroTensors.at(index)->get_byte_size(),
-                                                         levelZeroTensorCreatedLocally});
+        tensorsData = std::optional(
+            TensorData{levelZeroTensors->data(), levelZeroTensors->get_byte_size(), levelZeroTensorCreatedLocally});
 
         if (_pipelineIsCreated) {
             _logger.debug("ZeroInferRequest::infer_async - update command list");
 
             OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "updateCommandList");
-            _pipeline->updateCommandList(*tensorsData.at(index),
+            _pipeline->updateCommandList(*tensorsData,
                                          isInput ? _executor->get_input_descriptors().at(index).idx
                                                  : _executor->get_output_descriptors().at(index).idx);
         }
@@ -353,17 +360,17 @@ void ZeroInferRequest::set_remote_tensor_data(const std::shared_ptr<ZeroRemoteTe
         OPENVINO_THROW("Empty buffer");
     }
 
-    auto& levelZeroTensors = isInput ? _levelZeroInputTensors : _levelZeroOutputTensors;
-    auto& tensorsData = isInput ? _inputTensorsData : _outputTensorsData;
+    auto& levelZeroTensors = isInput ? get_level_zero_input(index) : _levelZeroOutputTensors.at(index);
+    auto& tensorsData = isInput ? get_input_tensor_data(index) : _outputTensorsData.at(index);
 
-    levelZeroTensors.at(index) = tensor;
-    tensorsData.at(index) = std::optional(TensorData{data, tensor->get_byte_size(), false});
+    levelZeroTensors = tensor;
+    tensorsData = std::optional(TensorData{data, tensor->get_byte_size(), false});
 
     if (_pipelineIsCreated) {
         _logger.debug("ZeroInferRequest::infer_async - update command list");
 
         OV_ITT_TASK_NEXT(ZERO_SET_REMOTE_TENSOR, "updateCommandList");
-        _pipeline->updateCommandList(*tensorsData.at(index),
+        _pipeline->updateCommandList(*tensorsData,
                                      isInput ? _executor->get_input_descriptors().at(index).idx
                                              : _executor->get_output_descriptors().at(index).idx);
     }
@@ -381,9 +388,16 @@ void ZeroInferRequest::set_tensor(const ov::Output<const ov::Node>& port, const
     }
 
     if (foundPort.is_input()) {
-        _userInputTensors.at(foundPort.idx) = tensor._ptr;
+        if (is_batched_input(foundPort.idx)) {
+            // resize vector size to 1 if set_tensor is called after set_tensors
+            get_input_tensors_data(foundPort.idx).resize(1);
+            get_level_zero_inputs(foundPort.idx).resize(1);
+            get_user_inputs(foundPort.idx).resize(1);
+        }
+
+        get_user_input(foundPort.idx) = tensor;
     } else {
-        _userOutputTensors.at(foundPort.idx) = tensor._ptr;
+        _userOutputTensors.at(foundPort.idx) = tensor;
     }
 
     if (_initStructs->getMutableCommandListVersion()) {
@@ -399,6 +413,78 @@ void ZeroInferRequest::set_tensor(const ov::Output<const ov::Node>& port, const
     }
 }
 
+void ZeroInferRequest::set_tensors(const ov::Output<const ov::Node>& port,
+                                   const std::vector<ov::SoPtr<ov::ITensor>>& tensors) {
+    OV_ITT_TASK_CHAIN(SET_TENSORS, itt::domains::LevelZeroBackend, "set_tensors", "set_tensors");
+    if (tensors.size() == 1) {
+        set_tensor(port, tensors[0]);
+        return;
+    }
+
+    auto foundPort = find_port(port);
+    OPENVINO_ASSERT(foundPort.found(), "Cannot find input tensor for port ", port);
+    if (!foundPort.is_input()) {
+        OPENVINO_THROW("set_input_tensors/set_tensors is not supported for output port.");
+    }
+
+    check_batched_tensors(port, tensors);
+
+    get_user_inputs(foundPort.idx).resize(tensors.size());
+    get_user_inputs(foundPort.idx) = tensors;
+
+    if (_initStructs->getMutableCommandListVersion()) {
+        if (_batchSize.has_value()) {
+            for (size_t i = 0; i < tensors.size(); i++) {
+                auto remoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(tensors[i]._ptr);
+
+                get_level_zero_inputs(foundPort.idx).resize(tensors.size());
+                get_input_tensors_data(foundPort.idx).resize(tensors.size());
+
+                if (remoteTensor == nullptr) {
+                    bool tensorHasSameL0Context = false;
+
+                    OV_ITT_TASK_NEXT(SET_TENSORS, "check_data_allocation");
+                    if (memory_was_allocated_in_the_same_l0_context(_initStructs->getContext(), tensors[i]->data())) {
+                        _logger.debug("ZeroInferRequest::set_tensors - tensor was created in the same L0 context");
+
+                        get_level_zero_input(foundPort.idx, i) = tensors.at(i)._ptr;
+                        tensorHasSameL0Context = true;
+                    }
+
+                    if (!tensorHasSameL0Context) {
+                        _logger.debug("ZeroInferRequest::set_tensors - tensor wasn't created in the same L0 context, "
+                                      "create a L0 tensor");
+
+                        get_level_zero_input(foundPort.idx, i) =
+                            allocate_tensor(_metadata.inputs.at(foundPort.idx), foundPort.idx, true, *_inputAllocator);
+                    }
+
+                    get_input_tensor_data(foundPort.idx, i) =
+                        std::optional(TensorData{get_level_zero_input(foundPort.idx, i)->data(),
+                                                 get_level_zero_input(foundPort.idx, i)->get_byte_size(),
+                                                 false});
+                } else {
+                    _logger.debug("ZeroInferRequest::set_tensors - remote tensor is used");
+
+                    get_input_tensor_data(foundPort.idx, i) = std::optional(
+                        TensorData{extract_object(remoteTensor->get_properties(), ov::intel_npu::mem_handle),
+                                   remoteTensor->get_byte_size(),
+                                   false});
+
+                    get_level_zero_input(foundPort.idx, i) = tensors.at(i)._ptr;
+                }
+
+                if (_pipelineIsCreated) {
+                    OV_ITT_TASK_NEXT(SET_TENSORS, "updateCommandList");
+                    _pipeline->updateCommandList(*get_input_tensor_data(foundPort.idx, i),
+                                                 _executor->get_input_descriptors().at(foundPort.idx).idx,
+                                                 i);
+                }
+            }
+        }
+    }
+}
+
 ov::SoPtr<ov::ITensor> ZeroInferRequest::get_tensor(const ov::Output<const ov::Node>& port) const {
     OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "get_tensor");
 
@@ -407,28 +493,31 @@ ov::SoPtr<ov::ITensor> ZeroInferRequest::get_tensor(const ov::Output<const ov::N
 
     const size_t ioIndex = foundPort.idx;
     const bool isInput = foundPort.is_input();
-    auto& userTensors = isInput ? _userInputTensors : _userOutputTensors;
 
-    if (userTensors.at(ioIndex)) {
+    if (isInput && is_batched_input(ioIndex)) {
+        OPENVINO_THROW("Cannot return tensors in a tensor.");
+    }
+
+    auto& userTensors = isInput ? get_user_input(ioIndex) : _userOutputTensors.at(ioIndex);
+
+    if (userTensors) {
         _logger.debug("ZeroInferRequest::get_tensor - tensor allocated, get the tensor");
-        return userTensors.at(ioIndex);
+        return userTensors;
     }
 
     _logger.debug("ZeroInferRequest::get_tensor - tensor is not allocated, create the tensor");
 
-    auto& levelZeroTensors = isInput ? _levelZeroInputTensors : _levelZeroOutputTensors;
-    auto& tensorsData = isInput ? _inputTensorsData : _outputTensorsData;
+    auto& levelZeroTensors = isInput ? get_level_zero_input(ioIndex) : _levelZeroOutputTensors.at(ioIndex);
+    auto& tensorsData = isInput ? get_input_tensor_data(ioIndex) : _outputTensorsData.at(ioIndex);
 
-    levelZeroTensors.at(ioIndex) =
-        allocate_tensor(isInput ? _metadata.inputs.at(ioIndex) : _metadata.outputs.at(ioIndex),
-                        ioIndex,
-                        isInput,
-                        isInput ? *_inputAllocator : *_outputAllocator,
-                        _batchSize);
-    tensorsData.at(ioIndex) =
-        std::optional(TensorData{levelZeroTensors.at(ioIndex)->data(), levelZeroTensors.at(ioIndex)->get_byte_size()});
+    levelZeroTensors = allocate_tensor(isInput ? _metadata.inputs.at(ioIndex) : _metadata.outputs.at(ioIndex),
+                                       ioIndex,
+                                       isInput,
+                                       isInput ? *_inputAllocator : *_outputAllocator,
+                                       _batchSize);
+    tensorsData = std::optional(TensorData{levelZeroTensors->data(), levelZeroTensors->get_byte_size()});
 
-    return levelZeroTensors.at(ioIndex);
+    return levelZeroTensors;
 }
 
 void ZeroInferRequest::infer() {
@@ -450,26 +539,75 @@ void ZeroInferRequest::infer_async() {
     _executor->mutexUnlock();
 
     size_t inputIndex = 0;
-    for (const std::shared_ptr<ov::ITensor>& userTensor : _userInputTensors) {
+    for (const auto& userTensor : _userInputTensors) {
         const IODescriptor inputDescriptor = _metadata.inputs.at(inputIndex);
         if (inputDescriptor.isShapeTensor) {
             OPENVINO_ASSERT(inputDescriptor.relatedDescriptorIndex.has_value(),
                             "The link between the dynamic tensor and its shape tensor is missing, entry name: ",
                             inputDescriptor.nameFromCompiler);
-            const auto& inputDims = _userInputTensors.at(*inputDescriptor.relatedDescriptorIndex)->get_shape();
+            const auto& inputDims = get_user_input(*inputDescriptor.relatedDescriptorIndex)->get_shape();
 
-            for (size_t i = 0; i < userTensor->get_size(); ++i) {
+            for (size_t i = 0; i < userTensor.at(SINGLE_TENSOR)->get_size(); ++i) {
                 const auto reverseIdx = inputDims.size() - 1 - i;
-                userTensor->data<uint32_t>()[i] = static_cast<uint32_t>(inputDims[reverseIdx]);
+                userTensor.at(SINGLE_TENSOR)->data<uint32_t>()[i] = static_cast<uint32_t>(inputDims[reverseIdx]);
             }
         }
 
-        auto userRemoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(userTensor);
+        if (is_batched_input(inputIndex)) {
+            if (_batchSize.has_value()) {
+                for (size_t i = 0; i < userTensor.size(); i++) {
+                    auto levelZeroBatchRemoteTensor =
+                        std::dynamic_pointer_cast<ZeroRemoteTensor>(get_level_zero_input(inputIndex, i));
+                    if (levelZeroBatchRemoteTensor == nullptr) {
+                        void* levelZeroBuffer = get_level_zero_input(inputIndex, i)->data();
+
+                        auto userBatchRemoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(userTensor.at(i)._ptr);
+
+                        void* userBuffer =
+                            !userBatchRemoteTensor
+                                ? userTensor.at(i)->data()
+                                : extract_object(userBatchRemoteTensor->get_properties(), ov::intel_npu::mem_handle);
+
+                        if (userBuffer != levelZeroBuffer) {
+                            if (userBuffer == nullptr || levelZeroBuffer == nullptr) {
+                                OPENVINO_THROW("Empty buffer");
+                            }
+
+                            _logger.info("Batched Tensors - Tensor is not allocated in the current Level Zero context");
+                            OV_ITT_TASK_NEXT(ZERO_INFER, "memcpy");
+                            std::memcpy(levelZeroBuffer, userBuffer, userTensor.at(i)->get_byte_size());
+                        }
+                    }
+                }
+            } else {
+                void* levelZeroBuffer = get_level_zero_input(inputIndex)->data();
+
+                _logger.info("Batched Tensors - Tensor is not allocated in the current Level Zero context or must be "
+                             "in a continued memory space");
+
+                for (size_t i = 0; i < userTensor.size(); i++) {
+                    auto userBatchRemoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(userTensor.at(i)._ptr);
+
+                    void* userBuffer = !userBatchRemoteTensor ? userTensor.at(i)->data()
+                                                              : extract_object(userBatchRemoteTensor->get_properties(),
+                                                                               ov::intel_npu::mem_handle);
+
+                    std::memcpy(static_cast<unsigned char*>(levelZeroBuffer) + (i * userTensor.at(i)->get_byte_size()),
+                                userBuffer,
+                                userTensor.at(i)->get_byte_size());
+                }
+            }
+
+            ++inputIndex;
+            continue;
+        }
+
+        auto userRemoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(userTensor.at(SINGLE_TENSOR)._ptr);
         void* userBuffer = !userRemoteTensor
-                               ? userTensor->data()
+                               ? userTensor.at(SINGLE_TENSOR)->data()
                                : extract_object(userRemoteTensor->get_properties(), ov::intel_npu::mem_handle);
 
-        const std::shared_ptr<ov::ITensor>& levelZeroTensor = _levelZeroInputTensors.at(inputIndex);
+        const std::shared_ptr<ov::ITensor>& levelZeroTensor = get_level_zero_input(inputIndex);
         auto levelZeroRemoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(levelZeroTensor);
         if (levelZeroRemoteTensor == nullptr) {
             void* levelZeroBuffer = levelZeroTensor->data();
@@ -481,7 +619,7 @@ void ZeroInferRequest::infer_async() {
 
                 _logger.info("Tensor is not allocated in the current Level Zero context");
                 OV_ITT_TASK_NEXT(ZERO_INFER, "memcpy");
-                std::memcpy(levelZeroBuffer, userBuffer, userTensor->get_byte_size());
+                std::memcpy(levelZeroBuffer, userBuffer, userTensor.at(SINGLE_TENSOR)->get_byte_size());
             }
         }
 
@@ -498,7 +636,7 @@ void ZeroInferRequest::get_result() {
     _pipeline->pull();
 
     size_t outputIndex = 0;
-    for (const std::shared_ptr<ov::ITensor>& userTensor : _userOutputTensors) {
+    for (const auto& userTensor : _userOutputTensors) {
         const IODescriptor outputDescriptor = _metadata.outputs.at(outputIndex);
         if (outputDescriptor.isShapeTensor) {
             OPENVINO_ASSERT(outputDescriptor.relatedDescriptorIndex.has_value(),
@@ -516,7 +654,7 @@ void ZeroInferRequest::get_result() {
             tensorToBeReshaped->set_shape(actualDims);
         }
 
-        auto userRemoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(userTensor);
+        auto userRemoteTensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(userTensor._ptr);
         void* userBuffer = !userRemoteTensor
                                ? userTensor->data()
                                : extract_object(userRemoteTensor->get_properties(), ov::intel_npu::mem_handle);
@@ -615,3 +753,18 @@ std::vector<ov::ProfilingInfo> ZeroInferRequest::get_profiling_info() const {
 std::vector<uint8_t> ZeroInferRequest::get_raw_profiling_data() const {
     return _profilingQuery.getData<uint8_t>();
 }
+
+std::shared_ptr<ov::ITensor>& ZeroInferRequest::get_level_zero_input(size_t index, size_t tensorNo) const {
+    return _levelZeroInputTensors.at(index).at(tensorNo);
+}
+
+std::vector<std::shared_ptr<ov::ITensor>>& ZeroInferRequest::get_level_zero_inputs(size_t index) const {
+    return _levelZeroInputTensors.at(index);
+}
+
+std::optional<TensorData>& ZeroInferRequest::get_input_tensor_data(size_t index, size_t tensorNo) const {
+    return _inputTensorsData.at(index).at(tensorNo);
+}
+std::vector<std::optional<TensorData>>& ZeroInferRequest::get_input_tensors_data(size_t index) const {
+    return _inputTensorsData.at(index);
+}
diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
index ff24536a52d9b6..cfc80d48c50707 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
@@ -20,7 +20,7 @@ Pipeline::Pipeline(const Config& config,
                    zeroProfiling::ProfilingPool& profiling_pool,
                    zeroProfiling::ProfilingQuery& profiling_query,
                    std::shared_ptr<zeroProfiling::NpuInferProfiling> npu_profiling,
-                   const std::vector<std::optional<TensorData>>& inputTensorsData,
+                   const std::vector<std::vector<std::optional<TensorData>>>& inputTensorsData,
                    const std::vector<std::optional<TensorData>>& outputTensorsData,
                    const size_t numberOfCommandLists)
     : _config(config),
@@ -31,9 +31,9 @@ Pipeline::Pipeline(const Config& config,
                   numberOfCommandLists ? static_cast<uint32_t>(numberOfCommandLists) : 1,
                   _config},
       _npu_profiling(std::move(npu_profiling)),
-      _logger("IntegratedPipeline", _config.get<LOG_LEVEL>()) {
-    OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::IntegratedPipeline::IntegratedPipeline");
-    _logger.debug("IntegratedPipeline - initialize started");
+      _logger("Pipeline", _config.get<LOG_LEVEL>()) {
+    OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::Pipeline::Pipeline");
+    _logger.debug("Pipeline - initialize started");
 
     if (profiling_pool.create()) {
         profiling_query.create(profiling_pool._handle);
@@ -42,7 +42,7 @@ Pipeline::Pipeline(const Config& config,
     _command_lists.reserve(numberOfCommandLists);
     _events.reserve(numberOfCommandLists);
     _fences.reserve(numberOfCommandLists);
-    _logger.debug("IntegratedPipeline - emplace_back _event_pool and _command_queue");
+    _logger.debug("Pipeline - emplace_back _event_pool and _command_queue");
     for (size_t i = 0; i < numberOfCommandLists; i++) {
         _command_lists.emplace_back(
             std::make_unique<CommandList>(_executor->getInitStructs()->getDevice(),
@@ -58,9 +58,17 @@ Pipeline::Pipeline(const Config& config,
     for (size_t i = 0; i < numberOfCommandLists; i++) {
         size_t ioIndex = 0;
         for (const auto& desc : _executor->get_input_descriptors()) {
+            if (inputTensorsData.at(ioIndex).size() > 1) {
+                _executor->setArgumentValue(desc.idx, inputTensorsData.at(ioIndex).at(i)->mem);
+
+                ++ioIndex;
+                continue;
+            }
+
             _executor->setArgumentValue(desc.idx,
-                                        static_cast<unsigned char*>(inputTensorsData.at(ioIndex)->mem) +
-                                            (i * inputTensorsData.at(ioIndex)->size) / numberOfCommandLists);
+                                        static_cast<unsigned char*>(inputTensorsData.at(ioIndex).at(0)->mem) +
+                                            (i * inputTensorsData.at(ioIndex).at(0)->size) / numberOfCommandLists);
+
             ++ioIndex;
         }
 
@@ -93,14 +101,14 @@ Pipeline::Pipeline(const Config& config,
         }
         _command_lists.at(i)->close();
     }
-    _logger.debug("IntegratedPipeline - initialize completed");
+    _logger.debug("Pipeline - initialize completed");
 }
 
 void Pipeline::push() {
-    _logger.debug("IntegratedPipeline - push() started");
+    _logger.debug("Pipeline - push() started");
 
     for (size_t i = 0; i < _command_lists.size(); ++i) {
-        OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PUSH, itt::domains::LevelZeroBackend, "IntegratedPipeline", "push");
+        OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PUSH, itt::domains::LevelZeroBackend, "Pipeline", "push");
         if (sync_output_with_fences_) {
             _command_queue.executeCommandList(*_command_lists.at(i), *_fences.at(i));
         } else {
@@ -108,12 +116,12 @@ void Pipeline::push() {
         }
     }
 
-    _logger.debug("IntegratedPipeline - push() completed");
+    _logger.debug("Pipeline - push() completed");
 };
 
 void Pipeline::pull() {
-    _logger.debug("IntegratedPipeline - pull() started");
-    OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PULL, itt::domains::LevelZeroBackend, "IntegratedPipeline", "pull");
+    _logger.debug("Pipeline - pull() started");
+    OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_PULL, itt::domains::LevelZeroBackend, "Pipeline", "pull");
 
     for (size_t i = 0; i < _command_lists.size(); ++i) {
         if (sync_output_with_fences_) {
@@ -127,11 +135,11 @@ void Pipeline::pull() {
         }
     }
 
-    _logger.debug("IntegratedPipeline - pull() completed");
+    _logger.debug("Pipeline - pull() completed");
 };
 
 void Pipeline::reset() const {
-    _logger.debug("IntegratedPipeline - rest() started");
+    _logger.debug("Pipeline - rest() started");
 
     for (size_t i = 0; i < _command_lists.size(); ++i) {
         if (sync_output_with_fences_) {
@@ -141,11 +149,13 @@ void Pipeline::reset() const {
         }
     }
 
-    _logger.debug("IntegratedPipeline - rest() completed");
+    _logger.debug("Pipeline - rest() completed");
 };
 
-void Pipeline::updateCommandList(const TensorData& tensorsData, const uint32_t index) {
-    OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "IntegratedPipeline", "updateCommandList");
+void Pipeline::updateCommandList(const TensorData& tensorsData, uint32_t index) {
+    OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandList");
+    _logger.debug("Pipeline - updateCommandList");
+
     const size_t numberOfCommandLists = _command_lists.size();
 
     for (size_t i = 0; i < numberOfCommandLists; i++) {
@@ -156,4 +166,18 @@ void Pipeline::updateCommandList(const TensorData& tensorsData, const uint32_t i
     }
 };
 
+void Pipeline::updateCommandList(const TensorData& tensorsData, uint32_t index, size_t commandListIndex) {
+    OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandList");
+    _logger.debug("Pipeline - updateCommandList");
+
+    const size_t numberOfCommandLists = _command_lists.size();
+
+    OPENVINO_ASSERT(commandListIndex < numberOfCommandLists,
+                    "Command list index is higgher than the number of Command lists ",
+                    commandListIndex);
+
+    _command_lists.at(commandListIndex)->updateMutableCommandList(index, tensorsData.mem);
+    _command_lists.at(commandListIndex)->close();
+};
+
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.cpp b/src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.cpp
new file mode 100644
index 00000000000000..3d7d4eb89eff4c
--- /dev/null
+++ b/src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.cpp
@@ -0,0 +1,21 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "behavior/batched_tensors_tests/batched_run.hpp"
+
+#include "common/npu_test_env_cfg.hpp"
+#include "common/utils.hpp"
+#include "intel_npu/al/config/common.hpp"
+
+using namespace ov::test::behavior;
+
+const std::vector<ov::AnyMap> batchedConfigs = {{ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::PLUGIN)},
+                                                {ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::COMPILER)},
+                                                {ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::AUTO)}};
+
+INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest,
+                         BatchedTensorsRunTests,
+                         ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU),
+                                            ::testing::ValuesIn(batchedConfigs)),
+                         BatchedTensorsRunTests::getTestCaseName);
diff --git a/src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.hpp
new file mode 100644
index 00000000000000..05e580ab99664c
--- /dev/null
+++ b/src/plugins/intel_npu/tests/functional/behavior/batched_tensors_tests/batched_run.hpp
@@ -0,0 +1,425 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <gmock/gmock-matchers.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "base/ov_behavior_test_utils.hpp"
+#include "common/npu_test_env_cfg.hpp"
+#include "common/utils.hpp"
+#include "functional_test_utils/ov_plugin_cache.hpp"
+#include "npu_private_properties.hpp"
+#include "openvino/core/any.hpp"
+#include "openvino/core/node_vector.hpp"
+#include "openvino/core/type/element_iterator.hpp"
+#include "openvino/op/op.hpp"
+#include "openvino/opsets/opset8.hpp"
+#include "openvino/runtime/compiled_model.hpp"
+#include "openvino/runtime/core.hpp"
+#include "openvino/runtime/intel_npu/level_zero/level_zero.hpp"
+#include "overload/overload_test_utils_npu.hpp"
+
+using CompilationParams = std::tuple<std::string,  // Device name
+                                     ov::AnyMap    // Config
+                                     >;
+
+using ::testing::AllOf;
+using ::testing::HasSubstr;
+
+namespace ov {
+namespace test {
+namespace behavior {
+class BatchedTensorsRunTests : public ov::test::behavior::OVPluginTestBase,
+                               public testing::WithParamInterface<CompilationParams> {
+protected:
+    std::shared_ptr<ov::Core> core = utils::PluginCache::get().core();
+    ov::AnyMap configuration;
+    std::shared_ptr<ov::Model> ov_model;
+    ov::CompiledModel compiled_model;
+    ov::Output<const ov::Node> input;
+    ov::Output<const ov::Node> output;
+    std::string m_cache_dir;
+
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<CompilationParams> obj) {
+        std::string targetDevice;
+        ov::AnyMap configuration;
+        std::tie(targetDevice, configuration) = obj.param;
+        std::replace(targetDevice.begin(), targetDevice.end(), ':', '_');
+        targetDevice = ov::test::utils::getTestsPlatformFromEnvironmentOr(ov::test::utils::DEVICE_NPU);
+
+        std::ostringstream result;
+        result << "targetDevice=" << targetDevice << "_";
+        result << "targetPlatform=" << ov::test::utils::getTestsPlatformFromEnvironmentOr(targetDevice) << "_";
+        if (!configuration.empty()) {
+            for (auto& configItem : configuration) {
+                result << "configItem=" << configItem.first << "_";
+                configItem.second.print(result);
+            }
+        }
+
+        return result.str();
+    }
+
+    void SetUp() override {
+        std::tie(target_device, configuration) = this->GetParam();
+
+        SKIP_IF_CURRENT_TEST_IS_DISABLED()
+        OVPluginTestBase::SetUp();
+        ov_model = getDefaultNGraphFunctionForTheDeviceNPU();  // FIXME: E#80555
+    }
+
+    std::string generateCacheDirName(const std::string& test_name) {
+        using namespace std::chrono;
+        // Generate unique file names based on test name, thread id and timestamp
+        // This allows execution of tests in parallel (stress mode)
+        auto hash = std::to_string(std::hash<std::string>()(test_name));
+        std::stringstream ss;
+        auto ts = duration_cast<nanoseconds>(high_resolution_clock::now().time_since_epoch());
+        ss << hash << "_"
+           << "_" << ts.count();
+        return ss.str();
+    }
+
+    void TearDown() override {
+        if (!m_cache_dir.empty()) {
+            core->set_property({ov::cache_dir()});
+            core.reset();
+            ov::test::utils::PluginCache::get().reset();
+            ov::test::utils::removeFilesWithExt(m_cache_dir, "blob");
+            ov::test::utils::removeDir(m_cache_dir);
+        }
+
+        if (!configuration.empty()) {
+            utils::PluginCache::get().reset();
+        }
+
+        APIBaseTest::TearDown();
+    }
+
+    std::shared_ptr<Model> create_n_inputs(size_t n,
+                                           element::Type type,
+                                           const PartialShape& shape,
+                                           const ov::Layout& layout) {
+        ResultVector res;
+        ParameterVector params;
+
+        for (size_t i = 0; i < n; i++) {
+            auto index_str = std::to_string(i);
+            auto data1 = std::make_shared<ov::op::v0::Parameter>(type, shape);
+            data1->set_friendly_name("input" + index_str);
+            data1->get_output_tensor(0).set_names({"tensor_input" + index_str});
+            data1->set_layout(layout);
+            auto constant = opset8::Constant::create(type, {1}, {1});
+            auto op1 = std::make_shared<ov::op::v1::Add>(data1, constant);
+            op1->set_friendly_name("Add" + index_str);
+            auto res1 = std::make_shared<ov::op::v0::Result>(op1);
+            res1->set_friendly_name("Result" + index_str);
+            res1->get_output_tensor(0).set_names({"tensor_output" + index_str});
+            params.push_back(data1);
+            res.push_back(res1);
+        }
+
+        return std::make_shared<Model>(res, params);
+    }
+};
+
+TEST_P(BatchedTensorsRunTests, SetInputRemoteTensorsMultipleInfer) {
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    size_t batch = 4;
+    auto one_shape = Shape{1, 2, 2, 2};
+    auto batch_shape = Shape{batch, 2, 2, 2};
+    auto one_shape_size = ov::shape_size(one_shape);
+    auto model = BatchedTensorsRunTests::create_n_inputs(2, element::f32, batch_shape, "N...");
+    auto execNet = core->compile_model(model, target_device, configuration);
+    auto context = core->get_default_context(target_device);
+    // Create InferRequest
+    ov::InferRequest req;
+    req = execNet.create_infer_request();
+    std::vector<ov::Tensor> tensors;
+    for (size_t i = 0; i < batch; ++i) {
+        // non contiguous memory
+        auto tensor = context.create_host_tensor(ov::element::f32, one_shape);
+        tensors.push_back(std::move(tensor));
+    }
+    req.set_tensors("tensor_input0", tensors);
+
+    auto actual_tensor = req.get_tensor("tensor_output0");
+    auto* actual = actual_tensor.data<float>();
+    for (auto testNum = 0; testNum < 5; testNum++) {
+        for (size_t i = 0; i < batch; ++i) {
+            auto* f = tensors[i].data<float>();
+            for (size_t j = 0; j < one_shape_size; ++j) {
+                f[j] = static_cast<float>(testNum + 20);
+            }
+        }
+        req.infer();  // Adds '1' to each element
+        for (size_t j = 0; j < one_shape_size * batch; ++j) {
+            EXPECT_EQ(actual[j], testNum + 21) << "Infer " << testNum << ": Expected=" << testNum + 21
+                                               << ", actual=" << actual[j] << " for index " << j;
+        }
+    }
+}
+
+TEST_P(BatchedTensorsRunTests, SetInputDifferentTensorsMultipleInfer) {
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    size_t batch = 4;
+    auto one_shape = Shape{1, 2, 2, 2};
+    auto batch_shape = Shape{batch, 2, 2, 2};
+    auto one_shape_size = ov::shape_size(one_shape);
+    auto model = BatchedTensorsRunTests::create_n_inputs(2, element::f32, batch_shape, "N...");
+    auto execNet = core->compile_model(model, target_device, configuration);
+    auto context = core->get_default_context(target_device);
+    // Create InferRequest
+    ov::InferRequest req;
+    req = execNet.create_infer_request();
+    std::vector<ov::Tensor> tensors;
+
+    std::vector<float> buffer(one_shape_size * 2 * 2, 0);
+
+    auto tensor0 = ov::Tensor(element::f32, one_shape, &buffer[(0 * 2) * one_shape_size]);
+    auto tensor1 = context.create_host_tensor(ov::element::f32, one_shape);
+    auto tensor2 = ov::Tensor(element::f32, one_shape, &buffer[(1 * 2) * one_shape_size]);
+    auto tensor3 = context.create_host_tensor(ov::element::f32, one_shape);
+
+    tensors.push_back(std::move(tensor0));
+    tensors.push_back(std::move(tensor1));
+    tensors.push_back(std::move(tensor2));
+    tensors.push_back(std::move(tensor3));
+
+    req.set_tensors("tensor_input0", tensors);
+
+    auto actual_tensor = req.get_tensor("tensor_output0");
+    auto* actual = actual_tensor.data<float>();
+    for (auto testNum = 0; testNum < 5; testNum++) {
+        for (size_t i = 0; i < batch; ++i) {
+            auto* f = tensors[i].data<float>();
+            for (size_t j = 0; j < one_shape_size; ++j) {
+                f[j] = static_cast<float>(testNum + 20);
+            }
+        }
+        req.infer();  // Adds '1' to each element
+        for (size_t j = 0; j < one_shape_size * batch; ++j) {
+            EXPECT_EQ(actual[j], testNum + 21) << "Infer " << testNum << ": Expected=" << testNum + 21
+                                               << ", actual=" << actual[j] << " for index " << j;
+        }
+    }
+}
+
+TEST_P(BatchedTensorsRunTests, SetInputDifferentTensorsMultipleInferMCL) {
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    size_t batch = 4;
+    auto one_shape = Shape{1, 2, 2, 2};
+    auto batch_shape = Shape{batch, 2, 2, 2};
+    auto one_shape_size = ov::shape_size(one_shape);
+    auto model = BatchedTensorsRunTests::create_n_inputs(2, element::f32, batch_shape, "N...");
+    auto execNet = core->compile_model(model, target_device, configuration);
+    auto context = core->get_default_context(target_device);
+    // Create InferRequest
+    ov::InferRequest req;
+    req = execNet.create_infer_request();
+
+    std::vector<float> buffer(one_shape_size * batch * 2, 0);
+
+    {
+        std::vector<ov::Tensor> tensors;
+
+        auto tensor0 = ov::Tensor(element::f32, one_shape, &buffer[(0 * 2) * one_shape_size]);
+        auto tensor1 = context.create_host_tensor(ov::element::f32, one_shape);
+        auto tensor2 = ov::Tensor(element::f32, one_shape, &buffer[(1 * 2) * one_shape_size]);
+        auto tensor3 = context.create_host_tensor(ov::element::f32, one_shape);
+
+        tensors.push_back(std::move(tensor0));
+        tensors.push_back(std::move(tensor1));
+        tensors.push_back(std::move(tensor2));
+        tensors.push_back(std::move(tensor3));
+
+        req.set_tensors("tensor_input0", tensors);
+
+        auto actual_tensor = req.get_tensor("tensor_output0");
+        auto* actual = actual_tensor.data<float>();
+        for (auto testNum = 0; testNum < 5; testNum++) {
+            for (size_t i = 0; i < batch; ++i) {
+                auto* f = tensors[i].data<float>();
+                for (size_t j = 0; j < one_shape_size; ++j) {
+                    f[j] = static_cast<float>(testNum + 20);
+                }
+            }
+            req.infer();  // Adds '1' to each element
+            for (size_t j = 0; j < one_shape_size * batch; ++j) {
+                EXPECT_EQ(actual[j], testNum + 21) << "Infer " << testNum << ": Expected=" << testNum + 21
+                                                   << ", actual=" << actual[j] << " for index " << j;
+            }
+        }
+    }
+
+    {
+        std::vector<ov::Tensor> tensors;
+
+        auto tensor0 = context.create_host_tensor(ov::element::f32, one_shape);
+        auto tensor1 = ov::Tensor(element::f32, one_shape, &buffer[(2 * 2) * one_shape_size]);
+        auto tensor2 = ov::Tensor(element::f32, one_shape, &buffer[(3 * 2) * one_shape_size]);
+        auto tensor3 = context.create_host_tensor(ov::element::f32, one_shape);
+
+        tensors.push_back(std::move(tensor0));
+        tensors.push_back(std::move(tensor1));
+        tensors.push_back(std::move(tensor2));
+        tensors.push_back(std::move(tensor3));
+
+        req.set_tensors("tensor_input0", tensors);
+
+        auto actual_tensor = req.get_tensor("tensor_output0");
+        auto* actual = actual_tensor.data<float>();
+        for (auto testNum = 0; testNum < 5; testNum++) {
+            for (size_t i = 0; i < batch; ++i) {
+                auto* f = tensors[i].data<float>();
+                for (size_t j = 0; j < one_shape_size; ++j) {
+                    f[j] = static_cast<float>(testNum + 200);
+                }
+            }
+            req.infer();  // Adds '1' to each element
+            for (size_t j = 0; j < one_shape_size * batch; ++j) {
+                EXPECT_EQ(actual[j], testNum + 201) << "Infer " << testNum << ": Expected=" << testNum + 21
+                                                    << ", actual=" << actual[j] << " for index " << j;
+            }
+        }
+    }
+}
+
+TEST_P(BatchedTensorsRunTests, SetInputDifferentRemoteTensorsMultipleInferMCL) {
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    size_t batch = 4;
+    auto one_shape = Shape{1, 2, 2, 2};
+    auto batch_shape = Shape{batch, 2, 2, 2};
+    auto one_shape_size = ov::shape_size(one_shape);
+    auto model = BatchedTensorsRunTests::create_n_inputs(2, element::f32, batch_shape, "N...");
+    auto execNet = core->compile_model(model, target_device, configuration);
+    auto context = core->get_default_context(target_device).as<ov::intel_npu::level_zero::ZeroContext>();
+    // Create InferRequest
+    ov::InferRequest req;
+    req = execNet.create_infer_request();
+
+    std::vector<float> buffer(one_shape_size * 2 * 2, 0);
+
+    {
+        std::vector<ov::Tensor> tensors;
+
+        auto tensor0 = ov::Tensor(element::f32, one_shape, &buffer[(0 * 2) * one_shape_size]);
+        auto tensor1 = context.create_l0_host_tensor(ov::element::f32, one_shape);
+        auto tensor2 = context.create_l0_host_tensor(ov::element::f32, one_shape);
+        auto tensor3 = context.create_host_tensor(ov::element::f32, one_shape);
+
+        tensors.push_back(tensor0);
+        tensors.push_back(tensor1);
+        tensors.push_back(tensor2);
+        tensors.push_back(tensor3);
+
+        req.set_tensors("tensor_input0", tensors);
+
+        auto actual_tensor = req.get_tensor("tensor_output0");
+        auto* actual = actual_tensor.data<float>();
+        for (auto testNum = 0; testNum < 5; testNum++) {
+            {
+                auto* f = tensor0.data<float>();
+                for (size_t j = 0; j < one_shape_size; ++j) {
+                    f[j] = static_cast<float>(testNum + 20);
+                }
+            }
+            {
+                auto* data = tensor1.get();
+                float* f = static_cast<float*>(data);
+                for (size_t j = 0; j < one_shape_size; ++j) {
+                    f[j] = static_cast<float>(testNum + 20);
+                }
+            }
+            {
+                auto* data = tensor2.get();
+                float* f = static_cast<float*>(data);
+                for (size_t j = 0; j < one_shape_size; ++j) {
+                    f[j] = static_cast<float>(testNum + 20);
+                }
+            }
+            {
+                auto* f = tensor3.data<float>();
+                for (size_t j = 0; j < one_shape_size; ++j) {
+                    f[j] = static_cast<float>(testNum + 20);
+                }
+            }
+
+            req.infer();  // Adds '1' to each element
+            for (size_t j = 0; j < one_shape_size * batch; ++j) {
+                EXPECT_EQ(actual[j], testNum + 21) << "Infer " << testNum << ": Expected=" << testNum + 21
+                                                   << ", actual=" << actual[j] << " for index " << j;
+            }
+        }
+    }
+
+    {
+        std::vector<ov::Tensor> tensors;
+
+        auto tensor0 = context.create_l0_host_tensor(ov::element::f32, one_shape);
+        auto tensor1 = context.create_host_tensor(ov::element::f32, one_shape);
+        auto tensor2 = ov::Tensor(element::f32, one_shape, &buffer[(1 * 2) * one_shape_size]);
+        auto tensor3 = context.create_l0_host_tensor(ov::element::f32, one_shape);
+
+        tensors.push_back(tensor0);
+        tensors.push_back(tensor1);
+        tensors.push_back(tensor2);
+        tensors.push_back(tensor3);
+
+        req.set_tensors("tensor_input0", tensors);
+
+        auto actual_tensor = req.get_tensor("tensor_output0");
+        auto* actual = actual_tensor.data<float>();
+        for (auto testNum = 0; testNum < 5; testNum++) {
+            {
+                auto* data = tensor0.get();
+                float* f = static_cast<float*>(data);
+                for (size_t j = 0; j < one_shape_size; ++j) {
+                    f[j] = static_cast<float>(testNum + 20);
+                }
+            }
+            {
+                auto* f = tensor1.data<float>();
+                for (size_t j = 0; j < one_shape_size; ++j) {
+                    f[j] = static_cast<float>(testNum + 20);
+                }
+            }
+            {
+                auto* f = tensor2.data<float>();
+                for (size_t j = 0; j < one_shape_size; ++j) {
+                    f[j] = static_cast<float>(testNum + 20);
+                }
+            }
+            {
+                auto* data = tensor3.get();
+                float* f = static_cast<float*>(data);
+                for (size_t j = 0; j < one_shape_size; ++j) {
+                    f[j] = static_cast<float>(testNum + 20);
+                }
+            }
+
+            req.infer();  // Adds '1' to each element
+            for (size_t j = 0; j < one_shape_size * batch; ++j) {
+                EXPECT_EQ(actual[j], testNum + 21) << "Infer " << testNum << ": Expected=" << testNum + 21
+                                                   << ", actual=" << actual[j] << " for index " << j;
+            }
+        }
+    }
+}
+
+}  // namespace behavior
+}  // namespace test
+}  // namespace ov
diff --git a/src/plugins/intel_npu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_npu/tests/functional/shared_tests_instances/skip_tests_config.cpp
index 4eb829045c964a..aa61afdcacc1bc 100644
--- a/src/plugins/intel_npu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_npu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@@ -702,18 +702,9 @@ std::vector<std::string> disabledTestPatterns() {
                 ".*OVCompiledModelPropertiesDefaultSupportedTests.CanCompileWithDefaultValueFromPlugin.*"
         });
 
-        // [Tracking number: E#116494]
-        _skipRegistry.addPatterns(
-                "NPU plugin doesn't implement `set_tensors` function", {
-                ".*OVInferRequestBatchedTests.SetInputTensorsBase.*",
-                ".*OVInferRequestBatchedTests.SetInputTensorsAsync.*",
-                ".*OVInferRequestBatchedTests.SetInputTensors_override_with_set.*",
-                ".*OVInferRequestBatchedTests.SetInputTensorsBase_Caching.*",
-                ".*OVInferRequestBatchedTests.SetInputTensors_Multiple_Infer.*",
+        _skipRegistry.addPatterns(
+                "NPU plugin doesn't support infer dynamic", {
                 ".*OVInferRequestBatchedTests.SetInputTensors_Can_Infer_Dynamic.*",
-                ".*OVInferRequestBatchedTests.SetInputTensors_Get_Tensor_Not_Allowed.*",
-                ".*OVInferRequestBatchedTests.SetInputTensors_Correct_all.*",
-                ".*OVInferRequestBatchedTests.SetInputTensors_Cache_CheckDeepCopy.*"
         });
 
         // [Tracking number: E#118381]