openvinotoolkit · dmatveev · Nov 21, 2024 · Oct 31, 2024 · Oct 29, 2024 · Oct 29, 2024
@@ -56,6 +56,7 @@ DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime);
 DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime);
 DEFINE_OPT(NPUW_CACHE_DIR, std::string, "", npuw::cache_dir, CompileTime);
 DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime);
+DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime);
 DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
 DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);
 DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime);

@@ -279,6 +279,14 @@ static constexpr ov::Property<bool> parallel_compilation{"NPUW_PARALLEL_COMPILE"
  */
 static constexpr ov::Property<bool> funcall_async{"NPUW_FUNCALL_ASYNC"};
 
+/**
+ * @brief
+ * Type: boolean
+ * Create individual infer requests for partitiongs, even repeating.
+ * Default value: false.
+ */
+static constexpr ov::Property<bool> unfold_ireqs{"NPUW_UNFOLD_IREQS"};
+
 namespace accuracy {
 /**
  * @brief

@@ -38,6 +38,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_FUNCALL_FOR_ALL>();
     desc.add<NPUW_PARALLEL_COMPILE>();
     desc.add<NPUW_FUNCALL_ASYNC>();
+    desc.add<NPUW_UNFOLD_IREQS>();
     desc.add<NPUW_WEIGHTS_BANK>();
     desc.add<NPUW_WEIGHTS_BANK_ALLOC>();
     desc.add<NPUW_CACHE_DIR>();

@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#include "openvino/core/parallel.hpp"
+
 #include "base_sync_infer_request.hpp"
 
 #include "compiled_model.hpp"
@@ -178,6 +180,33 @@ void ov::npuw::IBaseInferRequest::check_tensors() const {
     return;
 }
 
+std::vector<ov::SoPtr<ov::IVariableState>> ov::npuw::IBaseInferRequest::query_state() const {
+    std::vector<ov::SoPtr<ov::IVariableState>> variable_states = {};
+    for (const auto& request : m_subrequests) {
+        if (!request)  // optimized out
+            continue;
+        for (auto&& state : request->query_state()) {
+            if (!state._so)
+                state._so = request._so;
+            variable_states.emplace_back(state);
+        }
+    }
+    return variable_states;
+}
+
+std::vector<ov::ProfilingInfo> ov::npuw::IBaseInferRequest::get_profiling_info() const {
+    std::vector<ov::ProfilingInfo> info;
+    for (size_t i = 0; i < m_subrequests.size(); ++i) {
+        if (!m_subrequests[i])  // optimized out
+            continue;
+        auto&& subreq_info = m_subrequests[i]->get_profiling_info();
+        for (auto&& rec : subreq_info)
+            rec.node_name = std::string("subgraph") + std::to_string(i) + ": " + rec.node_name;
+        info.insert(info.end(), subreq_info.begin(), subreq_info.end());
+    }
+    return info;
+}
+
 void ov::npuw::IBaseInferRequest::infer() {
     m_now_idx.reset();
     prepare_for_infer();
@@ -209,6 +238,176 @@ void ov::npuw::IBaseInferRequest::infer() {
     m_now_idx.reset();
 }
 
+
+std::size_t ov::npuw::IBaseInferRequest::total_subrequests() const {
+    return m_subrequests.size();
+}
+
+ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocMem(const ov::element::Type type,
+                                                          const ov::Shape& shape,
+                                                          const std::string& device) {
+    if (device == "CPU" || ov::shape_size(shape) == 0) {
+        return ov::get_tensor_impl(ov::Tensor(type, shape));
+    }
+
+    // Protect access to shared context(s) - at least among infer requests
+    auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
+    auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
+    return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
+}
+
+ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocOut(const ov::Output<const ov::Node>& node,
+                                                          const std::string& device) {
+    return allocMem(node.get_element_type(), node.get_shape(), device);
+}
+
+void ov::npuw::IBaseInferRequest::alloc_io() {
+    // Preallocate input tensors
+    LOG_INFO("Preallocating input tensors...");
+    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
+        const auto& port = m_npuw_model->inputs()[i];
+        ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
+        m_input_tensors.push_back(allocated);
+        m_input_allocated.insert(allocated->data());
+        m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
+    }  // for(inputs)
+
+    // Preallocate output tensors
+    LOG_INFO("Preallocating output tensors...");
+    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
+        LOG_BLOCK();
+        const auto& port = m_npuw_model->outputs()[i];
+        LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port);
+
+        // FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom
+        const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
+        LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);
+
+        auto tensor = alloc_global_out(i);
+        m_output_tensors.push_back(tensor);
+        m_port_to_tensor[port] = TensorStorage{tensor, true};
+    }
+}
+
+ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::alloc_global_out(std::size_t out_idx) {
+    const auto& port = m_npuw_model->outputs().at(out_idx);
+    return allocOut(port, m_npuw_model->global_mem_device());
+}
+
+void ov::npuw::IBaseInferRequest::init_gio() {
+    // Build the parameter/result mapping
+    m_subrequests_gio.resize(m_subrequests.size());
+
+    // Parameters: stage 1...
+    for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
+        const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i);
+        if (to_submodel != CompiledModel::NO_LINK) {
+            std::size_t sub_idx{}, in_idx{};
+            std::tie(sub_idx, in_idx) = to_submodel;
+            m_subrequests_gio.at(sub_idx).global_params[i] = in_idx;
+        }
+    }  // for(inputs)
+
+    // Parameters: stage 2...
+    for (auto&& it : m_npuw_model->m_param_subscribers) {
+        const auto param_idx = it.first;
+        for (auto&& to_submodel : it.second) {
+            std::size_t sub_idx{}, in_idx{};
+            std::tie(sub_idx, in_idx) = to_submodel;
+            m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx;
+        }
+    }
+
+    // Results
+    for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
+        std::size_t sub_idx{}, out_idx{};
+        std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
+        m_subrequests_gio.at(sub_idx).global_results[i] = out_idx;
+    }
+}
+
+void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request) {
+    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
+
+    NPUW_ASSERT(comp_model_desc.replaced_by);
+    const auto real_idx = comp_model_desc.replaced_by.value();
+    auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx];
+
+    // Bind extra parameters from the function's closure
+    // First, do easy things & delay heavy stuff
+    std::vector<std::size_t> closure_unpack_required;
+    std::vector<std::size_t> closure_copy_required;
+
+    for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
+        auto& closure = comp_model_desc.closure[cidx];
+
+        const auto closure_param_id = comp_model_desc.param_base + cidx;
+
+        if (func_desc.host_gather.dst_idx != -1 &&
+            static_cast<uint64_t>(func_desc.host_gather.dst_idx) == closure_param_id) {
+            // No need to set/copy the host_gather's closure tensor int
+            // the subrequest - it is just a dummy. host_gather writes
+            // to the right buffer directly.
+            continue;
+        }
+
+        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
+        if (closure.get_element_type() != iport.get_element_type()) {
+            // Remember where the unpack is required
+            closure_unpack_required.push_back(cidx);
+        } else {
+            if (needs_copy(idx, cidx)) {
+                // Remember where copy is requried
+                closure_copy_required.push_back(cidx);
+            } else {
+                // Easy case, just set one to another
+                request->set_tensor(iport, ov::get_tensor_impl(closure));
+            }
+        }
+    }  // for(closure)
+
+    // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){
+    ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) {
+        auto cidx = closure_copy_required[j];
+        auto& closure = comp_model_desc.closure[cidx];
+        const auto closure_param_id = comp_model_desc.param_base + cidx;
+        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
+        auto clparam = request->get_tensor(iport);
+        ov::get_tensor_impl(closure)->copy_to(clparam._ptr);
+    });
+    // }); // ms_to_run
+
+    for (std::size_t j = 0; j != closure_unpack_required.size(); j++) {
+        // NB: No need to protect anything here as containers are all
+        // preallocated and we only access elements under particular (thread
+        // -local) indices.
+        auto cidx = closure_unpack_required[j];
+
+        // FIXME: zerops are stored with absolute indexing, this needs to be aligned
+        auto& closure = comp_model_desc.closure[cidx];
+
+        const auto closure_param_id = comp_model_desc.param_base + cidx;
+        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
+        auto clparam = request->get_tensor(iport);
+
+        if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx] && comp_model_desc.zerops[cidx]) {
+            // Unpacking this weight requires scaling with zero points...
+            ov::npuw::util::unpack(ov::get_tensor_impl(closure),
+                                   ov::get_tensor_impl(comp_model_desc.zerops[cidx]),
+                                   ov::get_tensor_impl(comp_model_desc.scales[cidx]),
+                                   clparam);
+        } else if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx]) {
+            // Unpacking this weight requires scaling
+            ov::npuw::util::unpack(ov::get_tensor_impl(closure),
+                                   ov::get_tensor_impl(comp_model_desc.scales[cidx]),
+                                   clparam);
+        } else {
+            // Unpacking this weight doesn't require scaling
+            ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam);
+        }
+    }
+}
+
 void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
     const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
     const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size();

@@ -19,8 +19,15 @@
 namespace ov {
 namespace npuw {
 
+using TensorPtr = ov::SoPtr<ov::ITensor>;
+
 class CompiledModel;
 
+using LinkFrom = std::pair<std::size_t /* Subrequest index */
+                           ,
+                           std::size_t /* Subrequest output index */
+                           >;          // FIXME: This is a third, if not fourth, definitiion of such structure
+
 // This interface is provided to npuw::AsyncInferRequest to manage the
 // individual subrequests' execution
 class IBaseInferRequest : public ov::ISyncInferRequest {
@@ -40,6 +47,10 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
 
     void check_tensors() const override;
 
+    // Query APIs - some default implementations here
+    std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
+    std::vector<ov::ProfilingInfo> get_profiling_info() const override;
+
     using sptr = std::shared_ptr<IBaseInferRequest>;
     using Completed = std::function<void(std::exception_ptr)>;
 
@@ -50,7 +61,7 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     virtual void run_subrequest_for_success(std::size_t idx, bool& failover) = 0;
     virtual void complete_subrequest(std::size_t idx) = 0;
     virtual void cancel_subrequest(std::size_t idx) = 0;
-    virtual std::size_t total_subrequests() const = 0;
+    virtual std::size_t total_subrequests() const;
     virtual bool supports_async_pipeline() const = 0;
 
 protected:
@@ -107,8 +118,30 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     };
     std::vector<SpatialIO> m_spatial_io;
 
+    // This structure tracks how every individual subrequest
+    // access the model's top-level (global, public, etc) parameters
+    // and results. Again, is managed by subclasses
+    struct GlobalIO {
+        using map_t = std::map<std::size_t, std::size_t>;
+        map_t global_params;   // param idx -> input idx
+        map_t global_results;  // result idx -> output idx
+    };
+    std::vector<GlobalIO> m_subrequests_gio;
+
+    // Tracks tensors we allocated on our own - to recognize and avoid copies
+    std::unordered_set<void*> m_input_allocated;
+
+    // Common functionality - shared for subclasses
     const std::size_t m_num_submodels;
 
+    TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
+    TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);
+    virtual void alloc_io();
+    virtual TensorPtr alloc_global_out(std::size_t out_idx);
+
+    virtual void init_gio();
+    void unpack_closure(std::size_t idx, RqPtr request);
+
     void dump_input_tensors(std::size_t idx);
     void dump_output_tensors(std::size_t idx);
 

@@ -21,6 +21,7 @@
 #include "openvino/util/common_util.hpp"
 #include "partitioning/patterns/opt.hpp"
 #include "plugin.hpp"
+#include "unfold_sync_infer_request.hpp"
 #include "util.hpp"
 
 // required for get_properties_per_device()
@@ -708,16 +709,20 @@ void ov::npuw::CompiledModel::dump_on_fail(std::size_t id, const std::string& de
     }
 }
 
-std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_just_sync_infer_request() {
-    auto this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(shared_from_this());
-    return std::make_shared<ov::npuw::JustInferRequest>(this_sptr);
-}
-
 std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infer_request() const {
     // Synchronous infer request implementation may vary based on the
     // selected strategy
     auto* non_const_this = const_cast<ov::npuw::CompiledModel*>(this);  // because of const in API
-    return non_const_this->create_just_sync_infer_request();
+    auto non_const_this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(non_const_this->shared_from_this());
+
+    std::shared_ptr<ov::ISyncInferRequest> result;
+    if (m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>()) {
+        result.reset(new ov::npuw::UnfoldInferRequest(non_const_this_sptr));
+    } else {
+        result.reset(new ov::npuw::JustInferRequest(non_const_this_sptr));
+    }
+    NPUW_ASSERT(result);
+    return result;
 }
 
 std::shared_ptr<ov::IAsyncInferRequest> ov::npuw::CompiledModel::create_infer_request() const {
@@ -934,6 +939,7 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
                           BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
                           BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
+                          BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS),
                           BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK),
                           BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
                           BIND(npuw::cache_dir, NPUW_CACHE_DIR),

@@ -47,6 +47,7 @@ class CompiledModel : public ov::ICompiledModel {
     // FIXME: This class has many friends..
     friend class IBaseInferRequest;
     friend class JustInferRequest;
+    friend class UnfoldInferRequest;
     friend class MemAccessSim;
     friend class FuncMemMgr;
 
@@ -66,7 +67,6 @@ class CompiledModel : public ov::ICompiledModel {
 
     std::shared_ptr<const ::intel_npu::Plugin> get_npuw_plugin() const;
 
-    std::shared_ptr<ov::ISyncInferRequest> create_just_sync_infer_request();
     std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
 
     std::string submodel_device(const std::size_t idx) const;