Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NPUW: Unfold infer requests #27319

Merged
merged 15 commits into from
Nov 21, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime);
DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime);
DEFINE_OPT(NPUW_CACHE_DIR, std::string, "", npuw::cache_dir, CompileTime);
DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime);
DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime);
DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);
DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,14 @@ static constexpr ov::Property<bool> parallel_compilation{"NPUW_PARALLEL_COMPILE"
*/
static constexpr ov::Property<bool> funcall_async{"NPUW_FUNCALL_ASYNC"};

/**
* @brief
* Type: boolean
* Create individual infer requests for partitiongs, even repeating.
* Default value: false.
*/
static constexpr ov::Property<bool> unfold_ireqs{"NPUW_UNFOLD_IREQS"};

namespace accuracy {
/**
* @brief
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
desc.add<NPUW_FUNCALL_FOR_ALL>();
desc.add<NPUW_PARALLEL_COMPILE>();
desc.add<NPUW_FUNCALL_ASYNC>();
desc.add<NPUW_UNFOLD_IREQS>();
desc.add<NPUW_WEIGHTS_BANK>();
desc.add<NPUW_WEIGHTS_BANK_ALLOC>();
desc.add<NPUW_CACHE_DIR>();
Expand Down
199 changes: 199 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "openvino/core/parallel.hpp"

#include "base_sync_infer_request.hpp"

#include "compiled_model.hpp"
Expand Down Expand Up @@ -178,6 +180,33 @@ void ov::npuw::IBaseInferRequest::check_tensors() const {
return;
}

std::vector<ov::SoPtr<ov::IVariableState>> ov::npuw::IBaseInferRequest::query_state() const {
std::vector<ov::SoPtr<ov::IVariableState>> variable_states = {};
for (const auto& request : m_subrequests) {
if (!request) // optimized out
continue;
for (auto&& state : request->query_state()) {
if (!state._so)
state._so = request._so;
variable_states.emplace_back(state);
}
}
return variable_states;
}

std::vector<ov::ProfilingInfo> ov::npuw::IBaseInferRequest::get_profiling_info() const {
std::vector<ov::ProfilingInfo> info;
for (size_t i = 0; i < m_subrequests.size(); ++i) {
if (!m_subrequests[i]) // optimized out
continue;
auto&& subreq_info = m_subrequests[i]->get_profiling_info();
for (auto&& rec : subreq_info)
rec.node_name = std::string("subgraph") + std::to_string(i) + ": " + rec.node_name;
info.insert(info.end(), subreq_info.begin(), subreq_info.end());
}
return info;
}

void ov::npuw::IBaseInferRequest::infer() {
m_now_idx.reset();
prepare_for_infer();
Expand Down Expand Up @@ -209,6 +238,176 @@ void ov::npuw::IBaseInferRequest::infer() {
m_now_idx.reset();
}


std::size_t ov::npuw::IBaseInferRequest::total_subrequests() const {
return m_subrequests.size();
}

ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocMem(const ov::element::Type type,
const ov::Shape& shape,
const std::string& device) {
if (device == "CPU" || ov::shape_size(shape) == 0) {
return ov::get_tensor_impl(ov::Tensor(type, shape));
}

// Protect access to shared context(s) - at least among infer requests
auto remote_ctx = m_npuw_model->get_plugin()->get_core()->get_default_context(device)._ptr;
auto remote_tensor = remote_ctx->create_host_tensor(type, shape);
return ov::get_tensor_impl(ov::make_tensor(remote_tensor));
}

ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::allocOut(const ov::Output<const ov::Node>& node,
const std::string& device) {
return allocMem(node.get_element_type(), node.get_shape(), device);
}

void ov::npuw::IBaseInferRequest::alloc_io() {
// Preallocate input tensors
LOG_INFO("Preallocating input tensors...");
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
const auto& port = m_npuw_model->inputs()[i];
ov::SoPtr<ov::ITensor> allocated = allocOut(port, m_npuw_model->global_mem_device());
m_input_tensors.push_back(allocated);
m_input_allocated.insert(allocated->data());
m_port_to_tensor[port] = TensorStorage{m_input_tensors.back(), true};
} // for(inputs)

// Preallocate output tensors
LOG_INFO("Preallocating output tensors...");
for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
LOG_BLOCK();
const auto& port = m_npuw_model->outputs()[i];
LOG_INFO("Output " << i << " of " << m_npuw_model->outputs().size() << ": " << port);

// FIXME: Yes, the CompiledModel::ToSubmodel == JustInferRequest::LinkFrom
const auto& from_submodel = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
LOG_INFO("Produced by Subgraph[" << from_submodel.first << "] / " << from_submodel.second);

auto tensor = alloc_global_out(i);
m_output_tensors.push_back(tensor);
m_port_to_tensor[port] = TensorStorage{tensor, true};
}
}

ov::npuw::TensorPtr ov::npuw::IBaseInferRequest::alloc_global_out(std::size_t out_idx) {
const auto& port = m_npuw_model->outputs().at(out_idx);
return allocOut(port, m_npuw_model->global_mem_device());
}

void ov::npuw::IBaseInferRequest::init_gio() {
// Build the parameter/result mapping
m_subrequests_gio.resize(m_subrequests.size());

// Parameters: stage 1...
for (size_t i = 0; i < m_npuw_model->inputs().size(); i++) {
const auto& to_submodel = m_npuw_model->m_inputs_to_submodels_inputs.at(i);
if (to_submodel != CompiledModel::NO_LINK) {
std::size_t sub_idx{}, in_idx{};
std::tie(sub_idx, in_idx) = to_submodel;
m_subrequests_gio.at(sub_idx).global_params[i] = in_idx;
}
} // for(inputs)

// Parameters: stage 2...
for (auto&& it : m_npuw_model->m_param_subscribers) {
const auto param_idx = it.first;
for (auto&& to_submodel : it.second) {
std::size_t sub_idx{}, in_idx{};
std::tie(sub_idx, in_idx) = to_submodel;
m_subrequests_gio.at(sub_idx).global_params[param_idx] = in_idx;
}
}

// Results
for (size_t i = 0; i < m_npuw_model->outputs().size(); i++) {
std::size_t sub_idx{}, out_idx{};
std::tie(sub_idx, out_idx) = m_npuw_model->m_outputs_to_submodels_outputs.at(i);
m_subrequests_gio.at(sub_idx).global_results[i] = out_idx;
}
}

void ov::npuw::IBaseInferRequest::unpack_closure(std::size_t idx, RqPtr request) {
auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];

NPUW_ASSERT(comp_model_desc.replaced_by);
const auto real_idx = comp_model_desc.replaced_by.value();
auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx];

// Bind extra parameters from the function's closure
// First, do easy things & delay heavy stuff
std::vector<std::size_t> closure_unpack_required;
std::vector<std::size_t> closure_copy_required;

for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
auto& closure = comp_model_desc.closure[cidx];

const auto closure_param_id = comp_model_desc.param_base + cidx;

if (func_desc.host_gather.dst_idx != -1 &&
static_cast<uint64_t>(func_desc.host_gather.dst_idx) == closure_param_id) {
// No need to set/copy the host_gather's closure tensor int
// the subrequest - it is just a dummy. host_gather writes
// to the right buffer directly.
continue;
}

auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
if (closure.get_element_type() != iport.get_element_type()) {
// Remember where the unpack is required
closure_unpack_required.push_back(cidx);
} else {
if (needs_copy(idx, cidx)) {
// Remember where copy is requried
closure_copy_required.push_back(cidx);
} else {
// Easy case, just set one to another
request->set_tensor(iport, ov::get_tensor_impl(closure));
}
}
} // for(closure)

// m_ms_unpack += ov::npuw::perf::ms_to_run([&](){
ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) {
auto cidx = closure_copy_required[j];
auto& closure = comp_model_desc.closure[cidx];
const auto closure_param_id = comp_model_desc.param_base + cidx;
auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
auto clparam = request->get_tensor(iport);
ov::get_tensor_impl(closure)->copy_to(clparam._ptr);
});
// }); // ms_to_run

for (std::size_t j = 0; j != closure_unpack_required.size(); j++) {
// NB: No need to protect anything here as containers are all
// preallocated and we only access elements under particular (thread
// -local) indices.
auto cidx = closure_unpack_required[j];

// FIXME: zerops are stored with absolute indexing, this needs to be aligned
auto& closure = comp_model_desc.closure[cidx];

const auto closure_param_id = comp_model_desc.param_base + cidx;
auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
auto clparam = request->get_tensor(iport);

if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx] && comp_model_desc.zerops[cidx]) {
// Unpacking this weight requires scaling with zero points...
ov::npuw::util::unpack(ov::get_tensor_impl(closure),
ov::get_tensor_impl(comp_model_desc.zerops[cidx]),
ov::get_tensor_impl(comp_model_desc.scales[cidx]),
clparam);
} else if (!comp_model_desc.scales.empty() && comp_model_desc.scales[cidx]) {
// Unpacking this weight requires scaling
ov::npuw::util::unpack(ov::get_tensor_impl(closure),
ov::get_tensor_impl(comp_model_desc.scales[cidx]),
clparam);
} else {
// Unpacking this weight doesn't require scaling
ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam);
}
}
}

void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,15 @@
namespace ov {
namespace npuw {

using TensorPtr = ov::SoPtr<ov::ITensor>;

class CompiledModel;

using LinkFrom = std::pair<std::size_t /* Subrequest index */
,
std::size_t /* Subrequest output index */
>; // FIXME: This is a third, if not fourth, definitiion of such structure

// This interface is provided to npuw::AsyncInferRequest to manage the
// individual subrequests' execution
class IBaseInferRequest : public ov::ISyncInferRequest {
Expand All @@ -40,6 +47,10 @@ class IBaseInferRequest : public ov::ISyncInferRequest {

void check_tensors() const override;

// Query APIs - some default implementations here
std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
std::vector<ov::ProfilingInfo> get_profiling_info() const override;

using sptr = std::shared_ptr<IBaseInferRequest>;
using Completed = std::function<void(std::exception_ptr)>;

Expand All @@ -50,7 +61,7 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
virtual void run_subrequest_for_success(std::size_t idx, bool& failover) = 0;
virtual void complete_subrequest(std::size_t idx) = 0;
virtual void cancel_subrequest(std::size_t idx) = 0;
virtual std::size_t total_subrequests() const = 0;
virtual std::size_t total_subrequests() const;
virtual bool supports_async_pipeline() const = 0;

protected:
Expand Down Expand Up @@ -107,8 +118,30 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
};
std::vector<SpatialIO> m_spatial_io;

// This structure tracks how every individual subrequest
// access the model's top-level (global, public, etc) parameters
// and results. Again, is managed by subclasses
struct GlobalIO {
using map_t = std::map<std::size_t, std::size_t>;
map_t global_params; // param idx -> input idx
map_t global_results; // result idx -> output idx
};
std::vector<GlobalIO> m_subrequests_gio;

// Tracks tensors we allocated on our own - to recognize and avoid copies
std::unordered_set<void*> m_input_allocated;

// Common functionality - shared for subclasses
const std::size_t m_num_submodels;

TensorPtr allocMem(const ov::element::Type type, const ov::Shape& shape, const std::string& device);
TensorPtr allocOut(const ov::Output<const ov::Node>& node, const std::string& device);
virtual void alloc_io();
virtual TensorPtr alloc_global_out(std::size_t out_idx);

virtual void init_gio();
void unpack_closure(std::size_t idx, RqPtr request);

void dump_input_tensors(std::size_t idx);
void dump_output_tensors(std::size_t idx);

Expand Down
18 changes: 12 additions & 6 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "openvino/util/common_util.hpp"
#include "partitioning/patterns/opt.hpp"
#include "plugin.hpp"
#include "unfold_sync_infer_request.hpp"
#include "util.hpp"

// required for get_properties_per_device()
Expand Down Expand Up @@ -708,16 +709,20 @@ void ov::npuw::CompiledModel::dump_on_fail(std::size_t id, const std::string& de
}
}

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_just_sync_infer_request() {
auto this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(shared_from_this());
return std::make_shared<ov::npuw::JustInferRequest>(this_sptr);
}

std::shared_ptr<ov::ISyncInferRequest> ov::npuw::CompiledModel::create_sync_infer_request() const {
// Synchronous infer request implementation may vary based on the
// selected strategy
auto* non_const_this = const_cast<ov::npuw::CompiledModel*>(this); // because of const in API
return non_const_this->create_just_sync_infer_request();
auto non_const_this_sptr = std::static_pointer_cast<ov::npuw::CompiledModel>(non_const_this->shared_from_this());

std::shared_ptr<ov::ISyncInferRequest> result;
if (m_cfg.get<::intel_npu::NPUW_UNFOLD_IREQS>()) {
result.reset(new ov::npuw::UnfoldInferRequest(non_const_this_sptr));
} else {
dmatveev marked this conversation as resolved.
Show resolved Hide resolved
result.reset(new ov::npuw::JustInferRequest(non_const_this_sptr));
}
NPUW_ASSERT(result);
return result;
}

std::shared_ptr<ov::IAsyncInferRequest> ov::npuw::CompiledModel::create_infer_request() const {
Expand Down Expand Up @@ -934,6 +939,7 @@ void ov::npuw::CompiledModel::implement_properties() {
BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS),
BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK),
BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
BIND(npuw::cache_dir, NPUW_CACHE_DIR),
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class CompiledModel : public ov::ICompiledModel {
// FIXME: This class has many friends..
friend class IBaseInferRequest;
friend class JustInferRequest;
friend class UnfoldInferRequest;
friend class MemAccessSim;
friend class FuncMemMgr;

Expand All @@ -66,7 +67,6 @@ class CompiledModel : public ov::ICompiledModel {

std::shared_ptr<const ::intel_npu::Plugin> get_npuw_plugin() const;

std::shared_ptr<ov::ISyncInferRequest> create_just_sync_infer_request();
std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;

std::string submodel_device(const std::size_t idx) const;
Expand Down
Loading
Loading