Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NPUW: Spatial execution #26880

Merged
merged 10 commits into from
Oct 9, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, CompileTime);
DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime);
DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime);
DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime);
DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, CompileTime);
DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 64, npuw::partitioning::spatial_nway, CompileTime);
DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime);
DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, CompileTime);
DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,22 @@ static constexpr ov::Property<bool> dyn_quant{"NPUW_DQ"};
*/
static constexpr ov::Property<std::string> par_matmul_merge_dims{"NPUW_PMM"};

/**
* @brief
* Type: boolean.
* Enable spatial execution for selected subgraphs. Requires COMPUTE isolation.
* Default value: false
*/
static constexpr ov::Property<bool> spatial{"NPUW_SPATIAL"};

/**
* @brief
* Type: std::size_t.
* Submission size for the spatial execution.
* Default value: 64
*/
static constexpr ov::Property<std::size_t> spatial_nway{"NPUW_SPATIAL_NWAY"};

/**
* @brief
* Type: boolean
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
desc.add<NPUW_CWAI>();
desc.add<NPUW_DQ>();
desc.add<NPUW_PMM>();
desc.add<NPUW_SPATIAL>();
desc.add<NPUW_SPATIAL_NWAY>();
desc.add<NPUW_HOST_GATHER>();
desc.add<NPUW_DCOFF_TYPE>();
desc.add<NPUW_DCOFF_SCALE>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,33 +210,91 @@ void ov::npuw::IBaseInferRequest::infer() {
}

void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
if (!ov::npuw::util::is_set(idx, dump_ios_opt)) {
return;
}

auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx);
const auto& comp_submodel = m_npuw_model->m_compiled_submodels[real_idx].compiled_model;
const auto& comp_submodel_desc = m_npuw_model->m_compiled_submodels[real_idx];
const auto& comp_submodel = comp_submodel_desc.compiled_model;

// Note: keep using the absolute `idx` for identififaction and printing
// Note:
// - _name is used for the user option (no leading 00s for indices)
// - _path is used for disk dump (will have leading 00s for indices)
const auto comp_submodel_name = subgr_name(idx);
const auto comp_submodel_path = m_npuw_model->m_name + subgr_path_suffix(idx) + iter_path_suffix(idx);
const auto num_inputs = comp_submodel->inputs().size();

const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
if (ov::npuw::util::is_set(idx, dump_ios_opt)) {
// There's different approaches to dumping normal and spatial subgraphs.
if (!comp_submodel_desc.spatial) {
// In the normal, non-spatial mode, we just dump the current subgrequests
// pre-set tensors and that's it
std::vector<std::string> in_base_names;
for (std::size_t i = 0u, num_inputs = comp_submodel->inputs().size(); i < num_inputs; i++) {
for (std::size_t i = 0u; i < num_inputs; i++) {
const auto& port = comp_submodel->inputs()[i];
const auto& tnsr = m_subrequests[real_idx]->get_tensor(port);
std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(i, num_inputs);
ov::npuw::dump_tensor(tnsr, in_base_name);
in_base_names.push_back(std::move(in_base_name));
}
ov::npuw::dump_input_list(comp_submodel_path, in_base_names);
} else {
const auto& s = comp_submodel_desc.spatial.value();

std::set<std::size_t> spatial_param_idx;
std::vector<std::string> in_base_names_nonspat;

// First, dump the non-spatial input tensors just once - and remember its names
for (auto&& p : s.params) {
spatial_param_idx.insert(p.idx);
}
for (std::size_t i = 0u; i < num_inputs; i++) {
if (spatial_param_idx.count(i)) {
continue;
}
const auto& port = comp_submodel->inputs()[i];
const auto& tnsr = m_subrequests[real_idx]->get_tensor(port);
std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(i, num_inputs);
ov::npuw::dump_tensor(tnsr, in_base_name);
in_base_names_nonspat.push_back(std::move(in_base_name));
}

// Now iterate over the spatial range and dump the individual tiles
// For the spatial case, these tiles should've been taken from the special
// spatial_io tensors
for (std::size_t offset = 0u; offset < s.range; offset += s.nway) {
const std::size_t this_len = (offset + s.nway <= s.range) ? s.nway // the full tile
: (s.range - offset); // the last tile
// Copy the base file list to start with it
std::vector<std::string> tile_ilist(in_base_names_nonspat);
for (auto&& p : s.params) {
std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(p.idx, num_inputs) +
"_d" + ov::npuw::util::fmt(p.dim, 10) + "_" +
ov::npuw::util::fmt(offset, s.range);

const auto& tnsr = m_spatial_io[real_idx].inputs.at(p.idx);
const auto& view = ov::npuw::util::view(tnsr, p.dim, offset, this_len);

ov::npuw::dump_tensor(view, in_base_name);
tile_ilist.push_back(std::move(in_base_name));
}
// Dump ilist per tile
ov::npuw::dump_input_list(comp_submodel_path, tile_ilist);
} // for(offset)
}
}

void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) {
const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
if (!ov::npuw::util::is_set(idx, dump_ios_opt)) {
return;
}

auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx);
const auto& comp_submodel = m_npuw_model->m_compiled_submodels[real_idx].compiled_model;
const auto& comp_submodel_desc = m_npuw_model->m_compiled_submodels[real_idx];
const auto& comp_submodel = comp_submodel_desc.compiled_model;

// Note: keep using the absolute `idx` for identififaction and printing
// Note:
Expand All @@ -245,18 +303,39 @@ void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) {
// FIXME: Duplication is evil
const auto comp_submodel_name = subgr_name(idx);
const auto comp_submodel_path = m_npuw_model->m_name + subgr_path_suffix(idx) + iter_path_suffix(idx);
const std::size_t num_outputs = comp_submodel->outputs().size();

const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
if (ov::npuw::util::is_set(idx, dump_ios_opt)) {
// Same approach as in above. Spatial tensors require special handling
if (!comp_submodel_desc.spatial) {
std::vector<std::string> out_base_names;
for (std::size_t i = 0u, num_outputs = comp_submodel->outputs().size(); i < num_outputs; i++) {
for (std::size_t i = 0u; i < num_outputs; i++) {
const auto& port = comp_submodel->outputs()[i];
const auto& tnsr = m_subrequests[real_idx]->get_tensor(port);
std::string out_base_name = comp_submodel_path + "_output_" + ov::npuw::util::fmt(i, num_outputs);
ov::npuw::dump_tensor(tnsr, out_base_name);
out_base_names.push_back(std::move(out_base_name));
}
ov::npuw::dump_output_list(comp_submodel_path, out_base_names);
} else {
// All outputs are considered spatial now so it should be easier
const auto& s = comp_submodel_desc.spatial.value();
for (std::size_t offset = 0u; offset < s.range; offset += s.nway) {
const std::size_t this_len = (offset + s.nway <= s.range) ? s.nway // the full tile
: (s.range - offset); // the last tile
std::vector<std::string> tile_olist;
for (std::size_t i = 0u; i < num_outputs; i++) {
std::string out_base_name = comp_submodel_path + "_output_" + ov::npuw::util::fmt(i, num_outputs) +
"_d" + ov::npuw::util::fmt(s.out_dim, 10) + "_" +
ov::npuw::util::fmt(offset, s.range);
const auto& tnsr = m_spatial_io[real_idx].outputs.at(i);
const auto& view = ov::npuw::util::view(tnsr, s.out_dim, offset, this_len);

ov::npuw::dump_tensor(view, out_base_name);
tile_olist.push_back(std::move(out_base_name));
}
// Dump olist per tile
ov::npuw::dump_output_list(comp_submodel_path, tile_olist);
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,22 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
// FROM(Every subrequests' output port) TO(Its output tensor)
std::map<ov::Output<const ov::Node>, TensorStorage> m_port_to_tensor;

// FIXME: Currently is initialized/managed by subclass as well.
// Moved here dumping purposes only
// Another sparse vector. Represents populated spatial I/O parameters
// which can should be read/written by parts in multile submissions.
// An ugly structure, cries for refactoring
// See function_prologue for details.
// Also it contains pre-allocated tensors for tails handling
struct SpatialIO {
std::vector<ov::SoPtr<ov::ITensor>> inputs; // # of elements - # of graph-side inputs
std::vector<ov::SoPtr<ov::ITensor>> outputs; // # of elements - # of subgraph outputs
Comment on lines +102 to +103
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why it's different: graph-side inputs and subgraph outputs?

Copy link
Contributor Author

@dmatveev dmatveev Oct 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

subgraph inputs include closures, formally. graph-side inputs are only the connections in the graph (we care about)


std::vector<ov::SoPtr<ov::ITensor>> input_tails; // temporary buffers for input tails
std::vector<ov::SoPtr<ov::ITensor>> output_tails; // temporary buffers for output tails
};
std::vector<SpatialIO> m_spatial_io;

const std::size_t m_num_submodels;

void dump_input_tensors(std::size_t idx);
Expand Down
18 changes: 18 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,22 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
compiledFunctions.insert({subgraph._funcall, id});
m_compiled_submodels[id].model = fcn_template._model;
m_compiled_submodels[id].replaced_by = id; // FIXME: UGLY

// Fill in the spatial information, if it is present
if (fcn_template._spatial) {
using S = CompiledModelDesc::Spatial;
S s;
s.range = fcn_template._spatial->_range;
s.nway = fcn_template._spatial->_slice;
s.out_dim = fcn_template._spatial->_out_dim;
s.nway_iters = s.range / s.nway;
s.tail_size = s.range % s.nway;
for (auto&& input : fcn_template._spatial->_inputs) {
std::size_t p_idx = fcn_template._model->get_parameter_index(input.param);
s.params.push_back(S::Param{p_idx, input.dim});
}
m_compiled_submodels[id].spatial = std::move(s);
}
LOG_INFO("Subgraph[" << id << "] is a function body for " << subgraph._funcall);
} else {
// ...and refer to it in other calls
Expand Down Expand Up @@ -824,6 +840,8 @@ void ov::npuw::CompiledModel::implement_properties() {
BIND(npuw::partitioning::cwai, NPUW_CWAI),
BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
BIND(npuw::partitioning::spatial, NPUW_SPATIAL_NWAY),
BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
Expand Down
18 changes: 16 additions & 2 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,24 @@ class CompiledModel : public ov::ICompiledModel {

std::optional<std::size_t> replaced_by;

// FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
// w.r.t. function calls
Subgraph::Gather host_gather;
struct Spatial {
struct Param {
std::size_t idx;
std::size_t dim;
};
std::vector<Param> params;
std::size_t range = 0u;
std::size_t nway = 0u;
std::size_t out_dim = 0u;

std::size_t nway_iters = 0u;
std::size_t tail_size = 0u;
};
std::optional<Spatial> spatial;

// FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure
// w.r.t. function calls
std::size_t param_base = 0;
std::vector<ov::Tensor> closure;
std::vector<ov::Tensor> scales;
Expand Down
Loading
Loading