diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp index 76a3b23259f1f5..53a12fb58d333c 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp @@ -44,6 +44,8 @@ DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, CompileTime); DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime); DEFINE_OPT(NPUW_PMM, std::string, "2", npuw::partitioning::par_matmul_merge_dims, CompileTime); DEFINE_OPT(NPUW_HOST_GATHER, bool, true, npuw::partitioning::host_gather, CompileTime); +DEFINE_OPT(NPUW_SPATIAL, bool, false, npuw::partitioning::spatial, CompileTime); +DEFINE_OPT(NPUW_SPATIAL_NWAY, std::size_t, 64, npuw::partitioning::spatial_nway, CompileTime); DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime); DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, CompileTime); DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime); diff --git a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp index 4ccb29469e98ab..4de9ee5ab15080 100644 --- a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp @@ -178,6 +178,22 @@ static constexpr ov::Property dyn_quant{"NPUW_DQ"}; */ static constexpr ov::Property par_matmul_merge_dims{"NPUW_PMM"}; +/** + * @brief + * Type: boolean. + * Enable spatial execution for selected subgraphs. Requires COMPUTE isolation. + * Default value: false + */ +static constexpr ov::Property spatial{"NPUW_SPATIAL"}; + +/** + * @brief + * Type: std::size_t. + * Submission size for the spatial execution. + * Default value: 64 + */ +static constexpr ov::Property spatial_nway{"NPUW_SPATIAL_NWAY"}; + /** * @brief * Type: boolean diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 798b5344c4ea62..ac5a2623020c04 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -28,6 +28,8 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp index 58036d299b3c1b..3bbbbe6aca7c06 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp @@ -210,8 +210,14 @@ void ov::npuw::IBaseInferRequest::infer() { } void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) { + const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>(); + if (!ov::npuw::util::is_set(idx, dump_ios_opt)) { + return; + } + auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx); - const auto& comp_submodel = m_npuw_model->m_compiled_submodels[real_idx].compiled_model; + const auto& comp_submodel_desc = m_npuw_model->m_compiled_submodels[real_idx]; + const auto& comp_submodel = comp_submodel_desc.compiled_model; // Note: keep using the absolute `idx` for identififaction and printing // Note: @@ -219,11 +225,14 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) { // - _path is used for disk dump (will have leading 00s for indices) const auto comp_submodel_name = subgr_name(idx); const auto comp_submodel_path = m_npuw_model->m_name + subgr_path_suffix(idx) + iter_path_suffix(idx); + const auto num_inputs = comp_submodel->inputs().size(); - const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>(); - if (ov::npuw::util::is_set(idx, dump_ios_opt)) { + // There's different approaches to dumping normal and spatial subgraphs. + if (!comp_submodel_desc.spatial) { + // In the normal, non-spatial mode, we just dump the current subgrequests + // pre-set tensors and that's it std::vector in_base_names; - for (std::size_t i = 0u, num_inputs = comp_submodel->inputs().size(); i < num_inputs; i++) { + for (std::size_t i = 0u; i < num_inputs; i++) { const auto& port = comp_submodel->inputs()[i]; const auto& tnsr = m_subrequests[real_idx]->get_tensor(port); std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(i, num_inputs); @@ -231,12 +240,61 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) { in_base_names.push_back(std::move(in_base_name)); } ov::npuw::dump_input_list(comp_submodel_path, in_base_names); + } else { + const auto& s = comp_submodel_desc.spatial.value(); + + std::set spatial_param_idx; + std::vector in_base_names_nonspat; + + // First, dump the non-spatial input tensors just once - and remember its names + for (auto&& p : s.params) { + spatial_param_idx.insert(p.idx); + } + for (std::size_t i = 0u; i < num_inputs; i++) { + if (spatial_param_idx.count(i)) { + continue; + } + const auto& port = comp_submodel->inputs()[i]; + const auto& tnsr = m_subrequests[real_idx]->get_tensor(port); + std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(i, num_inputs); + ov::npuw::dump_tensor(tnsr, in_base_name); + in_base_names_nonspat.push_back(std::move(in_base_name)); + } + + // Now iterate over the spatial range and dump the individual tiles + // For the spatial case, these tiles should've been taken from the special + // spatial_io tensors + for (std::size_t offset = 0u; offset < s.range; offset += s.nway) { + const std::size_t this_len = (offset + s.nway <= s.range) ? s.nway // the full tile + : (s.range - offset); // the last tile + // Copy the base file list to start with it + std::vector tile_ilist(in_base_names_nonspat); + for (auto&& p : s.params) { + std::string in_base_name = comp_submodel_path + "_input_" + ov::npuw::util::fmt(p.idx, num_inputs) + + "_d" + ov::npuw::util::fmt(p.dim, 10) + "_" + + ov::npuw::util::fmt(offset, s.range); + + const auto& tnsr = m_spatial_io[real_idx].inputs.at(p.idx); + const auto& view = ov::npuw::util::view(tnsr, p.dim, offset, this_len); + + ov::npuw::dump_tensor(view, in_base_name); + tile_ilist.push_back(std::move(in_base_name)); + } + // Dump ilist per tile + ov::npuw::dump_input_list(comp_submodel_path, tile_ilist); + } // for(offset) } } void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) { + const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>(); + if (!ov::npuw::util::is_set(idx, dump_ios_opt)) { + return; + } + auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx); - const auto& comp_submodel = m_npuw_model->m_compiled_submodels[real_idx].compiled_model; + const auto& comp_submodel_desc = m_npuw_model->m_compiled_submodels[real_idx]; + const auto& comp_submodel = comp_submodel_desc.compiled_model; // Note: keep using the absolute `idx` for identififaction and printing // Note: @@ -245,11 +303,12 @@ void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) { // FIXME: Duplication is evil const auto comp_submodel_name = subgr_name(idx); const auto comp_submodel_path = m_npuw_model->m_name + subgr_path_suffix(idx) + iter_path_suffix(idx); + const std::size_t num_outputs = comp_submodel->outputs().size(); - const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>(); - if (ov::npuw::util::is_set(idx, dump_ios_opt)) { + // Same approach as in above. Spatial tensors require special handling + if (!comp_submodel_desc.spatial) { std::vector out_base_names; - for (std::size_t i = 0u, num_outputs = comp_submodel->outputs().size(); i < num_outputs; i++) { + for (std::size_t i = 0u; i < num_outputs; i++) { const auto& port = comp_submodel->outputs()[i]; const auto& tnsr = m_subrequests[real_idx]->get_tensor(port); std::string out_base_name = comp_submodel_path + "_output_" + ov::npuw::util::fmt(i, num_outputs); @@ -257,6 +316,26 @@ void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) { out_base_names.push_back(std::move(out_base_name)); } ov::npuw::dump_output_list(comp_submodel_path, out_base_names); + } else { + // All outputs are considered spatial now so it should be easier + const auto& s = comp_submodel_desc.spatial.value(); + for (std::size_t offset = 0u; offset < s.range; offset += s.nway) { + const std::size_t this_len = (offset + s.nway <= s.range) ? s.nway // the full tile + : (s.range - offset); // the last tile + std::vector tile_olist; + for (std::size_t i = 0u; i < num_outputs; i++) { + std::string out_base_name = comp_submodel_path + "_output_" + ov::npuw::util::fmt(i, num_outputs) + + "_d" + ov::npuw::util::fmt(s.out_dim, 10) + "_" + + ov::npuw::util::fmt(offset, s.range); + const auto& tnsr = m_spatial_io[real_idx].outputs.at(i); + const auto& view = ov::npuw::util::view(tnsr, s.out_dim, offset, this_len); + + ov::npuw::dump_tensor(view, out_base_name); + tile_olist.push_back(std::move(out_base_name)); + } + // Dump olist per tile + ov::npuw::dump_output_list(comp_submodel_path, tile_olist); + } } } diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp index 986ea78c378c32..6d4b4c71ef3cab 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp @@ -91,6 +91,22 @@ class IBaseInferRequest : public ov::ISyncInferRequest { // FROM(Every subrequests' output port) TO(Its output tensor) std::map, TensorStorage> m_port_to_tensor; + // FIXME: Currently is initialized/managed by subclass as well. + // Moved here dumping purposes only + // Another sparse vector. Represents populated spatial I/O parameters + // which can should be read/written by parts in multile submissions. + // An ugly structure, cries for refactoring + // See function_prologue for details. + // Also it contains pre-allocated tensors for tails handling + struct SpatialIO { + std::vector> inputs; // # of elements - # of graph-side inputs + std::vector> outputs; // # of elements - # of subgraph outputs + + std::vector> input_tails; // temporary buffers for input tails + std::vector> output_tails; // temporary buffers for output tails + }; + std::vector m_spatial_io; + const std::size_t m_num_submodels; void dump_input_tensors(std::size_t idx); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 3213be04ec3a33..1d2217f1114d0c 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -277,6 +277,22 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, compiledFunctions.insert({subgraph._funcall, id}); m_compiled_submodels[id].model = fcn_template._model; m_compiled_submodels[id].replaced_by = id; // FIXME: UGLY + + // Fill in the spatial information, if it is present + if (fcn_template._spatial) { + using S = CompiledModelDesc::Spatial; + S s; + s.range = fcn_template._spatial->_range; + s.nway = fcn_template._spatial->_slice; + s.out_dim = fcn_template._spatial->_out_dim; + s.nway_iters = s.range / s.nway; + s.tail_size = s.range % s.nway; + for (auto&& input : fcn_template._spatial->_inputs) { + std::size_t p_idx = fcn_template._model->get_parameter_index(input.param); + s.params.push_back(S::Param{p_idx, input.dim}); + } + m_compiled_submodels[id].spatial = std::move(s); + } LOG_INFO("Subgraph[" << id << "] is a function body for " << subgraph._funcall); } else { // ...and refer to it in other calls @@ -824,6 +840,8 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::partitioning::cwai, NPUW_CWAI), BIND(npuw::partitioning::dyn_quant, NPUW_DQ), BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM), + BIND(npuw::partitioning::spatial, NPUW_SPATIAL), + BIND(npuw::partitioning::spatial, NPUW_SPATIAL_NWAY), BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER), BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL), BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE), diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index 1ddaf3f543eaa8..ab517d6adc75ef 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -116,10 +116,24 @@ class CompiledModel : public ov::ICompiledModel { std::optional replaced_by; - // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure - // w.r.t. function calls Subgraph::Gather host_gather; + struct Spatial { + struct Param { + std::size_t idx; + std::size_t dim; + }; + std::vector params; + std::size_t range = 0u; + std::size_t nway = 0u; + std::size_t out_dim = 0u; + + std::size_t nway_iters = 0u; + std::size_t tail_size = 0u; + }; + std::optional spatial; + // FIXME: This is a 1:1 copy of the ov::npuw::Subgraph structure + // w.r.t. function calls std::size_t param_base = 0; std::vector closure; std::vector scales; diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 6638fbcbe12a57..9ad7016f3efb69 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -5,6 +5,7 @@ #include "just_sync_infer_request.hpp" #include +#include #include #include #include @@ -29,6 +30,8 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrm_compiled_submodels[real_idx].compiled_model; - for (size_t out_idx = 0; out_idx < proto_comp_model->outputs().size(); out_idx++) { + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + auto& proto_comp_model = proto_comp_model_desc.compiled_model; + + const auto num_outputs = proto_comp_model->outputs().size(); + + // Initialize the spatial IO placeholders, if required + if (proto_comp_model_desc.spatial) { + m_spatial_io[real_idx].inputs.resize(proto_comp_model_desc.param_base); + m_spatial_io[real_idx].input_tails.resize(proto_comp_model_desc.param_base); + m_spatial_io[real_idx].outputs.resize(num_outputs); + m_spatial_io[real_idx].output_tails.resize(num_outputs); + + if (proto_comp_model_desc.spatial->tail_size) { + // Preallocate extra buffers for tail processing + // Note: these buffers are allocated to the entire NWAY (> tail_size) + for (auto&& p : proto_comp_model_desc.spatial->params) { + const auto& iport = proto_comp_model_desc.compiled_model->inputs()[p.idx]; + m_spatial_io[real_idx].input_tails[p.idx] = + ov::get_tensor_impl(ov::Tensor(iport.get_element_type(), iport.get_shape())); + } + const auto num_outs = proto_comp_model_desc.compiled_model->outputs().size(); + for (std::size_t out_idx = 0u; out_idx < num_outs; out_idx++) { + const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx]; + m_spatial_io[real_idx].output_tails[out_idx] = + ov::get_tensor_impl(ov::Tensor(oport.get_element_type(), oport.get_shape())); + } + } + } // if(spatial) + + for (size_t out_idx = 0; out_idx < num_outputs; out_idx++) { const auto& port = proto_comp_model->outputs()[out_idx]; + ov::Shape shape = port.get_shape(); + + // If the subgraph is spatial, promote the output size to the full vector size + if (proto_comp_model_desc.spatial) { + shape[proto_comp_model_desc.spatial->out_dim] = proto_comp_model_desc.spatial->range; + } m_funcall_result[LinkFrom{i, out_idx}] = - ov::get_tensor_impl(ov::Tensor(port.get_element_type(), port.get_shape())); + ov::get_tensor_impl(ov::Tensor(port.get_element_type(), shape)); } if (real_idx != i) { // If this function call is NOT the function body, do nothing here - the original @@ -147,7 +184,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrm_inputs_to_submodels_inputs.at(i); if (to_submodel != CompiledModel::NO_LINK) { std::size_t sub_idx{}, in_idx{}; - std::tie(sub_idx, in_idx) = m_npuw_model->m_inputs_to_submodels_inputs.at(i); + std::tie(sub_idx, in_idx) = to_submodel; m_subrequests_gio.at(sub_idx).global_params[i] = in_idx; } } // for(inputs) @@ -336,6 +373,9 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) { const bool do_copy = needs_copy(idx); const auto& iodesc = m_subrequests_gio.at(idx); + const auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + const bool is_spatial = proto_comp_model_desc.spatial.has_value(); + // a list of ports to copy tensors, if needed: FROM -> TO std::vector, ov::Output>> copy_list; @@ -356,21 +396,39 @@ void ov::npuw::JustInferRequest::bind_global_parameters(std::size_t idx) { return m_subrequests[real_idx]; }(); + // Check if the given subgraph's input is spatial + auto is_spatial_param = [&](std::size_t sub_in_idx) -> bool { + if (!is_spatial) { + return false; // Early return + } + auto& spatial = proto_comp_model_desc.spatial.value(); + return std::any_of(spatial.params.begin(), spatial.params.end(), [&](const auto& p) -> bool { + return p.idx == sub_in_idx; + }); + }; + for (auto&& it : iodesc.global_params) { std::size_t param_idx{}, sub_in_idx{}; std::tie(param_idx, sub_in_idx) = it; LOG_DEBUG("Processing " << param_idx << " -> " << sub_in_idx << std::endl); + const auto& g_port = m_npuw_model->inputs()[param_idx]; const auto& g_tnsr = m_port_to_tensor.at(g_port).tensor; const auto& s_port = subr->get_inputs()[sub_in_idx]; LOG_DEBUG("Processing " << g_port << " -> " << s_port << "..."); LOG_BLOCK(); - if (do_copy) { - LOG_DEBUG("Will be copied"); - copy_list.emplace_back(g_tnsr, s_port); + if (!is_spatial_param(sub_in_idx)) { + // Input parameter is non-spatial, do normal handling + if (do_copy) { + LOG_DEBUG("Will be copied"); + copy_list.emplace_back(g_tnsr, s_port); + } else { + LOG_DEBUG("Will be set"); + subr->set_tensor(s_port, g_tnsr); + } } else { - LOG_DEBUG("Will be set"); - subr->set_tensor(s_port, g_tnsr); + // Register for future use + m_spatial_io[real_idx].inputs.at(sub_in_idx) = g_tnsr; } } @@ -398,11 +456,10 @@ void ov::npuw::JustInferRequest::bind_global_results(std::size_t idx) { LOG_BLOCK(); auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; - const auto real_idx = comp_model_desc.replaced_by.value_or(idx); - if (real_idx != idx) { - // Don't do here - function call will take parameter - // itself. Note it may be implemented more efficently - // than now (and in some cases, parameter can be pre-set) + if (comp_model_desc.replaced_by) { + // Don't do here - function call will take the right tensor + // itself. Note it may be implemented more efficently than now + // (and in some cases, the tensor can be pre-set) LOG_DEBUG("Skipping this too now - function will do it for itself"); return; } @@ -429,6 +486,8 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) { const auto real_idx = comp_model_desc.replaced_by.value(); auto& func_desc = m_npuw_model->m_compiled_submodels[real_idx]; + const bool is_spatial = func_desc.spatial.has_value(); + // Function call prologue: // 1. Walk through function dependencies and set the respective tensors // as parameters @@ -446,11 +505,25 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) { if (!m_npuw_model->m_compiled_submodels[prod_idx].replaced_by) { // Producer is a normal model -> take its tensor directly const auto& oport = m_npuw_model->m_compiled_submodels[prod_idx].compiled_model->outputs()[prod_port]; - m_subrequests[real_idx]->set_tensor(iport, m_subrequests[prod_idx]->get_tensor(oport)); + auto i_tensor = m_subrequests[prod_idx]->get_tensor(oport); + if (!is_spatial) { + // Non-spatial case - set immediately + m_subrequests[real_idx]->set_tensor(iport, i_tensor); + } else { + // Spatial case - defer + m_spatial_io[real_idx].inputs.at(i) = i_tensor; + } } else { // Producer is a function - maybe the same as we're calling now. // Take its tensor from the storage - m_subrequests[real_idx]->set_tensor(iport, m_funcall_result.at({prod_idx, prod_port})); + auto i_tensor = m_funcall_result.at({prod_idx, prod_port}); + if (!is_spatial) { + // Non-spatial case - again, set immediately + m_subrequests[real_idx]->set_tensor(iport, m_funcall_result.at({prod_idx, prod_port})); + } else { + // Spatial case - defer + m_spatial_io[real_idx].inputs.at(i) = i_tensor; + } } } } // for(param_base) @@ -472,7 +545,14 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) { for (std::size_t i = 0; i < func_desc.compiled_model->outputs().size(); i++) { LOG_DEBUG("Binding result[" << i << "]..."); auto& oport = func_desc.compiled_model->outputs()[i]; - m_subrequests[real_idx]->set_tensor(oport, m_funcall_result.at({idx, i})); + auto o_tensor = m_funcall_result.at({idx, i}); + if (!is_spatial) { + // Non-spatial case - set immediately + m_subrequests[real_idx]->set_tensor(oport, o_tensor); + } else { + // Spatial case - defer + m_spatial_io[real_idx].outputs.at(i) = o_tensor; + } } LOG_DEBUG("Done"); } @@ -645,19 +725,133 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo } } -namespace { -template -void during(R&& r, F&& f) { - r->start_async(); - f(); // expect noexcept - r->wait(); +void ov::npuw::JustInferRequest::unsafe_during(std::size_t real_idx, const std::function& f) { + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + if (!comp_model_desc.spatial) { + // Non-spatial execution: trigger request asynchronously, run `f` in this context + auto& r = m_subrequests[real_idx]; + r->start_async(); + f(); // expect noexcept + r->wait(); + } else { + // Spatial execution... Do the opposite - run f asynchronously, and meanwhile run the + // spatial inference + auto future = std::async(std::launch::async, f); + unsafe_infer(real_idx); + future.wait(); + } +} + +void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + auto& r = m_subrequests[real_idx]; + if (!comp_model_desc.spatial) { + // Run normally + r->infer(); + } else { + // Run over the specified range... Note: the full inputs/outputs + // must be prepared in the m_spatial_io at this point + const auto& spatial = comp_model_desc.spatial.value(); + const auto num_outputs = comp_model_desc.compiled_model->outputs().size(); + + // Create a sparse vector with full input sizes. + // For the access simplicity, its size is aligned with function's + // number of input parameters (activations) so some slots may be + // not used here. + // FIXME: All these preparations could be done statically (just once) + std::vector full_in_shapes(comp_model_desc.param_base); + for (auto&& param : spatial.params) { + full_in_shapes[param.idx] = m_spatial_io[real_idx].inputs.at(param.idx)->get_shape(); + } + + // Now handle the range, even if it is not a multiply of nway (slice): + // + // |<- - - - full range - - - ->| + // +------+------+------+------+-+ + // | nway | nway | nway | nway | | + // +------+------+------+------+-+ + // ^tail + // The block is always compiled to produce nway. If we need a smaller tensor + // on the last iteration, the sub-nway will be copied from the input range to + // a temporary tensor, and then the sub-nwway range will be copied from the + // request's output range. + + std::size_t offset = 0u; + for (std::size_t i = 0u; i < spatial.nway_iters; i++, offset += spatial.nway) { + // Collect spatial inputs for this offset + for (auto&& param : spatial.params) { + const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx]; + r->set_tensor( + iport, + ov::npuw::util::view(m_spatial_io[real_idx].inputs.at(param.idx), param.dim, offset, spatial.nway)); + } // for(params) + + // Now set the spatial outputs + for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) { + const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx]; + r->set_tensor(oport, + ov::npuw::util::view(m_spatial_io[real_idx].outputs.at(out_idx), + spatial.out_dim, + offset, + spatial.nway)); + } // for(outputs) + + // Now run the part + r->infer(); + } // for(full_nway_times) + + // Now process the tail, if required + if (spatial.tail_size) { + // Copy the sub-ranges to spatial inputs + // NOTE: tails buffers are read from/written to at 0th offset! + for (auto&& param : spatial.params) { + auto in_view = ov::npuw::util::view(m_spatial_io[real_idx].inputs.at(param.idx), + param.dim, + offset, + spatial.tail_size); + + const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx]; + auto out_view = ov::npuw::util::view(m_spatial_io[real_idx].input_tails.at(param.idx), + param.dim, + 0, + spatial.tail_size); + + in_view->copy_to(out_view._ptr); + r->set_tensor(iport, m_spatial_io[real_idx].input_tails.at(param.idx)); + } // for(params) + + // Now set the tail tensors + for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) { + const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx]; + r->set_tensor(oport, m_spatial_io[real_idx].output_tails.at(out_idx)); + } // for(outputs) + + // Now run the tail infer + r->infer(); + + // Now copy the views from the output full-nway tensor to the output tensors + for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) { + const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx]; + auto spatial_tensor_shape = oport.get_shape(); + + auto in_view = ov::npuw::util::view(m_spatial_io[real_idx].output_tails.at(out_idx), + spatial.out_dim, + 0, + spatial.tail_size); + + auto out_view = ov::npuw::util::view(m_spatial_io[real_idx].outputs.at(out_idx), + spatial.out_dim, + offset, + spatial.tail_size); + in_view->copy_to(out_view._ptr); + } // for(outputs) + } + } } -} // namespace void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared) { auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; auto real_idx = comp_model_desc.replaced_by.value_or(idx); - auto& this_subr = m_subrequests[real_idx]; const std::size_t next_idx = next(idx + 1); if (comp_model_desc.replaced_by) { @@ -669,7 +863,7 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool if (m_use_function_pipelining) { // function pipelining is here! and the next rq is ours. NPUW_ASSERT(m_funcall_pipeline[idx].next.value() == next_idx); - during(this_subr, [&]() { + unsafe_during(real_idx, [&]() { LOG_DEBUG("Unpacking closures for the NEXT subrequest[" << next_idx << "]..."); LOG_BLOCK(); // Note: do it here unconditionally - if this request fails, @@ -680,7 +874,7 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool } else { // Function pipelining is not used. THIS infer request // is also the NEXT one. Nothing much to do here - this_subr->infer(); + unsafe_infer(real_idx); bind_global_parameters(next_idx); } } else { @@ -690,9 +884,9 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool if (next_idx == 0) { // Note: even if m_function_pipelining is ON, // SWAP won't happen here - see the below check for .next - this_subr->infer(); + unsafe_infer(real_idx); } else { - during(this_subr, [&]() { + unsafe_during(real_idx, [&]() { if (!next_prepared) { bind_global_parameters(next_idx); next_prepared = true; @@ -710,9 +904,9 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool // This is a regular subgraph. Start it async to prepare the next // parameters if (next_idx == 0) { - this_subr->infer(); + unsafe_infer(real_idx); } else { - during(this_subr, [&]() { + unsafe_during(real_idx, [&]() { if (!next_prepared) { bind_global_parameters(next_idx); next_prepared = true; diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp index e63f2f18b85ece..bb75eb69d0eb0a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp @@ -52,6 +52,8 @@ class JustInferRequest final : public IBaseInferRequest { void function_prologue(std::size_t idx); void unpack_closure(std::size_t idx, RqPtr request); + void unsafe_during(std::size_t real_idx, const std::function& f); + void unsafe_infer(std::size_t real_idx); void unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared_p); void connect_subrequests(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/logging.cpp b/src/plugins/intel_npu/src/plugin/npuw/logging.cpp index 15f0e8cb504c21..3c591e3154d8fd 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/logging.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/logging.cpp @@ -9,6 +9,8 @@ #include #include +#include "openvino/runtime/make_tensor.hpp" // get_tensor_impl + namespace { #ifdef NPU_PLUGIN_DEVELOPER_BUILD const char* get_env(const std::vector& list_to_try) { @@ -61,11 +63,17 @@ int ov::npuw::__logging_indent__::__level__() { return this_indent; } -void ov::npuw::dump_tensor(const ov::SoPtr& tensor, const std::string& base_path) { - if (!tensor->is_continuous()) { - LOG_ERROR("Failed to dump blob " << base_path << ": it is not continuous"); - return; +void ov::npuw::dump_tensor(const ov::SoPtr& input, const std::string& base_path) { + ov::SoPtr tensor; + + if (input->is_continuous()) { + tensor = input; + } else { + // Create temporary tensor and copy data in. Dumping is never fast, anyway + tensor = ov::get_tensor_impl(ov::Tensor(input->get_element_type(), input->get_shape())); + input->copy_to(tensor._ptr); } + NPUW_ASSERT(tensor); const auto bin_path = base_path + ".bin"; { diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp index 6a9cf017fded81..4b8973b5bb94ae 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/compiler.cpp @@ -20,13 +20,23 @@ namespace npuw { namespace online { namespace detail { + +namespace { +static const std::map ISOL_PRESETS = {{"COMPUTE", + "P:DQMatMulGQu4/compute,P:DQMatMulCWu4/compute," + "P:DQMatMulGQi4/compute,P:DQMatMulCWi4/compute," + "P:VocabMatMul/compute," + "P:RMSNorm/compute"}}; +} + // For missing declaration warning +// FIXME: Instead, one should use namespace{} size_t getMinGraphSize(::intel_npu::Config& cfg); size_t getMinRepBlocks(::intel_npu::Config& cfg); size_t getMinRepBlockSize(::intel_npu::Config& cfg); std::vector getAvoids(::intel_npu::Config& cfg); std::vector getIsolates(::intel_npu::Config& cfg); -std::vector getIsolates(const std::string isolates_unparsed); +std::vector getIsolates(const std::string& isolates_unparsed); std::vector getNoFolds(::intel_npu::Config& cfg); std::vector getNoFolds(const std::string& nofolds_unparsed); // Set default predefined values for COMPUTE pipeline @@ -108,13 +118,18 @@ std::vector getIsolates(::intel_npu::Config& cfg) { return getIsolates(cfg.getString<::intel_npu::NPUW_ONLINE_ISOLATE>()); } -std::vector getIsolates(const std::string isolates_unparsed) { +std::vector getIsolates(const std::string& isolates_unparsed) { if (isolates_unparsed.empty()) { return {}; } std::vector isolates; - std::string s = std::move(isolates_unparsed); + std::string s = isolates_unparsed; + + auto preset_iter = ISOL_PRESETS.find(s); + if (preset_iter != ISOL_PRESETS.end()) { + s = preset_iter->second; + } size_t pos = 0; size_t start = 0; @@ -191,8 +206,7 @@ std::vector getNoFolds(const std::string& nofolds_unparsed) { void setComputeConfig(PassContext& ctx) { // FIXME: initialize via a dedicated function instead of parsing - ctx.isolates = detail::getIsolates("P:DQMatMulGQu4/compute,P:DQMatMulCWu4/compute,P:DQMatMulGQi4/" - "compute,P:DQMatMulCWi4/compute,P:RMSNorm/compute"); + ctx.isolates = detail::getIsolates(ISOL_PRESETS.at("COMPUTE")); ctx.nofolds = detail::getNoFolds("compute"); } @@ -219,6 +233,9 @@ void dump_partitioning(const ov::npuw::Ensemble& ens, const std::string& to) { if (!group.avoid_list.empty()) { gr.append_attribute("avoid") = group.avoid_list.data(); } + if (!group.tag.empty()) { + gr.append_attribute("tag") = group.tag.data(); + } // Note: Ensemble also add "id" attribute but it's not used by the plugin for (const auto& input : group.input_layers) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp index d09f0b8a7100d3..cfcce1725433db 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/group.cpp @@ -112,6 +112,8 @@ ov::npuw::Group Group::toGroup() const { } } + g.tag = m_isol_tag; + return g; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp index c621dbafd3dadb..82856cece3de40 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp @@ -418,10 +418,11 @@ void Snapshot::earlyRegroup() { } else if (isolate.pattern == "DQMatMulGQi4") { rewr.add_matcher(shared_from_this(), isolate.tag); handle_patterns = true; + } else if (isolate.pattern == "VocabMatMul") { + rewr.add_matcher(shared_from_this(), isolate.tag); + handle_patterns = true; } else { - LOG_WARN("OPENVINO_NPUW_ISOLATE only supports RMSNorm, DQMatMulCWu4, DQMatMulGQu4, DQMatMulCWi4, " - "DQMatMulGQi4 " - << "as patterns. Isolate pattern " << isolate.pattern << " is skipped!"); + LOG_WARN("OPENVINO_NPUW_ISOLATE: unsupported pattern " << isolate.pattern << " is skipped!"); } } } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 192d975509ce5e..4ebfcc1809219c 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023 Intel Corporation +// Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -140,6 +140,7 @@ ov::npuw::Ensemble load_groups(const std::shared_ptr& model, const st this_group.gflops = get_float_attr(group, "gflops"); this_group.repeated_id = get_str_attr(group, "repeated", ""); this_group.avoid_list = get_str_attr(group, "avoid", ""); + this_group.tag = get_str_attr(group, "tag", ""); FOREACH_CHILD(input, group, "input") { this_group.input_layers.push_back(get_str_attr(input, "name")); } @@ -226,6 +227,10 @@ class Partitioner { void createFunction(FunctionPipeline& func_ggg); + // NB(dm): This method should get a better place, it is here only because + // it is tied to the Function structure (but, in fact, not so much) + void identifySpatialRange(ov::npuw::Function& f); + template void rearrange_to_function_protocol(ov::npuw::Subgraph::Ref func_ref, const std::vector& protocol, @@ -308,6 +313,7 @@ class Partitioner { void matchResults(const std::string& func_name); void createFunction(const std::string& func_name); void matchRepeatedSubgraphs(const std::string& func_name); + void spatial(const std::string& func_name); void optimize(const std::string& func_name); void decompressionCutOff(const std::string& func_name); @@ -360,6 +366,7 @@ void Partitioner::identifySubgraphs() { P.total_ops += group.sg._ops; group.sg._avoid_list = group.avoid_list; + group.sg._tag = group.tag; // Note inputs and outputs are included in the above set, so if // we are here, those nodes should be present in the model. @@ -1455,6 +1462,7 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) { ov::npuw::Function function; function._model = func_ggg.mdls.front(); function._param_offset = body_sg._parameters.size(); + function._tag = body_sg._tag; std::size_t new_param_idx = function._param_offset; for (auto&& node_ptr : function._model->get_ordered_ops()) { @@ -1516,6 +1524,76 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) { LOG_VERB("Done: " << func_name); } +void Partitioner::identifySpatialRange(ov::npuw::Function& f) { + NPUW_ASSERT(f._tag == "compute"); + + // NB: The current logic must be changed. Here we assume we only + // apply this change to "compute" subgraphs which we identify + // based on well-known patterns. This won't work in the generic case. + + // The current logic is the following: + // - Assume the function results are ALL SPATIAL (and this alone + // is a very strong assumption) + // - Identify their SPATIAL dimension (which is dim[1] because + // we know how COMPUTE subgraphs are organized) + // - Walk over the parameters (up to _param_offset), find + // spatial Parameters based on the dim we're looking at + // - Report the findings. + // Hence, the logic is not robust enough and should be generalized + // in the future. + + // First, check our assumption on the function results + const auto& f_results = f._model->get_results(); + NPUW_ASSERT(f_results.size() > 0); + + const auto& f_result_0 = f_results.front(); + const auto& f_result_0_shape = f_result_0->get_shape(); + + if (f_result_0_shape.size() != 3) { + return; // NB: this is the only case we enable now + } + + if (f_result_0_shape[1] <= 1) { + return; // NB: this is the only spatial dim we enable now + } + + for (auto&& f_result_i : f_results) { + // Yes, it will also compare r[0] vs r[0] + const auto& f_result_i_shape = f_result_i->get_shape(); + if (f_result_0_shape.size() != f_result_i_shape.size()) { + return; // Do nothing + } + + if (f_result_0_shape[1] != f_result_i_shape[1]) { + return; // Do nothing + } + } + + // Now, find the parameters with the same spatial dim + // NB: again, this is a very weak feature to look for + const auto& f_params = f._model->get_parameters(); + NPUW_ASSERT(f_params.size() > 0); + + using S = ov::npuw::Function::Spatial; + S spatial; + spatial._range = f_result_0_shape[1]; + spatial._out_dim = 1; // the only case we're looking into now + + for (std::size_t i = 0u; i < f._param_offset; i++) { + const auto& f_param = f_params[i]; + const auto& f_param_dims = f_param->get_shape(); + + auto spatial_dim_iter = std::find(f_param_dims.begin(), f_param_dims.end(), spatial._range); + if (spatial_dim_iter != f_param_dims.end()) { + std::size_t spatial_dim_idx = std::distance(f_param_dims.begin(), spatial_dim_iter); + spatial._inputs.push_back(S::Param{f_param, spatial_dim_idx}); + } + } + + // Apply the spatial change + f._spatial = std::move(spatial); +} + void Partitioner::createFunction(const std::string& func_name) { createFunction(all_functions.at(func_name)); } @@ -1594,6 +1672,50 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) { LOG_VERB("Done"); } +void Partitioner::spatial(const std::string& func_name) { + ov::npuw::Function& f = P.functions.at(func_name); + + // Identify the spatial dimension for this function + // Works only for Compute case. + // FIXME: Replace this string identification with smt better + if (!cfg.get<::intel_npu::NPUW_SPATIAL>() || f._tag != "compute") { + LOG_VERB("No spatial optimizations will be done to " << func_name << " in model " << model->get_friendly_name() + << "..."); + return; + } + + LOG_VERB("Turn " << func_name << " into spatial execution in model " << model->get_friendly_name() << "..."); + LOG_BLOCK(); + + identifySpatialRange(f); + if (!f._spatial) { + LOG_WARN("No spatial ranges identified in the COMPUTE block, expect a higher compile time"); + return; + } + + LOG_VERB("Spatial range: " << f._spatial->_range); + + // Final check before transformations + f._spatial->_slice = cfg.get<::intel_npu::NPUW_SPATIAL_NWAY>(); + if (f._spatial->_slice == 0) { + LOG_WARN("NWAY is set to 0, disabling it (but better disable SPATIAL setting itself)"); + f._spatial.reset(); // Erase spatial information to avoid conflicts + return; + } + + // Apply transformation to the model. Note: only function body is modified + // Accumulate the reshape map + std::map, ov::PartialShape> new_shapes; + for (auto&& p : f._spatial->_inputs) { + ov::Shape shape = p.param->get_shape(); + shape[p.dim] = f._spatial->_slice; + new_shapes[p.param->output(0)] = shape; + } + f._model->reshape(new_shapes); + + LOG_VERB("Done"); +} + void Partitioner::optimize(const std::string& func_name) { ov::npuw::Function& f = P.functions.at(func_name); auto& func_group = all_functions.at(func_name); @@ -1622,6 +1744,7 @@ void Partitioner::optimize(const std::string& func_name) { // Regardless of DQ setting, run this first { ov::npuw::patterns::opt::Context ctx; + ctx.is_spatial = f._spatial.has_value(); ctx.pmm_dims = cfg.get<::intel_npu::NPUW_PMM>(); // Run Head/Tail passes @@ -1768,6 +1891,8 @@ void Partitioner::optimize(const std::string& func_name) { // Run "dynamic quantization" ov::npuw::patterns::opt::Context ctx; + ctx.is_spatial = f._spatial.has_value(); + ov::pass::GraphRewrite rewr; rewr.add_matcher(); rewr.add_matcher(std::ref(ctx)); @@ -2052,6 +2177,7 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr, std::size_t> _param_mapping; + + // Spatial information. So far assume spatial execution in 1 dimension only + struct Spatial { + using PPtr = std::shared_ptr; + struct Param { + PPtr param; + std::size_t dim; + }; + std::size_t _range = 0u; // Range over which spatial execution is organized, e.g. 1024 + std::size_t _slice = 0u; // A submission size for a single execution, e.g. 128 + std::size_t _out_dim = 0u; // Assume it is the same dim for all Results + std::vector _inputs; + }; + using SpatialOpt = std::optional; + SpatialOpt _spatial; }; struct Group { @@ -71,6 +89,7 @@ struct Group { float gflops; std::string avoid_list; + std::string tag; ov::npuw::Subgraph sg; }; diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.cpp index e7f09b00cde2a2..b082d67037db7d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.cpp @@ -9,6 +9,7 @@ #include "../online/snapshot.hpp" // online::Snapshot #include "openvino/op/ops.hpp" #include "openvino/pass/pattern/op/label.hpp" // any_input +#include "openvino/pass/pattern/op/optional.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "openvino/util/common_util.hpp" @@ -106,8 +107,10 @@ DQMatMulCWu4::DQMatMulCWu4(const std::shared_ptr& sn auto matched_qzerop = std::static_pointer_cast(matched_node_qzerop); auto matched_qcoeff = std::static_pointer_cast(matched_node_qcoeff); - if (ov::element::u4 == matched_qweight->get_element_type() && - ov::element::u4 == matched_qzerop->get_element_type() && + if ((ov::element::u4 == matched_qweight->get_element_type() || + ov::element::u8 == matched_qweight->get_element_type()) && + (ov::element::u4 == matched_qzerop->get_element_type() || + ov::element::u8 == matched_qzerop->get_element_type()) && ov::element::f16 == matched_qcoeff->get_element_type()) { // Partitioning ignores Const->Convert nodes, so qcvtw and qcvtz are not used auto matched_qsubz = node_to_output.at(qsubz).get_node_shared_ptr(); @@ -135,7 +138,7 @@ DQMatMulGQi4::DQMatMulGQi4(const std::shared_ptr& sn auto qmuls = opp::wrap_type({qcvtw, qcoeff}); auto qreshp = opp::wrap_type({qmuls, opp::any_input()}); - auto qcvtr = opp::wrap_type({qreshp}); + auto qcvtr = opp::optional({qreshp->output(0)}); auto qmm = opp::wrap_type({opp::any_input(), qcvtr}); auto node_to_gptr = snapshot->getNodeToGroupMap(); @@ -155,17 +158,22 @@ DQMatMulGQi4::DQMatMulGQi4(const std::shared_ptr& sn if ((ov::element::i4 == matched_qweight->get_element_type() || ov::element::i8 == matched_qweight->get_element_type()) && - ov::element::f16 == matched_qcoeff->get_element_type()) { + (ov::element::f16 == matched_qcoeff->get_element_type() || + ov::element::f32 == matched_qcoeff->get_element_type())) { // Partitioning ignores Const->Convert nodes, so qcvtw is not used auto matched_qmuls = node_to_output.at(qmuls).get_node_shared_ptr(); auto matched_qreshp = node_to_output.at(qreshp).get_node_shared_ptr(); - auto matched_qcvtr = node_to_output.at(qcvtr).get_node_shared_ptr(); auto matched_qmm = node_to_output.at(qmm).get_node_shared_ptr(); node_to_gptr->at(matched_qmuls)->isolate(isol_tag); node_to_gptr->at(matched_qreshp)->isolate(isol_tag); - node_to_gptr->at(matched_qcvtr)->isolate(isol_tag); node_to_gptr->at(matched_qmm)->isolate(isol_tag); + + auto qcvtr_iter = node_to_output.find(qcvtr); + if (qcvtr_iter != node_to_output.end()) { + auto matched_qcvtr = qcvtr_iter->second.get_node_shared_ptr(); + node_to_gptr->at(matched_qcvtr)->isolate(isol_tag); + } } return false; // root hasn't changed @@ -218,6 +226,67 @@ DQMatMulCWi4::DQMatMulCWi4(const std::shared_ptr& sn register_matcher(std::make_shared(qmm, "TagDQMatMulCWi4"), std::move(callback)); } +// This is a case for Raw (f16/f32) MatMul connected directly to the Result. +// +// The following combinations are covered: +// +// act(f32) -> MatMul(f32) -> Result +// weight(f32) -> +// +// act(f16) -> MatMul(f16) -> to_f32 -> Result +// weight(f16) -> +// +// act(f32) -> to_f16 -> MatMul -> to_f32 -> Result +// weight(f16) -----------> +// +// act(f32) -----------> MatMul -> Result +// weight(f16) -- to_f32--> + +VocabMatMul::VocabMatMul(const std::shared_ptr& snapshot, const std::string& isol_tag) { + auto act_in = opp::any_input(); + auto weight = opp::wrap_type(); + + auto ocvta = opp::optional({act_in->output(0)}); + auto ocvtw = opp::optional({weight->output(0)}); + + auto mm = opp::wrap_type({ocvta, ocvtw}); + auto ocvtm = opp::optional({mm->output(0)}); + + auto res = opp::wrap_type({ocvtm}); + + auto node_to_gptr = snapshot->getNodeToGroupMap(); + + // Note: Use [=] to make sure the above objects stay alive in the callback + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + auto matched_out_a = node_to_output.at(act_in).get_node_shared_ptr(); + auto matched_out_w = node_to_output.at(weight).get_node_shared_ptr(); + + auto a_type = matched_out_a->get_element_type(); + auto w_type = matched_out_w->get_element_type(); + + if ((a_type == ov::element::f16 || a_type == ov::element::f32) && + (w_type == ov::element::f16 || w_type == ov::element::f32)) { + node_to_gptr->at(node_to_output.at(mm).get_node_shared_ptr())->isolate(isol_tag); + + auto isol_if = [=, &node_to_gptr, &node_to_output](std::shared_ptr n) { + auto iter = node_to_output.find(n); + if (iter != node_to_output.end()) { + auto group_iter = node_to_gptr->find(iter->second.get_node_shared_ptr()); + if (group_iter != node_to_gptr->end()) { + group_iter->second->isolate(isol_tag); + } + } + }; + isol_if(ocvta); + isol_if(ocvtw); + isol_if(ocvtm); + } + return false; + }; + register_matcher(std::make_shared(res, "TagVocabMatMul"), std::move(callback)); +} + // TODO: visualize RMSNorm::RMSNorm(const std::shared_ptr& snapshot, const std::string& isol_tag) { auto hadd = opp::wrap_type({opp::any_input(), opp::any_input()}); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.hpp index 92e60cb95fbdbe..faa2fe3f0f9578 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.hpp @@ -41,6 +41,11 @@ class DQMatMulCWi4 : public ov::pass::MatcherPass { DQMatMulCWi4(const std::shared_ptr& snapshot, const std::string& isol_tag); }; +class VocabMatMul : public ov::pass::MatcherPass { +public: + VocabMatMul(const std::shared_ptr& snapshot, const std::string& isol_tag); +}; + class RMSNorm : public ov::pass::MatcherPass { public: RMSNorm(const std::shared_ptr& snapshot, const std::string& isol_tag); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index 4ec72e02260884..d987023d0040e8 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -687,8 +687,13 @@ DQParMMGQ::DQParMMGQ(Context::Ref ctx) { auto qmmi_shape = node_to_output.at(qmm).get_shape(); - if (qmmi_shape.size() != 3 || qmmi_shape[0] != 1 || qmmi_shape[1] != 1) { - // Limit token to 1-token shapes only (prefill requires its own tranformation) + if (qmmi_shape.size() != 3 || qmmi_shape[0] != 1) { + // Not handling such cases + return false; + } + + if (qmmi_shape[1] != 1 && !ctx.get().is_spatial) { + // For non 1-token cases, do transformation if and only if and only if the block is spatial return false; } @@ -709,9 +714,12 @@ void mergeParallelMatMuls(const std::shared_ptr& m, Context& ctx) { continue; } ov::Output orig_multiply; + std::size_t axis_to_concat = -1; std::tie(orig_multiply, axis_to_concat) = mul_to_mms.first; + const ov::Shape orig_act_shape = orig_multiply.get_shape(); + if (!util::is_set(axis_to_concat, ctx.pmm_dims)) { LOG_VERB("Parallel MatMuls found, but fusion over dim " << axis_to_concat << " is not enabled"); continue; @@ -773,10 +781,10 @@ void mergeParallelMatMuls(const std::shared_ptr& m, Context& ctx) { auto this_orig_wshape = parallel_matmuls[i].w->get_shape(); auto this_slice_start = std::make_shared(ov::element::i32, ov::Shape{3}, S{0, 0, offset}); - auto this_slice_end = - std::make_shared(ov::element::i32, - ov::Shape{3}, - S{1, 1, offset + this_orig_wshape[axis_to_concat]}); + auto this_slice_end = std::make_shared( + ov::element::i32, + ov::Shape{3}, + S{1, orig_act_shape[1], offset + this_orig_wshape[axis_to_concat]}); auto this_slice_step = std::make_shared(ov::element::i32, ov::Shape{3}, S{1, 1, 1}); auto this_slice = std::make_shared(new_mm, this_slice_start, this_slice_end, this_slice_step); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp index 530d0a52cc515f..b649f6a136c2e7 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp @@ -26,6 +26,7 @@ class DQMatMulCWi : public ov::pass::MatcherPass { struct Context { std::string pmm_dims; + bool is_spatial = false; using PPtr = std::shared_ptr; using NPtr = std::shared_ptr; diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp index d83a521fb29496..ebbed29893583c 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp @@ -17,6 +17,7 @@ #include "openvino/op/constant.hpp" #include "openvino/op/transpose.hpp" #include "openvino/op/util/op_types.hpp" +#include "openvino/runtime/make_tensor.hpp" // get_tensor_impl #ifdef UNPACK_PROFILING # include "tbb/concurrent_unordered_map.h" @@ -1562,6 +1563,45 @@ void ov::npuw::util::gather(const ov::SoPtr& src, } } +ov::SoPtr ov::npuw::util::view(const ov::SoPtr& src, + const ov::npuw::util::View& from, + const ov::npuw::util::View& to) { + const auto type = src->get_element_type(); + NPUW_ASSERT(from.size() == to.size()); + + // Sub-byte views are not supported here + NPUW_ASSERT(type != ov::element::u4 && type != ov::element::i4); + + const auto num_dims = from.size(); + ov::Shape view_shape; + for (auto d = 0u; d < num_dims; d++) { + view_shape.push_back(to[d] - from[d]); + } + + const auto strides = src->get_strides(); + uint8_t* ptr = static_cast(src->data()); + + // Shift PTR according to the strides + for (auto d = 0u; d < num_dims; d++) { + ptr += strides[d] * from[d]; + } + + ov::Tensor viewt(type, view_shape, ptr, strides); + return ov::get_tensor_impl(viewt); +} + +ov::SoPtr ov::npuw::util::view(const ov::SoPtr& src, + std::size_t dim, + std::size_t offset, + std::size_t len) { + const auto shape = src->get_shape(); + View view_start = View(shape.size(), 0u); + View view_end = shape; + view_start[dim] = offset; + view_end[dim] = offset + len; + return ov::npuw::util::view(src, view_start, view_end); +} + template void to_f32(const ov::Tensor& in, ov::Tensor& out) { NPUW_ASSERT(in.is_continuous()); diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp index 6012ce0e587352..689bf8571ddb8d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.hpp @@ -52,6 +52,11 @@ void unpack(const ov::SoPtr& from, void gather(const ov::SoPtr& src, const ov::SoPtr& idx, const ov::SoPtr& dst); +using View = std::vector; +ov::SoPtr view(const ov::SoPtr& src, const View& from, const View& to); + +ov::SoPtr view(const ov::SoPtr& src, std::size_t dim, std::size_t offset, std::size_t len); + void to_f32(const ov::Tensor& in, ov::Tensor& out); void to_f16(ov::Tensor& t); void transpose(ov::Tensor& t);