diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp index e73874cd4bd57a..f462dffd82bc23 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp @@ -49,7 +49,7 @@ DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime); DEFINE_OPT(NPUW_PARALLEL_COMPILE, bool, false, npuw::parallel_compilation, CompileTime); DEFINE_OPT(NPUW_WEIGHTS_BANK, std::string, "", npuw::weights_bank, CompileTime); -DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "", npuw::weights_bank_alloc, CompileTime); +DEFINE_OPT(NPUW_WEIGHTS_BANK_ALLOC, std::string, "CPU", npuw::weights_bank_alloc, CompileTime); DEFINE_OPT(NPUW_FUNCALL_ASYNC, bool, false, npuw::funcall_async, RunTime); DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime); DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime); diff --git a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp index b83c7518df389e..b3dcc97f378e8f 100644 --- a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp @@ -49,7 +49,7 @@ static constexpr ov::Property weights_bank{"NPUW_WEIGHTS_BANK"}; * @brief * Type: std::string. * Specify device name for weights bank which is used to allocate memory. - * Default value: false. + * Default value: "CPU". */ static constexpr ov::Property weights_bank_alloc{"NPUW_WEIGHTS_BANK_ALLOC"}; diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 9cad96154112c6..9cc299cf2ee329 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -147,7 +147,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, rewr.run_on_model(model); } - auto partitioning = getPartitioning(model, m_cfg, m_weights_bank); + auto partitioning = getPartitioning(model, m_cfg); m_total_stat.gflops = partitioning.total_gflops; m_total_stat.ops = partitioning.total_ops; const std::vector& orderedSubgraphs = partitioning.subgraphs; @@ -236,7 +236,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, } // for(ordered_subgraphs) // NOTE(dm): there's a better way to do it, like we do in G-API backends. - m_update_required = m_cfg.get<::intel_npu::NPUW_FOLD>() ? true : false; + m_update_required = m_cfg.get<::intel_npu::NPUW_FOLD>(); // Store mapping between manually splitted inputs/outputs // to connect tensors between compiled submodels @@ -438,6 +438,7 @@ void ov::npuw::CompiledModel::finalize_weights_bank() { for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) { auto& comp_model_desc = m_compiled_submodels[idx]; + // FIXME: Head and tail don't have their closures set !!! if (!comp_model_desc.replaced_by) { continue; } @@ -453,6 +454,11 @@ void ov::npuw::CompiledModel::finalize_weights_bank() { const auto& lt = m_compiled_submodels[idx].lazy_closure[tidx]; const auto& evaled = evaluated_tensors[idx][tidx]; m_compiled_submodels[idx].closure.push_back(m_weights_bank->get(lt, *func_desc.device_it, evaled)); + + // Sanity check + const auto& tensor = m_compiled_submodels[idx].closure.back(); + NPUW_ASSERT(tensor && tensor.data() && (tensor.get_size() > 0)); + // FIXME: should is_remote be set unconditionally? m_compiled_submodels[idx].is_remote.push_back(true); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp index 53083e062747ad..99d58cd9dd2645 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp @@ -7,7 +7,7 @@ using ov::npuw::weights::ConcatMeta; using ov::npuw::weights::ConstPtr; using ov::npuw::weights::LazyTensor; -using ov::npuw::weights::LTData; +using ov::npuw::weights::OrigData; using ov::npuw::weights::Transform; using ov::npuw::weights::TransformType; @@ -17,8 +17,8 @@ namespace weights { struct LazyTensorImpl { public: - explicit LazyTensorImpl() = default; - explicit LazyTensorImpl(const TransformType& type, const Transform& transform); + LazyTensorImpl() = default; + LazyTensorImpl(const TransformType& type, const Transform& transform); bool operator==(const LazyTensorImpl& other) const; @@ -51,8 +51,10 @@ std::size_t LazyTensorImpl::get_hash() const { seed = m_parent->get_hash(); } else { seed = std::hash()(m_orig_data) + 0x9e3779b9; - seed ^= std::hash()(m_orig_shape.to_string()) + 0x9e3779b9; - seed ^= std::hash()(m_orig_type.to_string()) + 0x9e3779b9; + for (const auto& dim : m_orig_shape) { + seed ^= std::hash()(dim) + 0x9e3779b9; + } + seed ^= m_orig_type.hash() + 0x9e3779b9; } // Combine with this hash @@ -79,13 +81,13 @@ std::size_t LazyTensorImpl::get_hash() const { using ov::npuw::weights::LazyTensorImpl; LazyTensorImpl::LazyTensorImpl(const TransformType& type, const Transform& transform) { - if (type == TransformType::TENSOR && std::holds_alternative(transform)) { + if (type == TransformType::THIS && std::holds_alternative(transform)) { m_transform = std::make_pair(type, transform); ov::Tensor tensor; - if (std::holds_alternative(std::get(transform))) { - tensor = ov::npuw::util::tensor_from_const(std::get(std::get(transform))); + if (std::holds_alternative(std::get(transform))) { + tensor = ov::npuw::util::tensor_from_const(std::get(std::get(transform))); } else { - tensor = std::get(std::get(transform)); + tensor = std::get(std::get(transform)); } m_orig_data = tensor.data(); m_orig_shape = tensor.get_shape(); @@ -108,7 +110,7 @@ bool LazyTensorImpl::operator==(const LazyTensorImpl& other) const { ConcatMeta m1, m2; switch (m_transform.first) { - case TransformType::TENSOR: + case TransformType::THIS: // everything is already compared above - skip break; case TransformType::CONVERT: @@ -164,7 +166,7 @@ ov::Tensor LazyTensorImpl::eval() const { // Process the initial tensor - either from Const or from Concat if (!m_parent) { - if (m_transform.first == TransformType::TENSOR) { + if (m_transform.first == TransformType::THIS) { return get_orig_tensor(); } else if (m_transform.first == TransformType::CONCAT) { std::vector to_concat; @@ -196,17 +198,17 @@ ov::Tensor LazyTensorImpl::eval() const { ov::Tensor LazyTensorImpl::get_orig_tensor() const { // Sanity check NPUW_ASSERT(!has_transformations()); - if (std::holds_alternative(std::get(m_transform.second))) { - return ov::npuw::util::tensor_from_const(std::get(std::get(m_transform.second))); + if (std::holds_alternative(std::get(m_transform.second))) { + return ov::npuw::util::tensor_from_const(std::get(std::get(m_transform.second))); } - return std::get(std::get(m_transform.second)); + return std::get(std::get(m_transform.second)); } bool LazyTensorImpl::has_transformations() const { - if (m_parent == nullptr) { - return false; + if (m_parent) { + return true; } - return true; + return false; } LazyTensor::LazyTensor(const TransformType& type, const Transform& transform) diff --git a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp index ef5abdcd7842f3..69cee29ab6d853 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp @@ -19,7 +19,7 @@ namespace ov { namespace npuw { namespace weights { -enum class TransformType : int { TENSOR, PERMUTE, CONVERT, CONCAT }; +enum class TransformType : int { THIS, PERMUTE, CONVERT, CONCAT }; // Forward declaration class LazyTensor; @@ -27,10 +27,9 @@ struct LazyTensorImpl; using ConcatMeta = std::pair, std::size_t>; using ConstPtr = std::shared_ptr; -using LTData = std::variant; +using OrigData = std::variant; -// LazyTensor owns Constant's memory -using Transform = std::variant, std::monostate, ConcatMeta>; +using Transform = std::variant, std::monostate, ConcatMeta>; class LazyTensor { public: diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 8c466e4760e2fd..ccc9ab5b37f7c3 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -276,14 +276,12 @@ class Partitioner { Partitioner(const std::shared_ptr& _model, ov::npuw::Ensemble& _ens, ov::npuw::Partitioning& _P, - ::intel_npu::Config& _cfg, - const std::shared_ptr& _bank) + ::intel_npu::Config& _cfg) : model(_model), ens(_ens), P(_P), func_pipeline_type(FunctionPipelineType::FOLD), - cfg(_cfg), - bank(_bank) {} + cfg(_cfg) {} //////////////////////////////////////////////////////// // Partitioning execution pipeline @@ -319,7 +317,6 @@ class Partitioner { private: FunctionPipelineType func_pipeline_type; ::intel_npu::Config& cfg; - const std::shared_ptr& bank; }; void Partitioner::identifySubgraphs() { @@ -1493,7 +1490,7 @@ void Partitioner::createFunction(FunctionPipeline& func_ggg) { LOG_DEBUG("Register " << prod_output << " in the function closure"); funcall._lazy_closure.push_back( - LazyTensor(TransformType::TENSOR, + LazyTensor(TransformType::THIS, std::dynamic_pointer_cast(input_node))); // (n)/1/i/c } else if (ov::op::util::is_parameter(input_node)) { LOG_DEBUG("Handling a Parameter input " << prod_output); @@ -1591,7 +1588,7 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) { LOG_DEBUG("Register " << prod_output << " in the function closure[" << param_idx << "] (via prototype " << proto_layer_name << ")"); funcall._lazy_closure[param_idx - function._param_offset] = - LazyTensor(TransformType::TENSOR, + LazyTensor(TransformType::THIS, std::dynamic_pointer_cast(input_node)); // (t)/1/c } } // for (inputs) @@ -1716,11 +1713,6 @@ void Partitioner::optimize(const std::string& func_name) { auto& funcall = func_group.refs[f_idx].get(); // FIXME: assuming no transformations were applied to the tensor - since we are utilizing the original // ov::Tensor below - NPUW_ASSERT(!funcall._lazy_closure[w_idx - f._param_offset].has_transformations()); - if (z_idx != -1) { - NPUW_ASSERT(!funcall._lazy_closure[z_idx - f._param_offset].has_transformations()); - } - NPUW_ASSERT(!funcall._lazy_closure[s_idx - f._param_offset].has_transformations()); ov::Tensor cw = funcall._lazy_closure[w_idx - f._param_offset].get_orig_tensor(); ov::Tensor cz = z_idx != -1 ? funcall._lazy_closure[z_idx - f._param_offset].get_orig_tensor() : ov::Tensor{}; @@ -1735,7 +1727,7 @@ void Partitioner::optimize(const std::string& func_name) { } else { NPUW_ASSERT(false && "Unsupported combination"); } - funcall._lazy_closure.push_back(LazyTensor(TransformType::TENSOR, std::move(dst))); + funcall._lazy_closure.push_back(LazyTensor(TransformType::THIS, std::move(dst))); }); } @@ -1750,7 +1742,7 @@ void Partitioner::optimize(const std::string& func_name) { auto new_elem_type = params_to_gather.pnew->get_element_type(); auto new_shape = params_to_gather.pnew->get_shape(); funcall.get()._lazy_closure.push_back( - LazyTensor(TransformType::TENSOR, ov::Tensor(new_elem_type, new_shape))); + LazyTensor(TransformType::THIS, ov::Tensor(new_elem_type, new_shape))); } } @@ -2003,9 +1995,7 @@ void Partitioner::finalizeLinks() { } // namespace -ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr& model, - ::intel_npu::Config& cfg, - const std::shared_ptr& bank) { +ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr& model, ::intel_npu::Config& cfg) { LOG_INFO("Building partitioning for model " << model->get_friendly_name() << "..."); LOG_BLOCK(); @@ -2064,7 +2054,7 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr #include "../lazy_tensor.hpp" -#include "../weights_bank.hpp" #include "intel_npu/al/config/config.hpp" #include "openvino/openvino.hpp" @@ -118,9 +117,7 @@ struct Partitioning { float total_gflops = 0.f; }; -Partitioning getPartitioning(const std::shared_ptr& model, - ::intel_npu::Config& config, - const std::shared_ptr& bank); +Partitioning getPartitioning(const std::shared_ptr& model, ::intel_npu::Config& config); } // namespace npuw } // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp index d9e808272a8d79..1b32dc6e38685a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp @@ -101,8 +101,7 @@ ClosureRemap build_remap(const Function& fbody, const DCOFFParams& params_to) { auto zerop_iter = params_to.zerops.find(param); if (zerop_iter != params_to.zerops.end()) { LOG_DEBUG("This parameter requires zero point: " << zerop_iter->second); - m.zero_points.push_back( - ov::npuw::util::tensor_from_const(std::dynamic_pointer_cast(zerop_iter->second))); + m.zero_points.push_back(ov::npuw::util::tensor_from_const(zerop_iter->second)); } else { m.zero_points.push_back(ov::Tensor()); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp index 27bd170abac540..9bb3c132fa9c5d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.hpp @@ -7,7 +7,6 @@ #include #include -#include "../../weights_bank.hpp" #include "openvino/openvino.hpp" #include "openvino/pass/graph_rewrite.hpp" diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp index 57bd352a3a4ed0..9675b21256b284 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp @@ -38,8 +38,12 @@ bool ov::npuw::util::is_set(const std::size_t sub_idx, const std::string& opt) { return false; } -ov::Tensor ov::npuw::util::tensor_from_const(const std::shared_ptr& node) { - return ov::Tensor(node->get_element_type(), node->get_shape(), const_cast(node->get_data_ptr())); +ov::Tensor ov::npuw::util::tensor_from_const(const std::shared_ptr& node) { + NPUW_ASSERT(ov::op::util::is_constant(node)); + NPUW_ASSERT(node->outputs().size() == 1); + const auto port = node->output(0); + auto cnst_node = std::dynamic_pointer_cast(node); + return ov::Tensor(port.get_element_type(), port.get_shape(), const_cast(cnst_node->get_data_ptr())); } bool ov::npuw::util::starts_with(const std::string& str, const std::string& prefix) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp index 9199a4739751c2..312d227bd5396e 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.hpp @@ -19,7 +19,7 @@ bool is_set(const std::size_t sub_idx, const std::string& opt); // Every great project has its own string class... // NB: Newer C++ standards would allow to use string views or smt -ov::Tensor tensor_from_const(const std::shared_ptr& node); +ov::Tensor tensor_from_const(const std::shared_ptr& node); bool starts_with(const std::string& str, const std::string& prefix);