diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp index 613c7ff8c496e2..7ed21aee7525e4 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp @@ -172,24 +172,6 @@ struct OptionPrinter final { } }; -template -struct OptionPrinter> final { - static std::string toString(const std::vector& val) { - std::stringstream ss; - std::size_t counter = 0; - std::size_t size = val.size(); - for (auto el : val) { - std::string el_str = OptionPrinter::toString(el); - ss << el_str; - if (counter < size - 1) { - ss << ","; - } - ++counter; - } - return ss.str(); - } -}; - template struct OptionPrinter> final { static std::string toString(const std::map& val) { diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 3fd0dcea4f83a5..8a92e5c4824400 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -66,39 +66,36 @@ DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, CompileTime); DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, CompileTime); DEFINE_OPT(NPUW_DUMP_IO, std::string, "", npuw::dump::inputs_outputs, RunTime); DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime); -DEFINE_OPT(NPUW_LLM, bool, false, npuw::dynamic_llm::enabled, CompileTime); -DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::dynamic_llm::max_prompt_len, CompileTime); -DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::dynamic_llm::min_response_len, CompileTime); +DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime); +DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime); +DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime); namespace npuw { -namespace dynamic_llm { +namespace llm { struct ModelDesc { std::string type; std::string name_or_path; int num_key_value_heads; }; -enum class GenerateHint { - FAST_COMPILE, - BEST_PERF -}; -} // namespace dynamic_llm -} // namespace npuw +enum class GenerateHint { FAST_COMPILE, BEST_PERF }; +} // namespace llm +} // namespace npuw -struct NPUW_LLM_MODEL_DESC final : OptionBase { +struct NPUW_LLM_MODEL_DESC final : OptionBase { static std::string_view key() { - return ov::intel_npu::npuw::dynamic_llm::model_desc.name(); + return ov::intel_npu::npuw::llm::model_desc.name(); } static constexpr std::string_view getTypeName() { - return "::intel_npu::npuw::dynamic_llm::ModelDesc"; + return "::intel_npu::npuw::llm::ModelDesc"; } - static ::intel_npu::npuw::dynamic_llm::ModelDesc defaultValue() { + static ::intel_npu::npuw::llm::ModelDesc defaultValue() { return {}; } - static ::intel_npu::npuw::dynamic_llm::ModelDesc parse(std::string_view val) { - ::intel_npu::npuw::dynamic_llm::ModelDesc res; + static ::intel_npu::npuw::llm::ModelDesc parse(std::string_view val) { + ::intel_npu::npuw::llm::ModelDesc res; std::map res_map = OptionParser>::parse(val); res.type = res_map["type"]; res.name_or_path = res_map["name_or_path"]; @@ -106,7 +103,7 @@ struct NPUW_LLM_MODEL_DESC final : OptionBase res_map; res_map["type"] = val.type; @@ -124,46 +121,45 @@ struct NPUW_LLM_MODEL_DESC final : OptionBase { +struct NPUW_LLM_GENERATE_HINT final : OptionBase { static std::string_view key() { - return ov::intel_npu::npuw::dynamic_llm::generate_hint.name(); + return ov::intel_npu::npuw::llm::generate_hint.name(); } static constexpr std::string_view getTypeName() { - return "::intel_npu::npuw::dynamic_llm::GenerateHint"; + return "::intel_npu::npuw::llm::GenerateHint"; } - static ::intel_npu::npuw::dynamic_llm::GenerateHint defaultValue() { - return ::intel_npu::npuw::dynamic_llm::GenerateHint::FAST_COMPILE; + static ::intel_npu::npuw::llm::GenerateHint defaultValue() { + return ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE; } - static ::intel_npu::npuw::dynamic_llm::GenerateHint parse(std::string_view val) { - ::intel_npu::npuw::dynamic_llm::GenerateHint res; + static ::intel_npu::npuw::llm::GenerateHint parse(std::string_view val) { + ::intel_npu::npuw::llm::GenerateHint res; if (val == "FAST_COMPILE") { - res = ::intel_npu::npuw::dynamic_llm::GenerateHint::FAST_COMPILE; - } - else if (val == "BEST_PERF") { - res = ::intel_npu::npuw::dynamic_llm::GenerateHint::BEST_PERF; + res = ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE; + } else if (val == "BEST_PERF") { + res = ::intel_npu::npuw::llm::GenerateHint::BEST_PERF; } else { OPENVINO_THROW("Unsupported \"GENERATE_HINT\" provided: ", - val, ". Please select either \"FAST_COMPILE\" or \"BEST_PERF\"."); + val, + ". Please select either \"FAST_COMPILE\" or \"BEST_PERF\"."); } return res; } - static std::string toString(const ::intel_npu::npuw::dynamic_llm::GenerateHint& val) { + static std::string toString(const ::intel_npu::npuw::llm::GenerateHint& val) { std::string res; switch (val) { - case ::intel_npu::npuw::dynamic_llm::GenerateHint::FAST_COMPILE: - res = "FAST_COMPILE"; - break; - case ::intel_npu::npuw::dynamic_llm::GenerateHint::BEST_PERF: - res = "BEST_PERF"; - break; - default: - OPENVINO_THROW("Can't convert provided \"GENERATE_HINT\" : ", - int(val), " to string."); + case ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE: + res = "FAST_COMPILE"; + break; + case ::intel_npu::npuw::llm::GenerateHint::BEST_PERF: + res = "BEST_PERF"; + break; + default: + OPENVINO_THROW("Can't convert provided \"GENERATE_HINT\" : ", int(val), " to string."); } return res; } diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index e9d782a6d8e46c..79fd8409f3be7c 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -370,14 +370,14 @@ static constexpr ov::Property inputs_outputs{"NPUW_DUMP_IO"}; static constexpr ov::Property io_iters{"NPUW_DUMP_IO_ITERS"}; } // namespace dump -namespace dynamic_llm { +namespace llm { /** * @brief * Type: bool. * Tell NPUW that you want to pass dynamic stateful LLM model * Default value: false. */ -static constexpr ov::Property enabled {"NPUW_LLM"}; +static constexpr ov::Property enabled{"NPUW_LLM"}; /** * @brief @@ -385,15 +385,15 @@ static constexpr ov::Property enabled {"NPUW_LLM"}; * Tell NPUW about your LLM model. * Default value: empty map. */ -static constexpr ov::Property> model_desc {"NPUW_LLM_MODEL_DESC"}; +static constexpr ov::Property> model_desc{"NPUW_LLM_MODEL_DESC"}; - /** +/** * @brief * Type: uint32_t. * Tell NPUW your desirable max prompt length. * Default value: 1024. */ -static constexpr ov::Property max_prompt_len {"NPUW_LLM_MAX_PROMPT_LEN"}; +static constexpr ov::Property max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"}; /** * @brief @@ -401,7 +401,7 @@ static constexpr ov::Property max_prompt_len {"NPUW_LLM_MAX_PROMPT_LEN * Tell NPUW your desirable min response length. * Default value: 128. */ -static constexpr ov::Property min_response_len {"NPUW_LLM_MIN_RESPONSE_LEN"}; +static constexpr ov::Property min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"}; /** * @brief @@ -410,9 +410,9 @@ static constexpr ov::Property min_response_len {"NPUW_LLM_MIN_RESPONSE * Possible values: "FAST_COMPILE", "BEST_PERF". * Default value: "FAST_COMPILE". */ -static constexpr ov::Property generate_hint {"NPUW_LLM_GENERATE_HINT"}; +static constexpr ov::Property generate_hint{"NPUW_LLM_GENERATE_HINT"}; -} // namespace llm_dynamic +} // namespace llm } // namespace npuw } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 5b2501bd33a66d..7eba967776827b 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -52,11 +52,6 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); #endif - desc.add(); - desc.add(); - desc.add(); - desc.add(); - desc.add(); } void intel_npu::registerNpuwLlmOptions(OptionsDesc& desc) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index b723d6d095d025..ced6d6f3840d2e 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -28,11 +28,14 @@ #include "intel_npu/config/config.hpp" #include "intel_npu/config/npuw.hpp" #include "intel_npu/npuw_private_properties.hpp" +#include "llm_compiled_model.hpp" #include "openvino/runtime/device_id_parser.hpp" #include "openvino/runtime/internal_properties.hpp" #include "openvino/runtime/properties.hpp" #include "transformations/convert_precision.hpp" +#include "llm_compiled_model.hpp" + namespace { void split_properties(const ov::AnyMap& properties, ov::AnyMap& npu_plugin_properties, @@ -85,10 +88,33 @@ ov::npuw::DeviceProperties get_properties_per_device(const std::shared_ptr ov::npuw::ICompiledModel::create( + const std::shared_ptr& model, + const std::shared_ptr& plugin, + const ov::AnyMap& properties) { + LOG_VERB(__PRETTY_FUNCTION__); + LOG_BLOCK(); + std::shared_ptr compiled_model; + auto use_llm_key = ov::intel_npu::npuw::llm::enabled.name(); + if (properties.count(use_llm_key) && properties.at(use_llm_key).as() == true) { + LOG_DEBUG("ov::npuw::LLMCompiledModel will be created."); + compiled_model = std::make_shared(model, plugin, properties); + } else { + LOG_DEBUG("ov::npuw::CompiledModel will be created."); + compiled_model = std::make_shared(model, plugin, properties); + } + LOG_DEBUG("Done"); + return compiled_model; +} + +ov::npuw::ICompiledModel::ICompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin) + : ov::ICompiledModel(model, plugin) {} + ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const ov::AnyMap& properties) - : ov::ICompiledModel(model, plugin), + : ov::npuw::ICompiledModel(model, plugin), m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), m_cfg(m_options_desc), m_name(model->get_friendly_name()), @@ -875,8 +901,6 @@ void ov::npuw::CompiledModel::implement_properties() { // request. So the vector will define public properties. // 3. Create mappings for all remaining (private) NPUW-specific properties // to getters of their values from config, related to ov::npuw::CompiledModel. - // 4. Fill default values for (private) NPUW-specific, dynamic stateful - // model-specific properties. #define GET_PLUGIN_PROP(property) return get_plugin()->get_property(property.name(), ov::AnyMap()); @@ -963,52 +987,47 @@ void ov::npuw::CompiledModel::implement_properties() { } \ } - m_prop_to_opt.insert({BIND(use_npuw, NPU_USE_NPUW), - BIND(npuw::devices, NPUW_DEVICES), - BIND(npuw::submodel_device, NPUW_SUBMODEL_DEVICE), - BIND(npuw::partitioning::online::pipeline, NPUW_ONLINE_PIPELINE), - BIND(npuw::partitioning::online::min_size, NPUW_ONLINE_MIN_SIZE), - BIND(npuw::partitioning::online::keep_blocks, NPUW_ONLINE_KEEP_BLOCKS), - BIND(npuw::partitioning::online::keep_block_size, NPUW_ONLINE_KEEP_BLOCK_SIZE), - BIND(npuw::partitioning::online::avoid, NPUW_ONLINE_AVOID), - BIND(npuw::partitioning::online::isolate, NPUW_ONLINE_ISOLATE), - BIND(npuw::partitioning::online::nofold, NPUW_ONLINE_NO_FOLD), - BIND(npuw::partitioning::online::dump_plan, NPUW_ONLINE_DUMP_PLAN), - BIND(npuw::partitioning::plan, NPUW_PLAN), - BIND(npuw::partitioning::fold, NPUW_FOLD), - BIND(npuw::partitioning::cwai, NPUW_CWAI), - BIND(npuw::partitioning::dyn_quant, NPUW_DQ), - BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM), - BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT), - BIND(npuw::partitioning::spatial, NPUW_SPATIAL), - BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY), - BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN), - BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER), - BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL), - BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE), - BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE), - BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE), - BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC), - BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS), - BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK), - BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC), - BIND(npuw::cache_dir, NPUW_CACHE_DIR), - BIND(npuw::accuracy::check, NPUW_ACC_CHECK), - BIND(npuw::accuracy::threshold, NPUW_ACC_THRESH), - BIND(npuw::accuracy::reference_device, NPUW_ACC_DEVICE), + m_prop_to_opt.insert({ + BIND(use_npuw, NPU_USE_NPUW), + BIND(npuw::devices, NPUW_DEVICES), + BIND(npuw::submodel_device, NPUW_SUBMODEL_DEVICE), + BIND(npuw::partitioning::online::pipeline, NPUW_ONLINE_PIPELINE), + BIND(npuw::partitioning::online::min_size, NPUW_ONLINE_MIN_SIZE), + BIND(npuw::partitioning::online::keep_blocks, NPUW_ONLINE_KEEP_BLOCKS), + BIND(npuw::partitioning::online::keep_block_size, NPUW_ONLINE_KEEP_BLOCK_SIZE), + BIND(npuw::partitioning::online::avoid, NPUW_ONLINE_AVOID), + BIND(npuw::partitioning::online::isolate, NPUW_ONLINE_ISOLATE), + BIND(npuw::partitioning::online::nofold, NPUW_ONLINE_NO_FOLD), + BIND(npuw::partitioning::online::dump_plan, NPUW_ONLINE_DUMP_PLAN), + BIND(npuw::partitioning::plan, NPUW_PLAN), + BIND(npuw::partitioning::fold, NPUW_FOLD), + BIND(npuw::partitioning::cwai, NPUW_CWAI), + BIND(npuw::partitioning::dyn_quant, NPUW_DQ), + BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM), + BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT), + BIND(npuw::partitioning::spatial, NPUW_SPATIAL), + BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY), + BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN), + BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER), + BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL), + BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE), + BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE), + BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE), + BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC), + BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS), + BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK), + BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC), + BIND(npuw::cache_dir, NPUW_CACHE_DIR), + BIND(npuw::accuracy::check, NPUW_ACC_CHECK), + BIND(npuw::accuracy::threshold, NPUW_ACC_THRESH), + BIND(npuw::accuracy::reference_device, NPUW_ACC_DEVICE), #ifdef NPU_PLUGIN_DEVELOPER_BUILD - BIND(npuw::dump::full, NPUW_DUMP_FULL), - BIND(npuw::dump::subgraphs, NPUW_DUMP_SUBS), - BIND(npuw::dump::subgraphs_on_fail, NPUW_DUMP_SUBS_ON_FAIL), - BIND(npuw::dump::inputs_outputs, NPUW_DUMP_IO), - BIND(npuw::dump::io_iters, NPUW_DUMP_IO_ITERS), + BIND(npuw::dump::full, NPUW_DUMP_FULL), + BIND(npuw::dump::subgraphs, NPUW_DUMP_SUBS), + BIND(npuw::dump::subgraphs_on_fail, NPUW_DUMP_SUBS_ON_FAIL), + BIND(npuw::dump::inputs_outputs, NPUW_DUMP_IO), + BIND(npuw::dump::io_iters, NPUW_DUMP_IO_ITERS), #endif - // 4. - BIND(npuw::dynamic_llm::enabled, NPUW_LLM), - BIND(npuw::dynamic_llm::model_desc, NPUW_LLM_MODEL_DESC), - BIND(npuw::dynamic_llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN), - BIND(npuw::dynamic_llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN), - BIND(npuw::dynamic_llm::generate_hint, NPUW_LLM_GENERATE_HINT) }); #undef BIND } diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index e82a175f03042d..0d0a18f1e0df3b 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -22,9 +22,16 @@ class Plugin; namespace ov { namespace npuw { +class ICompiledModel : public ov::ICompiledModel { +public: + static std::shared_ptr create(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const ov::AnyMap& properties); + ICompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin); +}; class InferRequest; -class CompiledModel : public ov::ICompiledModel { +class CompiledModel : public ov::npuw::ICompiledModel { using DevList = std::vector; using GetPropertiesMap = std::map>>; diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model_factory.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model_factory.cpp deleted file mode 100644 index 246f27bf4a0aee..00000000000000 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model_factory.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "compiled_model_factory.hpp" -#include "logging.hpp" -#include "intel_npu/npuw_private_properties.hpp" -#include "compiled_model.hpp" -#include "llm_compiled_model.hpp" - -std::shared_ptr -ov::npuw::CompiledModelFactory::create(const std::shared_ptr& model, - const std::shared_ptr& plugin, - const ov::AnyMap& properties) { - LOG_VERB(__PRETTY_FUNCTION__); - LOG_BLOCK(); - std::shared_ptr compiled_model; - auto use_dynamic_llm_key = ov::intel_npu::npuw::dynamic_llm::enabled.name(); - if (properties.count(use_dynamic_llm_key) && - properties.at(use_dynamic_llm_key).as() == true) { - LOG_DEBUG("ov::npuw::LLMCompiledModel will be created."); - compiled_model = std::make_shared(model, plugin, properties); - } else { - LOG_DEBUG("ov::npuw::CompiledModel will be created."); - compiled_model = std::make_shared(model, plugin, properties); - } - LOG_DEBUG("Done"); - return compiled_model; -} diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model_factory.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model_factory.hpp deleted file mode 100644 index d190f88aeab95d..00000000000000 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model_factory.hpp +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include - -#include "common.hpp" -#include "openvino/openvino.hpp" -#include "openvino/runtime/icompiled_model.hpp" -#include "openvino/runtime/so_ptr.hpp" - -namespace intel_npu { -class Plugin; -} - -namespace ov { -namespace npuw { - -class CompiledModelFactory { -public: - static std::shared_ptr create(const std::shared_ptr& model, - const std::shared_ptr& plugin, - const ov::AnyMap& properties); -}; - -} // namespace npuw -} // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index ef1274276cb935..c0e9f8b935396f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -4,9 +4,8 @@ #include "llm_compiled_model.hpp" #include "llm_infer_request.hpp" - -#include "openvino/runtime/iasync_infer_request.hpp" #include "openvino/pass/stateful_to_stateless.hpp" +#include "openvino/runtime/iasync_infer_request.hpp" namespace { uint32_t align_to(uint32_t value, uint32_t alignment) { @@ -16,10 +15,10 @@ uint32_t align_to(uint32_t value, uint32_t alignment) { std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr& model) { const auto kStartOutputKVCacheLayers = 1u; for (int i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) { - auto kvout = model->output(i); + auto kvout = model->output(i); auto kvrslt = kvout.get_node(); - auto kvcat = kvrslt->inputs()[0].get_source_output().get_node(); - auto kvval = kvcat->inputs()[1].get_source_output(); + auto kvcat = kvrslt->inputs()[0].get_source_output().get_node(); + auto kvval = kvcat->inputs()[1].get_source_output(); kvval.set_names({kvout.get_any_name()}); kvrslt->inputs()[0].replace_source_output(kvval); } @@ -144,16 +143,15 @@ T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_va ov::AnyMap get_baseline_common_config() { ov::AnyMap config = { - { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" }, - { "NPUW_DEVICES", "NPU" }, - { "NPU_USE_NPUW", "YES" }, - { "NPUW_FOLD", "YES" }, - { "NPUW_DCOFF_TYPE", "f16" }, - { "NPUW_DCOFF_SCALE", "YES"}, - { "NPUW_WEIGHTS_BANK", "shared" }, - { "NPUW_SLICE_OUT", "YES" }, - { "NPUW_FUNCALL_ASYNC", "YES" } - }; + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"}, + {"NPUW_DEVICES", "NPU"}, + {"NPU_USE_NPUW", "YES"}, + {"NPUW_FOLD", "YES"}, + {"NPUW_DCOFF_TYPE", "f16"}, + {"NPUW_DCOFF_SCALE", "YES"}, + {"NPUW_WEIGHTS_BANK", "shared"}, + {"NPUW_SLICE_OUT", "YES"}, + {"NPUW_FUNCALL_ASYNC", "YES"}}; return config; } @@ -168,17 +166,14 @@ ov::AnyMap get_default_common_config(const std::shared_ptr& model) { return config; } -ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, - const std::optional& npudesc) { +ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, const std::optional& npudesc) { auto config = get_default_common_config(model); if (is_cw_compressed(model)) { config.emplace("NPUW_DQ", "YES"); } else { config.emplace("NPUW_PMM", "NO"); } - if (npudesc.has_value() && - npudesc->arch == "4000" && - npudesc->max_tiles != -1) { + if (npudesc.has_value() && npudesc->arch == "4000" && npudesc->max_tiles != -1) { config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles); } return config; @@ -186,9 +181,9 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, ov::AnyMap get_default_generate_config(const std::shared_ptr& model, const std::optional& npudesc, - const ::intel_npu::npuw::dynamic_llm::GenerateHint hint) { + const ::intel_npu::npuw::llm::GenerateHint hint) { auto config = get_default_common_config(model); - if (hint == ::intel_npu::npuw::dynamic_llm::GenerateHint::BEST_PERF) { + if (hint == ::intel_npu::npuw::llm::GenerateHint::BEST_PERF) { config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); } // NB: Unconditionally set for generation model @@ -216,12 +211,10 @@ void drop_cache_dir(ov::AnyMap& config) { } } -void split_llm_properties(const ov::AnyMap& properties, - ov::AnyMap& dyn_llm_properties, - ov::AnyMap& other_properties) { +void split_llm_properties(const ov::AnyMap& properties, ov::AnyMap& llm_properties, ov::AnyMap& other_properties) { for (auto it = properties.begin(); it != properties.end(); ++it) { if (it->first.find("NPUW_LLM") != it->first.npos) { - dyn_llm_properties.insert(*it); + llm_properties.insert(*it); } else { other_properties.insert(*it); } @@ -235,27 +228,27 @@ std::map any_copy(const ov::AnyMap& params) { } return result; } -} // namespace +} // namespace ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const ov::AnyMap& properties) - : ov::ICompiledModel(model, plugin), - m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), - m_cfg(m_options_desc) { + : ov::npuw::ICompiledModel(model, plugin), + m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), + m_cfg(m_options_desc) { LOG_VERB(__PRETTY_FUNCTION__); LOG_BLOCK(); ::intel_npu::registerNpuwLlmOptions(*m_options_desc); - std::map npuw_dyn_llm_props; + std::map npuw_llm_props; std::map other_props; - split_llm_properties(properties, npuw_dyn_llm_props, other_props); - m_cfg.update(any_copy(npuw_dyn_llm_props)); + split_llm_properties(properties, npuw_llm_props, other_props); + m_cfg.update(any_copy(npuw_llm_props)); // (1) Make template model to be kvcache model, used in generation phase auto kvcache_model = model->clone(); - // (2) Expose KV-cache input and output layers from kvcache model + // (2) Expose KV-cache input and output layers from kvcache model; ov::pass::StatefulToStateless().run_on_model(kvcache_model); // (3) Create prefill model from passed template model @@ -271,10 +264,9 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m const uint32_t kMaxPromptLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u); const uint32_t kMinResponseLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u); - const ::intel_npu::npuw::dynamic_llm::ModelDesc model_desc = m_cfg.get<::intel_npu::NPUW_LLM_MODEL_DESC>(); + const ::intel_npu::npuw::llm::ModelDesc model_desc = m_cfg.get<::intel_npu::NPUW_LLM_MODEL_DESC>(); KVAxesPosition axes = get_kv_axes(model_desc.type); - m_kvcache_desc = KVCacheDesc - { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len}; + m_kvcache_desc = KVCacheDesc{kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len}; // (7) Make prefill model with static shapes reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); // (8) Make kvcache model with static shapes @@ -285,7 +277,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m ov::AnyMap properties_copy = other_props; auto prefill_config = get_default_prefill_config(model, npudesc); // NB: GENERATE_HINT is only applicable for default generate config! - const ::intel_npu::npuw::dynamic_llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>(); + const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>(); auto generate_config = get_default_generate_config(model, npudesc, generate_hint); merge_config_with(prefill_config, properties_copy); merge_config_with(generate_config, properties_copy); @@ -333,11 +325,6 @@ std::shared_ptr ov::npuw::LLMCompiledModel::create_llm_in return std::make_shared(this_sptr, m_kvcache_desc); } -std::shared_ptr ov::npuw::LLMCompiledModel::create_infer_request() const { - auto internal_request = create_sync_infer_request(); - return std::make_shared(internal_request, get_task_executor(), get_callback_executor()); -} - void ov::npuw::LLMCompiledModel::implement_properties() { #define BIND(N, T) \ { \ @@ -348,10 +335,10 @@ void ov::npuw::LLMCompiledModel::implement_properties() { } \ } - m_prop_to_opt.insert({BIND(npuw::dynamic_llm::enabled, NPUW_LLM), - BIND(npuw::dynamic_llm::model_desc, NPUW_LLM_MODEL_DESC), - BIND(npuw::dynamic_llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN), - BIND(npuw::dynamic_llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN), - BIND(npuw::dynamic_llm::generate_hint, NPUW_LLM_GENERATE_HINT)}); + m_prop_to_opt.insert({BIND(npuw::llm::enabled, NPUW_LLM), + BIND(npuw::llm::model_desc, NPUW_LLM_MODEL_DESC), + BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN), + BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN), + BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT)}); #undef BIND } diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp index fe025e750169f1..1a748997fd48fa 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp @@ -12,33 +12,28 @@ namespace ov { namespace npuw { class LLMInferRequest; -class LLMCompiledModel : public ov::ICompiledModel { +class LLMCompiledModel : public ov::npuw::ICompiledModel { using GetPropertiesMap = - std::map>>; + std::map>>; + public: struct KVCacheDesc { - uint32_t max_prompt_size = 1024u; - uint32_t total_size = 1152u; + uint32_t max_prompt_size = 0u; + uint32_t total_size = 0u; uint32_t num_stored_tokens = 0u; - uint32_t dim = 2u; + uint32_t dim = 0u; }; LLMCompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const ov::AnyMap& properties); + LLMCompiledModel() = delete; void export_model(std::ostream& model) const override; std::shared_ptr get_runtime_model() const override; void set_property(const ov::AnyMap& properties) override; ov::Any get_property(const std::string& name) const override; - std::shared_ptr create_infer_request() const override; - - std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc; - ::intel_npu::Config m_cfg; - GetPropertiesMap m_prop_to_opt; - private: friend class LLMInferRequest; @@ -46,10 +41,14 @@ class LLMCompiledModel : public ov::ICompiledModel { std::shared_ptr create_sync_infer_request() const override; void implement_properties(); + std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc; + ::intel_npu::Config m_cfg; + GetPropertiesMap m_prop_to_opt; + KVCacheDesc m_kvcache_desc; std::shared_ptr m_kvcache_compiled; std::shared_ptr m_prefill_compiled; }; -} // namespace npuw -} // namespace ov +} // namespace npuw +} // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 705a8e07bbe0c1..e399aaa011eca7 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -4,18 +4,21 @@ #include "llm_infer_request.hpp" +#include + #include "llm_compiled_model.hpp" #include "openvino/runtime/iasync_infer_request.hpp" -#include - template void fill_tensor(ov::SoPtr tensor, T fill_val, size_t offset = 0u) { T* tensor_data = tensor->data(); std::fill(tensor_data + offset, tensor_data + tensor->get_size(), fill_val); } -ov::SoPtr make_tensor_slice(ov::SoPtr tensor, uint32_t dim, uint32_t start_pos, uint32_t end_pos) { +ov::SoPtr make_tensor_slice(ov::SoPtr tensor, + uint32_t dim, + uint32_t start_pos, + uint32_t end_pos) { ov::Shape start_shape(std::vector(tensor->get_shape().size(), 0u)); start_shape[dim] = start_pos; ov::Shape end_shape = tensor->get_shape(); @@ -54,12 +57,12 @@ void ov::npuw::LLMInferRequest::prepare_for_new_conversation() { LOG_VERB(__PRETTY_FUNCTION__); LOG_BLOCK(); - auto prefill_compiled = m_prefill_request->get_compiled_model(); fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0u); fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0u); fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0u); fill_tensor(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0u); m_kvcache_desc.num_stored_tokens = 0u; + LOG_VERB("Done"); } @@ -71,13 +74,9 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr input_ids, prepare_for_new_conversation(); - auto prefill_compiled = m_prefill_request->get_compiled_model(); - auto padded_input_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")); const size_t offset = padded_input_ids->get_size() - input_ids->get_size(); - std::copy_n(input_ids->data(), - input_ids->get_size(), - padded_input_ids->data() + offset); + std::copy_n(input_ids->data(), input_ids->get_size(), padded_input_ids->data() + offset); auto padded_attention_mask = m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")); std::copy_n(attention_mask->data(), @@ -85,9 +84,7 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr input_ids, padded_attention_mask->data() + offset); auto padded_position_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")); - std::copy_n(position_ids->data(), - position_ids->get_size(), - padded_position_ids->data() + offset); + std::copy_n(position_ids->data(), position_ids->get_size(), padded_position_ids->data() + offset); m_prefill_request->infer(); m_kvcache_desc.num_stored_tokens += static_cast(input_ids->get_size()); @@ -106,7 +103,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, // NB: KV-cache is full, further generation is impossible if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) { - OPENVINO_THROW("KV-Cache is full"); + OPENVINO_THROW("KV-Cache is full."); } if (m_need_copy_kvcache) { @@ -119,21 +116,25 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name)); + + // FIXME: We don't need to fill whole tensor with 0s, but only tensor.size() - num_stored_tokens + // taking into account kvcache dimension. fill_tensor(kvcache_in_tensor, 0); - auto prefill_out_slice = make_tensor_slice( - prefill_out_tensor, m_kvcache_desc.dim, - m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, m_kvcache_desc.max_prompt_size - ); + auto prefill_out_slice = + make_tensor_slice(prefill_out_tensor, + m_kvcache_desc.dim, + m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, + m_kvcache_desc.max_prompt_size); + + auto kvcache_in_slice = + make_tensor_slice(kvcache_in_tensor, m_kvcache_desc.dim, 0u, m_kvcache_desc.num_stored_tokens); - auto kvcache_in_slice = make_tensor_slice( - kvcache_in_tensor, m_kvcache_desc.dim, 0u, m_kvcache_desc.num_stored_tokens - ); prefill_out_slice->copy_to(kvcache_in_slice._ptr); } LOG_VERB("Prepare attention mask pattern."); - auto* attention_mask_data = m_kvcache_request->get_tensor( - m_kvcache_in_ports.at("attention_mask"))->data(); + auto* attention_mask_data = + m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"))->data(); attention_mask_data[m_kvcache_desc.total_size - 1] = 1; m_need_copy_kvcache = false; @@ -160,9 +161,10 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, const auto& output_name = kvcache_compiled->outputs()[kStartOutputKVCacheLayers + i].get_any_name(); const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name)); - auto kvcache_in_slice = make_tensor_slice( - kvcache_in_tensor, m_kvcache_desc.dim, m_kvcache_desc.num_stored_tokens - 1, m_kvcache_desc.num_stored_tokens - ); + auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, + m_kvcache_desc.dim, + m_kvcache_desc.num_stored_tokens - 1, + m_kvcache_desc.num_stored_tokens); auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name)); kvcache_out_tensor->copy_to(kvcache_in_slice._ptr); } @@ -172,9 +174,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, void ov::npuw::LLMInferRequest::infer() { const auto& inputs = get_inputs(); - auto input_ids = get_tensor(inputs[0]); + auto input_ids = get_tensor(inputs[0]); auto attention_mask = get_tensor(inputs[1]); - auto position_ids = get_tensor(inputs[2]); + auto position_ids = get_tensor(inputs[2]); + + OPENVINO_ASSERT(ov::element::i64 == input_ids->get_element_type()); + OPENVINO_ASSERT(ov::element::i64 == attention_mask->get_element_type()); + OPENVINO_ASSERT(ov::element::i64 == position_ids->get_element_type()); if (input_ids->get_size() != 1) { infer_prefill(input_ids, attention_mask, position_ids); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp index bef90aab4a4d69..3703d41f0c2950 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp @@ -6,9 +6,9 @@ #include -#include "openvino/runtime/isync_infer_request.hpp" -#include "openvino/core/descriptor/output.hpp" #include "llm_compiled_model.hpp" +#include "openvino/core/descriptor/output.hpp" +#include "openvino/runtime/isync_infer_request.hpp" namespace ov { namespace npuw { @@ -24,8 +24,12 @@ class LLMInferRequest final : public ov::ISyncInferRequest { void check_tensors() const override{}; - virtual std::vector get_profiling_info() const { return {}; } - virtual std::vector> query_state() const { return {}; } + virtual std::vector get_profiling_info() const { + return {}; + } + virtual std::vector> query_state() const { + return {}; + } private: void prepare_for_new_conversation(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/logging.hpp b/src/plugins/intel_npu/src/plugin/npuw/logging.hpp index ca0ece85f7f2bb..95c9a742db7842 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/logging.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/logging.hpp @@ -64,5 +64,5 @@ void dump_failure(const std::shared_ptr& model, const std::string& de } while (0) #ifdef _MSC_VER - #define __PRETTY_FUNCTION__ __FUNCSIG__ +# define __PRETTY_FUNCTION__ __FUNCSIG__ #endif diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index ac9ac64f6e5046..28ac490b04ce1e 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -7,6 +7,7 @@ #include #include "compiled_model.hpp" +#include "npuw/compiled_model.hpp" #include "driver_compiler_adapter.hpp" #include "intel_npu/common/device_helpers.hpp" #include "intel_npu/common/igraph.hpp" @@ -16,7 +17,6 @@ #include "intel_npu/config/npuw.hpp" #include "intel_npu/config/runtime.hpp" #include "intel_npu/utils/zero/zero_init.hpp" -#include "npuw/compiled_model_factory.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/parameter.hpp" #include "openvino/runtime/intel_npu/properties.hpp" @@ -631,7 +631,7 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< if (localProperties.count(ov::cache_dir.name()) || !_globalConfig.get().empty()) { OPENVINO_THROW("Option 'CACHE_DIR' is not supported with NPU_USE_NPUW!"); } - return ov::npuw::CompiledModelFactory::create(model->clone(), shared_from_this(), localProperties); + return ov::npuw::ICompiledModel::create(model->clone(), shared_from_this(), localProperties); } else { // NPUW is disabled, remove the key from the properties localProperties.erase(useNpuwKey);