Another round of review

openvinotoolkit · Nov 26, 2024 · 96cfc44 · 96cfc44
1 parent 978cdd7
commit 96cfc44
Show file tree

Hide file tree

Showing 14 changed files with 209 additions and 274 deletions.
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp
@@ -172,24 +172,6 @@ struct OptionPrinter final {
     }
 };
 
-template <typename T>
-struct OptionPrinter<std::vector<T>> final {
-    static std::string toString(const std::vector<T>& val) {
-        std::stringstream ss;
-        std::size_t counter = 0;
-        std::size_t size = val.size();
-        for (auto el : val) {
-            std::string el_str = OptionPrinter<V>::toString(el);
-            ss << el_str;
-            if (counter < size - 1) {
-                ss << ",";
-            }
-            ++counter;
-        }
-        return ss.str();
-    }
-};
-
 template <typename K, typename V>
 struct OptionPrinter<std::map<K, V>> final {
     static std::string toString(const std::map<K, V>& val) {

diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
@@ -66,47 +66,44 @@ DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, CompileTime);
 DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, CompileTime);
 DEFINE_OPT(NPUW_DUMP_IO, std::string, "", npuw::dump::inputs_outputs, RunTime);
 DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
-DEFINE_OPT(NPUW_LLM, bool, false, npuw::dynamic_llm::enabled, CompileTime);
-DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::dynamic_llm::max_prompt_len, CompileTime);
-DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::dynamic_llm::min_response_len, CompileTime);
+DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
+DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
+DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
 
 namespace npuw {
-namespace dynamic_llm {
+namespace llm {
 struct ModelDesc {
     std::string type;
     std::string name_or_path;
     int num_key_value_heads;
 };
-enum class GenerateHint {
-    FAST_COMPILE,
-    BEST_PERF
-};
-} // namespace dynamic_llm
-} // namespace npuw
+enum class GenerateHint { FAST_COMPILE, BEST_PERF };
+}  // namespace llm
+}  // namespace npuw
 
-struct NPUW_LLM_MODEL_DESC final : OptionBase<NPUW_LLM_MODEL_DESC, ::intel_npu::npuw::dynamic_llm::ModelDesc> {
+struct NPUW_LLM_MODEL_DESC final : OptionBase<NPUW_LLM_MODEL_DESC, ::intel_npu::npuw::llm::ModelDesc> {
     static std::string_view key() {
-        return ov::intel_npu::npuw::dynamic_llm::model_desc.name();
+        return ov::intel_npu::npuw::llm::model_desc.name();
     }
 
     static constexpr std::string_view getTypeName() {
-        return "::intel_npu::npuw::dynamic_llm::ModelDesc";
+        return "::intel_npu::npuw::llm::ModelDesc";
     }
 
-    static ::intel_npu::npuw::dynamic_llm::ModelDesc defaultValue() {
+    static ::intel_npu::npuw::llm::ModelDesc defaultValue() {
         return {};
     }
 
-    static ::intel_npu::npuw::dynamic_llm::ModelDesc parse(std::string_view val) {
-        ::intel_npu::npuw::dynamic_llm::ModelDesc res;
+    static ::intel_npu::npuw::llm::ModelDesc parse(std::string_view val) {
+        ::intel_npu::npuw::llm::ModelDesc res;
         std::map<std::string, std::string> res_map = OptionParser<std::map<std::string, std::string>>::parse(val);
         res.type = res_map["type"];
         res.name_or_path = res_map["name_or_path"];
         res.num_key_value_heads = std::stoi(res_map["num_key_value_heads"]);
         return res;
     }
 
-    static std::string toString(const ::intel_npu::npuw::dynamic_llm::ModelDesc& val) {
+    static std::string toString(const ::intel_npu::npuw::llm::ModelDesc& val) {
         std::string res;
         std::map<std::string, std::string> res_map;
         res_map["type"] = val.type;
@@ -124,46 +121,45 @@ struct NPUW_LLM_MODEL_DESC final : OptionBase<NPUW_LLM_MODEL_DESC, ::intel_npu::
     }
 };
 
-struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::dynamic_llm::GenerateHint> {
+struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::llm::GenerateHint> {
     static std::string_view key() {
-        return ov::intel_npu::npuw::dynamic_llm::generate_hint.name();
+        return ov::intel_npu::npuw::llm::generate_hint.name();
     }
 
     static constexpr std::string_view getTypeName() {
-        return "::intel_npu::npuw::dynamic_llm::GenerateHint";
+        return "::intel_npu::npuw::llm::GenerateHint";
     }
 
-    static ::intel_npu::npuw::dynamic_llm::GenerateHint defaultValue() {
-        return ::intel_npu::npuw::dynamic_llm::GenerateHint::FAST_COMPILE;
+    static ::intel_npu::npuw::llm::GenerateHint defaultValue() {
+        return ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE;
     }
 
-    static ::intel_npu::npuw::dynamic_llm::GenerateHint parse(std::string_view val) {
-        ::intel_npu::npuw::dynamic_llm::GenerateHint res;
+    static ::intel_npu::npuw::llm::GenerateHint parse(std::string_view val) {
+        ::intel_npu::npuw::llm::GenerateHint res;
 
         if (val == "FAST_COMPILE") {
-            res = ::intel_npu::npuw::dynamic_llm::GenerateHint::FAST_COMPILE;
-        }
-        else if (val == "BEST_PERF") {
-            res = ::intel_npu::npuw::dynamic_llm::GenerateHint::BEST_PERF;
+            res = ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE;
+        } else if (val == "BEST_PERF") {
+            res = ::intel_npu::npuw::llm::GenerateHint::BEST_PERF;
         } else {
             OPENVINO_THROW("Unsupported \"GENERATE_HINT\" provided: ",
-                val, ". Please select either \"FAST_COMPILE\" or \"BEST_PERF\".");
+                           val,
+                           ". Please select either \"FAST_COMPILE\" or \"BEST_PERF\".");
         }
         return res;
     }
 
-    static std::string toString(const ::intel_npu::npuw::dynamic_llm::GenerateHint& val) {
+    static std::string toString(const ::intel_npu::npuw::llm::GenerateHint& val) {
         std::string res;
         switch (val) {
-            case ::intel_npu::npuw::dynamic_llm::GenerateHint::FAST_COMPILE:
-                res = "FAST_COMPILE";
-                break;
-            case ::intel_npu::npuw::dynamic_llm::GenerateHint::BEST_PERF:
-                res = "BEST_PERF";
-                break;
-            default:
-                OPENVINO_THROW("Can't convert provided \"GENERATE_HINT\" : ",
-                    int(val), " to string.");
+        case ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE:
+            res = "FAST_COMPILE";
+            break;
+        case ::intel_npu::npuw::llm::GenerateHint::BEST_PERF:
+            res = "BEST_PERF";
+            break;
+        default:
+            OPENVINO_THROW("Can't convert provided \"GENERATE_HINT\" : ", int(val), " to string.");
         }
         return res;
     }

diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
@@ -370,38 +370,38 @@ static constexpr ov::Property<std::string> inputs_outputs{"NPUW_DUMP_IO"};
 static constexpr ov::Property<std::string> io_iters{"NPUW_DUMP_IO_ITERS"};
 }  // namespace dump
 
-namespace dynamic_llm {
+namespace llm {
 /**
  * @brief
  * Type: bool.
  * Tell NPUW that you want to pass dynamic stateful LLM model
  * Default value: false.
  */
-static constexpr ov::Property<bool> enabled {"NPUW_LLM"};
+static constexpr ov::Property<bool> enabled{"NPUW_LLM"};
 
 /**
  * @brief
  * Type: std::map<std::string, std::string>.
  * Tell NPUW about your LLM model.
  * Default value: empty map.
  */
-static constexpr ov::Property<std::map<std::string, std::string>> model_desc {"NPUW_LLM_MODEL_DESC"};
+static constexpr ov::Property<std::map<std::string, std::string>> model_desc{"NPUW_LLM_MODEL_DESC"};
 
-    /**
+/**
  * @brief
  * Type: uint32_t.
  * Tell NPUW your desirable max prompt length.
  * Default value: 1024.
  */
-static constexpr ov::Property<uint32_t> max_prompt_len {"NPUW_LLM_MAX_PROMPT_LEN"};
+static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"};
 
 /**
  * @brief
  * Type: uint32_t.
  * Tell NPUW your desirable min response length.
  * Default value: 128.
  */
-static constexpr ov::Property<uint32_t> min_response_len {"NPUW_LLM_MIN_RESPONSE_LEN"};
+static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"};
 
 /**
  * @brief
@@ -410,9 +410,9 @@ static constexpr ov::Property<uint32_t> min_response_len {"NPUW_LLM_MIN_RESPONSE
  * Possible values: "FAST_COMPILE", "BEST_PERF".
  * Default value: "FAST_COMPILE".
  */
-static constexpr ov::Property<std::string> generate_hint {"NPUW_LLM_GENERATE_HINT"};
+static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT"};
 
-} // namespace llm_dynamic
+}  // namespace llm
 
 }  // namespace npuw
 }  // namespace intel_npu

diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -52,11 +52,6 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_DUMP_IO>();
     desc.add<NPUW_DUMP_IO_ITERS>();
 #endif
-    desc.add<NPUW_LLM>();
-    desc.add<NPUW_LLM_MODEL_DESC>();
-    desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
-    desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
-    desc.add<NPUW_LLM_GENERATE_HINT>();
 }
 
 void intel_npu::registerNpuwLlmOptions(OptionsDesc& desc) {

diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -28,11 +28,14 @@
 #include "intel_npu/config/config.hpp"
 #include "intel_npu/config/npuw.hpp"
 #include "intel_npu/npuw_private_properties.hpp"
+#include "llm_compiled_model.hpp"
 #include "openvino/runtime/device_id_parser.hpp"
 #include "openvino/runtime/internal_properties.hpp"
 #include "openvino/runtime/properties.hpp"
 #include "transformations/convert_precision.hpp"
 
+#include "llm_compiled_model.hpp"
+
 namespace {
 void split_properties(const ov::AnyMap& properties,
                       ov::AnyMap& npu_plugin_properties,
@@ -85,10 +88,33 @@ ov::npuw::DeviceProperties get_properties_per_device(const std::shared_ptr<const
 }  // namespace npuw
 }  // namespace ov
 
+std::shared_ptr<ov::npuw::ICompiledModel> ov::npuw::ICompiledModel::create(
+    const std::shared_ptr<ov::Model>& model,
+    const std::shared_ptr<const ov::IPlugin>& plugin,
+    const ov::AnyMap& properties) {
+    LOG_VERB(__PRETTY_FUNCTION__);
+    LOG_BLOCK();
+    std::shared_ptr<ov::npuw::ICompiledModel> compiled_model;
+    auto use_llm_key = ov::intel_npu::npuw::llm::enabled.name();
+    if (properties.count(use_llm_key) && properties.at(use_llm_key).as<bool>() == true) {
+        LOG_DEBUG("ov::npuw::LLMCompiledModel will be created.");
+        compiled_model = std::make_shared<ov::npuw::LLMCompiledModel>(model, plugin, properties);
+    } else {
+        LOG_DEBUG("ov::npuw::CompiledModel will be created.");
+        compiled_model = std::make_shared<ov::npuw::CompiledModel>(model, plugin, properties);
+    }
+    LOG_DEBUG("Done");
+    return compiled_model;
+}
+
+ov::npuw::ICompiledModel::ICompiledModel(const std::shared_ptr<ov::Model>& model,
+                                         const std::shared_ptr<const ov::IPlugin>& plugin)
+    : ov::ICompiledModel(model, plugin) {}
+
 ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
                                        const std::shared_ptr<const ov::IPlugin>& plugin,
                                        const ov::AnyMap& properties)
-    : ov::ICompiledModel(model, plugin),
+    : ov::npuw::ICompiledModel(model, plugin),
       m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()),
       m_cfg(m_options_desc),
       m_name(model->get_friendly_name()),
@@ -875,8 +901,6 @@ void ov::npuw::CompiledModel::implement_properties() {
     //    request. So the vector will define public properties.
     // 3. Create mappings for all remaining (private) NPUW-specific properties
     //    to getters of their values from config, related to ov::npuw::CompiledModel.
-    // 4. Fill default values for (private) NPUW-specific, dynamic stateful
-    //    model-specific properties.
 
 #define GET_PLUGIN_PROP(property) return get_plugin()->get_property(property.name(), ov::AnyMap());
 
@@ -963,52 +987,47 @@ void ov::npuw::CompiledModel::implement_properties() {
         }                                                                                  \
     }
 
-    m_prop_to_opt.insert({BIND(use_npuw, NPU_USE_NPUW),
-                          BIND(npuw::devices, NPUW_DEVICES),
-                          BIND(npuw::submodel_device, NPUW_SUBMODEL_DEVICE),
-                          BIND(npuw::partitioning::online::pipeline, NPUW_ONLINE_PIPELINE),
-                          BIND(npuw::partitioning::online::min_size, NPUW_ONLINE_MIN_SIZE),
-                          BIND(npuw::partitioning::online::keep_blocks, NPUW_ONLINE_KEEP_BLOCKS),
-                          BIND(npuw::partitioning::online::keep_block_size, NPUW_ONLINE_KEEP_BLOCK_SIZE),
-                          BIND(npuw::partitioning::online::avoid, NPUW_ONLINE_AVOID),
-                          BIND(npuw::partitioning::online::isolate, NPUW_ONLINE_ISOLATE),
-                          BIND(npuw::partitioning::online::nofold, NPUW_ONLINE_NO_FOLD),
-                          BIND(npuw::partitioning::online::dump_plan, NPUW_ONLINE_DUMP_PLAN),
-                          BIND(npuw::partitioning::plan, NPUW_PLAN),
-                          BIND(npuw::partitioning::fold, NPUW_FOLD),
-                          BIND(npuw::partitioning::cwai, NPUW_CWAI),
-                          BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
-                          BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
-                          BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT),
-                          BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
-                          BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY),
-                          BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN),
-                          BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
-                          BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
-                          BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
-                          BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
-                          BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
-                          BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
-                          BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS),
-                          BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK),
-                          BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
-                          BIND(npuw::cache_dir, NPUW_CACHE_DIR),
-                          BIND(npuw::accuracy::check, NPUW_ACC_CHECK),
-                          BIND(npuw::accuracy::threshold, NPUW_ACC_THRESH),
-                          BIND(npuw::accuracy::reference_device, NPUW_ACC_DEVICE),
+    m_prop_to_opt.insert({
+        BIND(use_npuw, NPU_USE_NPUW),
+        BIND(npuw::devices, NPUW_DEVICES),
+        BIND(npuw::submodel_device, NPUW_SUBMODEL_DEVICE),
+        BIND(npuw::partitioning::online::pipeline, NPUW_ONLINE_PIPELINE),
+        BIND(npuw::partitioning::online::min_size, NPUW_ONLINE_MIN_SIZE),
+        BIND(npuw::partitioning::online::keep_blocks, NPUW_ONLINE_KEEP_BLOCKS),
+        BIND(npuw::partitioning::online::keep_block_size, NPUW_ONLINE_KEEP_BLOCK_SIZE),
+        BIND(npuw::partitioning::online::avoid, NPUW_ONLINE_AVOID),
+        BIND(npuw::partitioning::online::isolate, NPUW_ONLINE_ISOLATE),
+        BIND(npuw::partitioning::online::nofold, NPUW_ONLINE_NO_FOLD),
+        BIND(npuw::partitioning::online::dump_plan, NPUW_ONLINE_DUMP_PLAN),
+        BIND(npuw::partitioning::plan, NPUW_PLAN),
+        BIND(npuw::partitioning::fold, NPUW_FOLD),
+        BIND(npuw::partitioning::cwai, NPUW_CWAI),
+        BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
+        BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
+        BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT),
+        BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
+        BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY),
+        BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN),
+        BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
+        BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
+        BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
+        BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
+        BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
+        BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
+        BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS),
+        BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK),
+        BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
+        BIND(npuw::cache_dir, NPUW_CACHE_DIR),
+        BIND(npuw::accuracy::check, NPUW_ACC_CHECK),
+        BIND(npuw::accuracy::threshold, NPUW_ACC_THRESH),
+        BIND(npuw::accuracy::reference_device, NPUW_ACC_DEVICE),
 #ifdef NPU_PLUGIN_DEVELOPER_BUILD
-                          BIND(npuw::dump::full, NPUW_DUMP_FULL),
-                          BIND(npuw::dump::subgraphs, NPUW_DUMP_SUBS),
-                          BIND(npuw::dump::subgraphs_on_fail, NPUW_DUMP_SUBS_ON_FAIL),
-                          BIND(npuw::dump::inputs_outputs, NPUW_DUMP_IO),
-                          BIND(npuw::dump::io_iters, NPUW_DUMP_IO_ITERS),
+        BIND(npuw::dump::full, NPUW_DUMP_FULL),
+        BIND(npuw::dump::subgraphs, NPUW_DUMP_SUBS),
+        BIND(npuw::dump::subgraphs_on_fail, NPUW_DUMP_SUBS_ON_FAIL),
+        BIND(npuw::dump::inputs_outputs, NPUW_DUMP_IO),
+        BIND(npuw::dump::io_iters, NPUW_DUMP_IO_ITERS),
 #endif
-    // 4.
-                          BIND(npuw::dynamic_llm::enabled, NPUW_LLM),
-                          BIND(npuw::dynamic_llm::model_desc, NPUW_LLM_MODEL_DESC),
-                          BIND(npuw::dynamic_llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN),
-                          BIND(npuw::dynamic_llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN),
-                          BIND(npuw::dynamic_llm::generate_hint, NPUW_LLM_GENERATE_HINT)
     });
 #undef BIND
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -22,9 +22,16 @@ class Plugin;
 
 namespace ov {
 namespace npuw {
+class ICompiledModel : public ov::ICompiledModel {
+public:
+    static std::shared_ptr<ov::npuw::ICompiledModel> create(const std::shared_ptr<ov::Model>& model,
+                                                            const std::shared_ptr<const ov::IPlugin>& plugin,
+                                                            const ov::AnyMap& properties);
+    ICompiledModel(const std::shared_ptr<ov::Model>& model, const std::shared_ptr<const ov::IPlugin>& plugin);
+};
 
 class InferRequest;
-class CompiledModel : public ov::ICompiledModel {
+class CompiledModel : public ov::npuw::ICompiledModel {
     using DevList = std::vector<std::string>;
     using GetPropertiesMap =
         std::map<std::string, std::tuple<ov::PropertyMutability, std::function<ov::Any(const ::intel_npu::Config&)>>>;