Skip to content

Commit

Permalink
Another round of review
Browse files Browse the repository at this point in the history
  • Loading branch information
AsyaPronina committed Nov 26, 2024
1 parent 978cdd7 commit 96cfc44
Show file tree
Hide file tree
Showing 14 changed files with 209 additions and 274 deletions.
18 changes: 0 additions & 18 deletions src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,24 +172,6 @@ struct OptionPrinter final {
}
};

template <typename T>
struct OptionPrinter<std::vector<T>> final {
static std::string toString(const std::vector<T>& val) {
std::stringstream ss;
std::size_t counter = 0;
std::size_t size = val.size();
for (auto el : val) {
std::string el_str = OptionPrinter<V>::toString(el);
ss << el_str;
if (counter < size - 1) {
ss << ",";
}
++counter;
}
return ss.str();
}
};

template <typename K, typename V>
struct OptionPrinter<std::map<K, V>> final {
static std::string toString(const std::map<K, V>& val) {
Expand Down
74 changes: 35 additions & 39 deletions src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,47 +66,44 @@ DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, CompileTime);
DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, CompileTime);
DEFINE_OPT(NPUW_DUMP_IO, std::string, "", npuw::dump::inputs_outputs, RunTime);
DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
DEFINE_OPT(NPUW_LLM, bool, false, npuw::dynamic_llm::enabled, CompileTime);
DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::dynamic_llm::max_prompt_len, CompileTime);
DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::dynamic_llm::min_response_len, CompileTime);
DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);

namespace npuw {
namespace dynamic_llm {
namespace llm {
struct ModelDesc {
std::string type;
std::string name_or_path;
int num_key_value_heads;
};
enum class GenerateHint {
FAST_COMPILE,
BEST_PERF
};
} // namespace dynamic_llm
} // namespace npuw
enum class GenerateHint { FAST_COMPILE, BEST_PERF };
} // namespace llm
} // namespace npuw

struct NPUW_LLM_MODEL_DESC final : OptionBase<NPUW_LLM_MODEL_DESC, ::intel_npu::npuw::dynamic_llm::ModelDesc> {
struct NPUW_LLM_MODEL_DESC final : OptionBase<NPUW_LLM_MODEL_DESC, ::intel_npu::npuw::llm::ModelDesc> {
static std::string_view key() {
return ov::intel_npu::npuw::dynamic_llm::model_desc.name();
return ov::intel_npu::npuw::llm::model_desc.name();
}

static constexpr std::string_view getTypeName() {
return "::intel_npu::npuw::dynamic_llm::ModelDesc";
return "::intel_npu::npuw::llm::ModelDesc";
}

static ::intel_npu::npuw::dynamic_llm::ModelDesc defaultValue() {
static ::intel_npu::npuw::llm::ModelDesc defaultValue() {
return {};
}

static ::intel_npu::npuw::dynamic_llm::ModelDesc parse(std::string_view val) {
::intel_npu::npuw::dynamic_llm::ModelDesc res;
static ::intel_npu::npuw::llm::ModelDesc parse(std::string_view val) {
::intel_npu::npuw::llm::ModelDesc res;
std::map<std::string, std::string> res_map = OptionParser<std::map<std::string, std::string>>::parse(val);
res.type = res_map["type"];
res.name_or_path = res_map["name_or_path"];
res.num_key_value_heads = std::stoi(res_map["num_key_value_heads"]);
return res;
}

static std::string toString(const ::intel_npu::npuw::dynamic_llm::ModelDesc& val) {
static std::string toString(const ::intel_npu::npuw::llm::ModelDesc& val) {
std::string res;
std::map<std::string, std::string> res_map;
res_map["type"] = val.type;
Expand All @@ -124,46 +121,45 @@ struct NPUW_LLM_MODEL_DESC final : OptionBase<NPUW_LLM_MODEL_DESC, ::intel_npu::
}
};

struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::dynamic_llm::GenerateHint> {
struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::llm::GenerateHint> {
static std::string_view key() {
return ov::intel_npu::npuw::dynamic_llm::generate_hint.name();
return ov::intel_npu::npuw::llm::generate_hint.name();
}

static constexpr std::string_view getTypeName() {
return "::intel_npu::npuw::dynamic_llm::GenerateHint";
return "::intel_npu::npuw::llm::GenerateHint";
}

static ::intel_npu::npuw::dynamic_llm::GenerateHint defaultValue() {
return ::intel_npu::npuw::dynamic_llm::GenerateHint::FAST_COMPILE;
static ::intel_npu::npuw::llm::GenerateHint defaultValue() {
return ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE;
}

static ::intel_npu::npuw::dynamic_llm::GenerateHint parse(std::string_view val) {
::intel_npu::npuw::dynamic_llm::GenerateHint res;
static ::intel_npu::npuw::llm::GenerateHint parse(std::string_view val) {
::intel_npu::npuw::llm::GenerateHint res;

if (val == "FAST_COMPILE") {
res = ::intel_npu::npuw::dynamic_llm::GenerateHint::FAST_COMPILE;
}
else if (val == "BEST_PERF") {
res = ::intel_npu::npuw::dynamic_llm::GenerateHint::BEST_PERF;
res = ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE;
} else if (val == "BEST_PERF") {
res = ::intel_npu::npuw::llm::GenerateHint::BEST_PERF;
} else {
OPENVINO_THROW("Unsupported \"GENERATE_HINT\" provided: ",
val, ". Please select either \"FAST_COMPILE\" or \"BEST_PERF\".");
val,
". Please select either \"FAST_COMPILE\" or \"BEST_PERF\".");
}
return res;
}

static std::string toString(const ::intel_npu::npuw::dynamic_llm::GenerateHint& val) {
static std::string toString(const ::intel_npu::npuw::llm::GenerateHint& val) {
std::string res;
switch (val) {
case ::intel_npu::npuw::dynamic_llm::GenerateHint::FAST_COMPILE:
res = "FAST_COMPILE";
break;
case ::intel_npu::npuw::dynamic_llm::GenerateHint::BEST_PERF:
res = "BEST_PERF";
break;
default:
OPENVINO_THROW("Can't convert provided \"GENERATE_HINT\" : ",
int(val), " to string.");
case ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE:
res = "FAST_COMPILE";
break;
case ::intel_npu::npuw::llm::GenerateHint::BEST_PERF:
res = "BEST_PERF";
break;
default:
OPENVINO_THROW("Can't convert provided \"GENERATE_HINT\" : ", int(val), " to string.");
}
return res;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -370,38 +370,38 @@ static constexpr ov::Property<std::string> inputs_outputs{"NPUW_DUMP_IO"};
static constexpr ov::Property<std::string> io_iters{"NPUW_DUMP_IO_ITERS"};
} // namespace dump

namespace dynamic_llm {
namespace llm {
/**
* @brief
* Type: bool.
* Tell NPUW that you want to pass dynamic stateful LLM model
* Default value: false.
*/
static constexpr ov::Property<bool> enabled {"NPUW_LLM"};
static constexpr ov::Property<bool> enabled{"NPUW_LLM"};

/**
* @brief
* Type: std::map<std::string, std::string>.
* Tell NPUW about your LLM model.
* Default value: empty map.
*/
static constexpr ov::Property<std::map<std::string, std::string>> model_desc {"NPUW_LLM_MODEL_DESC"};
static constexpr ov::Property<std::map<std::string, std::string>> model_desc{"NPUW_LLM_MODEL_DESC"};

/**
/**
* @brief
* Type: uint32_t.
* Tell NPUW your desirable max prompt length.
* Default value: 1024.
*/
static constexpr ov::Property<uint32_t> max_prompt_len {"NPUW_LLM_MAX_PROMPT_LEN"};
static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"};

/**
* @brief
* Type: uint32_t.
* Tell NPUW your desirable min response length.
* Default value: 128.
*/
static constexpr ov::Property<uint32_t> min_response_len {"NPUW_LLM_MIN_RESPONSE_LEN"};
static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"};

/**
* @brief
Expand All @@ -410,9 +410,9 @@ static constexpr ov::Property<uint32_t> min_response_len {"NPUW_LLM_MIN_RESPONSE
* Possible values: "FAST_COMPILE", "BEST_PERF".
* Default value: "FAST_COMPILE".
*/
static constexpr ov::Property<std::string> generate_hint {"NPUW_LLM_GENERATE_HINT"};
static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT"};

} // namespace llm_dynamic
} // namespace llm

} // namespace npuw
} // namespace intel_npu
Expand Down
5 changes: 0 additions & 5 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,6 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
desc.add<NPUW_DUMP_IO>();
desc.add<NPUW_DUMP_IO_ITERS>();
#endif
desc.add<NPUW_LLM>();
desc.add<NPUW_LLM_MODEL_DESC>();
desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
desc.add<NPUW_LLM_GENERATE_HINT>();
}

void intel_npu::registerNpuwLlmOptions(OptionsDesc& desc) {
Expand Down
113 changes: 66 additions & 47 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,14 @@
#include "intel_npu/config/config.hpp"
#include "intel_npu/config/npuw.hpp"
#include "intel_npu/npuw_private_properties.hpp"
#include "llm_compiled_model.hpp"
#include "openvino/runtime/device_id_parser.hpp"
#include "openvino/runtime/internal_properties.hpp"
#include "openvino/runtime/properties.hpp"
#include "transformations/convert_precision.hpp"

#include "llm_compiled_model.hpp"

namespace {
void split_properties(const ov::AnyMap& properties,
ov::AnyMap& npu_plugin_properties,
Expand Down Expand Up @@ -85,10 +88,33 @@ ov::npuw::DeviceProperties get_properties_per_device(const std::shared_ptr<const
} // namespace npuw
} // namespace ov

std::shared_ptr<ov::npuw::ICompiledModel> ov::npuw::ICompiledModel::create(
const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties) {
LOG_VERB(__PRETTY_FUNCTION__);
LOG_BLOCK();
std::shared_ptr<ov::npuw::ICompiledModel> compiled_model;
auto use_llm_key = ov::intel_npu::npuw::llm::enabled.name();
if (properties.count(use_llm_key) && properties.at(use_llm_key).as<bool>() == true) {
LOG_DEBUG("ov::npuw::LLMCompiledModel will be created.");
compiled_model = std::make_shared<ov::npuw::LLMCompiledModel>(model, plugin, properties);
} else {
LOG_DEBUG("ov::npuw::CompiledModel will be created.");
compiled_model = std::make_shared<ov::npuw::CompiledModel>(model, plugin, properties);
}
LOG_DEBUG("Done");
return compiled_model;
}

ov::npuw::ICompiledModel::ICompiledModel(const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin)
: ov::ICompiledModel(model, plugin) {}

ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties)
: ov::ICompiledModel(model, plugin),
: ov::npuw::ICompiledModel(model, plugin),
m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()),
m_cfg(m_options_desc),
m_name(model->get_friendly_name()),
Expand Down Expand Up @@ -875,8 +901,6 @@ void ov::npuw::CompiledModel::implement_properties() {
// request. So the vector will define public properties.
// 3. Create mappings for all remaining (private) NPUW-specific properties
// to getters of their values from config, related to ov::npuw::CompiledModel.
// 4. Fill default values for (private) NPUW-specific, dynamic stateful
// model-specific properties.

#define GET_PLUGIN_PROP(property) return get_plugin()->get_property(property.name(), ov::AnyMap());

Expand Down Expand Up @@ -963,52 +987,47 @@ void ov::npuw::CompiledModel::implement_properties() {
} \
}

m_prop_to_opt.insert({BIND(use_npuw, NPU_USE_NPUW),
BIND(npuw::devices, NPUW_DEVICES),
BIND(npuw::submodel_device, NPUW_SUBMODEL_DEVICE),
BIND(npuw::partitioning::online::pipeline, NPUW_ONLINE_PIPELINE),
BIND(npuw::partitioning::online::min_size, NPUW_ONLINE_MIN_SIZE),
BIND(npuw::partitioning::online::keep_blocks, NPUW_ONLINE_KEEP_BLOCKS),
BIND(npuw::partitioning::online::keep_block_size, NPUW_ONLINE_KEEP_BLOCK_SIZE),
BIND(npuw::partitioning::online::avoid, NPUW_ONLINE_AVOID),
BIND(npuw::partitioning::online::isolate, NPUW_ONLINE_ISOLATE),
BIND(npuw::partitioning::online::nofold, NPUW_ONLINE_NO_FOLD),
BIND(npuw::partitioning::online::dump_plan, NPUW_ONLINE_DUMP_PLAN),
BIND(npuw::partitioning::plan, NPUW_PLAN),
BIND(npuw::partitioning::fold, NPUW_FOLD),
BIND(npuw::partitioning::cwai, NPUW_CWAI),
BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT),
BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY),
BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN),
BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS),
BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK),
BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
BIND(npuw::cache_dir, NPUW_CACHE_DIR),
BIND(npuw::accuracy::check, NPUW_ACC_CHECK),
BIND(npuw::accuracy::threshold, NPUW_ACC_THRESH),
BIND(npuw::accuracy::reference_device, NPUW_ACC_DEVICE),
m_prop_to_opt.insert({
BIND(use_npuw, NPU_USE_NPUW),
BIND(npuw::devices, NPUW_DEVICES),
BIND(npuw::submodel_device, NPUW_SUBMODEL_DEVICE),
BIND(npuw::partitioning::online::pipeline, NPUW_ONLINE_PIPELINE),
BIND(npuw::partitioning::online::min_size, NPUW_ONLINE_MIN_SIZE),
BIND(npuw::partitioning::online::keep_blocks, NPUW_ONLINE_KEEP_BLOCKS),
BIND(npuw::partitioning::online::keep_block_size, NPUW_ONLINE_KEEP_BLOCK_SIZE),
BIND(npuw::partitioning::online::avoid, NPUW_ONLINE_AVOID),
BIND(npuw::partitioning::online::isolate, NPUW_ONLINE_ISOLATE),
BIND(npuw::partitioning::online::nofold, NPUW_ONLINE_NO_FOLD),
BIND(npuw::partitioning::online::dump_plan, NPUW_ONLINE_DUMP_PLAN),
BIND(npuw::partitioning::plan, NPUW_PLAN),
BIND(npuw::partitioning::fold, NPUW_FOLD),
BIND(npuw::partitioning::cwai, NPUW_CWAI),
BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
BIND(npuw::partitioning::par_matmul_merge_dims, NPUW_PMM),
BIND(npuw::partitioning::slice_out, NPUW_SLICE_OUT),
BIND(npuw::partitioning::spatial, NPUW_SPATIAL),
BIND(npuw::partitioning::spatial_nway, NPUW_SPATIAL_NWAY),
BIND(npuw::partitioning::spatial_dyn, NPUW_SPATIAL_DYN),
BIND(npuw::partitioning::host_gather, NPUW_HOST_GATHER),
BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
BIND(npuw::parallel_compilation, NPUW_PARALLEL_COMPILE),
BIND(npuw::funcall_async, NPUW_FUNCALL_ASYNC),
BIND(npuw::unfold_ireqs, NPUW_UNFOLD_IREQS),
BIND(npuw::weights_bank, NPUW_WEIGHTS_BANK),
BIND(npuw::weights_bank_alloc, NPUW_WEIGHTS_BANK_ALLOC),
BIND(npuw::cache_dir, NPUW_CACHE_DIR),
BIND(npuw::accuracy::check, NPUW_ACC_CHECK),
BIND(npuw::accuracy::threshold, NPUW_ACC_THRESH),
BIND(npuw::accuracy::reference_device, NPUW_ACC_DEVICE),
#ifdef NPU_PLUGIN_DEVELOPER_BUILD
BIND(npuw::dump::full, NPUW_DUMP_FULL),
BIND(npuw::dump::subgraphs, NPUW_DUMP_SUBS),
BIND(npuw::dump::subgraphs_on_fail, NPUW_DUMP_SUBS_ON_FAIL),
BIND(npuw::dump::inputs_outputs, NPUW_DUMP_IO),
BIND(npuw::dump::io_iters, NPUW_DUMP_IO_ITERS),
BIND(npuw::dump::full, NPUW_DUMP_FULL),
BIND(npuw::dump::subgraphs, NPUW_DUMP_SUBS),
BIND(npuw::dump::subgraphs_on_fail, NPUW_DUMP_SUBS_ON_FAIL),
BIND(npuw::dump::inputs_outputs, NPUW_DUMP_IO),
BIND(npuw::dump::io_iters, NPUW_DUMP_IO_ITERS),
#endif
// 4.
BIND(npuw::dynamic_llm::enabled, NPUW_LLM),
BIND(npuw::dynamic_llm::model_desc, NPUW_LLM_MODEL_DESC),
BIND(npuw::dynamic_llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN),
BIND(npuw::dynamic_llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN),
BIND(npuw::dynamic_llm::generate_hint, NPUW_LLM_GENERATE_HINT)
});
#undef BIND
}
9 changes: 8 additions & 1 deletion src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,16 @@ class Plugin;

namespace ov {
namespace npuw {
class ICompiledModel : public ov::ICompiledModel {
public:
static std::shared_ptr<ov::npuw::ICompiledModel> create(const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin,
const ov::AnyMap& properties);
ICompiledModel(const std::shared_ptr<ov::Model>& model, const std::shared_ptr<const ov::IPlugin>& plugin);
};

class InferRequest;
class CompiledModel : public ov::ICompiledModel {
class CompiledModel : public ov::npuw::ICompiledModel {
using DevList = std::vector<std::string>;
using GetPropertiesMap =
std::map<std::string, std::tuple<ov::PropertyMutability, std::function<ov::Any(const ::intel_npu::Config&)>>>;
Expand Down
Loading

0 comments on commit 96cfc44

Please sign in to comment.