diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index adcaeaaaa31a6f..65826fb96a58e1 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -453,5 +453,15 @@ void Config::updateProperties() { _config.insert({ov::hint::num_requests.name(), std::to_string(hintNumRequests)}); } +void Config::applyRtInfo(const std::shared_ptr& model) { + if (model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) { + this->kvCachePrecision = model->get_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()}); + } + if (model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) { + this->fcDynamicQuantizationGroupSize = + model->get_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()}); + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 79cdf3a5e827ec..1f402eb306c435 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -104,6 +104,8 @@ struct Config { void updateProperties(); + void applyRtInfo(const std::shared_ptr& model); + std::map _config; int modelPreferThreads = -1; diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 5c88772eeedabc..07916380268c93 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -246,6 +246,7 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< // update the props after the perf mode translated to configs // TODO: Clarify the behavior of SetConfig method. Skip eng_config or not? Config conf = engConfig; + conf.applyRtInfo(cloned_model); conf.readProperties(config, modelType); Transformations transformations(cloned_model, conf); @@ -519,6 +520,7 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr& Config conf = engConfig; Config::ModelType modelType = getModelType(model); + conf.applyRtInfo(model); conf.readProperties(config, modelType); auto context = std::make_shared(conf, fake_w_cache, false); @@ -574,7 +576,7 @@ std::shared_ptr Plugin::import_model(std::istream& model_str Config conf = engConfig; Config::ModelType modelType = getModelType(model); - + conf.applyRtInfo(model); // check ov::loaded_from_cache property and erase it to avoid exception in readProperties. auto _config = config; const auto& it = _config.find(ov::loaded_from_cache.name()); diff --git a/src/plugins/intel_cpu/src/plugin.h b/src/plugins/intel_cpu/src/plugin.h index 2548ba2c1cc8af..8973478d30403f 100644 --- a/src/plugins/intel_cpu/src/plugin.h +++ b/src/plugins/intel_cpu/src/plugin.h @@ -50,7 +50,6 @@ class Plugin : public ov::IPlugin { void get_performance_streams(Config& config, const std::shared_ptr& model) const; void calculate_streams(Config& conf, const std::shared_ptr& model, bool imported = false) const; - Config engConfig; /* Explicily configured streams have higher priority than performance hints. So track if streams is set explicitly (not auto-configured) */ diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp index 365e7c56dcef82..177daa69836d18 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp @@ -316,4 +316,35 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPUExecutionDevice) { ASSERT_EQ(value.as(), "CPU"); } +TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptions) { + ov::Core ie; + ov::Any type; + ov::Any size; + ov::CompiledModel compiledModel; + model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name()); + model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name()); + OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName)); + OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision)); + OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size)); + ASSERT_EQ(type.as(), ov::element::f16); + ASSERT_EQ(size.as(), 0); +} + +TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCompileConfig) { + ov::Core ie; + ov::Any type; + ov::Any size; + ov::CompiledModel compiledModel; + model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name()); + model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name()); + ov::AnyMap config; + config[ov::hint::kv_cache_precision.name()] = "u8"; + config[ov::hint::dynamic_quantization_group_size.name()] = "16"; + OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName, config)); + OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision)); + OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size)); + ASSERT_EQ(type.as(), ov::element::u8); + ASSERT_EQ(size.as(), 16); +} + } // namespace