diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst index 42c1c3fb47aa42..172586831252a9 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst @@ -130,7 +130,7 @@ make sure to :doc:`install OpenVINO with GenAI <../../get-started/install-openvi image_write("baseline.bmp", image) For more information, refer to the - `Python sample `__ + `Python sample `__ .. tab-item:: C++ :sync: cpp diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp index 8a2985a284769a..aa067da4f360fd 100644 --- a/src/common/transformations/src/transformations/convert_precision.cpp +++ b/src/common/transformations/src/transformations/convert_precision.cpp @@ -8,6 +8,7 @@ #include #include "itt.hpp" +#include "openvino/core/rt_info/weightless_caching_attributes.hpp" #include "openvino/op/ops.hpp" #include "openvino/pass/constant_folding.hpp" #include "openvino/pass/manager.hpp" @@ -1405,6 +1406,13 @@ bool fuse_type_to_constant(const std::shared_ptr& node, new_const->validate_and_infer_types(); new_const->set_friendly_name(constant->get_friendly_name()); ov::copy_runtime_info(constant, new_const); + + const auto& rt_info = node->get_rt_info(); + auto weightless_caching_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); + if (weightless_caching_attr != rt_info.end()) { + new_const->get_rt_info()[ov::WeightlessCacheAttribute::get_type_info_static()] = + weightless_caching_attr->second; + } return true; } return false; diff --git a/src/common/transformations/tests/utils/convert_precision.cpp b/src/common/transformations/tests/utils/convert_precision.cpp index 318f15ab1a64dc..c2b7133506aebe 100644 --- a/src/common/transformations/tests/utils/convert_precision.cpp +++ b/src/common/transformations/tests/utils/convert_precision.cpp @@ -13,6 +13,7 @@ #include "common_test_utils/ov_test_utils.hpp" #include "openvino/core/model.hpp" +#include "openvino/core/rt_info/weightless_caching_attributes.hpp" #include "openvino/opsets/opset1.hpp" #include "openvino/opsets/opset10.hpp" #include "openvino/opsets/opset15.hpp" @@ -2702,3 +2703,38 @@ TEST(TransformationTests, ConvertPrecision_assign_read_value_preserve_orig_types FunctionsComparator::Result result = func_comparator(model_ref, model); ASSERT_TRUE(result.valid) << result.message; } + +TEST(TransformationTests, ConvertPrecision_assign_read_value_preserve_weightless_cache_info_as_rt_attribute) { + pass::Manager manager; + + auto some_value = opset10::Constant::create(element::f32, Shape{1}, {2}); + auto& node_rt_info = some_value->get_rt_info(); + ov::WeightlessCacheAttribute attr(element::f32.size(), 0, element::f32); + node_rt_info[ov::WeightlessCacheAttribute::get_type_info_static()] = attr; + + ov::ParameterVector inputParams; + ov::ResultVector results; + results.push_back(std::make_shared(some_value->output(0))); + auto model = std::make_shared(results, inputParams); + + type_to_fuse_map empty_type_to_fuse_map = {}; + bool keep_precision_sensitive_in_fp32 = false; + bool convert_input_output_precision = false; + bool store_original_precision_as_rt_attribute = true; + manager.register_pass(precisions_map{{element::f32, element::f16}}, + empty_type_to_fuse_map, + keep_precision_sensitive_in_fp32, + convert_input_output_precision, + store_original_precision_as_rt_attribute); + manager.run_passes(model); + + const auto& ops = model->get_ops(); + auto it = std::find_if(ops.begin(), ops.end(), [](const std::shared_ptr& node) { + return ov::op::util::is_constant(node); + }); + + ASSERT_TRUE(it != ops.end()); + const auto& new_rt_info = (*it)->get_rt_info(); + auto weightless_caching_attr_it = new_rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); + ASSERT_TRUE(weightless_caching_attr_it != new_rt_info.end()); +} diff --git a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp index fedcb030fb52cf..e3cf2609b26c8d 100644 --- a/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp +++ b/src/core/dev_api/openvino/core/rt_info/weightless_caching_attributes.hpp @@ -5,6 +5,7 @@ #pragma once #include "openvino/core/core_visibility.hpp" +#include "openvino/core/node.hpp" #include "openvino/core/runtime_attribute.hpp" namespace ov { @@ -25,14 +26,16 @@ class OPENVINO_API WeightlessCacheAttribute : public RuntimeAttribute { WeightlessCacheAttribute() = delete; - WeightlessCacheAttribute(size_t original_size, size_t bin_offset) + WeightlessCacheAttribute(size_t original_size, size_t bin_offset, ov::element::Type original_dtype) : original_size(original_size), - bin_offset(bin_offset) {} + bin_offset(bin_offset), + original_dtype(original_dtype) {} bool is_copyable() const override; size_t original_size; size_t bin_offset; + ov::element::Type original_dtype; }; } // namespace ov diff --git a/src/core/include/openvino/core/graph_util.hpp b/src/core/include/openvino/core/graph_util.hpp index 66c640a62314df..f5694ca89fee51 100644 --- a/src/core/include/openvino/core/graph_util.hpp +++ b/src/core/include/openvino/core/graph_util.hpp @@ -21,6 +21,10 @@ #include "openvino/op/parameter.hpp" #include "openvino/pass/serialize.hpp" +#ifdef OPENVINO_CPP_VER_17 +# include +#endif + namespace ov { OPENVINO_API @@ -288,27 +292,45 @@ bool replace_node_update_name(const std::shared_ptr& target, const std::sh /// \param bin_path Path where .bin file will be saved (optional). /// The same name as for xml_path will be used by default. /// \param version Version of the generated IR (optional). +/// \{ OPENVINO_API void serialize(const std::shared_ptr& m, const std::string& xml_path, const std::string& bin_path = "", ov::pass::Serialize::Version version = ov::pass::Serialize::Version::UNSPECIFIED); +#ifdef OPENVINO_CPP_VER_17 +template >* = nullptr> +void serialize(const std::shared_ptr& m, + const Path& xml_path, + const Path& bin_path = {""}, + ov::pass::Serialize::Version version = ov::pass::Serialize::Version::UNSPECIFIED) { + serialize(m, xml_path.string(), bin_path.string(), version); +} +#endif +/// \} + /// \brief Save given model into IR. Floating point weights are compressed to FP16 by default. /// This method saves a model to IR applying all necessary transformations that usually applied -/// in model conversion flow provided by mo tool. Paricularly, floatting point weights are compressed to FP16. +/// in model conversion flow provided by mo tool. Particularly, floating point weights are compressed to FP16. /// \param model Model which will be converted to IR representation. /// \param output_model Path to the output model file, must have extension .xml -/// \param compress_to_fp16 Whether to compress floatting point weights to FP16 (true by default) +/// \param compress_to_fp16 Whether to compress floating point weights to FP16 (true by default) OPENVINO_API void save_model(const std::shared_ptr& model, const std::string& output_model, bool compress_to_fp16 = true); - #if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) OPENVINO_API void save_model(const std::shared_ptr& model, const std::wstring& output_model, bool compress_to_fp16 = true); #endif -} // namespace ov \ No newline at end of file + +#ifdef OPENVINO_CPP_VER_17 +template >* = nullptr> +void save_model(const std::shared_ptr& model, const Path& output_model, bool compress_to_fp16 = true) { + save_model(model, output_model.string(), compress_to_fp16); +} +#endif +} // namespace ov diff --git a/src/core/include/openvino/pass/serialize.hpp b/src/core/include/openvino/pass/serialize.hpp index fc3e743d4005dc..d0eaadde346bf6 100644 --- a/src/core/include/openvino/pass/serialize.hpp +++ b/src/core/include/openvino/pass/serialize.hpp @@ -11,6 +11,10 @@ #include "openvino/opsets/opset.hpp" #include "openvino/pass/pass.hpp" +#ifdef OPENVINO_CPP_VER_17 +# include +#endif + namespace ov { namespace pass { @@ -35,6 +39,13 @@ class OPENVINO_API Serialize : public ov::pass::ModelPass { Serialize(const std::string& xmlPath, const std::string& binPath, Version version = Version::UNSPECIFIED); +#ifdef OPENVINO_CPP_VER_17 + Serialize(const std::filesystem::path& xmlPath, + const std::filesystem::path& binPath, + Version version = Version::UNSPECIFIED) + : Serialize(xmlPath.string(), binPath.string(), version) {} +#endif + private: std::ostream* m_xmlFile; std::ostream* m_binFile; diff --git a/src/core/tests/pass/serialization/deterministicity.cpp b/src/core/tests/pass/serialization/deterministicity.cpp index 8441da501eb9bf..a93f092889d2a1 100644 --- a/src/core/tests/pass/serialization/deterministicity.cpp +++ b/src/core/tests/pass/serialization/deterministicity.cpp @@ -296,6 +296,47 @@ TEST_P(SerializationDeterministicityInputOutputTest, FromIrModel) { EXPECT_TRUE(files_equal(xml_2, xml_1)); } +#ifdef OPENVINO_CPP_VER_17 +TEST_P(SerializationDeterministicityInputOutputTest, FromOvModelBybPath) { + auto irVersion = GetParam(); + + std::shared_ptr modelRef; + { + auto parameter0 = std::make_shared(ov::element::f32, ov::Shape{1, 3, 22, 22}); + parameter0->set_friendly_name("input0"); + auto result0 = std::make_shared(parameter0); + result0->set_friendly_name("output0"); + auto parameter1 = std::make_shared(ov::element::f32, ov::Shape{1, 3, 22, 22}); + parameter1->set_friendly_name("input1"); + auto result1 = std::make_shared(parameter1); + result1->set_friendly_name("output1"); + modelRef = + std::make_shared(ov::NodeVector{result0, result1}, ov::ParameterVector{parameter0, parameter1}); + } + + auto& expected1 = modelRef; + const auto out_xml_path = std::filesystem::path(m_out_xml_path_1); + const auto out_bin_path = std::filesystem::path(m_out_bin_path_1); + ov::pass::Serialize(out_xml_path, out_bin_path, irVersion).run_on_model(modelRef); + auto expected2 = ov::test::readModel(m_out_xml_path_1, m_out_bin_path_1); + + ov::pass::Serialize(m_out_xml_path_2, m_out_bin_path_2, irVersion).run_on_model(expected2); + + EXPECT_EQ(input0Name, expected1->input(0).get_node()->get_friendly_name()); + EXPECT_EQ(input1Name, expected1->input(1).get_node()->get_friendly_name()); + EXPECT_EQ(output0Name, expected1->output(0).get_node()->get_friendly_name()); + EXPECT_EQ(output1Name, expected1->output(1).get_node()->get_friendly_name()); + EXPECT_EQ(input0Name, expected2->input(0).get_node()->get_friendly_name()); + EXPECT_EQ(input1Name, expected2->input(1).get_node()->get_friendly_name()); + EXPECT_EQ(output0Name, expected2->output(0).get_node()->get_friendly_name()); + EXPECT_EQ(output1Name, expected2->output(1).get_node()->get_friendly_name()); + + std::ifstream xml_1(m_out_xml_path_1, std::ios::in | std::ios::binary); + std::ifstream xml_2(m_out_xml_path_2, std::ios::in | std::ios::binary); + EXPECT_TRUE(files_equal(xml_1, xml_2)); +} +#endif + INSTANTIATE_TEST_SUITE_P(DeterministicityInputOutput, SerializationDeterministicityInputOutputTest, ::testing::Values(ov::pass::Serialize::Version::IR_V10, ov::pass::Serialize::Version::IR_V11)); diff --git a/src/core/tests/pass/serialization/serialize.cpp b/src/core/tests/pass/serialization/serialize.cpp index e45d5d1d1434ff..5cb1965feebdd7 100644 --- a/src/core/tests/pass/serialization/serialize.cpp +++ b/src/core/tests/pass/serialization/serialize.cpp @@ -74,6 +74,23 @@ TEST_P(SerializationTest, SaveModel) { }); } +#ifdef OPENVINO_CPP_VER_17 +TEST_P(SerializationTest, CompareFunctionsByPath) { + const auto out_xml_path = std::filesystem::path(m_out_xml_path); + const auto out_bin_path = std::filesystem::path(m_out_bin_path); + CompareSerialized([&out_xml_path, &out_bin_path](const auto& m) { + ov::pass::Serialize(out_xml_path, out_bin_path).run_on_model(m); + }); +} + +TEST_P(SerializationTest, SaveModelByPath) { + const auto out_xml_path = std::filesystem::path(m_out_xml_path); + CompareSerialized([&out_xml_path](const auto& m) { + ov::save_model(m, out_xml_path, false); + }); +} +#endif + INSTANTIATE_TEST_SUITE_P( IRSerialization, SerializationTest, diff --git a/src/frontends/ir/src/ir_deserializer.cpp b/src/frontends/ir/src/ir_deserializer.cpp index 2d1dfba956ea72..d7e250f9916302 100644 --- a/src/frontends/ir/src/ir_deserializer.cpp +++ b/src/frontends/ir/src/ir_deserializer.cpp @@ -950,10 +950,12 @@ std::shared_ptr ov::XmlDeserializer::create_node(const std::vector(pugixml::get_uint64_attr(dn, "size")), - static_cast(pugixml::get_uint64_attr(dn, "offset"))); + static_cast(pugixml::get_uint64_attr(dn, "offset")), + ov::element::Type(pugixml::get_str_attr(dn, "element_type"))); } } diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index 39a2d20c092835..05a0e0a2cf6a0e 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -1329,6 +1329,58 @@ std::set> jit_logical_and_emitter::get_supported_prec return {{element::f32, element::f32}}; } +/// LOGICAL_OR /// +jit_logical_or_emitter::jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { + prepare_table(); +} + +jit_logical_or_emitter::jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc) + : jit_emitter(host, host_isa, exec_prc) { + prepare_table(); +} + +size_t jit_logical_or_emitter::get_inputs_count() const { return 2; } + +size_t jit_logical_or_emitter::get_aux_vecs_count() const { return 1; } + +size_t jit_logical_or_emitter::get_aux_gprs_count() const { return 1; } + +void jit_logical_or_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} + +template +void jit_logical_or_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + const TReg src1 = TReg(in_vec_idxs[0]); + const TReg src2 = TReg(in_vec_idxs[1]); + const TReg dst = TReg(out_vec_idxs[0]); + const TReg aux = TReg(aux_vec_idxs[0]); + + h->orr(dst.b16, src1.b16, src2.b16); + h->ld1r(aux.s, table_val2("one")); + h->and_(dst.b16, dst.b16, aux.b16); +} + +void jit_logical_or_emitter::register_table_entries() { + push_arg_entry_of("one", 0x3f800000, true); +} + +std::set> jit_logical_or_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}}; +} + /// LOGICAL_NOT /// jit_logical_not_emitter::jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp index 2173a1487f1057..be4e51cd0b759d 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp @@ -628,6 +628,34 @@ class jit_logical_and_emitter : public jit_emitter { void register_table_entries() override; }; +class jit_logical_or_emitter : public jit_emitter { +public: + jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_logical_or_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& n); + + size_t get_inputs_count() const override; + + size_t get_aux_vecs_count() const override; + + size_t get_aux_gprs_count() const override; + + static std::set> get_supported_precisions( + const std::shared_ptr& node = nullptr); + +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; + + void register_table_entries() override; +}; + class jit_logical_not_emitter : public jit_emitter { public: jit_logical_not_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index 0374888e3d7fcb..912fe23fcd1fcf 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -38,6 +38,7 @@ bool JitEltwiseExecutor::isSupported( Algorithm::EltwiseIsNaN, Algorithm::EltwiseLessEqual, Algorithm::EltwiseLogicalAnd, + Algorithm::EltwiseLogicalOr, Algorithm::EltwiseLogicalNot, Algorithm::EltwiseLogicalXor, Algorithm::EltwiseMaximum, diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp index cfe36f78cc40f9..b3fe7018d23677 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -655,6 +655,7 @@ std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitte OV_CASE(Algorithm::EltwiseIsInf, ov::intel_cpu::aarch64::jit_is_inf_emitter), OV_CASE(Algorithm::EltwiseLessEqual, ov::intel_cpu::aarch64::jit_less_equal_emitter), OV_CASE(Algorithm::EltwiseLogicalAnd, ov::intel_cpu::aarch64::jit_logical_and_emitter), + OV_CASE(Algorithm::EltwiseLogicalOr, ov::intel_cpu::aarch64::jit_logical_or_emitter), OV_CASE(Algorithm::EltwiseLogicalNot, ov::intel_cpu::aarch64::jit_logical_not_emitter), OV_CASE(Algorithm::EltwiseLogicalXor, ov::intel_cpu::aarch64::jit_logical_xor_emitter), OV_CASE(Algorithm::EltwiseIsNaN, ov::intel_cpu::aarch64::jit_is_nan_emitter), @@ -845,6 +846,7 @@ std::set> eltwise_precision_helper::get_supported_pre OV_CASE(Algorithm::EltwiseIsNaN, jit_is_nan_emitter), OV_CASE(Algorithm::EltwiseLessEqual, jit_less_equal_emitter), OV_CASE(Algorithm::EltwiseLogicalAnd, jit_logical_and_emitter), + OV_CASE(Algorithm::EltwiseLogicalOr, jit_logical_or_emitter), OV_CASE(Algorithm::EltwiseLogicalNot, jit_logical_not_emitter), OV_CASE(Algorithm::EltwiseLogicalXor, jit_logical_xor_emitter), OV_CASE(Algorithm::EltwiseMaximum, jit_maximum_emitter), diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp index 461f063ec26bc5..8a9a35b1e92fe9 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp @@ -4,15 +4,170 @@ #pragma once #include +#include #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/memory.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/util/op_types.hpp" +#include "openvino/pass/manager.hpp" #include "openvino/runtime/shared_buffer.hpp" #include "openvino/util/mmap_object.hpp" #include "primitive.hpp" +#include "transformations/convert_precision.hpp" namespace cldnn { +struct weights_mem { + std::shared_ptr>> shared_buf = nullptr; + std::shared_ptr transformed_constant = nullptr; + + const uint8_t* get_loaded_data() { + if (transformed_constant) { + return reinterpret_cast(transformed_constant->get_data_ptr()); + } + OPENVINO_ASSERT(shared_buf); + return shared_buf->get_ptr(); + } +}; + +struct weightless_cache_manager { + void set_constant_info(size_t bin_offset, + size_t original_size, + ov::element::Type original_dtype, + ov::element::Type curr_dtype, + ov::Shape shape) { + this->bin_offset = bin_offset; + this->original_size = original_size; + this->original_dtype = original_dtype; + this->curr_dtype = curr_dtype; + this->shape = shape; + do_weightless_caching = true; + + if (original_dtype != curr_dtype) { + do_precision_conversion = true; + } + } + + void invalidate() { + do_weightless_caching = false; + } + + void set_new_dtype(ov::element::Type curr_dtype) { + this->curr_dtype = curr_dtype; + do_precision_conversion = original_dtype != curr_dtype; + } + + bool save(BinaryOutputBuffer& ob, size_t data_size) const { + if (!do_weightless_caching) { + ob << false; + return false; + } + + ob << true; + ob << bin_offset; + ob << do_precision_conversion; + if (do_precision_conversion) { + ob << original_size; + ob << make_data(&original_dtype, sizeof(ov::element::Type)); + ob << make_data(&curr_dtype, sizeof(ov::element::Type)); + + size_t num_dims = shape.size(); + ob << make_data(&num_dims, sizeof(size_t)); + ob << make_data(shape.data(), num_dims * sizeof(ov::Shape::value_type)); + } + return true; + } + + std::shared_ptr load(BinaryInputBuffer& ib, + std::shared_ptr mapped_weights, + size_t data_size) { + ib >> do_weightless_caching; + if (!do_weightless_caching) { + return nullptr; + } + + OPENVINO_ASSERT(mapped_weights != nullptr, "mmap object is null"); + + ib >> bin_offset; + ib >> do_precision_conversion; + if (do_precision_conversion) { + ib >> original_size; + ib >> make_data(&original_dtype, sizeof(ov::element::Type)); + ib >> make_data(&curr_dtype, sizeof(ov::element::Type)); + + size_t num_dims = 0; + ib >> make_data(&num_dims, sizeof(size_t)); + shape.resize(num_dims); + ib >> make_data(shape.data(), num_dims * sizeof(ov::Shape::value_type)); + } else { + original_size = data_size; + } + + auto mem_obj = std::make_shared(); + mem_obj->shared_buf = std::make_shared>>( + mapped_weights->data() + bin_offset, + original_size, + mapped_weights); + + if (should_run_transformations()) { + run_transformations(mem_obj); + } + return mem_obj; + } + +private: + bool do_weightless_caching = false; + bool do_precision_conversion = false; + + size_t bin_offset = SIZE_MAX; + size_t original_size = SIZE_MAX; + ov::element::Type original_dtype = ov::element::Type_t::undefined; + ov::element::Type curr_dtype = ov::element::Type_t::undefined; + ov::Shape shape; + + bool should_run_transformations() { + return do_precision_conversion; + } + + void run_transformations(std::shared_ptr mem_obj) { + auto orig_constant = std::make_shared(original_dtype, + shape, + mem_obj->shared_buf->get_ptr(), + mem_obj->shared_buf); + + ov::ParameterVector inputParams; + ov::ResultVector results; + results.push_back(std::make_shared(orig_constant->output(0))); + auto model = std::make_shared(results, inputParams, "aux"); + + ov::pass::Manager manager("Plugin:GPU:weightless_cache_transformations"); + + if (do_precision_conversion) { + precisions_map fp_convert_precision_map = { + {original_dtype, curr_dtype}}; + type_to_fuse_map empty_fuse_map = {}; + const bool keep_precision_sensitive_in_fp32 = false; + const bool convert_input_output_precision = false; + const bool store_original_precision_as_rt_attribute = true; + manager.register_pass(fp_convert_precision_map, + empty_fuse_map, + keep_precision_sensitive_in_fp32, + convert_input_output_precision, + store_original_precision_as_rt_attribute); + } + + manager.run_passes(model); + const auto& ops = model->get_ops(); + auto it = std::find_if(ops.begin(), ops.end(), [](const std::shared_ptr& node) { + return ov::op::util::is_constant(node); + }); + OPENVINO_ASSERT(it != ops.end()); + mem_obj->transformed_constant = std::dynamic_pointer_cast(*it); + OPENVINO_ASSERT(mem_obj->transformed_constant->get_element_type() == curr_dtype); + } +}; + /// @brief Provides input data to topology. /// @details This primitive allows to pass data which is known at topology creation. /// For example, weights and biases for scoring networks. @@ -20,21 +175,32 @@ namespace cldnn { struct data : public primitive_base { CLDNN_DECLARE_PRIMITIVE(data) - data() : primitive_base("", {}) {} + data() : primitive_base("", {}) { + cache_info = std::make_shared(); + } /// @brief Constructs data primitive. /// @param id This primitive id. /// @param mem @ref memory object which contains data. /// @note If memory is attached by memory::attach(), the attached buffer should be valid till network build. - data(const primitive_id& id, memory::ptr mem) - : primitive_base(id, {}), mem(std::move(mem)) {} + data(const primitive_id& id, memory::ptr mem) : primitive_base(id, {}), mem(std::move(mem)) { + cache_info = std::make_shared(); + } + + data(const primitive_id& id, memory::ptr mem, std::shared_ptr cache_info) + : primitive_base(id, {}), + mem(std::move(mem)), + cache_info(cache_info) { + if (!cache_info) { + this->cache_info = std::make_shared(); + } + } /// @brief @ref memory object which contains data. /// @note If memory is attached by memory::attach(), the attached buffer should be valid till network build. memory::ptr mem; - size_t original_size = SIZE_MAX; - size_t bin_offset = SIZE_MAX; + std::shared_ptr cache_info; size_t hash() const override { size_t seed = primitive::hash(); @@ -53,13 +219,8 @@ struct data : public primitive_base { size_t data_size = mem->size(); ob << make_data(&data_size, sizeof(size_t)); - bool is_cache_without_weights = bin_offset != SIZE_MAX && data_size == original_size; - - if (is_cache_without_weights) { - ob << true; - ob << bin_offset; - } else { - ob << false; + bool do_weightless_caching = cache_info->save(ob, data_size); + if (!do_weightless_caching) { if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { ob << make_data(mem->buffer_ptr(), data_size); } else { @@ -88,26 +249,12 @@ struct data : public primitive_base { mem = ib.get_engine().allocate_memory(output_layout, _allocation_type, false); - bool is_cache_without_weights; - ib >> is_cache_without_weights; - if (is_cache_without_weights && mapped_weights == nullptr) { - OPENVINO_THROW("mmap object is null"); - } - - std::shared_ptr>> shared_buf; - if (is_cache_without_weights) { - ib >> bin_offset; - original_size = data_size; - - shared_buf = std::make_shared>>( - mapped_weights->data() + bin_offset, - data_size, - mapped_weights); - } + auto mem_obj = cache_info->load(ib, mapped_weights, data_size); + bool is_weightless_caching_enabled = mem_obj != nullptr; if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) { - if (is_cache_without_weights) { - std::memcpy(reinterpret_cast(mem->buffer_ptr()), shared_buf->get_ptr(), data_size); + if (is_weightless_caching_enabled) { + std::memcpy(reinterpret_cast(mem->buffer_ptr()), mem_obj->get_loaded_data(), data_size); } else { ib >> make_data(mem->buffer_ptr(), data_size); } @@ -116,8 +263,8 @@ struct data : public primitive_base { auto& strm = ib.get_engine().get_service_stream(); if (data_size < DATA_BLOCK_SIZE || output_layout.format.is_image_2d()) { std::vector _buf(data_size); - if (is_cache_without_weights) { - std::memcpy(reinterpret_cast(_buf.data()), shared_buf->get_ptr(), data_size); + if (is_weightless_caching_enabled) { + std::memcpy(reinterpret_cast(_buf.data()), mem_obj->get_loaded_data(), data_size); } else { ib >> make_data(_buf.data(), data_size); } @@ -135,9 +282,9 @@ struct data : public primitive_base { size_t copy_size = (data_size > (dst_offset + DATA_BLOCK_SIZE)) ? DATA_BLOCK_SIZE : (data_size - dst_offset); if (buf_flag) { - if (is_cache_without_weights) { + if (is_weightless_caching_enabled) { std::memcpy(reinterpret_cast(_buf1.data()), - shared_buf->get_ptr() + dst_offset, + mem_obj->get_loaded_data() + dst_offset, copy_size); } else { ib >> make_data(_buf1.data(), copy_size); @@ -148,9 +295,9 @@ struct data : public primitive_base { } ev1 = mem->copy_from(strm, _buf1.data(), src_offset, dst_offset, copy_size, is_blocking); } else { - if (is_cache_without_weights) { + if (is_weightless_caching_enabled) { std::memcpy(reinterpret_cast(_buf2.data()), - shared_buf->get_ptr() + dst_offset, + mem_obj->get_loaded_data() + dst_offset, copy_size); } else { ib >> make_data(_buf2.data(), copy_size); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp index 85173e9eb33e7c..a4129800733875 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp @@ -74,11 +74,14 @@ void propagate_constants::run(program& p) { // replace all constant nodes which are relevant for inference (either used by non-const user or marked as output) // with recomputed cldnn::data for (auto& cout : to_replace) { - auto& id_to_replace = cout.first; - auto mem_impl = cout.second; - - auto const_data = - std::make_shared("_cldnn_const_prop_" + id_to_replace, mem_impl /* <<< REMOVE ME WHEN POSSIBLE */); + auto& id_to_replace = std::get<0>(cout); + auto mem_impl = std::get<1>(cout); + auto cache_info = std::get<2>(cout); + auto in_layout = std::get<3>(cout); + + auto const_data = std::make_shared("_cldnn_const_prop_" + id_to_replace, + mem_impl, /* <<< REMOVE ME WHEN POSSIBLE */ + cache_info); auto& new_node = p.get_or_create(const_data); auto& curr_node = p.get_node(id_to_replace); @@ -92,6 +95,25 @@ void propagate_constants::run(program& p) { } } + auto is_reorder_with_only_dtype_change = [&](program_node& dst) { + if (!in_layout) { + return false; + } + auto& dst_layout = dst.get_output_layout(); + if (in_layout->data_type == dst_layout.data_type) { + return false; + } + + auto aux_layout = dst_layout; + aux_layout.data_type = in_layout->data_type; + return aux_layout == *in_layout.get(); + }; + if (is_reorder_with_only_dtype_change(new_node)) { + new_node.as().get_primitive()->cache_info->set_new_dtype(new_node.get_output_layout().data_type); + } else { + new_node.as().get_primitive()->cache_info->invalidate(); + } + curr_node.dependencies.clear(); // remove all constant users (as they will be either removed or replaced by cldnn::data which does not have any // dependencies) @@ -113,9 +135,10 @@ bool propagate_constants::has_non_const_user(program_node& node) const { return false; } -std::list> propagate_constants::calculate(engine& engine, - const ExecutionConfig& config, - std::shared_ptr task_executor) { +std::list, std::shared_ptr>> +propagate_constants::calculate(engine& engine, + const ExecutionConfig& config, + std::shared_ptr task_executor) { if (!has_non_trivial_constants) return {}; @@ -123,15 +146,37 @@ std::list> propagate_constants::calculate(e cf_config.set_property(ov::intel_gpu::optimize_data(false)); cf_config.set_property(ov::intel_gpu::custom_outputs(const_outputs)); network::ptr net = network::build_network(engine, nodes, cf_config, task_executor, true); - for (auto& cin : const_inputs) + std::map, std::shared_ptr>> + weightless_cache_map; + for (auto& cin : const_inputs) { net->set_input_data(cin->id(), cin->get_attached_memory_ptr()); + auto users = cin->get_users(); + if (users.size() == 1 && users.front()->is_type()) { + auto rprim = users.front()->as().get_primitive(); + auto id = rprim->id; + auto cache_ptr = cin->as().get_primitive()->cache_info; + auto layout_ptr = std::make_shared(cin->get_output_layout()); + weightless_cache_map.emplace(id, std::make_pair(cache_ptr, layout_ptr)); + } + } + net->execute({}); net->reset_execution(true); // wait for computations to complete auto outputs = net->get_outputs(); - std::list> ret; - for (auto& out : outputs) ret.push_back({out->id(), out->output_memory_ptr()}); + std::list, std::shared_ptr>> + ret; + for (auto& out : outputs) { + std::shared_ptr cache_ptr = nullptr; + std::shared_ptr layout_ptr = nullptr; + auto it = weightless_cache_map.find(out->id()); + if (it != weightless_cache_map.end()) { + cache_ptr = it->second.first; + layout_ptr = it->second.second; + } + ret.push_back({out->id(), out->output_memory_ptr(), cache_ptr, layout_ptr}); + } return ret; } diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h index 490076a37f788e..0b7c3d85c37e27 100644 --- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h +++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h @@ -211,9 +211,10 @@ class propagate_constants : public base_pass { private: void run(program& p) override; - std::list> calculate(engine& engine, - const ExecutionConfig& config, - std::shared_ptr task_executor); + std::list, std::shared_ptr>> + calculate(engine& engine, + const ExecutionConfig& config, + std::shared_ptr task_executor); bool has_non_const_user(program_node& node) const; void handle_constant(program& prog, program_node& node); void add_constant(program& prog, program_node& node); diff --git a/src/plugins/intel_gpu/src/plugin/program_builder.cpp b/src/plugins/intel_gpu/src/plugin/program_builder.cpp index 368e25abe2ddac..a9bb813d0ce587 100644 --- a/src/plugins/intel_gpu/src/plugin/program_builder.cpp +++ b/src/plugins/intel_gpu/src/plugin/program_builder.cpp @@ -14,6 +14,7 @@ #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/plugin/program_builder.hpp" +#include "intel_gpu/primitives/data.hpp" #include "intel_gpu/runtime/itt.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/primitives/mutable_data.hpp" @@ -311,11 +312,15 @@ void ProgramBuilder::add_primitive(const ov::Node& op, std::shared_ptrm_config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE) { if (auto data_prim = dynamic_cast(prim.get())) { auto rt_info = op.get_rt_info(); + auto weightless_cache_attr = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); if (weightless_cache_attr != rt_info.end()) { - data_prim->bin_offset = weightless_cache_attr->second.as().bin_offset; - data_prim->original_size = - weightless_cache_attr->second.as().original_size; + auto& attr = weightless_cache_attr->second.as(); + data_prim->cache_info->set_constant_info(attr.bin_offset, + attr.original_size, + attr.original_dtype, + op.get_output_element_type(0), + op.get_output_shape(0)); } } } diff --git a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp index 839b2640ca180c..17e1ed6d0a9bbe 100644 --- a/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/behavior/model_cache.cpp @@ -8,48 +8,40 @@ #include "common_test_utils/common_utils.hpp" #include "common_test_utils/file_utils.hpp" #include "common_test_utils/ov_tensor_utils.hpp" -#include "common_test_utils/subgraph_builders/2_input_subtract.hpp" -#include "common_test_utils/subgraph_builders/concat_with_params.hpp" -#include "common_test_utils/subgraph_builders/conv_bias.hpp" -#include "common_test_utils/subgraph_builders/conv_pool_relu.hpp" -#include "common_test_utils/subgraph_builders/conv_pool_relu_no_reshapes.hpp" -#include "common_test_utils/subgraph_builders/conv_pool_relu_non_zero.hpp" -#include "common_test_utils/subgraph_builders/convert_transpose.hpp" -#include "common_test_utils/subgraph_builders/detection_output.hpp" -#include "common_test_utils/subgraph_builders/kso_func.hpp" -#include "common_test_utils/subgraph_builders/matmul_bias.hpp" -#include "common_test_utils/subgraph_builders/multi_single_conv.hpp" -#include "common_test_utils/subgraph_builders/multiple_input_outpput_double_concat.hpp" -#include "common_test_utils/subgraph_builders/nested_branch_conv_concat.hpp" -#include "common_test_utils/subgraph_builders/nested_split_conv_concat.hpp" #include "common_test_utils/subgraph_builders/read_concat_split_assign.hpp" #include "common_test_utils/subgraph_builders/single_concat_with_constant.hpp" -#include "common_test_utils/subgraph_builders/single_conv.hpp" -#include "common_test_utils/subgraph_builders/single_split.hpp" -#include "common_test_utils/subgraph_builders/split_concat.hpp" -#include "common_test_utils/subgraph_builders/split_conv_concat.hpp" -#include "common_test_utils/subgraph_builders/split_multi_conv_concat.hpp" #include "common_test_utils/subgraph_builders/ti_with_lstm_cell.hpp" #include "common_test_utils/test_common.hpp" #include "openvino/pass/serialize.hpp" namespace { -class CheckWeightlessCacheAccuracy : public ::testing::Test, - public ::testing::WithParamInterface { +typedef std::tuple testParams; + +class CheckWeightlessCacheAccuracy : public ::testing::Test, public ::testing::WithParamInterface { public: - static std::string get_test_case_name(::testing::TestParamInfo obj) { - bool use_compile_model_api = obj.param; + static std::string get_test_case_name(::testing::TestParamInfo obj) { + bool use_compile_model_api_; + ov::element::Type inference_mode_; + ov::element::Type model_dtype_; + std::tie(use_compile_model_api_, inference_mode_, model_dtype_) = obj.param; std::ostringstream result; - result << "use_compile_model_api=" << use_compile_model_api; + const char separator = '_'; + result << "use_compile_model_api=" << use_compile_model_api_ << separator; + result << "inference_mode=" << inference_mode_ << separator; + result << "model_dtype=" << model_dtype_; return result.str(); } + protected: std::shared_ptr model; std::string xml_path; std::string bin_path; std::string cache_path; - bool use_compile_model_api; // for loading from cache + std::string cache_dir; + bool use_compile_model_api; // for loading from cache + ov::element::Type inference_mode; + ov::element::Type model_dtype; void SetUp() override; void TearDown() override; @@ -61,36 +53,46 @@ void CheckWeightlessCacheAccuracy::SetUp() { xml_path = filePrefix + ".xml"; bin_path = filePrefix + ".bin"; cache_path = filePrefix + ".blob"; - use_compile_model_api = GetParam(); + cache_dir = filePrefix + "_cache_dir"; + + std::tie(use_compile_model_api, inference_mode, model_dtype) = GetParam(); } void CheckWeightlessCacheAccuracy::TearDown() { std::remove(xml_path.c_str()); std::remove(bin_path.c_str()); std::remove(cache_path.c_str()); + + ov::test::utils::removeFilesWithExt(cache_dir, "blob"); + ov::test::utils::removeFilesWithExt(cache_dir, "cl_cache"); + ov::test::utils::removeDir(cache_dir); } void CheckWeightlessCacheAccuracy::run() { - ov::AnyMap config = { ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE) }; - ov::AnyMap config_with_weights_path = { ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE), ov::weights_path(bin_path) }; + ov::AnyMap config = {ov::cache_dir(cache_dir), + ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE), + ov::hint::inference_precision(inference_mode)}; + ov::AnyMap config_with_weights_path = {ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE), + ov::weights_path(bin_path), + ov::hint::inference_precision(inference_mode)}; auto core = ov::test::utils::PluginCache::get().core(); ov::pass::Serialize(xml_path, bin_path).run_on_model(model); ov::CompiledModel compiled_model; - OV_ASSERT_NO_THROW(compiled_model = core->compile_model(xml_path, ov::test::utils::DEVICE_GPU, config)); + compiled_model = core->compile_model(xml_path, ov::test::utils::DEVICE_GPU, config); - auto ofstr = std::ofstream(cache_path, std::ofstream::binary); - OV_ASSERT_NO_THROW(compiled_model.export_model(ofstr)); - ofstr.close(); + if (!use_compile_model_api) { + auto ofstr = std::ofstream(cache_path, std::ofstream::binary); + compiled_model.export_model(ofstr); + ofstr.close(); + } auto ifstr = std::ifstream(cache_path, std::ifstream::binary); ov::CompiledModel imported_model; if (use_compile_model_api) { - OV_ASSERT_NO_THROW(imported_model = - core->compile_model(xml_path, ov::test::utils::DEVICE_GPU, config)); + imported_model = core->compile_model(xml_path, ov::test::utils::DEVICE_GPU, config); } else { - OV_ASSERT_NO_THROW(imported_model = - core->import_model(ifstr, ov::test::utils::DEVICE_GPU, config_with_weights_path)); + imported_model = core->import_model(ifstr, ov::test::utils::DEVICE_GPU, config_with_weights_path); } ifstr.close(); @@ -99,39 +101,57 @@ void CheckWeightlessCacheAccuracy::run() { for (size_t param_idx = 0; param_idx < model->get_parameters().size(); ++param_idx) { auto input = model->get_parameters().at(param_idx); - auto tensor = ov::test::utils::create_and_fill_tensor(input->get_element_type(), input->get_shape()); + auto tensor = ov::test::utils::create_and_fill_tensor_real_distribution(input->get_element_type(), + input->get_shape(), + -100, + 100, + param_idx); orig_req.set_tensor(input, tensor); new_req.set_tensor(input, tensor); } - OV_ASSERT_NO_THROW(orig_req.infer()); - OV_ASSERT_NO_THROW(new_req.infer()); + orig_req.infer(); + new_req.infer(); auto result_vector = model->get_results(); for (auto& res : result_vector) { auto orig_out = orig_req.get_tensor(res); auto new_out = new_req.get_tensor(res); - ov::test::utils::compare(orig_out, new_out); + ov::test::utils::compare(orig_out, new_out, inference_mode); } } TEST_P(CheckWeightlessCacheAccuracy, ReadConcatSplitAssign) { - model = ov::test::utils::make_read_concat_split_assign({1, 1, 2, 4}, ov::element::f16); - run(); + OV_ASSERT_NO_THROW(model = ov::test::utils::make_read_concat_split_assign({1, 1, 2, 4}, model_dtype)); + OV_ASSERT_NO_THROW(run()); } TEST_P(CheckWeightlessCacheAccuracy, SingleConcatWithConstant) { - model = ov::test::utils::make_single_concat_with_constant({1, 1, 2, 4}, ov::element::f16); - run(); + OV_ASSERT_NO_THROW(model = ov::test::utils::make_single_concat_with_constant({1, 1, 2, 4}, model_dtype)); + OV_ASSERT_NO_THROW(run()); } TEST_P(CheckWeightlessCacheAccuracy, TiWithLstmCell) { - model = ov::test::utils::make_ti_with_lstm_cell(ov::element::f16); - run(); + OV_ASSERT_NO_THROW(model = ov::test::utils::make_ti_with_lstm_cell(model_dtype)); + OV_ASSERT_NO_THROW(run()); } -INSTANTIATE_TEST_SUITE_P(smoke_CheckWeightlessCacheAccuracy, CheckWeightlessCacheAccuracy, - ::testing::Bool(), +const std::vector inference_modes = { + ov::element::f32, + ov::element::f16, +}; + +const std::vector model_dtypes = { + ov::element::f32, + ov::element::f16, + ov::element::bf16, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_CheckWeightlessCacheAccuracy, + CheckWeightlessCacheAccuracy, + ::testing::Combine(::testing::Bool(), + ::testing::ValuesIn(inference_modes), + ::testing::ValuesIn(model_dtypes)), CheckWeightlessCacheAccuracy::get_test_case_name); } // namespace diff --git a/src/plugins/intel_gpu/tests/unit/shape_infer/eltwise_si_test.cpp b/src/plugins/intel_gpu/tests/unit/shape_infer/eltwise_si_test.cpp index 7abdbcb8c2fc52..7b4f27b5af05b4 100644 --- a/src/plugins/intel_gpu/tests/unit/shape_infer/eltwise_si_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/shape_infer/eltwise_si_test.cpp @@ -23,11 +23,11 @@ using namespace ov; namespace shape_infer_tests { struct eltwise_test_params { - layout input1_layout; - layout input2_layout; + cldnn::layout input1_layout; + cldnn::layout input2_layout; eltwise_mode mode; AutoBroadcastSpec auto_broadcast_spec; - layout expected_layout; + cldnn::layout expected_layout; std::vector stride; };