From 4d5e1c72ef4e3b599878749b67cc140947d44baf Mon Sep 17 00:00:00 2001 From: Mircea-Aurelian Dan Date: Mon, 27 Jan 2025 10:20:26 +0200 Subject: [PATCH] [intel-npu] Support new internal `cached_model_buffer` config for memory mapped cached blobs (#27822) ### Details: - *Based on new `import_model` API from PR #27644 new plugins property to pass mmap buffer from PR #27981* - *Added `BlobContainer` class for `IGraph` objects that may derive with `BlobContainerAlignedBuffer` for the new `import_model` API and `BlobContainerVector` for the old one* - *Refactored `getGraphHandle` function to allow passing `const uint8_t` and `size_t` params instead of `std::vector* ### Tickets: - *157192* --------- Signed-off-by: Alexandru Enache Co-authored-by: Alexandru Enache Co-authored-by: Oleg Pipikin --- .../intel_npu/common/blob_container.hpp | 81 +++++++++++++++++++ .../intel_npu/common/icompiler_adapter.hpp | 2 +- .../include/intel_npu/common/igraph.hpp | 5 +- .../intel_npu/src/common/src/igraph.cpp | 9 +-- .../include/driver_compiler_adapter.hpp | 2 +- .../compiler_adapter/include/driver_graph.hpp | 2 +- .../include/plugin_compiler_adapter.hpp | 2 +- .../compiler_adapter/include/plugin_graph.hpp | 2 +- .../include/ze_graph_ext_wrappers.hpp | 2 +- .../src/driver_compiler_adapter.cpp | 10 ++- .../src/compiler_adapter/src/driver_graph.cpp | 11 +-- .../src/plugin_compiler_adapter.cpp | 19 +++-- .../src/compiler_adapter/src/plugin_graph.cpp | 19 +++-- .../src/ze_graph_ext_wrappers.cpp | 12 +-- .../intel_npu/src/plugin/include/metrics.hpp | 3 +- .../intel_npu/src/plugin/src/plugin.cpp | 31 +++++-- 16 files changed, 161 insertions(+), 51 deletions(-) create mode 100644 src/plugins/intel_npu/src/common/include/intel_npu/common/blob_container.hpp diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/blob_container.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/blob_container.hpp new file mode 100644 index 00000000000000..2f6b31aceacd5d --- /dev/null +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/blob_container.hpp @@ -0,0 +1,81 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "openvino/runtime/shared_buffer.hpp" + +namespace intel_npu { + +class BlobContainer { +public: + /** + * @brief Returns the address at the beginning of the blob. + */ + virtual const void* get_ptr() const = 0; + + /** + * @brief Size of the blob. + */ + virtual size_t size() const = 0; + + /** + * @brief Returns true if the blob can be deallocated from memory, false otherwise. + */ + virtual bool release_from_memory() = 0; + + virtual ~BlobContainer() = default; +}; + +class BlobContainerVector : public BlobContainer { +public: + BlobContainerVector(std::vector blob) : _blob(std::move(blob)) {} + + const void* get_ptr() const override { + return reinterpret_cast(_blob.data()); + } + + size_t size() const override { + return _blob.size(); + } + + bool release_from_memory() override { + _blob.clear(); + _blob.shrink_to_fit(); + return true; + } + +private: + std::vector _blob; +}; + +class BlobContainerAlignedBuffer : public BlobContainer { +public: + BlobContainerAlignedBuffer(const std::shared_ptr& blobSO, size_t ovHeaderOffset, uint64_t size) + : _size(size), + _ovHeaderOffset(ovHeaderOffset), + _blobSO(blobSO) {} + + const void* get_ptr() const override { + return _blobSO->get_ptr(_ovHeaderOffset); + } + + size_t size() const override { + return _size; + } + + bool release_from_memory() override { + return false; + } + +private: + uint64_t _size; + size_t _ovHeaderOffset; + std::shared_ptr _blobSO; +}; + +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp index bedf0aaeeca966..a86d942627c6b5 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/icompiler_adapter.hpp @@ -12,7 +12,7 @@ class ICompilerAdapter { public: virtual std::shared_ptr compile(const std::shared_ptr& model, const Config& config) const = 0; - virtual std::shared_ptr parse(std::vector network, const Config& config) const = 0; + virtual std::shared_ptr parse(std::unique_ptr blobPtr, const Config& config) const = 0; virtual ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const = 0; virtual uint32_t get_version() const = 0; diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp index fc5aec9158151c..ec4d7091ac6345 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp @@ -8,6 +8,7 @@ #include #include +#include "intel_npu/common/blob_container.hpp" #include "intel_npu/network_metadata.hpp" #include "intel_npu/utils/zero/zero_init.hpp" #include "intel_npu/utils/zero/zero_utils.hpp" @@ -21,7 +22,7 @@ class IGraph : public std::enable_shared_from_this { IGraph(ze_graph_handle_t handle, NetworkMetadata metadata, const Config& config, - std::optional> blob); + std::unique_ptr blobPtr); virtual size_t export_blob(std::ostream& stream) const = 0; @@ -89,7 +90,7 @@ class IGraph : public std::enable_shared_from_this { // first inference starts running std::mutex _mutex; - std::vector _blob; + std::unique_ptr _blobPtr; uint32_t _unique_id = 0; uint32_t _last_submitted_id; diff --git a/src/plugins/intel_npu/src/common/src/igraph.cpp b/src/plugins/intel_npu/src/common/src/igraph.cpp index 9a53928c9a3d9e..f641813e44c0e7 100644 --- a/src/plugins/intel_npu/src/common/src/igraph.cpp +++ b/src/plugins/intel_npu/src/common/src/igraph.cpp @@ -17,14 +17,11 @@ namespace intel_npu { IGraph::IGraph(ze_graph_handle_t handle, NetworkMetadata metadata, const Config& config, - std::optional> blob) + std::unique_ptr blobPtr) : _handle(handle), _metadata(std::move(metadata)), - _logger("IGraph", config.get()) { - if (blob.has_value()) { - _blob = std::move(*blob); - } -} + _blobPtr(std::move(blobPtr)), + _logger("IGraph", config.get()) {} const NetworkMetadata& IGraph::get_metadata() const { return _metadata; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp index 6801e26c2fed73..3a2af03df8cead 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/driver_compiler_adapter.hpp @@ -23,7 +23,7 @@ class DriverCompilerAdapter final : public ICompilerAdapter { std::shared_ptr compile(const std::shared_ptr& model, const Config& config) const override; - std::shared_ptr parse(std::vector network, const Config& config) const override; + std::shared_ptr parse(std::unique_ptr blobPtr, const Config& config) const override; ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const override; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp index cf3d54c6b363e5..ac89a790291d2e 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp @@ -21,7 +21,7 @@ class DriverGraph final : public IGraph { ze_graph_handle_t graphHandle, NetworkMetadata metadata, const Config& config, - std::optional> blob); + std::unique_ptr blobPtr); size_t export_blob(std::ostream& stream) const override; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp index 61870e718a088e..c60b80bcfaa314 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_compiler_adapter.hpp @@ -21,7 +21,7 @@ class PluginCompilerAdapter final : public ICompilerAdapter { std::shared_ptr compile(const std::shared_ptr& model, const Config& config) const override; - std::shared_ptr parse(std::vector network, const Config& config) const override; + std::shared_ptr parse(std::unique_ptr blobPtr, const Config& config) const override; ov::SupportedOpsMap query(const std::shared_ptr& model, const Config& config) const override; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp index 9c88ace1c29d23..61d4a6ed866529 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp @@ -23,7 +23,7 @@ class PluginGraph final : public IGraph { const std::shared_ptr& zeroInitStruct, ze_graph_handle_t graphHandle, NetworkMetadata metadata, - std::vector blob, + std::unique_ptr blobPtr, const Config& config); size_t export_blob(std::ostream& stream) const override; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp index a80beb8c57305d..df538521d856f1 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp @@ -35,7 +35,7 @@ class ZeGraphExtWrappers { const std::string& buildFlags, const uint32_t& flags) const; - ze_graph_handle_t getGraphHandle(const std::vector& network) const; + ze_graph_handle_t getGraphHandle(const uint8_t& data, size_t size) const; NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const; diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp index d7c4def10c8c93..1d19854618a237 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_compiler_adapter.cpp @@ -200,14 +200,16 @@ std::shared_ptr DriverCompilerAdapter::compile(const std::shared_ptr DriverCompilerAdapter::parse(std::vector network, const Config& config) const { +std::shared_ptr DriverCompilerAdapter::parse(std::unique_ptr blobPtr, + const Config& config) const { OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "DriverCompilerAdapter", "parse"); _logger.debug("parse start"); - ze_graph_handle_t graphHandle = _zeGraphExt->getGraphHandle(network); + ze_graph_handle_t graphHandle = + _zeGraphExt->getGraphHandle(*reinterpret_cast(blobPtr->get_ptr()), blobPtr->size()); _logger.debug("parse end"); OV_ITT_TASK_NEXT(PARSE_BLOB, "getNetworkMeta"); @@ -218,7 +220,7 @@ std::shared_ptr DriverCompilerAdapter::parse(std::vector networ graphHandle, std::move(networkMeta), config, - std::optional>(std::move(network))); + std::move(blobPtr)); } ov::SupportedOpsMap DriverCompilerAdapter::query(const std::shared_ptr& model, diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp index a29412075c7e39..48ae84a6c841ea 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp @@ -15,8 +15,8 @@ DriverGraph::DriverGraph(const std::shared_ptr& zeGraphExt, ze_graph_handle_t graphHandle, NetworkMetadata metadata, const Config& config, - std::optional> blob) - : IGraph(graphHandle, std::move(metadata), config, std::move(blob)), + std::unique_ptr blobPtr) + : IGraph(graphHandle, std::move(metadata), config, std::move(blobPtr)), _zeGraphExt(zeGraphExt), _zeroInitStruct(zeroInitStruct), _logger("DriverGraph", config.get()) { @@ -140,7 +140,7 @@ void DriverGraph::initialize(const Config& config) { } bool DriverGraph::release_blob(const Config& config) { - if (_blob.empty() || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 || + if (_blobPtr == nullptr || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 || config.get()) { return false; } @@ -153,8 +153,9 @@ bool DriverGraph::release_blob(const Config& config) { return false; } - _blob.clear(); - _blob.shrink_to_fit(); + if (!_blobPtr->release_from_memory()) { + return false; + } _logger.debug("Blob is released"); diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp index 72fab52d6cf895..809e1c88e05a71 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_compiler_adapter.cpp @@ -80,6 +80,7 @@ std::shared_ptr PluginCompilerAdapter::compile(const std::shared_ptrcompile(model, config); + auto blobPtr = std::make_unique(std::move(networkDesc.compiledNetwork)); _logger.debug("compile end"); ze_graph_handle_t graphHandle = nullptr; @@ -87,7 +88,8 @@ std::shared_ptr PluginCompilerAdapter::compile(const std::shared_ptrgetGraphHandle(networkDesc.compiledNetwork); + graphHandle = + _zeGraphExt->getGraphHandle(*reinterpret_cast(blobPtr->get_ptr()), blobPtr->size()); } catch (...) { _logger.info("Failed to obtain the level zero graph handle. Inference requests for this model are not " "allowed. Only exports are available"); @@ -99,21 +101,28 @@ std::shared_ptr PluginCompilerAdapter::compile(const std::shared_ptr PluginCompilerAdapter::parse(std::vector network, const Config& config) const { +std::shared_ptr PluginCompilerAdapter::parse(std::unique_ptr blobPtr, + const Config& config) const { OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "PluginCompilerAdapter", "parse"); _logger.debug("parse start"); + std::vector network(blobPtr->size()); + network.assign(reinterpret_cast(blobPtr->get_ptr()), + reinterpret_cast(blobPtr->get_ptr()) + blobPtr->size()); auto networkMeta = _compiler->parse(network, config); + network.clear(); + network.shrink_to_fit(); _logger.debug("parse end"); ze_graph_handle_t graphHandle = nullptr; if (_zeGraphExt) { - graphHandle = _zeGraphExt->getGraphHandle(network); + graphHandle = + _zeGraphExt->getGraphHandle(*reinterpret_cast(blobPtr->get_ptr()), blobPtr->size()); } return std::make_shared(_zeGraphExt, @@ -121,7 +130,7 @@ std::shared_ptr PluginCompilerAdapter::parse(std::vector networ _zeroInitStruct, graphHandle, std::move(networkMeta), - std::move(network), + std::move(blobPtr), config); } diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp index d0c24a82e03937..726a1196b7c88b 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp @@ -15,9 +15,9 @@ PluginGraph::PluginGraph(const std::shared_ptr& zeGraphExt, const std::shared_ptr& zeroInitStruct, ze_graph_handle_t graphHandle, NetworkMetadata metadata, - std::vector blob, + std::unique_ptr blobPtr, const Config& config) - : IGraph(graphHandle, std::move(metadata), config, std::optional>(std::move(blob))), + : IGraph(graphHandle, std::move(metadata), config, std::move(blobPtr)), _zeGraphExt(zeGraphExt), _zeroInitStruct(zeroInitStruct), _compiler(compiler), @@ -31,7 +31,7 @@ PluginGraph::PluginGraph(const std::shared_ptr& zeGraphExt, } size_t PluginGraph::export_blob(std::ostream& stream) const { - stream.write(reinterpret_cast(_blob.data()), _blob.size()); + stream.write(reinterpret_cast(_blobPtr->get_ptr()), _blobPtr->size()); if (!stream) { _logger.error("Write blob to stream failed. Blob is broken!"); @@ -40,21 +40,26 @@ size_t PluginGraph::export_blob(std::ostream& stream) const { if (_logger.level() >= ov::log::Level::INFO) { std::uint32_t result = 1171117u; - for (const uint8_t* it = _blob.data(); it != _blob.data() + _blob.size(); ++it) { + for (const uint8_t* it = reinterpret_cast(_blobPtr->get_ptr()); + it != reinterpret_cast(_blobPtr->get_ptr()) + _blobPtr->size(); + ++it) { result = ((result << 7) + result) + static_cast(*it); } std::stringstream str; - str << "Blob size: " << _blob.size() << ", hash: " << std::hex << result; + str << "Blob size: " << _blobPtr->size() << ", hash: " << std::hex << result; _logger.info(str.str().c_str()); } _logger.info("Write blob to stream successfully."); - return _blob.size(); + return _blobPtr->size(); } std::vector PluginGraph::process_profiling_output(const std::vector& profData, const Config& config) const { - return _compiler->process_profiling_output(profData, _blob, config); + std::vector blob(_blobPtr->size()); + blob.assign(reinterpret_cast(_blobPtr->get_ptr()), + reinterpret_cast(_blobPtr->get_ptr()) + _blobPtr->size()); + return _compiler->process_profiling_output(profData, blob, config); } void PluginGraph::set_argument_value(uint32_t argi, const void* argv) const { diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp index a3626a79475dcd..d5e793d4fff9fe 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp @@ -365,19 +365,15 @@ ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(std::pair& network) const { +ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(const uint8_t& blobData, size_t blobSize) const { ze_graph_handle_t graphHandle; - if (network.empty()) { + if (blobSize == 0) { OPENVINO_THROW("Empty blob"); } - ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, - nullptr, - ZE_GRAPH_FORMAT_NATIVE, - network.size(), - network.data(), - nullptr}; + ze_graph_desc_t desc = + {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, nullptr, ZE_GRAPH_FORMAT_NATIVE, blobSize, &blobData, nullptr}; _logger.debug("getGraphHandle - perform pfnCreate"); auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate(_zeroInitStruct->getContext(), diff --git a/src/plugins/intel_npu/src/plugin/include/metrics.hpp b/src/plugins/intel_npu/src/plugin/include/metrics.hpp index 91f78a9cd773f6..357d8b51da475a 100644 --- a/src/plugins/intel_npu/src/plugin/include/metrics.hpp +++ b/src/plugins/intel_npu/src/plugin/include/metrics.hpp @@ -68,7 +68,8 @@ class Metrics final { ov::intel_npu::batch_mode.name(), ov::hint::execution_mode.name()}; - const std::vector _internalSupportedProperties = {ov::internal::caching_properties.name()}; + const std::vector _internalSupportedProperties = {ov::internal::caching_properties.name(), + ov::internal::caching_with_mmap.name()}; // Metric to provide a hint for a range for number of async infer requests. (bottom bound, upper bound, step) const std::tuple _rangeForAsyncInferRequests{1u, 10u, 1u}; diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index cfcec542e6219e..23e2e04fbe0ba7 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -808,7 +808,17 @@ std::shared_ptr Plugin::import_model(std::istream& stream, c npu_plugin_properties.insert(*it); } } - const std::map propertiesMap = any_copy(npu_plugin_properties); + + std::shared_ptr modelBuffer; + // ov::internal::cached_model_buffer has no corresponding "Config" implementation thus we need to remove it from the + // list of properties + if (npu_plugin_properties.count(ov::internal::cached_model_buffer.name())) { + modelBuffer = + npu_plugin_properties.at(ov::internal::cached_model_buffer.name()).as>(); + npu_plugin_properties.erase(ov::internal::cached_model_buffer.name()); + } + + const auto propertiesMap = any_copy(npu_plugin_properties); auto localConfig = merge_configs(_globalConfig, propertiesMap, OptionMode::RunTime); _logger.setLevel(localConfig.get()); @@ -837,16 +847,23 @@ std::shared_ptr Plugin::import_model(std::istream& stream, c OPENVINO_THROW("Incompatible blob version!"); } + std::unique_ptr blobPtr; auto graphSize = storedMeta->get_blob_size(); - std::vector blob(graphSize); - stream.read(reinterpret_cast(blob.data()), graphSize); - if (!stream) { - OPENVINO_THROW("Failed to read data from stream!"); + if (modelBuffer == nullptr) { + std::vector blob(graphSize); + stream.read(reinterpret_cast(blob.data()), graphSize); + if (!stream) { + OPENVINO_THROW("Failed to read data from stream!"); + } + _logger.debug("Successfully read %zu bytes into blob.", graphSize); + + blobPtr = std::make_unique(std::move(blob)); + } else { + blobPtr = std::make_unique(modelBuffer, stream.tellg(), graphSize); } - _logger.debug("Successfully read %zu bytes into blob.", graphSize); - auto graph = compiler->parse(std::move(blob), localConfig); + auto graph = compiler->parse(std::move(blobPtr), localConfig); graph->update_network_name("net" + std::to_string(_compiledModelLoadCounter++)); const std::shared_ptr modelDummy =