Skip to content

Commit

Permalink
[intel-npu] Support new internal cached_model_buffer config for mem…
Browse files Browse the repository at this point in the history
…ory mapped cached blobs (openvinotoolkit#27822)

### Details:
- *Based on <s>new `import_model` API from PR openvinotoolkit#27644</s>&emsp;new
plugins property to pass mmap buffer from PR openvinotoolkit#27981*
- *Added `BlobContainer` class for `IGraph` objects that may derive with
`BlobContainerAlignedBuffer` for the new `import_model` API and
`BlobContainerVector` for the old one*
- *Refactored `getGraphHandle` function to allow passing `const uint8_t`
and `size_t` params instead of `std::vector<uint8_t>*

### Tickets:
 - *157192*

---------

Signed-off-by: Alexandru Enache <[email protected]>
Co-authored-by: Alexandru Enache <[email protected]>
Co-authored-by: Oleg Pipikin <[email protected]>
  • Loading branch information
3 people authored and aobolensk committed Jan 28, 2025
1 parent d5a3c7e commit 4d5e1c7
Show file tree
Hide file tree
Showing 16 changed files with 161 additions and 51 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <memory>
#include <vector>

#include "openvino/runtime/shared_buffer.hpp"

namespace intel_npu {

class BlobContainer {
public:
/**
* @brief Returns the address at the beginning of the blob.
*/
virtual const void* get_ptr() const = 0;

/**
* @brief Size of the blob.
*/
virtual size_t size() const = 0;

/**
* @brief Returns true if the blob can be deallocated from memory, false otherwise.
*/
virtual bool release_from_memory() = 0;

virtual ~BlobContainer() = default;
};

class BlobContainerVector : public BlobContainer {
public:
BlobContainerVector(std::vector<uint8_t> blob) : _blob(std::move(blob)) {}

const void* get_ptr() const override {
return reinterpret_cast<const void*>(_blob.data());
}

size_t size() const override {
return _blob.size();
}

bool release_from_memory() override {
_blob.clear();
_blob.shrink_to_fit();
return true;
}

private:
std::vector<uint8_t> _blob;
};

class BlobContainerAlignedBuffer : public BlobContainer {
public:
BlobContainerAlignedBuffer(const std::shared_ptr<ov::AlignedBuffer>& blobSO, size_t ovHeaderOffset, uint64_t size)
: _size(size),
_ovHeaderOffset(ovHeaderOffset),
_blobSO(blobSO) {}

const void* get_ptr() const override {
return _blobSO->get_ptr(_ovHeaderOffset);
}

size_t size() const override {
return _size;
}

bool release_from_memory() override {
return false;
}

private:
uint64_t _size;
size_t _ovHeaderOffset;
std::shared_ptr<ov::AlignedBuffer> _blobSO;
};

} // namespace intel_npu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class ICompilerAdapter {
public:
virtual std::shared_ptr<IGraph> compile(const std::shared_ptr<const ov::Model>& model,
const Config& config) const = 0;
virtual std::shared_ptr<IGraph> parse(std::vector<uint8_t> network, const Config& config) const = 0;
virtual std::shared_ptr<IGraph> parse(std::unique_ptr<BlobContainer> blobPtr, const Config& config) const = 0;
virtual ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const = 0;
virtual uint32_t get_version() const = 0;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <mutex>
#include <vector>

#include "intel_npu/common/blob_container.hpp"
#include "intel_npu/network_metadata.hpp"
#include "intel_npu/utils/zero/zero_init.hpp"
#include "intel_npu/utils/zero/zero_utils.hpp"
Expand All @@ -21,7 +22,7 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
IGraph(ze_graph_handle_t handle,
NetworkMetadata metadata,
const Config& config,
std::optional<std::vector<uint8_t>> blob);
std::unique_ptr<BlobContainer> blobPtr);

virtual size_t export_blob(std::ostream& stream) const = 0;

Expand Down Expand Up @@ -89,7 +90,7 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
// first inference starts running
std::mutex _mutex;

std::vector<uint8_t> _blob;
std::unique_ptr<BlobContainer> _blobPtr;

uint32_t _unique_id = 0;
uint32_t _last_submitted_id;
Expand Down
9 changes: 3 additions & 6 deletions src/plugins/intel_npu/src/common/src/igraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,11 @@ namespace intel_npu {
IGraph::IGraph(ze_graph_handle_t handle,
NetworkMetadata metadata,
const Config& config,
std::optional<std::vector<uint8_t>> blob)
std::unique_ptr<BlobContainer> blobPtr)
: _handle(handle),
_metadata(std::move(metadata)),
_logger("IGraph", config.get<LOG_LEVEL>()) {
if (blob.has_value()) {
_blob = std::move(*blob);
}
}
_blobPtr(std::move(blobPtr)),
_logger("IGraph", config.get<LOG_LEVEL>()) {}

const NetworkMetadata& IGraph::get_metadata() const {
return _metadata;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class DriverCompilerAdapter final : public ICompilerAdapter {

std::shared_ptr<IGraph> compile(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;

std::shared_ptr<IGraph> parse(std::vector<uint8_t> network, const Config& config) const override;
std::shared_ptr<IGraph> parse(std::unique_ptr<BlobContainer> blobPtr, const Config& config) const override;

ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class DriverGraph final : public IGraph {
ze_graph_handle_t graphHandle,
NetworkMetadata metadata,
const Config& config,
std::optional<std::vector<uint8_t>> blob);
std::unique_ptr<BlobContainer> blobPtr);

size_t export_blob(std::ostream& stream) const override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class PluginCompilerAdapter final : public ICompilerAdapter {

std::shared_ptr<IGraph> compile(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;

std::shared_ptr<IGraph> parse(std::vector<uint8_t> network, const Config& config) const override;
std::shared_ptr<IGraph> parse(std::unique_ptr<BlobContainer> blobPtr, const Config& config) const override;

ov::SupportedOpsMap query(const std::shared_ptr<const ov::Model>& model, const Config& config) const override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class PluginGraph final : public IGraph {
const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct,
ze_graph_handle_t graphHandle,
NetworkMetadata metadata,
std::vector<uint8_t> blob,
std::unique_ptr<BlobContainer> blobPtr,
const Config& config);

size_t export_blob(std::ostream& stream) const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class ZeGraphExtWrappers {
const std::string& buildFlags,
const uint32_t& flags) const;

ze_graph_handle_t getGraphHandle(const std::vector<uint8_t>& network) const;
ze_graph_handle_t getGraphHandle(const uint8_t& data, size_t size) const;

NetworkMetadata getNetworkMeta(ze_graph_handle_t graphHandle) const;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,14 +200,16 @@ std::shared_ptr<IGraph> DriverCompilerAdapter::compile(const std::shared_ptr<con
graphHandle,
std::move(networkMeta),
config,
std::nullopt);
nullptr);
}

std::shared_ptr<IGraph> DriverCompilerAdapter::parse(std::vector<uint8_t> network, const Config& config) const {
std::shared_ptr<IGraph> DriverCompilerAdapter::parse(std::unique_ptr<BlobContainer> blobPtr,
const Config& config) const {
OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "DriverCompilerAdapter", "parse");

_logger.debug("parse start");
ze_graph_handle_t graphHandle = _zeGraphExt->getGraphHandle(network);
ze_graph_handle_t graphHandle =
_zeGraphExt->getGraphHandle(*reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()), blobPtr->size());
_logger.debug("parse end");

OV_ITT_TASK_NEXT(PARSE_BLOB, "getNetworkMeta");
Expand All @@ -218,7 +220,7 @@ std::shared_ptr<IGraph> DriverCompilerAdapter::parse(std::vector<uint8_t> networ
graphHandle,
std::move(networkMeta),
config,
std::optional<std::vector<uint8_t>>(std::move(network)));
std::move(blobPtr));
}

ov::SupportedOpsMap DriverCompilerAdapter::query(const std::shared_ptr<const ov::Model>& model,
Expand Down
11 changes: 6 additions & 5 deletions src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ DriverGraph::DriverGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
ze_graph_handle_t graphHandle,
NetworkMetadata metadata,
const Config& config,
std::optional<std::vector<uint8_t>> blob)
: IGraph(graphHandle, std::move(metadata), config, std::move(blob)),
std::unique_ptr<BlobContainer> blobPtr)
: IGraph(graphHandle, std::move(metadata), config, std::move(blobPtr)),
_zeGraphExt(zeGraphExt),
_zeroInitStruct(zeroInitStruct),
_logger("DriverGraph", config.get<LOG_LEVEL>()) {
Expand Down Expand Up @@ -140,7 +140,7 @@ void DriverGraph::initialize(const Config& config) {
}

bool DriverGraph::release_blob(const Config& config) {
if (_blob.empty() || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 ||
if (_blobPtr == nullptr || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 ||
config.get<PERF_COUNT>()) {
return false;
}
Expand All @@ -153,8 +153,9 @@ bool DriverGraph::release_blob(const Config& config) {
return false;
}

_blob.clear();
_blob.shrink_to_fit();
if (!_blobPtr->release_from_memory()) {
return false;
}

_logger.debug("Blob is released");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,16 @@ std::shared_ptr<IGraph> PluginCompilerAdapter::compile(const std::shared_ptr<con

_logger.debug("compile start");
auto networkDesc = _compiler->compile(model, config);
auto blobPtr = std::make_unique<BlobContainerVector>(std::move(networkDesc.compiledNetwork));
_logger.debug("compile end");

ze_graph_handle_t graphHandle = nullptr;

if (_zeGraphExt) {
// Depending on the config, we may get an error when trying to get the graph handle from the compiled network
try {
graphHandle = _zeGraphExt->getGraphHandle(networkDesc.compiledNetwork);
graphHandle =
_zeGraphExt->getGraphHandle(*reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()), blobPtr->size());
} catch (...) {
_logger.info("Failed to obtain the level zero graph handle. Inference requests for this model are not "
"allowed. Only exports are available");
Expand All @@ -99,29 +101,36 @@ std::shared_ptr<IGraph> PluginCompilerAdapter::compile(const std::shared_ptr<con
_zeroInitStruct,
graphHandle,
std::move(networkDesc.metadata),
std::move(networkDesc.compiledNetwork),
std::move(blobPtr),
config);
}

std::shared_ptr<IGraph> PluginCompilerAdapter::parse(std::vector<uint8_t> network, const Config& config) const {
std::shared_ptr<IGraph> PluginCompilerAdapter::parse(std::unique_ptr<BlobContainer> blobPtr,
const Config& config) const {
OV_ITT_TASK_CHAIN(PARSE_BLOB, itt::domains::NPUPlugin, "PluginCompilerAdapter", "parse");

_logger.debug("parse start");
std::vector<uint8_t> network(blobPtr->size());
network.assign(reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()),
reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()) + blobPtr->size());
auto networkMeta = _compiler->parse(network, config);
network.clear();
network.shrink_to_fit();
_logger.debug("parse end");

ze_graph_handle_t graphHandle = nullptr;

if (_zeGraphExt) {
graphHandle = _zeGraphExt->getGraphHandle(network);
graphHandle =
_zeGraphExt->getGraphHandle(*reinterpret_cast<const uint8_t*>(blobPtr->get_ptr()), blobPtr->size());
}

return std::make_shared<PluginGraph>(_zeGraphExt,
_compiler,
_zeroInitStruct,
graphHandle,
std::move(networkMeta),
std::move(network),
std::move(blobPtr),
config);
}

Expand Down
19 changes: 12 additions & 7 deletions src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ PluginGraph::PluginGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct,
ze_graph_handle_t graphHandle,
NetworkMetadata metadata,
std::vector<uint8_t> blob,
std::unique_ptr<BlobContainer> blobPtr,
const Config& config)
: IGraph(graphHandle, std::move(metadata), config, std::optional<std::vector<uint8_t>>(std::move(blob))),
: IGraph(graphHandle, std::move(metadata), config, std::move(blobPtr)),
_zeGraphExt(zeGraphExt),
_zeroInitStruct(zeroInitStruct),
_compiler(compiler),
Expand All @@ -31,7 +31,7 @@ PluginGraph::PluginGraph(const std::shared_ptr<ZeGraphExtWrappers>& zeGraphExt,
}

size_t PluginGraph::export_blob(std::ostream& stream) const {
stream.write(reinterpret_cast<const char*>(_blob.data()), _blob.size());
stream.write(reinterpret_cast<const char*>(_blobPtr->get_ptr()), _blobPtr->size());

if (!stream) {
_logger.error("Write blob to stream failed. Blob is broken!");
Expand All @@ -40,21 +40,26 @@ size_t PluginGraph::export_blob(std::ostream& stream) const {

if (_logger.level() >= ov::log::Level::INFO) {
std::uint32_t result = 1171117u;
for (const uint8_t* it = _blob.data(); it != _blob.data() + _blob.size(); ++it) {
for (const uint8_t* it = reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr());
it != reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr()) + _blobPtr->size();
++it) {
result = ((result << 7) + result) + static_cast<uint32_t>(*it);
}

std::stringstream str;
str << "Blob size: " << _blob.size() << ", hash: " << std::hex << result;
str << "Blob size: " << _blobPtr->size() << ", hash: " << std::hex << result;
_logger.info(str.str().c_str());
}
_logger.info("Write blob to stream successfully.");
return _blob.size();
return _blobPtr->size();
}

std::vector<ov::ProfilingInfo> PluginGraph::process_profiling_output(const std::vector<uint8_t>& profData,
const Config& config) const {
return _compiler->process_profiling_output(profData, _blob, config);
std::vector<uint8_t> blob(_blobPtr->size());
blob.assign(reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr()),
reinterpret_cast<const uint8_t*>(_blobPtr->get_ptr()) + _blobPtr->size());
return _compiler->process_profiling_output(profData, blob, config);
}

void PluginGraph::set_argument_value(uint32_t argi, const void* argv) const {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,19 +365,15 @@ ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(std::pair<size_t, std::shar
return graphHandle;
}

ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(const std::vector<uint8_t>& network) const {
ze_graph_handle_t ZeGraphExtWrappers::getGraphHandle(const uint8_t& blobData, size_t blobSize) const {
ze_graph_handle_t graphHandle;

if (network.empty()) {
if (blobSize == 0) {
OPENVINO_THROW("Empty blob");
}

ze_graph_desc_t desc = {ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES,
nullptr,
ZE_GRAPH_FORMAT_NATIVE,
network.size(),
network.data(),
nullptr};
ze_graph_desc_t desc =
{ZE_STRUCTURE_TYPE_GRAPH_DESC_PROPERTIES, nullptr, ZE_GRAPH_FORMAT_NATIVE, blobSize, &blobData, nullptr};

_logger.debug("getGraphHandle - perform pfnCreate");
auto result = _zeroInitStruct->getGraphDdiTable().pfnCreate(_zeroInitStruct->getContext(),
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_npu/src/plugin/include/metrics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ class Metrics final {
ov::intel_npu::batch_mode.name(),
ov::hint::execution_mode.name()};

const std::vector<ov::PropertyName> _internalSupportedProperties = {ov::internal::caching_properties.name()};
const std::vector<ov::PropertyName> _internalSupportedProperties = {ov::internal::caching_properties.name(),
ov::internal::caching_with_mmap.name()};

// Metric to provide a hint for a range for number of async infer requests. (bottom bound, upper bound, step)
const std::tuple<uint32_t, uint32_t, uint32_t> _rangeForAsyncInferRequests{1u, 10u, 1u};
Expand Down
Loading

0 comments on commit 4d5e1c7

Please sign in to comment.