Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU] Add dumping of the memory statistcs #28441

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
a700aaf
Add memory stats dump interface
maxnick Dec 9, 2024
13ce11e
Dump statistics initial working state
maxnick Dec 10, 2024
3ab9711
Store initial boxes in static manager as they are
maxnick Dec 10, 2024
d14103f
Fix optimal memory calculation algo
maxnick Dec 11, 2024
d2f5a3f
Add optimal memory size calculation for dynamic memory regions
maxnick Dec 11, 2024
b8a05ce
Merge remote-tracking branch 'origin/master' into memory_stats
maxnick Dec 12, 2024
3b7185d
Merge remote-tracking branch 'origin/master' into memory_stats
maxnick Dec 16, 2024
b63b482
Print unique boxes
maxnick Dec 16, 2024
4287d47
Merge remote-tracking branch 'origin/master' into memory_stats
maxnick Dec 16, 2024
54f62a4
Dump constants statistics
maxnick Dec 18, 2024
0254602
Remove redundant includes
maxnick Jan 7, 2025
258baa4
Merge remote-tracking branch 'origin/master' into memory_stats
maxnick Jan 7, 2025
d761c35
Resolve merge conflict
maxnick Jan 7, 2025
186d257
Merge remote-tracking branch 'origin/master' into memory_stats
maxnick Jan 9, 2025
4241086
Fix typos
maxnick Jan 9, 2025
94bd8ff
Partially put under the DEBUG_CAPS macro
maxnick Jan 10, 2025
142f266
Move memory statistics dumping into a specific routine
maxnick Jan 10, 2025
1080454
Add dump to csv
maxnick Jan 10, 2025
48505af
Split Debug Caps and ordinary compilation
maxnick Jan 14, 2025
258f1a8
Merge remote-tracking branch 'origin/master' into memory_stats
maxnick Jan 15, 2025
c11eac5
Fix code format
maxnick Jan 15, 2025
0382ce8
Enhance docs with the information about mem stats usage
maxnick Jan 15, 2025
158285f
Clean up code
maxnick Jan 15, 2025
f75bd16
Fix code style
maxnick Jan 16, 2025
2047ce1
Support multi compiled model scenarios
maxnick Jan 16, 2025
9c4c38f
Use size_t accumulators in the statistics routines
maxnick Jan 17, 2025
3d7d9ef
Merge remote-tracking branch 'origin/master' into memory_stats
maxnick Jan 17, 2025
1e7af3d
Fix merge
maxnick Jan 17, 2025
cb0d647
Fix clang tidy
maxnick Jan 17, 2025
2b76687
Remove level add cout as output type
maxnick Jan 22, 2025
6d93b32
Code style fix
maxnick Jan 22, 2025
5047c48
Merge remote-tracking branch 'origin/master' into memory_stats
maxnick Jan 23, 2025
7f71ad1
Merge remote-tracking branch 'origin/master' into memory_stats
maxnick Jan 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/plugins/intel_cpu/docs/debug_capabilities/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,8 @@ Use the following cmake option to enable debug capabilities:
Internal performance counter will be enabled automatically.
* [Average counters](average_counters.md)
`OV_CPU_AVERAGE_COUNTERS=filename`
* Memory statistics
`OV_CPU_MEMORY_STATISTICS_LEVEL=1`
Set this environment variable to dump memory usage statistics to the standard output when the compiled model is destructed.
`OV_CPU_MEMORY_STATISTICS_PATH=<file_path>.csv`
Add this environment variable to dump memory usage statistics to a *.csv file.
12 changes: 10 additions & 2 deletions src/plugins/intel_cpu/src/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
#include "openvino/runtime/threading/cpu_streams_info.hpp"
#include "openvino/runtime/threading/executor_manager.hpp"
#include "openvino/util/common_util.hpp"
#include "transformations/transformation_pipeline.h"
#include "transformations/utils/utils.hpp"
#include "utils/debug_capabilities.h"
#include "utils/memory_stats_dump.hpp"
#include "utils/serialize.hpp"

#if defined(OV_CPU_WITH_ACL)
Expand All @@ -42,6 +42,14 @@ struct ImmediateSerialExecutor : public ov::threading::ITaskExecutor {
std::mutex _mutex;
};

CompiledModel::~CompiledModel() {
if (m_has_sub_compiled_models) {
m_sub_compiled_models.clear();
m_sub_memory_manager->_memorys_table.clear();
}
CPU_DEBUG_CAP_ENABLE(dumpMemoryStats(m_cfg.debugCaps, m_name, m_graphs, m_socketWeights));
}

CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin,
const Config& cfg,
Expand Down
11 changes: 3 additions & 8 deletions src/plugins/intel_cpu/src/compiled_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ namespace intel_cpu {

class CompiledModel : public ov::ICompiledModel {
public:
typedef std::shared_ptr<CompiledModel> Ptr;

struct GraphGuard : public Graph {
std::mutex _mutex;
struct Lock : public std::unique_lock<std::mutex> {
Expand All @@ -30,20 +32,13 @@ class CompiledModel : public ov::ICompiledModel {
};

public:
typedef std::shared_ptr<CompiledModel> Ptr;

CompiledModel(const std::shared_ptr<ov::Model>& model,
const std::shared_ptr<const ov::IPlugin>& plugin,
const Config& cfg,
const bool loaded_from_cache,
const std::shared_ptr<SubMemoryManager> sub_memory_manager = nullptr);

~CompiledModel() {
if (m_has_sub_compiled_models) {
m_sub_compiled_models.clear();
m_sub_memory_manager->_memorys_table.clear();
}
}
~CompiledModel();

std::shared_ptr<ov::IAsyncInferRequest> create_infer_request() const override;

Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_cpu/src/cpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,10 @@ void MemoryBlockWithReuse::free() {
m_useExternalStorage = false;
}

size_t MemoryBlockWithReuse::size() const {
return m_memUpperBound;
}

void MemoryBlockWithReuse::release(void* ptr) {}

void MemoryBlockWithReuse::destroy(void* ptr) {
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_cpu/src/cpu_memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class MemoryBlockWithReuse : public IMemoryBlock {
bool resize(size_t size) override;
bool hasExtBuffer() const noexcept override;
void free();
size_t size() const; // in bytes

private:
bool m_useExternalStorage = false;
Expand Down
11 changes: 10 additions & 1 deletion src/plugins/intel_cpu/src/dnnl_scratch_pad.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,25 @@ namespace intel_cpu {

class DnnlScratchPad {
MemoryBlockPtr blockPtr;
MemoryBlockWithReuse* baseBlockPtr = nullptr;
dnnl::engine eng;

public:
DnnlScratchPad(const dnnl::engine& eng, int numa_node = -1) : eng(eng) {
blockPtr = std::make_shared<DnnlMemoryBlock>(make_unique<MemoryBlockWithReuse>(numa_node));
auto baseMemoryBlock = make_unique<MemoryBlockWithReuse>(numa_node);
baseBlockPtr = baseMemoryBlock.get();
blockPtr = std::make_shared<DnnlMemoryBlock>(std::move(baseMemoryBlock));
}

MemoryPtr createScratchPadMem(const MemoryDescPtr& md) {
return std::make_shared<Memory>(eng, md, blockPtr);
}

size_t size() const {
if (baseBlockPtr)
return baseBlockPtr->size();
return 0;
}
};

using DnnlScratchPadPtr = std::shared_ptr<DnnlScratchPad>;
Expand Down
33 changes: 17 additions & 16 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model>& model,
}

// update output precisions of producers to avoid extra reorders
// do this only in case output configration is not provided explicitly
// do this only in case output configuration is not provided explicitly
if (outputConfigs.empty()) {
for (auto& output : outputNodesMap) {
const auto& outputNode = output.second;
Expand Down Expand Up @@ -736,7 +736,7 @@ static inline bool isConstOutput(EdgePtr edge) {
}

void Graph::AllocateWithReuse(const std::vector<size_t>& syncNodesInds) {
edgeClusters edge_clusters = MemoryControl::findEdgeClusters(graphEdges);
EdgeClusters edge_clusters = MemoryControl::findEdgeClusters(graphEdges);

size_t remaining_edge_clusters_count = edge_clusters.size();

Expand All @@ -754,7 +754,7 @@ void Graph::AllocateWithReuse(const std::vector<size_t>& syncNodesInds) {
// Special allocation for string tensors
if (edge->getDesc().getPrecision() == element::string &&
edge->getStatus() == Edge::Status::NeedAllocation) {
StringMemory::StringMemoryBlockPtr memBlcok;
StringMemory::StringMemoryBlockPtr memBlock;
if (edge->getParent()->isConstant()) {
if (edge->getParent()->getType() == Type::Input) {
auto constNode = static_cast<node::Input*>(edge->getParent().get());
Expand All @@ -769,11 +769,11 @@ void Graph::AllocateWithReuse(const std::vector<size_t>& syncNodesInds) {
"' and '",
edge->getChild()->getName(),
"' must have StringMemory.");
memBlcok = stringMemory->getStringMemoryBlockPtr();
memBlock = stringMemory->getStringMemoryBlockPtr();
} else {
auto memory = std::make_shared<StringMemory>(getEngine(), edge->getDesc());
edge->reuse(memory);
memBlcok = memory->getStringMemoryBlockPtr();
memBlock = memory->getStringMemoryBlockPtr();
}
for (auto& edge_c : cluster) {
if (edge_c == edge) {
Expand All @@ -782,7 +782,7 @@ void Graph::AllocateWithReuse(const std::vector<size_t>& syncNodesInds) {
OPENVINO_ASSERT(edge_c->getDesc().getPrecision() == element::string,
"All edges in the cluster must be string.");
if (edge_c->getStatus() == Edge::Status::NotAllocated) {
auto memory = std::make_shared<StringMemory>(getEngine(), edge_c->getDesc(), memBlcok);
auto memory = std::make_shared<StringMemory>(getEngine(), edge_c->getDesc(), memBlock);
edge_c->reuse(memory);
} else {
OPENVINO_THROW("[CPU] String tensors allocation in the cluster. Edge between nodes '",
Expand Down Expand Up @@ -912,10 +912,11 @@ void Graph::AllocateWithReuse(const std::vector<size_t>& syncNodesInds) {
memoryRegions.erase(it, memoryRegions.end());

// Set up the memory control subsystem.
this->m_pMemoryControl = &(getGraphContext()->getNetworkMemoryControl()->createMemoryControlUnit(syncNodesInds));
auto memoryBlocks = m_pMemoryControl->insert(memoryRegions);
this->m_pMemoryControl = &(getGraphContext()->getNetworkMemoryControl()->createMemoryControlUnit(this->_name));
m_pMemoryControl->insert(memoryRegions, syncNodesInds);
auto memoryBlocks = m_pMemoryControl->solve();

// attach all the not yet allocated edges to the memory contol
// attach all the not yet allocated edges to the memory control
for (auto&& item : memoryBlocks) {
int count = 0;
for (auto&& edge : edge_clusters[item.first]) {
Expand Down Expand Up @@ -1464,7 +1465,7 @@ void Graph::Infer(SyncInferRequest* request) {
const int numaId = GetNumaNodeId(m_context);

if (!m_pMemoryControl) {
OPENVINO_THROW("Memory control unit is not initilized in graph: ", GetName());
OPENVINO_THROW("Memory control unit is not initalized in graph: ", GetName());
}

if (!m_pMemoryControl->allocated()) {
Expand Down Expand Up @@ -1494,7 +1495,7 @@ void Graph::Infer(SyncInferRequest* request) {
void Graph::SortTopologically() {
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::SortTopologically");

// Set execIndex of all nodes to default invaild value
// Set execIndex of all nodes to default invalid value
for (auto& node : graphNodes) {
node->execIndex = -1;
}
Expand Down Expand Up @@ -1806,8 +1807,8 @@ void Graph::EnforceInferencePrecision() {
if (one_of(parent->getType(),
Type::Convolution, // conv nets
Type::FullyConnected, // conv / bert nets
Type::RNNCell, // recurent nets
Type::RNNSeq, // recurent nets
Type::RNNCell, // recurrent nets
Type::RNNSeq, // recurrent nets
Type::MatMul, // bert nets
Type::ROIPooling, // object detection nets
Type::Interpolate, // super resolution nets
Expand Down Expand Up @@ -1837,7 +1838,7 @@ void Graph::EnforceInferencePrecision() {

/* Skip low-precision float point enforcement for tail of the graph by forming set of nodes to skip.
* Necessary to maintain accuracy.
* Experiments show zero peformance impact on average */
* Experiments show zero performance impact on average */
std::unordered_set<NodePtr> nodesToSkip;
// starting from output nodes
for (const auto& entry : outputNodesMap) {
Expand Down Expand Up @@ -1883,7 +1884,7 @@ void Graph::EnforceInferencePrecision() {
if (parent->getType() == Type::Input && one_of(node->getType(), Type::Eltwise, Type::Subgraph))
return true;

// exclude Convert after Range since it may cause precision loss when integter type to LP.
// exclude Convert after Range since it may cause precision loss when integer type to LP.
if (parent->getType() == Type::Range && node->getType() == Type::Convert) {
return true;
}
Expand Down Expand Up @@ -1912,7 +1913,7 @@ void Graph::EnforceInferencePrecision() {
if (node->getOriginalOutputPrecisionAtPort(i) != ov::element::f32)
continue;

// exclude Convert before Range since it may cause precision loss when integter type to LP.
// exclude Convert before Range since it may cause precision loss when integer type to LP.
// TODO: Incorrect subgraph is generated by ONNX FE + ticket 117861.
const auto& child = node->getChildEdgeAt(i)->getChild();
if (child->getType() == Type::Range && node->getType() == Type::Convert)
Expand Down
Loading
Loading