openvinotoolkit · maxnick · Dec 9, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 11, 2024
@@ -26,3 +26,8 @@ Use the following cmake option to enable debug capabilities:
   Internal performance counter will be enabled automatically. 
 * [Average counters](average_counters.md)  
   `OV_CPU_AVERAGE_COUNTERS=filename`
+* Memory statistics  
+  `OV_CPU_MEMORY_STATISTICS_LEVEL=1`  
+  Set this environment variable to dump memory usage statistics to the standard output when the compiled model is destructed.
+  `OV_CPU_MEMORY_STATISTICS_PATH=<file_path>.csv`  
+  Add this environment variable to dump memory usage statistics to a *.csv file.
@@ -21,8 +21,8 @@
 #include "openvino/runtime/threading/cpu_streams_info.hpp"
 #include "openvino/runtime/threading/executor_manager.hpp"
 #include "openvino/util/common_util.hpp"
-#include "transformations/transformation_pipeline.h"
-#include "transformations/utils/utils.hpp"
+#include "utils/debug_capabilities.h"
+#include "utils/memory_stats_dump.hpp"
 #include "utils/serialize.hpp"
 
 #if defined(OV_CPU_WITH_ACL)
@@ -42,6 +42,14 @@ struct ImmediateSerialExecutor : public ov::threading::ITaskExecutor {
     std::mutex _mutex;
 };
 
+CompiledModel::~CompiledModel() {
+    if (m_has_sub_compiled_models) {
+        m_sub_compiled_models.clear();
+        m_sub_memory_manager->_memorys_table.clear();
+    }
+    CPU_DEBUG_CAP_ENABLE(dumpMemoryStats(m_cfg.debugCaps, m_name, m_graphs, m_socketWeights));
+}
+
 CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
                              const std::shared_ptr<const ov::IPlugin>& plugin,
                              const Config& cfg,

@@ -21,6 +21,8 @@ namespace intel_cpu {
 
 class CompiledModel : public ov::ICompiledModel {
 public:
+    typedef std::shared_ptr<CompiledModel> Ptr;
+
     struct GraphGuard : public Graph {
         std::mutex _mutex;
         struct Lock : public std::unique_lock<std::mutex> {
@@ -30,20 +32,13 @@ class CompiledModel : public ov::ICompiledModel {
     };
 
 public:
-    typedef std::shared_ptr<CompiledModel> Ptr;
-
     CompiledModel(const std::shared_ptr<ov::Model>& model,
                   const std::shared_ptr<const ov::IPlugin>& plugin,
                   const Config& cfg,
                   const bool loaded_from_cache,
                   const std::shared_ptr<SubMemoryManager> sub_memory_manager = nullptr);
 
-    ~CompiledModel() {
-        if (m_has_sub_compiled_models) {
-            m_sub_compiled_models.clear();
-            m_sub_memory_manager->_memorys_table.clear();
-        }
-    }
+    ~CompiledModel();
 
     std::shared_ptr<ov::IAsyncInferRequest> create_infer_request() const override;
 

@@ -240,6 +240,10 @@ void MemoryBlockWithReuse::free() {
     m_useExternalStorage = false;
 }
 
+size_t MemoryBlockWithReuse::size() const {
+    return m_memUpperBound;
+}
+
 void MemoryBlockWithReuse::release(void* ptr) {}
 
 void MemoryBlockWithReuse::destroy(void* ptr) {

@@ -79,6 +79,7 @@ class MemoryBlockWithReuse : public IMemoryBlock {
     bool resize(size_t size) override;
     bool hasExtBuffer() const noexcept override;
     void free();
+    size_t size() const;  // in bytes
 
 private:
     bool m_useExternalStorage = false;

@@ -14,16 +14,25 @@ namespace intel_cpu {
 
 class DnnlScratchPad {
     MemoryBlockPtr blockPtr;
+    MemoryBlockWithReuse* baseBlockPtr = nullptr;
     dnnl::engine eng;
 
 public:
     DnnlScratchPad(const dnnl::engine& eng, int numa_node = -1) : eng(eng) {
-        blockPtr = std::make_shared<DnnlMemoryBlock>(make_unique<MemoryBlockWithReuse>(numa_node));
+        auto baseMemoryBlock = make_unique<MemoryBlockWithReuse>(numa_node);
+        baseBlockPtr = baseMemoryBlock.get();
+        blockPtr = std::make_shared<DnnlMemoryBlock>(std::move(baseMemoryBlock));
     }
 
     MemoryPtr createScratchPadMem(const MemoryDescPtr& md) {
         return std::make_shared<Memory>(eng, md, blockPtr);
     }
+
+    size_t size() const {
+        if (baseBlockPtr)
+            return baseBlockPtr->size();
+        return 0;
+    }
 };
 
 using DnnlScratchPadPtr = std::shared_ptr<DnnlScratchPad>;

@@ -236,7 +236,7 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model>& model,
     }
 
     // update output precisions of producers to avoid extra reorders
-    // do this only in case output configration is not provided explicitly
+    // do this only in case output configuration is not provided explicitly
     if (outputConfigs.empty()) {
         for (auto& output : outputNodesMap) {
             const auto& outputNode = output.second;
@@ -736,7 +736,7 @@ static inline bool isConstOutput(EdgePtr edge) {
 }
 
 void Graph::AllocateWithReuse(const std::vector<size_t>& syncNodesInds) {
-    edgeClusters edge_clusters = MemoryControl::findEdgeClusters(graphEdges);
+    EdgeClusters edge_clusters = MemoryControl::findEdgeClusters(graphEdges);
 
     size_t remaining_edge_clusters_count = edge_clusters.size();
 
@@ -754,7 +754,7 @@ void Graph::AllocateWithReuse(const std::vector<size_t>& syncNodesInds) {
             // Special allocation for string tensors
             if (edge->getDesc().getPrecision() == element::string &&
                 edge->getStatus() == Edge::Status::NeedAllocation) {
-                StringMemory::StringMemoryBlockPtr memBlcok;
+                StringMemory::StringMemoryBlockPtr memBlock;
                 if (edge->getParent()->isConstant()) {
                     if (edge->getParent()->getType() == Type::Input) {
                         auto constNode = static_cast<node::Input*>(edge->getParent().get());
@@ -769,11 +769,11 @@ void Graph::AllocateWithReuse(const std::vector<size_t>& syncNodesInds) {
                                     "' and '",
                                     edge->getChild()->getName(),
                                     "' must have StringMemory.");
-                    memBlcok = stringMemory->getStringMemoryBlockPtr();
+                    memBlock = stringMemory->getStringMemoryBlockPtr();
                 } else {
                     auto memory = std::make_shared<StringMemory>(getEngine(), edge->getDesc());
                     edge->reuse(memory);
-                    memBlcok = memory->getStringMemoryBlockPtr();
+                    memBlock = memory->getStringMemoryBlockPtr();
                 }
                 for (auto& edge_c : cluster) {
                     if (edge_c == edge) {
@@ -782,7 +782,7 @@ void Graph::AllocateWithReuse(const std::vector<size_t>& syncNodesInds) {
                     OPENVINO_ASSERT(edge_c->getDesc().getPrecision() == element::string,
                                     "All edges in the cluster must be string.");
                     if (edge_c->getStatus() == Edge::Status::NotAllocated) {
-                        auto memory = std::make_shared<StringMemory>(getEngine(), edge_c->getDesc(), memBlcok);
+                        auto memory = std::make_shared<StringMemory>(getEngine(), edge_c->getDesc(), memBlock);
                         edge_c->reuse(memory);
                     } else {
                         OPENVINO_THROW("[CPU] String tensors allocation in the cluster. Edge between nodes '",
@@ -912,10 +912,11 @@ void Graph::AllocateWithReuse(const std::vector<size_t>& syncNodesInds) {
     memoryRegions.erase(it, memoryRegions.end());
 
     // Set up the memory control subsystem.
-    this->m_pMemoryControl = &(getGraphContext()->getNetworkMemoryControl()->createMemoryControlUnit(syncNodesInds));
-    auto memoryBlocks = m_pMemoryControl->insert(memoryRegions);
+    this->m_pMemoryControl = &(getGraphContext()->getNetworkMemoryControl()->createMemoryControlUnit(this->_name));
+    m_pMemoryControl->insert(memoryRegions, syncNodesInds);
+    auto memoryBlocks = m_pMemoryControl->solve();
 
-    // attach all the not yet allocated edges to the memory contol
+    // attach all the not yet allocated edges to the memory control
     for (auto&& item : memoryBlocks) {
         int count = 0;
         for (auto&& edge : edge_clusters[item.first]) {
@@ -1464,7 +1465,7 @@ void Graph::Infer(SyncInferRequest* request) {
     const int numaId = GetNumaNodeId(m_context);
 
     if (!m_pMemoryControl) {
-        OPENVINO_THROW("Memory control unit is not initilized in graph: ", GetName());
+        OPENVINO_THROW("Memory control unit is not initalized in graph: ", GetName());
     }
 
     if (!m_pMemoryControl->allocated()) {
@@ -1494,7 +1495,7 @@ void Graph::Infer(SyncInferRequest* request) {
 void Graph::SortTopologically() {
     OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "Graph::SortTopologically");
 
-    // Set execIndex of all nodes to default invaild value
+    // Set execIndex of all nodes to default invalid value
     for (auto& node : graphNodes) {
         node->execIndex = -1;
     }
@@ -1806,8 +1807,8 @@ void Graph::EnforceInferencePrecision() {
                 if (one_of(parent->getType(),
                            Type::Convolution,     // conv nets
                            Type::FullyConnected,  // conv / bert nets
-                           Type::RNNCell,         // recurent nets
-                           Type::RNNSeq,          // recurent nets
+                           Type::RNNCell,         // recurrent nets
+                           Type::RNNSeq,          // recurrent nets
                            Type::MatMul,          // bert nets
                            Type::ROIPooling,      // object detection nets
                            Type::Interpolate,     // super resolution nets
@@ -1837,7 +1838,7 @@ void Graph::EnforceInferencePrecision() {
 
     /* Skip low-precision float point enforcement for tail of the graph by forming set of nodes to skip.
      * Necessary to maintain accuracy.
-     * Experiments show zero peformance impact on average */
+     * Experiments show zero performance impact on average */
     std::unordered_set<NodePtr> nodesToSkip;
     // starting from output nodes
     for (const auto& entry : outputNodesMap) {
@@ -1883,7 +1884,7 @@ void Graph::EnforceInferencePrecision() {
                 if (parent->getType() == Type::Input && one_of(node->getType(), Type::Eltwise, Type::Subgraph))
                     return true;
 
-                // exclude Convert after Range since it may cause precision loss when integter type to LP.
+                // exclude Convert after Range since it may cause precision loss when integer type to LP.
                 if (parent->getType() == Type::Range && node->getType() == Type::Convert) {
                     return true;
                 }
@@ -1912,7 +1913,7 @@ void Graph::EnforceInferencePrecision() {
             if (node->getOriginalOutputPrecisionAtPort(i) != ov::element::f32)
                 continue;
 
-            // exclude Convert before Range since it may cause precision loss when integter type to LP.
+            // exclude Convert before Range since it may cause precision loss when integer type to LP.
             // TODO: Incorrect subgraph is generated by ONNX FE + ticket 117861.
             const auto& child = node->getChildEdgeAt(i)->getChild();
             if (child->getType() == Type::Range && node->getType() == Type::Convert)