If operation memory reuse

openvinotoolkit · Nov 12, 2024 · 18563de · 18563de
1 parent b002dd0
commit 18563de
Show file tree

Hide file tree

Showing 10 changed files with 491 additions and 156 deletions.
diff --git a/src/plugins/intel_cpu/src/cpu_memory.cpp b/src/plugins/intel_cpu/src/cpu_memory.cpp
@@ -204,10 +204,24 @@ void MemoryBlockWithReuse::setExtBuff(void *ptr, size_t size) {
     m_data = decltype(m_data)(ptr, release);
 }
 
+// class MemoryUsage {
+// public:
+//     MemoryUsage() {}
+
+//     ~MemoryUsage() {
+//         std::cout << "Total memory usage: " << total << "\n";
+//     }
+
+//     int total = 0;
+// };
+
 bool MemoryBlockWithReuse::resize(size_t size) {
+    // static MemoryUsage mu;
+
     constexpr int cacheLineSize = 64;
     bool sizeChanged = false;
     if (size > m_memUpperBound) {
+        // mu.total += size;
         void *ptr = dnnl::impl::malloc(size, cacheLineSize);
         if (!ptr) {
             OPENVINO_THROW("Failed to allocate ", size, " bytes of memory");

diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
@@ -20,6 +20,7 @@
 #include <vector>
 
 #include "allocation_context.hpp"
+#include "cpu_types.h"
 #include "edge.h"
 #include "graph_context.h"
 #include "graph_dumper.h"
@@ -283,9 +284,9 @@ static std::tuple<std::vector<NodePtr>, std::vector<size_t>> ExtractExecutableNo
     std::vector<NodePtr> executableGraphNodes;
     for (size_t i = 0; i < graphNodes.size(); i++) {
         const auto& graphNode = graphNodes[i];
-        // if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(!graphNode->canBeSkipped())) || // non-constant executable or
+        if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(!graphNode->canBeSkipped())) || // non-constant executable or
         // if ((!graphNode->isConstant()) || // non-constant executable or
-        if ((!graphNode->isConstant() && !graphNode->canBeSkipped()) || // non-constant executable or
+        // if ((!graphNode->isConstant() && !graphNode->canBeSkipped()) || // non-constant executable or
             (graphNode->isDynamicNode() && !one_of(graphNode->getType(), Type::Input, Type::Output))) { // dynamic, except inputs / outputs
             /* @todo
              * Revise implementation.
@@ -816,7 +817,7 @@ static void AllocateBaseEdges(const EdgeClusters& edgeClusters,
         int count = 0;
         // std::cout << "Processing cluster: " << item.first << "\n";
         for (auto&& edge : edgeClusters[item.first]) {
-            // std::cout << "Processing edge: " << edge->name() << "\n";
+            // std::cout << "Processing base edge: " << edge->name() << "\n";
             if (edge->getStatus() == Edge::Status::NeedAllocation) {
                 // std::cout << "Allocating edge: " << edge->name() << "\n";
 
@@ -851,7 +852,7 @@ static void AllocatedReferencingEdges(const EdgeClusters& clusters) {
             }
 
             std::for_each(edges_to_process.rbegin(), edges_to_process.rend(), [](const EdgePtr& edge) {
-                // std::cout << "Processing edge: " << edge->name() << "\n";
+                // std::cout << "Processing referencing edge: " << edge->name() << "\n";
                 if (edge->getStatus() == Edge::Status::NotAllocated) {
                     if (edge->inPlace(Edge::LOOK_DOWN)) {
                         edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN);
@@ -935,15 +936,18 @@ int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) {
     auto syncNodesInds = CreateExecutionGraph();
 
     ResolveInOutInPlaceEdges(graphEdges);
+    // std::cout << "RegisterToAllocationContext: " << offset << "\n";
 
     // nodes are expected to be topologically sorted
     for (size_t execIndex = 0, j = 0; execIndex < graphNodes.size(); execIndex++) {
         const auto& node = graphNodes[execIndex];
-        const auto inputExecIndex = execIndex + offset;
+        const auto inputExecIndex = offset;
         // an offset is the number of nodes in the internal graph minus the current node (-1)
-        offset = node->registerToAllocationContext(inputExecIndex, context) - 1;
-        const auto outputExecIndex = execIndex + offset;
+        offset = node->registerToAllocationContext(inputExecIndex, context);
+        const auto outputExecIndex = offset;
+        offset++;
         context.execIndex[node] = {inputExecIndex, outputExecIndex};
+        // std::cout << node->getName() << " - " << "[" << inputExecIndex << "," << outputExecIndex << "] offset " << offset << "\n";
 
         if (j < syncNodesInds.size() && syncNodesInds[j] == execIndex) {
             context.syncPoints.push_back(inputExecIndex);
@@ -953,7 +957,7 @@ int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) {
 
     context.edges.insert(context.edges.end(), graphEdges.begin(), graphEdges.end());
 
-    return offset;
+    return offset - 1;
 }
 
 AllocationContext Graph::CreateAllocationContext(bool global) {
@@ -1027,8 +1031,9 @@ static EdgeClusters FormEdgeClusters(const std::vector<EdgePtr>& graphEdges) {
 
             addToCluster(edge->getSharedEdge(std::nothrow));
 
-            edgeClusterIndices.emplace(edge, clusterIdx);
-            edgeClusters[clusterIdx].push_back(edge);
+            if (edgeClusterIndices.emplace(edge, clusterIdx).second) {
+                edgeClusters[clusterIdx].push_back(edge);
+            }
         };
 
         addToCluster(edge);
@@ -1058,17 +1063,22 @@ static MemoryRegions FormMemoryRegions(const EdgeClusters& clusters,
 
         int64_t boxSize = 0;
         bool isConst = false, isOutput = false, isInput = false;
+
         // std::cout << "Form memory region for cluster: " << i << "\n";
+
         for (auto &edge : clusters[i]) {
             const auto& parent = edge->getParent();
             const auto& child = edge->getChild();
 
-            // std::cout << "[" << globalExecIndex.at(parent).second << " - " << globalExecIndex.at(child).first << "]"
+            // @todo this is can be considered as a property of the node, whether it is going to use input / output memory multiple times
+            // in scope of its execution routine
+            int e_start = parent->getType() == Type::TensorIterator ?  globalExecIndex.at(parent).first : globalExecIndex.at(parent).second;
+            int e_finish = child->getType() == Type::TensorIterator ?  globalExecIndex.at(child).second : globalExecIndex.at(child).first;
+
+            // std::cout << "[" << e_start << " - " << e_finish << "]"
             //           << edge->name()
             //           << "\n";
 
-            int e_start = globalExecIndex.at(parent).second;
-            int e_finish = globalExecIndex.at(child).first;
             // int e_finish = edge->getChild()->getExecIndex();
 
             auto&& desc = edge->getDesc();
@@ -1187,7 +1197,12 @@ SolveMemoryReuse(MemoryControl* memoryControl,
 }
 
 void Graph::Allocate() {
-    const auto globalAllocation = m_context->memoryReuseGlobal();
+    auto globalAllocation = m_context->memoryReuseGlobal();
+
+    if (std::getenv("LOCAL_REUSE")) {
+        globalAllocation = false;
+    }
+
     // Set up the memory control subsystem.
     auto memoryControl = globalAllocation ? m_context->getMemoryControl() : m_context->getNetworkMemoryControl()->createMemoryControlUnit();
 
@@ -1214,7 +1229,8 @@ void Graph::Allocate() {
     // for (const auto& edge : edges) {
     //     const auto& parent = edge->getParent();
     //     const auto& child = edge->getChild();
-    //     std::cout << "[" << allocationContext.execIndex[parent].second << " - " << allocationContext.execIndex[child].first << "]"
+    //     std::cout << "[" << allocationContext.execIndex[parent].second << " - "
+    //               << (child->getType() == Type::TensorIterator ? allocationContext.execIndex[child].second : allocationContext.execIndex[child].first) << "]"
     //               << edge->name()
     //               << "\n";
     // }

diff --git a/src/plugins/intel_cpu/src/memory_control.cpp b/src/plugins/intel_cpu/src/memory_control.cpp
@@ -300,6 +300,15 @@ MemoryControl::RegionHandlerPtr buildHandler(F&& f, Args&&... args) {
 MemoryControl::MemoryControl() {
     // init handlers
     // handler for dynamic tensors
+    if (std::getenv("DISABLE_REUSE")) {
+        //handler for I/O tensors, so far simply individual blocks
+        m_handlers.emplace_back(buildHandler<MemoryManagerIO>([](const MemoryRegion& reg) {
+            return true;
+        }));
+
+        return;
+    }
+
     m_handlers.emplace_back(buildHandler<MemoryManagerStatic>([](const MemoryRegion& reg) {
         if (reg.size < 0 || MemoryRegion::RegionType::VARIABLE != reg.type ||
             MemoryRegion::AllocType::POD != reg.alloc_type) {

diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h
@@ -519,7 +519,7 @@ class Node {
 
     virtual int registerToAllocationContext(int offset, AllocationContext& context) {
         (void) context;
-        return offset + 1;
+        return offset;
     }
 
     const std::string & getTypeStr() const {

diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp
@@ -45,7 +45,7 @@ void Composite::selectOptimalPrimitiveDescriptor() {
     std::vector<Input::InputConfig> graphInputConfig;
 
     // @todo should be always inplace after global memory reuse is fully supported by all the nodes
-    bool isInPlace = context->memoryReuseGlobal();
+    bool isInPlace = true;
 
     for (size_t i = 0; i < getParentEdges().size(); i++) {
         auto desc = getParentOutputMemDesc(getParentEdgeAt(i));