Skip to content

Commit

Permalink
If operation memory reuse
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorDuplensky committed Nov 12, 2024
1 parent b002dd0 commit 18563de
Show file tree
Hide file tree
Showing 10 changed files with 491 additions and 156 deletions.
14 changes: 14 additions & 0 deletions src/plugins/intel_cpu/src/cpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,24 @@ void MemoryBlockWithReuse::setExtBuff(void *ptr, size_t size) {
m_data = decltype(m_data)(ptr, release);
}

// class MemoryUsage {
// public:
// MemoryUsage() {}

// ~MemoryUsage() {
// std::cout << "Total memory usage: " << total << "\n";
// }

// int total = 0;
// };

bool MemoryBlockWithReuse::resize(size_t size) {
// static MemoryUsage mu;

constexpr int cacheLineSize = 64;
bool sizeChanged = false;
if (size > m_memUpperBound) {
// mu.total += size;
void *ptr = dnnl::impl::malloc(size, cacheLineSize);
if (!ptr) {
OPENVINO_THROW("Failed to allocate ", size, " bytes of memory");
Expand Down
46 changes: 31 additions & 15 deletions src/plugins/intel_cpu/src/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <vector>

#include "allocation_context.hpp"
#include "cpu_types.h"
#include "edge.h"
#include "graph_context.h"
#include "graph_dumper.h"
Expand Down Expand Up @@ -283,9 +284,9 @@ static std::tuple<std::vector<NodePtr>, std::vector<size_t>> ExtractExecutableNo
std::vector<NodePtr> executableGraphNodes;
for (size_t i = 0; i < graphNodes.size(); i++) {
const auto& graphNode = graphNodes[i];
// if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(!graphNode->canBeSkipped())) || // non-constant executable or
if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(!graphNode->canBeSkipped())) || // non-constant executable or
// if ((!graphNode->isConstant()) || // non-constant executable or
if ((!graphNode->isConstant() && !graphNode->canBeSkipped()) || // non-constant executable or
// if ((!graphNode->isConstant() && !graphNode->canBeSkipped()) || // non-constant executable or
(graphNode->isDynamicNode() && !one_of(graphNode->getType(), Type::Input, Type::Output))) { // dynamic, except inputs / outputs
/* @todo
* Revise implementation.
Expand Down Expand Up @@ -816,7 +817,7 @@ static void AllocateBaseEdges(const EdgeClusters& edgeClusters,
int count = 0;
// std::cout << "Processing cluster: " << item.first << "\n";
for (auto&& edge : edgeClusters[item.first]) {
// std::cout << "Processing edge: " << edge->name() << "\n";
// std::cout << "Processing base edge: " << edge->name() << "\n";
if (edge->getStatus() == Edge::Status::NeedAllocation) {
// std::cout << "Allocating edge: " << edge->name() << "\n";

Expand Down Expand Up @@ -851,7 +852,7 @@ static void AllocatedReferencingEdges(const EdgeClusters& clusters) {
}

std::for_each(edges_to_process.rbegin(), edges_to_process.rend(), [](const EdgePtr& edge) {
// std::cout << "Processing edge: " << edge->name() << "\n";
// std::cout << "Processing referencing edge: " << edge->name() << "\n";
if (edge->getStatus() == Edge::Status::NotAllocated) {
if (edge->inPlace(Edge::LOOK_DOWN)) {
edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN);
Expand Down Expand Up @@ -935,15 +936,18 @@ int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) {
auto syncNodesInds = CreateExecutionGraph();

ResolveInOutInPlaceEdges(graphEdges);
// std::cout << "RegisterToAllocationContext: " << offset << "\n";

// nodes are expected to be topologically sorted
for (size_t execIndex = 0, j = 0; execIndex < graphNodes.size(); execIndex++) {
const auto& node = graphNodes[execIndex];
const auto inputExecIndex = execIndex + offset;
const auto inputExecIndex = offset;
// an offset is the number of nodes in the internal graph minus the current node (-1)
offset = node->registerToAllocationContext(inputExecIndex, context) - 1;
const auto outputExecIndex = execIndex + offset;
offset = node->registerToAllocationContext(inputExecIndex, context);
const auto outputExecIndex = offset;
offset++;
context.execIndex[node] = {inputExecIndex, outputExecIndex};
// std::cout << node->getName() << " - " << "[" << inputExecIndex << "," << outputExecIndex << "] offset " << offset << "\n";

if (j < syncNodesInds.size() && syncNodesInds[j] == execIndex) {
context.syncPoints.push_back(inputExecIndex);
Expand All @@ -953,7 +957,7 @@ int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) {

context.edges.insert(context.edges.end(), graphEdges.begin(), graphEdges.end());

return offset;
return offset - 1;
}

AllocationContext Graph::CreateAllocationContext(bool global) {
Expand Down Expand Up @@ -1027,8 +1031,9 @@ static EdgeClusters FormEdgeClusters(const std::vector<EdgePtr>& graphEdges) {

addToCluster(edge->getSharedEdge(std::nothrow));

edgeClusterIndices.emplace(edge, clusterIdx);
edgeClusters[clusterIdx].push_back(edge);
if (edgeClusterIndices.emplace(edge, clusterIdx).second) {
edgeClusters[clusterIdx].push_back(edge);
}
};

addToCluster(edge);
Expand Down Expand Up @@ -1058,17 +1063,22 @@ static MemoryRegions FormMemoryRegions(const EdgeClusters& clusters,

int64_t boxSize = 0;
bool isConst = false, isOutput = false, isInput = false;

// std::cout << "Form memory region for cluster: " << i << "\n";

for (auto &edge : clusters[i]) {
const auto& parent = edge->getParent();
const auto& child = edge->getChild();

// std::cout << "[" << globalExecIndex.at(parent).second << " - " << globalExecIndex.at(child).first << "]"
// @todo this is can be considered as a property of the node, whether it is going to use input / output memory multiple times
// in scope of its execution routine
int e_start = parent->getType() == Type::TensorIterator ? globalExecIndex.at(parent).first : globalExecIndex.at(parent).second;
int e_finish = child->getType() == Type::TensorIterator ? globalExecIndex.at(child).second : globalExecIndex.at(child).first;

// std::cout << "[" << e_start << " - " << e_finish << "]"
// << edge->name()
// << "\n";

int e_start = globalExecIndex.at(parent).second;
int e_finish = globalExecIndex.at(child).first;
// int e_finish = edge->getChild()->getExecIndex();

auto&& desc = edge->getDesc();
Expand Down Expand Up @@ -1187,7 +1197,12 @@ SolveMemoryReuse(MemoryControl* memoryControl,
}

void Graph::Allocate() {
const auto globalAllocation = m_context->memoryReuseGlobal();
auto globalAllocation = m_context->memoryReuseGlobal();

if (std::getenv("LOCAL_REUSE")) {
globalAllocation = false;
}

// Set up the memory control subsystem.
auto memoryControl = globalAllocation ? m_context->getMemoryControl() : m_context->getNetworkMemoryControl()->createMemoryControlUnit();

Expand All @@ -1214,7 +1229,8 @@ void Graph::Allocate() {
// for (const auto& edge : edges) {
// const auto& parent = edge->getParent();
// const auto& child = edge->getChild();
// std::cout << "[" << allocationContext.execIndex[parent].second << " - " << allocationContext.execIndex[child].first << "]"
// std::cout << "[" << allocationContext.execIndex[parent].second << " - "
// << (child->getType() == Type::TensorIterator ? allocationContext.execIndex[child].second : allocationContext.execIndex[child].first) << "]"
// << edge->name()
// << "\n";
// }
Expand Down
9 changes: 9 additions & 0 deletions src/plugins/intel_cpu/src/memory_control.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,15 @@ MemoryControl::RegionHandlerPtr buildHandler(F&& f, Args&&... args) {
MemoryControl::MemoryControl() {
// init handlers
// handler for dynamic tensors
if (std::getenv("DISABLE_REUSE")) {
//handler for I/O tensors, so far simply individual blocks
m_handlers.emplace_back(buildHandler<MemoryManagerIO>([](const MemoryRegion& reg) {
return true;
}));

return;
}

m_handlers.emplace_back(buildHandler<MemoryManagerStatic>([](const MemoryRegion& reg) {
if (reg.size < 0 || MemoryRegion::RegionType::VARIABLE != reg.type ||
MemoryRegion::AllocType::POD != reg.alloc_type) {
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/src/node.h
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ class Node {

virtual int registerToAllocationContext(int offset, AllocationContext& context) {
(void) context;
return offset + 1;
return offset;
}

const std::string & getTypeStr() const {
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/src/nodes/composite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ void Composite::selectOptimalPrimitiveDescriptor() {
std::vector<Input::InputConfig> graphInputConfig;

// @todo should be always inplace after global memory reuse is fully supported by all the nodes
bool isInPlace = context->memoryReuseGlobal();
bool isInPlace = true;

for (size_t i = 0; i < getParentEdges().size(); i++) {
auto desc = getParentOutputMemDesc(getParentEdgeAt(i));
Expand Down
Loading

0 comments on commit 18563de

Please sign in to comment.