diff --git a/src/plugins/intel_cpu/src/cpu_memory.cpp b/src/plugins/intel_cpu/src/cpu_memory.cpp index 8e5fe8d72fd1f2..ad9448645914ce 100644 --- a/src/plugins/intel_cpu/src/cpu_memory.cpp +++ b/src/plugins/intel_cpu/src/cpu_memory.cpp @@ -204,10 +204,24 @@ void MemoryBlockWithReuse::setExtBuff(void *ptr, size_t size) { m_data = decltype(m_data)(ptr, release); } +// class MemoryUsage { +// public: +// MemoryUsage() {} + +// ~MemoryUsage() { +// std::cout << "Total memory usage: " << total << "\n"; +// } + +// int total = 0; +// }; + bool MemoryBlockWithReuse::resize(size_t size) { + // static MemoryUsage mu; + constexpr int cacheLineSize = 64; bool sizeChanged = false; if (size > m_memUpperBound) { + // mu.total += size; void *ptr = dnnl::impl::malloc(size, cacheLineSize); if (!ptr) { OPENVINO_THROW("Failed to allocate ", size, " bytes of memory"); diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 7b519bd75d3fcf..494b3ecc00f9d9 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -20,6 +20,7 @@ #include #include "allocation_context.hpp" +#include "cpu_types.h" #include "edge.h" #include "graph_context.h" #include "graph_dumper.h" @@ -283,9 +284,9 @@ static std::tuple, std::vector> ExtractExecutableNo std::vector executableGraphNodes; for (size_t i = 0; i < graphNodes.size(); i++) { const auto& graphNode = graphNodes[i]; - // if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(!graphNode->canBeSkipped())) || // non-constant executable or + if ((!graphNode->isConstant() && CPU_DEBUG_CAPS_ALWAYS_TRUE(!graphNode->canBeSkipped())) || // non-constant executable or // if ((!graphNode->isConstant()) || // non-constant executable or - if ((!graphNode->isConstant() && !graphNode->canBeSkipped()) || // non-constant executable or + // if ((!graphNode->isConstant() && !graphNode->canBeSkipped()) || // non-constant executable or (graphNode->isDynamicNode() && !one_of(graphNode->getType(), Type::Input, Type::Output))) { // dynamic, except inputs / outputs /* @todo * Revise implementation. @@ -816,7 +817,7 @@ static void AllocateBaseEdges(const EdgeClusters& edgeClusters, int count = 0; // std::cout << "Processing cluster: " << item.first << "\n"; for (auto&& edge : edgeClusters[item.first]) { - // std::cout << "Processing edge: " << edge->name() << "\n"; + // std::cout << "Processing base edge: " << edge->name() << "\n"; if (edge->getStatus() == Edge::Status::NeedAllocation) { // std::cout << "Allocating edge: " << edge->name() << "\n"; @@ -851,7 +852,7 @@ static void AllocatedReferencingEdges(const EdgeClusters& clusters) { } std::for_each(edges_to_process.rbegin(), edges_to_process.rend(), [](const EdgePtr& edge) { - // std::cout << "Processing edge: " << edge->name() << "\n"; + // std::cout << "Processing referencing edge: " << edge->name() << "\n"; if (edge->getStatus() == Edge::Status::NotAllocated) { if (edge->inPlace(Edge::LOOK_DOWN)) { edge->getChild()->resolveInPlaceEdges(Edge::LOOK_DOWN); @@ -935,15 +936,18 @@ int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) { auto syncNodesInds = CreateExecutionGraph(); ResolveInOutInPlaceEdges(graphEdges); + // std::cout << "RegisterToAllocationContext: " << offset << "\n"; // nodes are expected to be topologically sorted for (size_t execIndex = 0, j = 0; execIndex < graphNodes.size(); execIndex++) { const auto& node = graphNodes[execIndex]; - const auto inputExecIndex = execIndex + offset; + const auto inputExecIndex = offset; // an offset is the number of nodes in the internal graph minus the current node (-1) - offset = node->registerToAllocationContext(inputExecIndex, context) - 1; - const auto outputExecIndex = execIndex + offset; + offset = node->registerToAllocationContext(inputExecIndex, context); + const auto outputExecIndex = offset; + offset++; context.execIndex[node] = {inputExecIndex, outputExecIndex}; + // std::cout << node->getName() << " - " << "[" << inputExecIndex << "," << outputExecIndex << "] offset " << offset << "\n"; if (j < syncNodesInds.size() && syncNodesInds[j] == execIndex) { context.syncPoints.push_back(inputExecIndex); @@ -953,7 +957,7 @@ int Graph::RegisterToAllocationContext(int offset, AllocationContext& context) { context.edges.insert(context.edges.end(), graphEdges.begin(), graphEdges.end()); - return offset; + return offset - 1; } AllocationContext Graph::CreateAllocationContext(bool global) { @@ -1027,8 +1031,9 @@ static EdgeClusters FormEdgeClusters(const std::vector& graphEdges) { addToCluster(edge->getSharedEdge(std::nothrow)); - edgeClusterIndices.emplace(edge, clusterIdx); - edgeClusters[clusterIdx].push_back(edge); + if (edgeClusterIndices.emplace(edge, clusterIdx).second) { + edgeClusters[clusterIdx].push_back(edge); + } }; addToCluster(edge); @@ -1058,17 +1063,22 @@ static MemoryRegions FormMemoryRegions(const EdgeClusters& clusters, int64_t boxSize = 0; bool isConst = false, isOutput = false, isInput = false; + // std::cout << "Form memory region for cluster: " << i << "\n"; + for (auto &edge : clusters[i]) { const auto& parent = edge->getParent(); const auto& child = edge->getChild(); - // std::cout << "[" << globalExecIndex.at(parent).second << " - " << globalExecIndex.at(child).first << "]" + // @todo this is can be considered as a property of the node, whether it is going to use input / output memory multiple times + // in scope of its execution routine + int e_start = parent->getType() == Type::TensorIterator ? globalExecIndex.at(parent).first : globalExecIndex.at(parent).second; + int e_finish = child->getType() == Type::TensorIterator ? globalExecIndex.at(child).second : globalExecIndex.at(child).first; + + // std::cout << "[" << e_start << " - " << e_finish << "]" // << edge->name() // << "\n"; - int e_start = globalExecIndex.at(parent).second; - int e_finish = globalExecIndex.at(child).first; // int e_finish = edge->getChild()->getExecIndex(); auto&& desc = edge->getDesc(); @@ -1187,7 +1197,12 @@ SolveMemoryReuse(MemoryControl* memoryControl, } void Graph::Allocate() { - const auto globalAllocation = m_context->memoryReuseGlobal(); + auto globalAllocation = m_context->memoryReuseGlobal(); + + if (std::getenv("LOCAL_REUSE")) { + globalAllocation = false; + } + // Set up the memory control subsystem. auto memoryControl = globalAllocation ? m_context->getMemoryControl() : m_context->getNetworkMemoryControl()->createMemoryControlUnit(); @@ -1214,7 +1229,8 @@ void Graph::Allocate() { // for (const auto& edge : edges) { // const auto& parent = edge->getParent(); // const auto& child = edge->getChild(); - // std::cout << "[" << allocationContext.execIndex[parent].second << " - " << allocationContext.execIndex[child].first << "]" + // std::cout << "[" << allocationContext.execIndex[parent].second << " - " + // << (child->getType() == Type::TensorIterator ? allocationContext.execIndex[child].second : allocationContext.execIndex[child].first) << "]" // << edge->name() // << "\n"; // } diff --git a/src/plugins/intel_cpu/src/memory_control.cpp b/src/plugins/intel_cpu/src/memory_control.cpp index 8f1885daeae8fc..443dab8230e30d 100644 --- a/src/plugins/intel_cpu/src/memory_control.cpp +++ b/src/plugins/intel_cpu/src/memory_control.cpp @@ -300,6 +300,15 @@ MemoryControl::RegionHandlerPtr buildHandler(F&& f, Args&&... args) { MemoryControl::MemoryControl() { // init handlers // handler for dynamic tensors + if (std::getenv("DISABLE_REUSE")) { + //handler for I/O tensors, so far simply individual blocks + m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { + return true; + })); + + return; + } + m_handlers.emplace_back(buildHandler([](const MemoryRegion& reg) { if (reg.size < 0 || MemoryRegion::RegionType::VARIABLE != reg.type || MemoryRegion::AllocType::POD != reg.alloc_type) { diff --git a/src/plugins/intel_cpu/src/node.h b/src/plugins/intel_cpu/src/node.h index c9d85f2123d6b7..5364839e282925 100644 --- a/src/plugins/intel_cpu/src/node.h +++ b/src/plugins/intel_cpu/src/node.h @@ -519,7 +519,7 @@ class Node { virtual int registerToAllocationContext(int offset, AllocationContext& context) { (void) context; - return offset + 1; + return offset; } const std::string & getTypeStr() const { diff --git a/src/plugins/intel_cpu/src/nodes/composite.cpp b/src/plugins/intel_cpu/src/nodes/composite.cpp index 44b79d8e6982ab..f64cf0c991eec4 100644 --- a/src/plugins/intel_cpu/src/nodes/composite.cpp +++ b/src/plugins/intel_cpu/src/nodes/composite.cpp @@ -45,7 +45,7 @@ void Composite::selectOptimalPrimitiveDescriptor() { std::vector graphInputConfig; // @todo should be always inplace after global memory reuse is fully supported by all the nodes - bool isInPlace = context->memoryReuseGlobal(); + bool isInPlace = true; for (size_t i = 0; i < getParentEdges().size(); i++) { auto desc = getParentOutputMemDesc(getParentEdgeAt(i)); diff --git a/src/plugins/intel_cpu/src/nodes/if.cpp b/src/plugins/intel_cpu/src/nodes/if.cpp index 1b6102ff954689..c74f542c9aacd2 100644 --- a/src/plugins/intel_cpu/src/nodes/if.cpp +++ b/src/plugins/intel_cpu/src/nodes/if.cpp @@ -4,6 +4,7 @@ #include "if.h" +#include "nodes/node_config.h" #include "openvino/op/if.hpp" #include "common/cpu_memcpy.h" @@ -70,128 +71,256 @@ bool If::isSupportedOperation(const std::shared_ptr& op, std::st } If::If(const std::shared_ptr& op, const GraphContext::CPtr context) : - Node(op, context, InternalDynShapeInferFactory()), ovOp(op) { + Node(op, context, InternalDynShapeInferFactory()), m_op(op) { std::string errorMessage; if (!isSupportedOperation(op, errorMessage)) { OPENVINO_THROW_NOT_IMPLEMENTED(errorMessage); } } -void If::getSupportedDescriptors() { - auto ifOp = ov::as_type_ptr(ovOp); - - const std::shared_ptr& thenBody = ifOp->get_then_body(); - const std::shared_ptr& elseBody = ifOp->get_else_body(); - subGraphThen.CreateGraph(thenBody, context); - subGraphElse.CreateGraph(elseBody, context); - - const auto& inMapThen = subGraphThen.GetInputNodesMap(); - for (const auto& param : ifOp->get_then_body()->get_parameters()) { - auto inNode = inMapThen.find(ifOp->get_then_body()->get_parameter_index(param)); - if (inNode != inMapThen.end()) { - inputMemThen.push_back(getToMemories(inNode->second.get(), 0)); - } else { - OPENVINO_THROW("Then body of node If with name ", - getName(), - " does not have input with name: ", - param->get_friendly_name()); - } +// void If::getSupportedDescriptors() { +// auto ifOp = ov::as_type_ptr(ovOp); + +// const std::shared_ptr& thenBody = ifOp->get_then_body(); +// const std::shared_ptr& elseBody = ifOp->get_else_body(); +// subGraphThen.CreateGraph(thenBody, context); +// subGraphElse.CreateGraph(elseBody, context); + +// const auto& inMapThen = subGraphThen.GetInputNodesMap(); +// for (const auto& param : ifOp->get_then_body()->get_parameters()) { +// auto inNode = inMapThen.find(ifOp->get_then_body()->get_parameter_index(param)); +// if (inNode != inMapThen.end()) { +// inputMemThen.push_back(getToMemories(inNode->second.get(), 0)); +// } else { +// OPENVINO_THROW("Then body of node If with name ", +// getName(), +// " does not have input with name: ", +// param->get_friendly_name()); +// } +// } + +// const auto& inMapElse = subGraphElse.GetInputNodesMap(); +// for (const auto& param : ifOp->get_else_body()->get_parameters()) { +// auto inNode = inMapElse.find(ifOp->get_else_body()->get_parameter_index(param)); +// if (inNode != inMapElse.end()) { +// inputMemElse.push_back(getToMemories(inNode->second.get(), 0)); +// } else { +// OPENVINO_THROW("Else body of node If with name ", +// getName(), +// " does not have input with name: ", +// param->get_friendly_name()); +// } +// } + +// const auto &outMapThen = subGraphThen.GetOutputNodesMap(); +// for (const auto& out : ifOp->get_then_body()->get_results()) { +// auto outNode = outMapThen.find(ifOp->get_then_body()->get_result_index(out)); +// if (outNode != outMapThen.end()) { +// auto outMem = outNode->second->getSrcMemoryAtPort(0); +// outputMemThen.push_back(outMem); +// } else { +// OPENVINO_THROW("Then body of node If with name ", getName(), " does not have output with name: ", out->get_friendly_name()); +// } +// } + +// const auto &outMapElse = subGraphElse.GetOutputNodesMap(); +// for (const auto& out : ifOp->get_else_body()->get_results()) { +// auto outNode = outMapElse.find(ifOp->get_else_body()->get_result_index(out)); +// if (outNode != outMapElse.end()) { +// auto outMem = outNode->second->getSrcMemoryAtPort(0); +// outputMemElse.push_back(outMem); +// } else { +// OPENVINO_THROW("Else body of node If with name ", getName(), " does not have output with name: ", out->get_friendly_name()); +// } +// } + +// // Port map: outputs +// for (const auto& desc : ifOp->get_output_descriptions(0)) { +// auto body_output_idx = desc->m_body_value_index; +// thenOutputPortMap.emplace_back(PortMap { +// static_cast(desc->m_output_index), static_cast(body_output_idx)}); +// } +// for (const auto& desc : ifOp->get_output_descriptions(1)) { +// auto body_output_idx = desc->m_body_value_index; +// elseOutputPortMap.emplace_back(PortMap { +// static_cast(desc->m_output_index), static_cast(body_output_idx)}); +// } + +// for (const auto& desc : ifOp->get_input_descriptions(0)) { +// auto body_input_index = desc->m_body_parameter_index; +// thenInputPortMap.emplace_back(PortMap { +// static_cast(desc->m_input_index), static_cast(body_input_index)}); +// } +// for (const auto& desc : ifOp->get_input_descriptions(1)) { +// auto body_input_index = desc->m_body_parameter_index; +// elseInputPortMap.emplace_back(PortMap { +// static_cast(desc->m_input_index), static_cast(body_input_index)}); +// } +// } + +// void If::initSupportedPrimitiveDescriptors() { +// if (!supportedPrimitiveDescriptors.empty()) +// return; + +// NodeConfig config; +// config.inConfs.reserve(getParentEdges().size()); +// config.outConfs.reserve(getChildEdges().size()); + +// for (size_t i = 0; i < inputShapes.size(); i++) { +// PortConfig dataConf {}; +// auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); +// dataConf.setMemDesc(descCreator->createSharedDesc(getOriginalInputPrecisionAtPort(i), getInputShapeAtPort(i))); +// config.inConfs.emplace_back(dataConf); +// } + +// for (size_t i = 0; i < outputShapes.size(); i++) { +// PortConfig dataConf {}; +// auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); +// dataConf.setMemDesc(descCreator->createSharedDesc(getOriginalOutputPrecisionAtPort(i), getOutputShapeAtPort(i))); +// config.outConfs.push_back(dataConf); +// } + +// supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); +// } + +void If::selectOptimalPrimitiveDescriptor() { + // for the input configuration, just always use the parent configuration + auto ifOp = ov::as_type_ptr(m_op); + const auto numThenParameters = ifOp->get_then_body()->get_parameters().size(); + const auto numThenResults = ifOp->get_then_body()->get_results().size(); + const auto numElseParameters = ifOp->get_else_body()->get_parameters().size(); + const auto numElseResults = ifOp->get_else_body()->get_results().size(); + + std::vector inConfs(inputShapes.size()); + std::vector outConfs(outputShapes.size()); + + std::vector thenInputConfig(numThenParameters); + std::vector elseInputConfig(numElseParameters); + + // @todo should be always inplace when global memory reuse is fully supported by all the nodes + bool isInPlace = true; + + std::vector thenOutputConfig(numThenResults, + node::Input::OutputConfig{true, isInPlace}); + std::vector elseOutputConfig(numElseResults, + node::Input::OutputConfig{true, isInPlace}); + + auto thenInputDescriptions = ifOp->get_input_descriptions(0); + auto elseInputDescriptions = ifOp->get_input_descriptions(1); + + auto conditionDesc = getParentOutputMemDesc(getParentEdgeAt(0)); + inConfs.at(0) = PortConfig(conditionDesc); + + for (const auto& description : thenInputDescriptions) { + const auto inIdx = description->m_input_index; + const auto paramIdx = description->m_body_parameter_index; + auto desc = getParentOutputMemDesc(getParentEdgeAt(inIdx)); + inConfs.at(inIdx) = PortConfig(desc); + thenInputConfig.at(paramIdx) = node::Input::InputConfig{desc, isInPlace}; } - const auto& inMapElse = subGraphElse.GetInputNodesMap(); - for (const auto& param : ifOp->get_else_body()->get_parameters()) { - auto inNode = inMapElse.find(ifOp->get_else_body()->get_parameter_index(param)); - if (inNode != inMapElse.end()) { - inputMemElse.push_back(getToMemories(inNode->second.get(), 0)); - } else { - OPENVINO_THROW("Else body of node If with name ", - getName(), - " does not have input with name: ", - param->get_friendly_name()); - } + for (const auto& description : elseInputDescriptions) { + const auto inIdx = description->m_input_index; + const auto paramIdx = description->m_body_parameter_index; + auto desc = getParentOutputMemDesc(getParentEdgeAt(inIdx)); + inConfs.at(inIdx) = PortConfig(desc); + elseInputConfig.at(paramIdx) = node::Input::InputConfig{desc, isInPlace}; } - const auto &outMapThen = subGraphThen.GetOutputNodesMap(); - for (const auto& out : ifOp->get_then_body()->get_results()) { - auto outNode = outMapThen.find(ifOp->get_then_body()->get_result_index(out)); - if (outNode != outMapThen.end()) { - auto outMem = outNode->second->getSrcMemoryAtPort(0); - outputMemThen.push_back(outMem); - } else { - OPENVINO_THROW("Then body of node If with name ", getName(), " does not have output with name: ", out->get_friendly_name()); - } - } + // configure the inner graph to get the information about output memory descriptors + m_thenGraph.Init(ifOp->get_then_body(), context, thenInputConfig, thenOutputConfig); + m_elseGraph.Init(ifOp->get_else_body(), context, elseInputConfig, elseOutputConfig); - const auto &outMapElse = subGraphElse.GetOutputNodesMap(); - for (const auto& out : ifOp->get_else_body()->get_results()) { - auto outNode = outMapElse.find(ifOp->get_else_body()->get_result_index(out)); - if (outNode != outMapElse.end()) { - auto outMem = outNode->second->getSrcMemoryAtPort(0); - outputMemElse.push_back(outMem); - } else { - OPENVINO_THROW("Else body of node If with name ", getName(), " does not have output with name: ", out->get_friendly_name()); - } - } + // for the output descriptors, use the configuration of the graph's output nodes + auto thenOutputDescriptors = m_thenGraph.getOutputMemoryDescriptors(); + auto elseOutputDescriptors = m_elseGraph.getOutputMemoryDescriptors(); + auto thenOutputDescriptions = ifOp->get_output_descriptions(0); + auto elseOutputDescriptions = ifOp->get_output_descriptions(1); - // Port map: outputs - for (const auto& desc : ifOp->get_output_descriptions(0)) { - auto body_output_idx = desc->m_body_value_index; - thenOutputPortMap.emplace_back(PortMap { - static_cast(desc->m_output_index), static_cast(body_output_idx)}); - } - for (const auto& desc : ifOp->get_output_descriptions(1)) { - auto body_output_idx = desc->m_body_value_index; - elseOutputPortMap.emplace_back(PortMap { - static_cast(desc->m_output_index), static_cast(body_output_idx)}); + for (const auto& description : thenOutputDescriptions) { + auto outIdx = description->m_output_index; + auto resultIdx = description->m_body_value_index; + outConfs.at(outIdx) = PortConfig(thenOutputDescriptors.at(resultIdx)); } - for (const auto& desc : ifOp->get_input_descriptions(0)) { - auto body_input_index = desc->m_body_parameter_index; - thenInputPortMap.emplace_back(PortMap { - static_cast(desc->m_input_index), static_cast(body_input_index)}); - } - for (const auto& desc : ifOp->get_input_descriptions(1)) { - auto body_input_index = desc->m_body_parameter_index; - elseInputPortMap.emplace_back(PortMap { - static_cast(desc->m_input_index), static_cast(body_input_index)}); + for (const auto& description : elseOutputDescriptions) { + auto outIdx = description->m_output_index; + auto resultIdx = description->m_body_value_index; + outConfs.at(outIdx) = PortConfig(elseOutputDescriptors.at(resultIdx)); } + + const NodeConfig config(inConfs, outConfs); + + supportedPrimitiveDescriptors.clear(); + supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::undef); + + selectPrimitiveDescriptorByIndex(0); } -void If::initSupportedPrimitiveDescriptors() { - if (!supportedPrimitiveDescriptors.empty()) - return; +int If::registerToAllocationContext(int offset, AllocationContext& context) { + // if (!this->context->memoryReuseGlobal()) + // return Node::registerToAllocationContext(offset, context); + auto ifOp = ov::as_type_ptr(m_op); + + for (const auto& description : ifOp->get_input_descriptions(0)) { + const auto inIdx = description->m_input_index; + const auto paramIdx = description->m_body_parameter_index; + auto parentEdge = getParentEdgeAt(inIdx); + auto inputEdges = m_thenGraph.GetInputNodesMap().at(paramIdx)->getChildEdgesAtPort(0); + + for (const auto& inputEdge : inputEdges) { + OPENVINO_ASSERT(inputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", + inputEdge->name()); + inputEdge->sharedMemFrom(parentEdge); + } + } - NodeConfig config; - config.inConfs.reserve(getParentEdges().size()); - config.outConfs.reserve(getChildEdges().size()); + for (const auto& description : ifOp->get_input_descriptions(1)) { + const auto inIdx = description->m_input_index; + const auto paramIdx = description->m_body_parameter_index; + auto parentEdge = getParentEdgeAt(inIdx); + auto inputEdges = m_elseGraph.GetInputNodesMap().at(paramIdx)->getChildEdgesAtPort(0); + + for (const auto& inputEdge : inputEdges) { + OPENVINO_ASSERT(inputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", + inputEdge->name()); + inputEdge->sharedMemFrom(parentEdge); + } + } + + for (const auto& description : ifOp->get_output_descriptions(0)) { + const auto outIdx = description->m_output_index; + const auto resultIdx = description->m_body_value_index; + auto childEdge = getChildEdgeAt(outIdx); + auto outputEdge = m_thenGraph.GetOutputNodesMap().at(resultIdx)->getParentEdgeAt(0); - for (size_t i = 0; i < inputShapes.size(); i++) { - PortConfig dataConf {}; - auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); - dataConf.setMemDesc(descCreator->createSharedDesc(getOriginalInputPrecisionAtPort(i), getInputShapeAtPort(i))); - config.inConfs.emplace_back(dataConf); + OPENVINO_ASSERT(outputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", + outputEdge->name()); + outputEdge->sharedMemFrom(childEdge); } - for (size_t i = 0; i < outputShapes.size(); i++) { - PortConfig dataConf {}; - auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); - dataConf.setMemDesc(descCreator->createSharedDesc(getOriginalOutputPrecisionAtPort(i), getOutputShapeAtPort(i))); - config.outConfs.push_back(dataConf); + for (const auto& description : ifOp->get_output_descriptions(1)) { + const auto outIdx = description->m_output_index; + const auto resultIdx = description->m_body_value_index; + auto childEdge = getChildEdgeAt(outIdx); + auto outputEdge = m_elseGraph.GetOutputNodesMap().at(resultIdx)->getParentEdgeAt(0); + + OPENVINO_ASSERT(outputEdge->getStatus() == Edge::Status::Uninitialized, + "Expected Uninitialized state for edge: ", + outputEdge->name()); + outputEdge->sharedMemFrom(childEdge); } - supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown); + offset = m_thenGraph.RegisterToAllocationContext(offset, context); + return m_elseGraph.RegisterToAllocationContext(offset, context); } void If::createPrimitive() { - const auto& eng = getEngine(); - prepareBeforeMappers(true, eng); - prepareBeforeMappers(false, eng); - prepareAfterMappers(true, eng); - prepareAfterMappers(false, eng); - - if (inputShapesDefined()) { - updateLastInputDims(); - } + m_thenGraph.Activate({}, {}); + m_elseGraph.Activate({}, {}); } void If::prepareBeforeMappers(const bool isThen, const dnnl::engine& eng) { @@ -246,16 +375,16 @@ std::deque If::getToMemories(const Node* node, const size_t port) con void If::execute(dnnl::stream strm) { const bool condition = static_cast((getSrcDataAtPortAs(0))[0]); - auto& beforeMappers = condition ? beforeThenMappers : beforeElseMappers; - auto& afterMappers = condition ? afterThenMappers : afterElseMappers; - auto& subGraph = condition ? subGraphThen : subGraphElse; + // auto& beforeMappers = condition ? beforeThenMappers : beforeElseMappers; + // auto& afterMappers = condition ? afterThenMappers : afterElseMappers; + auto& graph = condition ? m_thenGraph : m_elseGraph; // - for (auto &mapper : beforeMappers) - mapper->execute(strm); - subGraph.ResetInferCount(); - subGraph.Infer(); - for (auto &mapper : afterMappers) - mapper->execute(strm); + graph.ResetInferCount(); + graph.Infer(); + // for (auto &mapper : beforeMappers) + // mapper->execute(strm); + // for (auto &mapper : afterMappers) + // mapper->execute(strm); } void If::executeDynamicImpl(dnnl::stream strm) { diff --git a/src/plugins/intel_cpu/src/nodes/if.h b/src/plugins/intel_cpu/src/nodes/if.h index 3c279f028754b8..2b1124d204e650 100644 --- a/src/plugins/intel_cpu/src/nodes/if.h +++ b/src/plugins/intel_cpu/src/nodes/if.h @@ -20,8 +20,10 @@ class If : public Node { If(const std::shared_ptr& op, const GraphContext::CPtr context); static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - void initSupportedPrimitiveDescriptors() override; - void getSupportedDescriptors() override; + // void initSupportedPrimitiveDescriptors() override; + void getSupportedDescriptors() override {} + void selectOptimalPrimitiveDescriptor() override; + int registerToAllocationContext(int offset, AllocationContext& context) override; void createPrimitive() override; bool created() const override; void execute(dnnl::stream strm) override; @@ -60,8 +62,8 @@ class If : public Node { ptrdiff_t size; }; - Graph subGraphThen; - Graph subGraphElse; + Graph m_thenGraph; + Graph m_elseGraph; std::vector> inputMemThen, inputMemElse; std::deque outputMemThen, outputMemElse; @@ -77,7 +79,7 @@ class If : public Node { elseInputPortMap, elseOutputPortMap; - const std::shared_ptr ovOp; + const std::shared_ptr m_op; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/lora.cpp b/src/plugins/intel_cpu/src/nodes/lora.cpp index 17e4835675dc7d..fe83b616fd7302 100644 --- a/src/plugins/intel_cpu/src/nodes/lora.cpp +++ b/src/plugins/intel_cpu/src/nodes/lora.cpp @@ -52,7 +52,7 @@ void LoRA::selectOptimalPrimitiveDescriptor() { inConfs.emplace_back(mainInputDesc); // @todo should be always inplace after global memory reuse is fully supported by all the nodes - bool isInPlace = context->memoryReuseGlobal(); + bool isInPlace = true; graphInputConfig.emplace_back(node::Input::InputConfig{mainInputDesc, isInPlace}); for (size_t i = 1; i < getParentEdges().size(); i++) { @@ -89,9 +89,8 @@ void LoRA::selectOptimalPrimitiveDescriptor() { } int LoRA::registerToAllocationContext(int offset, AllocationContext& context) { - if (!this->context->memoryReuseGlobal()) - return Node::registerToAllocationContext(offset, context); - + // if (!this->context->memoryReuseGlobal()) + // return Node::registerToAllocationContext(offset, context); for (size_t i = 0; i < getOriginalInputsNumber(); i++) { auto parentEdge = getParentEdgeAt(i); auto inputEdges = m_graph.GetInputNodesMap().at(i)->getChildEdgesAtPort(0); diff --git a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp index dcf2b0f8ffd5ee..4afa1aa3d09aa6 100644 --- a/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp +++ b/src/plugins/intel_cpu/src/nodes/tensoriterator.cpp @@ -402,25 +402,195 @@ TensorIterator::TensorIterator(const std::shared_ptr& op, const GraphC } } -void TensorIterator::getSupportedDescriptors() { - auto tiOp = ov::as_type_ptr(ngraphOp); - if (!tiOp) { - THROW_ERROR("cannot be cast to ov::op::util::SubGraphOp"); - } - const std::shared_ptr body = tiOp->get_function(); - sub_graph.CreateGraph(body, context); +// void TensorIterator::getSupportedDescriptors() { +void TensorIterator::selectOptimalPrimitiveDescriptor() { + // auto tiOp = ov::as_type_ptr(ngraphOp); + // if (!tiOp) { + // THROW_ERROR("cannot be cast to ov::op::util::SubGraphOp"); + // } + // const std::shared_ptr body = tiOp->get_function(); + // sub_graph.CreateGraph(body, context); + + // const auto &inMap = sub_graph.GetInputNodesMap(); + // for (const auto ¶m : tiOp->get_function()->get_parameters()) { + // auto inNode = inMap.find(tiOp->get_function()->get_parameter_index(param)); + // if (inNode != inMap.end()) { + // input_mems.push_back(getToMemories(inNode->second.get(), 0)); + // } + // } + + // const auto &outMap = sub_graph.GetOutputNodesMap(); + // for (const auto &out : tiOp->get_function()->get_results()) { + // auto outNode = outMap.find(tiOp->get_function()->get_result_index(out)); + // if (outNode != outMap.end()) { + // auto outMem = outNode->second->getSrcMemoryAtPort(0); + // output_mem.push_back(outMem); + // } + // } + + // // Port map: outputs + // for (const auto& desc : tiOp->get_output_descriptions()) { + // auto body_output_idx = desc->m_body_value_index; + + // std::string type_name = desc->get_type_info().name; + // if (type_name == "ConcatOutputDescription") { + // auto output_desc = ov::as_type_ptr(desc); + // OPENVINO_ASSERT(output_desc != nullptr); + + // outputPortMap.emplace_back(PortMap { + // static_cast(output_desc->m_output_index), static_cast(body_output_idx), + // static_cast(output_desc->m_axis), static_cast(output_desc->m_stride), + // static_cast(output_desc->m_start), static_cast(output_desc->m_end), + // static_cast(output_desc->m_part_size)}); + // } else if (type_name == "BodyOutputDescription") { + // auto output_desc = ov::as_type_ptr(desc); + // OPENVINO_ASSERT(output_desc != nullptr); + + // outputPortMap.emplace_back(PortMap { + // static_cast(output_desc->m_output_index), static_cast(body_output_idx), -1, 1, 0, -1, 1}); + // } else { + // OPENVINO_THROW("Incorrect type of the output description."); + // } + // } + + // // Port map : inputs and back edges + // for (const auto& desc : tiOp->get_input_descriptions()) { + // auto body_input_index = desc->m_body_parameter_index; + + // if (auto slice_desc = ov::as_type_ptr(desc)) { + // inputPortMap.emplace_back(PortMap { + // static_cast(slice_desc->m_input_index), static_cast(body_input_index), + // static_cast(slice_desc->m_axis), static_cast(slice_desc->m_stride), + // static_cast(slice_desc->m_start), static_cast(slice_desc->m_end), + // static_cast(slice_desc->m_part_size)}); + // } else if (auto merge_desc = ov::as_type_ptr(desc)) { + // inputPortMap.emplace_back(PortMap { + // static_cast(merge_desc->m_input_index), static_cast(body_input_index), -1, 1, 0, -1, 1}); + + // auto body_output_idx = merge_desc->m_body_value_index; + + // backEdges.emplace_back(PortMap { + // static_cast(body_output_idx), static_cast(body_input_index), -1, 1, 0, -1, 1}); + // } else if (auto inv_desc = ov::as_type_ptr(desc)) { + // inputPortMap.emplace_back(PortMap { + // static_cast(inv_desc->m_input_index), static_cast(body_input_index), -1, 1, 0, -1, 1}); + // } else { + // THROW_ERROR("has incorrect type of the input description."); + // } + // } + + // if (auto loopOp = ov::as_type_ptr(ngraphOp)) { + // algorithm = Algorithm::TensorIteratorLoop; + // auto spec_port = loopOp->get_special_body_ports(); + // if (spec_port.current_iteration_input_idx != -1) { + // loopBodyCurrentIterationIdx.push_back(spec_port.current_iteration_input_idx); + // } + // if (spec_port.body_condition_output_idx != -1) { + // loopBodyConditionOutputIdx = spec_port.body_condition_output_idx; + // } + // loopTripCountIdx = 0; + // loopExecutionConditionIdx = 1; + // } else if (auto ti = ov::as_type_ptr(ngraphOp)) { + // algorithm = Algorithm::TensorIteratorCommon; + // } else { + // THROW_ERROR("isn't supported!"); + // } + // supportedPrimitiveDescriptors.emplace_back(make_plain_config(ngraphOp), impl_desc_type::unknown); + // selectPrimitiveDescriptorByIndex(0); + + // for the input configuration, just always use the parent configuration + auto subgraphOp = ov::as_type_ptr(ngraphOp); + // const auto numParameters = subgraphOp->get_function()->get_parameters().size(); + // const auto numResults = subgraphOp->get_function()->get_results().size(); + + // std::vector inConfs(inputShapes.size()); + // std::vector outConfs(outputShapes.size()); + + // std::vector inputConfig(numParameters); + // std::vector outputConfig(numResults); + + // // @todo should be always inplace when global memory reuse is fully supported by all the nodes + // bool isInPlace = false; + + // for (const auto& description : subgraphOp->get_output_descriptions()) { + // const auto outIdx = description->m_output_index; + // const auto resultIdx = description->m_body_value_index; + + // const auto &origShape = subgraphOp->get_output_partial_shape(outIdx); + // const auto& shape = Shape(origShape.rank().get_length() == 0 ? ov::PartialShape{1} : origShape); + // const auto prec = subgraphOp->get_output_element_type(outIdx); + + // auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); + // auto desc = descCreator->createSharedDesc(prec, shape); + + // outConfs.at(outIdx) = PortConfig(desc); + // outputConfig.at(resultIdx) = node::Input::OutputConfig{desc, isInPlace}; + // } + + // auto inputDescriptions = subgraphOp->get_input_descriptions(); + + // for (const auto& description : inputDescriptions) { + // const auto inIdx = description->m_input_index; + // const auto paramIdx = description->m_body_parameter_index; + + // const auto &origShape = subgraphOp->get_input_partial_shape(inIdx); + // const auto& shape = Shape(origShape.rank().get_length() == 0 ? ov::PartialShape{1} : origShape); + // const auto prec = subgraphOp->get_input_element_type(inIdx); + + // auto descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp); + // auto desc = descCreator->createSharedDesc(prec, shape); + + // // auto desc = getParentOutputMemDesc(getParentEdgeAt(inIdx)); + // inConfs.at(inIdx) = PortConfig(desc); + // inputConfig.at(paramIdx) = node::Input::InputConfig{desc, isInPlace}; + // } + + // configure the inner graph to get the information about output memory descriptors + // sub_graph.Init(subgraphOp->get_function(), context, inputConfig, outputConfig); + sub_graph.Init(subgraphOp->get_function(), context); + + // for the output descriptors, use the configuration of the graph's output nodes + // auto outputDescriptors = sub_graph.getOutputMemoryDescriptors(); + // auto outputDescriptions = subgraphOp->get_output_descriptions(); + + // for (const auto& description : outputDescriptions) { + // auto outIdx = description->m_output_index; + // auto resultIdx = description->m_body_value_index; + // outConfs.at(outIdx) = PortConfig(outputDescriptors.at(resultIdx)); + // } + + // const NodeConfig config(inConfs, outConfs); + + supportedPrimitiveDescriptors.clear(); + supportedPrimitiveDescriptors.emplace_back(make_plain_config(ngraphOp), impl_desc_type::unknown); + // supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::undef); + + selectPrimitiveDescriptorByIndex(0); +} + +// void TensorIterator::initSupportedPrimitiveDescriptors() { +// if (!supportedPrimitiveDescriptors.empty()) +// return; + +// supportedPrimitiveDescriptors.emplace_back(make_plain_config(ngraphOp), impl_desc_type::unknown); +// } + +void TensorIterator::createPrimitive() { + sub_graph.Activate(); + + auto subgraphOp = ov::as_type_ptr(ngraphOp); const auto &inMap = sub_graph.GetInputNodesMap(); - for (const auto ¶m : tiOp->get_function()->get_parameters()) { - auto inNode = inMap.find(tiOp->get_function()->get_parameter_index(param)); + for (const auto ¶m : subgraphOp->get_function()->get_parameters()) { + auto inNode = inMap.find(subgraphOp->get_function()->get_parameter_index(param)); if (inNode != inMap.end()) { input_mems.push_back(getToMemories(inNode->second.get(), 0)); } } const auto &outMap = sub_graph.GetOutputNodesMap(); - for (const auto &out : tiOp->get_function()->get_results()) { - auto outNode = outMap.find(tiOp->get_function()->get_result_index(out)); + for (const auto &out : subgraphOp->get_function()->get_results()) { + auto outNode = outMap.find(subgraphOp->get_function()->get_result_index(out)); if (outNode != outMap.end()) { auto outMem = outNode->second->getSrcMemoryAtPort(0); output_mem.push_back(outMem); @@ -428,7 +598,7 @@ void TensorIterator::getSupportedDescriptors() { } // Port map: outputs - for (const auto& desc : tiOp->get_output_descriptions()) { + for (const auto& desc : subgraphOp->get_output_descriptions()) { auto body_output_idx = desc->m_body_value_index; std::string type_name = desc->get_type_info().name; @@ -453,7 +623,7 @@ void TensorIterator::getSupportedDescriptors() { } // Port map : inputs and back edges - for (const auto& desc : tiOp->get_input_descriptions()) { + for (const auto& desc : subgraphOp->get_input_descriptions()) { auto body_input_index = desc->m_body_parameter_index; if (auto slice_desc = ov::as_type_ptr(desc)) { @@ -494,16 +664,7 @@ void TensorIterator::getSupportedDescriptors() { } else { THROW_ERROR("isn't supported!"); } -} -void TensorIterator::initSupportedPrimitiveDescriptors() { - if (!supportedPrimitiveDescriptors.empty()) - return; - - supportedPrimitiveDescriptors.emplace_back(make_plain_config(ngraphOp), impl_desc_type::unknown); -} - -void TensorIterator::createPrimitive() { if (loopBodyConditionOutputIdx == -1) continue_cond_check.reset(new staticValueCheck(true)); // always true if (loopExecutionConditionIdx == -1) { @@ -521,6 +682,10 @@ void TensorIterator::createPrimitive() { } } +int TensorIterator::registerToAllocationContext(int offset, AllocationContext& context) { + return sub_graph.RegisterToAllocationContext(offset, context); +} + bool TensorIterator::needPrepareParams() const { if (getAlgorithm() == Algorithm::TensorIteratorLoop) { const auto tripCountPtr = getSrcDataAtPortAs(loopTripCountIdx); @@ -876,7 +1041,6 @@ int TensorIterator::getNumIteration(const std::vector& inputPortMap, co return static_cast(length / step); }; - int numIterations = 1; bool isDefault = true; for (const auto& rule : inputPortMap) { diff --git a/src/plugins/intel_cpu/src/nodes/tensoriterator.h b/src/plugins/intel_cpu/src/nodes/tensoriterator.h index 41c086288f0cdb..623df59be41f86 100644 --- a/src/plugins/intel_cpu/src/nodes/tensoriterator.h +++ b/src/plugins/intel_cpu/src/nodes/tensoriterator.h @@ -106,9 +106,11 @@ class TensorIterator : public Node { TensorIterator(const std::shared_ptr& op, const GraphContext::CPtr context); static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - void initSupportedPrimitiveDescriptors() override; - void getSupportedDescriptors() override; + void initSupportedPrimitiveDescriptors() override {}; + void getSupportedDescriptors() override {}; + void selectOptimalPrimitiveDescriptor() override; void createPrimitive() override; + int registerToAllocationContext(int offset, AllocationContext& context) override; bool created() const override; void execute(dnnl::stream strm) override; bool canBeSkipped() const override { return false; }