diff --git a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp index 94d1cc324fa51a..9b86a1433acb06 100644 --- a/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp +++ b/src/plugins/intel_cpu/src/dnnl_postops_composer.cpp @@ -13,8 +13,8 @@ #include "cpu_types.h" #include "memory_desc/dnnl_blocked_memory_desc.h" -#include "nodes/executors/memory_arguments.hpp" #include "nodes/executors/common/common_utils.hpp" +#include "nodes/executors/memory_arguments.hpp" #include "openvino/core/type/element_type.hpp" #include "utils/cpu_utils.hpp" #include "utils/debug_capabilities.h" diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp index 4beb54a5df3a8f..38566868e0575d 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_fullyconnected.cpp @@ -24,239 +24,6 @@ namespace ov { namespace intel_cpu { -static VectorDims makeDummyInputDims(const Shape& inShape, const Shape& wShape) { - const auto& weightDims = wShape.getStaticDims(); - - auto inMinDims = inShape.getMinDims(); - auto inMaxDims = inShape.getMaxDims(); - inMinDims.back() = weightDims.back(); - inMaxDims.back() = weightDims.back(); - - return MemoryDescUtils::makeDummyShape(Shape(inMinDims, inMaxDims)).getStaticDims(); -} - -static VectorDims makeDummyOutputDims(const VectorDims& inShape, const VectorDims& wShape, const size_t out_rank) { - size_t activationRank = inShape.size(); - size_t channelRank = wShape.size() - 1; - // activation weight output_shape - // NCHW CoCHW NCo - // TNC CoC TNCo - // NC CoC NCo - VectorDims outputShape(out_rank, 1); - // set Co - outputShape.back() = wShape[0]; - // set batch dims - size_t batchRank = activationRank - channelRank; - size_t startIdx = out_rank - batchRank - 1; - for (size_t i = 0; i < batchRank; i++) { - outputShape[i + startIdx] = inShape[i]; - } - - return outputShape; -} - -static DnnlMemoryDescPtr makeTransposedWeightDescriptor(const DnnlMemoryDescPtr srcDesc, - const DnnlMemoryDescPtr dstDesc) { - const auto& weiDesc = srcDesc->getDnnlDesc(); - const auto reorderedWeiDesc = dnnl::memory::desc{weiDesc.get_dims(), weiDesc.get_data_type(), dnnl::memory::format_tag::ba}; - const auto transposedWeiDesc = reorderedWeiDesc.reshape(dstDesc->getDnnlDesc().get_dims()); - - return DnnlExtensionUtils::makeDescriptor(transposedWeiDesc); -} - -static ov::optional convertWeightPrecision(MemoryPtr input, MemoryPtr output, ov::element::Type weightPrecision) { - MemoryArgs memoryArgs; - memoryArgs[ARG_SRC] = input; - memoryArgs[ARG_DST] = output; - - auto aclWeightsConverter = std::make_shared(); - if (aclWeightsConverter->update(memoryArgs)) { - aclWeightsConverter->execute(memoryArgs); - return ov::optional(memoryArgs.at(ARG_DST)); - } - - if (!node::Convert::isSupportedDesc(input->getDesc()) || - !node::Convert::isSupportedDesc(output->getDesc())) { - return {}; - } - - auto data = static_cast(input->getData()); - std::vector tmpBuff; - tmpBuff.resize(output->getSize()); - cpu_convert(data, tmpBuff.data(), DnnlExtensionUtils::DataTypeToElementType(input->getDataType()), - weightPrecision, input->getSize() / input->getDesc().getPrecision().size()); - - return ov::optional(std::make_shared(output->getPrimitive().get_engine(), - output->getDesc().cloneWithNewPrecision(weightPrecision), - tmpBuff.data())); -} - -static ov::optional reorderDataFallback(MemoryPtr input, MemoryPtr output, ExecutorContext::CPtr context) { - if (output->getDataType() == input->getDataType()) { - return {}; - } - const auto inPrc = DnnlExtensionUtils::DataTypeToElementType(input->getDataType()); - auto convertedDstMemoryDesc = output->getDesc().cloneWithNewPrecision(inPrc); - dnnl::reorder reorderWithoutConvert = getReorderPrim(context->getRuntimeCache(), - output->getPrimitive().get_engine(), - input->getPrimitive().get_desc(), - MemoryDescUtils::convertToDnnlMemoryDesc(convertedDstMemoryDesc)->getDnnlDesc()); - - if (reorderWithoutConvert && parse_impl_name(reorderWithoutConvert.get_primitive_desc()->impl()->name()) != ref_any) { - auto convertOutput = convertWeightPrecision(input, output, inPrc); - if (!convertOutput) { - return {}; - } - input = *convertOutput; - - if (reorderWithoutConvert) { - dnnl::stream loc_stream(output->getPrimitive().get_engine(), dnnl::stream::flags::in_order); - reorderWithoutConvert.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); - return ov::optional(output); - } - } - return {}; -} - -static MemoryPtr reorderData(DnnlMemoryDescPtr srcWeightDesc, - DnnlMemoryDescPtr dstWeightDesc, - MemoryCPtr weightsMem, - ExecutorContext::CPtr context) { - MemoryPtr input = std::make_shared(context->getEngine(), srcWeightDesc, weightsMem->getData()); - MemoryPtr output = std::make_shared(context->getEngine(), dstWeightDesc); - if (!input->getDesc().isDefined() || !output->getDesc().isDefined()) - OPENVINO_THROW("Can't reorder data with dynamic shapes"); - - if (input->getShape().hasZeroDims() || output->getShape().hasZeroDims()) { - return output; - } - - if (input->getDesc().isCompatible(output->getDesc())) { - auto srcPtr = static_cast(input->getData()); - auto dstPtr = static_cast(output->getData()); - auto copySize = output->getSize(); - cpu_memcpy(dstPtr, srcPtr, copySize); - return output; - } - - // try directly reorder - auto engine = output->getPrimitive().get_engine(); - dnnl::reorder directReorder = getReorderPrim(context->getRuntimeCache(), - engine, - input->getPrimitive().get_desc(), - output->getPrimitive().get_desc()); - - if (!directReorder || parse_impl_name(directReorder.get_primitive_desc()->impl()->name()) == ref_any) { - // try precision conversion then do the reorder - auto fallbackOutput = reorderDataFallback(input, output, context); - if (fallbackOutput) { - return *fallbackOutput; - } - } - // if precision conversion does not work then do direct reference reorder - if (directReorder) { - dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order); - directReorder.execute(loc_stream, {{DNNL_ARG_FROM, input->getPrimitive()}, {DNNL_ARG_TO, output->getPrimitive()}}); - } else { - OPENVINO_THROW("Could not make onednn reorder."); - } - return output; -} - -static MemoryPtr reorderWeights(const MemoryArgs &memory, - const ExecutorContext::CPtr context, - ACLFCAttrs& aclfcAttrs, - DnnlMemoryDescPtr dnnlSrcDesc, - DnnlMemoryDescPtr dnnlDstDesc) { - auto create = [&]() { - MemoryPtr weightsMemory = memory.at(ARG_WEI); - if (aclfcAttrs.isWeightsRepacked || aclfcAttrs.isConvertedWeights) { - weightsMemory = reorderData(dnnlSrcDesc, dnnlDstDesc, memory.at(ARG_WEI), context); - DEBUG_LOG("ACLFullyConnectedExecutor: cache miss, perform packing"); - } - return weightsMemory; - }; - - auto weightCache = context->getWeightsCache(); - if (weightCache != nullptr) { - const auto& wgtDims = memory.at(ARG_WEI)->getStaticDims(); - const auto N = wgtDims[0]; - const auto K = wgtDims[1]; - std::string format = "fc_acl_" + std::to_string(N) + "_" + std::to_string(K); - const std::string string_hash = format + "_" + std::to_string(memory.at(ARG_WEI)->getSize()) + "_" + - std::to_string(reinterpret_cast(memory.at(ARG_WEI)->getData())); - DEBUG_LOG("ACLFullyConnectedExecutor: findOrCreate, string_hash: ", string_hash); - return *weightCache->findOrCreate(string_hash, create); - } - - DEBUG_LOG("ACLFullyConnectedExecutor: Weights cache is not available"); - return create(); -} - -static MemoryPtr prepareWeightMemory(const MemoryArgs &memory, - const ExecutorContext::CPtr context, - const FCAttrs &attrs, - ACLFCAttrs& aclfcAttrs, - const PostOps &postOps, - arm_compute::WeightFormat& expectedWeightFormat, - arm_compute::TensorInfo& weiTensorInfo) { - MemoryArgs memoryArgs; - memoryArgs[ARG_BIAS] = memory.at(ARG_BIAS); - memoryArgs[ARG_WEI] = memory.at(ARG_WEI); - - auto originalWeightsDesc = memory.at(ARG_WEI)->getDescPtr(); - - // normalize weights to 2D - const auto& wgtDims = originalWeightsDesc->getShape().getStaticDims(); - const VectorDims wgtDims2D = reshapeDownToRank<2>(wgtDims); - - originalWeightsDesc = std::make_shared(originalWeightsDesc->getPrecision(), Shape{wgtDims2D}); - - auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(originalWeightsDesc); - auto dstDesc = originalWeightsDesc->cloneWithNewPrecision(aclfcAttrs.inputPrecision); - auto dnnlDstDesc = MemoryDescUtils::convertToDnnlMemoryDesc(dstDesc); - - if (memory.at(ARG_SRC_0)->getShape().isDynamic()) { - const auto& inShape = memory.at(ARG_SRC_0)->getShape(); - const auto& wShape = originalWeightsDesc->getShape(); - const auto& inDymmyDims = makeDummyInputDims(inShape, wShape); - const auto& outDymmyDims = makeDummyOutputDims(inDymmyDims, wShape.getStaticDims(), memory.at(ARG_DST)->getShape().getRank()); - memoryArgs[ARG_SRC_0] = std::make_shared(context->getEngine(), - memory.at(ARG_SRC_0)->getDescPtr()->cloneWithNewDims(inDymmyDims)); - memoryArgs[ARG_DST] = std::make_shared(context->getEngine(), - memory.at(ARG_DST)->getDescPtr()->cloneWithNewDims(outDymmyDims)); - } else { - memoryArgs[ARG_SRC_0] = memory.at(ARG_SRC_0); - memoryArgs[ARG_DST] = memory.at(ARG_DST); - } - - // TODO: ACLWeightFormatGenerator should be replaced with Reorder executor - // that calls ACL NEReorder + NETranspose or dnnl::reorder depending on backend availability - auto aclWeightsRepack = std::make_shared(attrs, postOps, memoryArgs); - bool isNeededReorder = aclWeightsRepack->update(memoryArgs); - expectedWeightFormat = isNeededReorder ? aclWeightsRepack->getOptImplWeightFormat() : arm_compute::WeightFormat::UNSPECIFIED; - weiTensorInfo = aclWeightsRepack->getTensorInfo(ACLArgs::ACL_WEI); - - if (isNeededReorder) { - dnnl::impl::dim_t o_dim = 0; - dnnl::impl::dim_t inner_dim = 1; - std::vector remaining_dims = {}; - auto weights_md_ = dnnlDstDesc->getDnnlDesc().get(); - dnnl::impl::cpu::acl::acl_utils::reorder_to_weight_format(weiTensorInfo, *weights_md_, expectedWeightFormat, - inner_dim, o_dim, remaining_dims, {}); - if (aclfcAttrs.weightsNonTransposed) { - dnnlSrcDesc = makeTransposedWeightDescriptor(dnnlSrcDesc, dnnlDstDesc); - } - aclfcAttrs.isWeightsRepacked = true; - return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc); - } - if (!aclfcAttrs.weightsNonTransposed) { - dnnlDstDesc = makeTransposedWeightDescriptor(dnnlDstDesc, dnnlSrcDesc); - aclfcAttrs.isWeightsRepacked = true; - } - return reorderWeights(memory, context, aclfcAttrs, dnnlSrcDesc, dnnlDstDesc); -} - static bool checkPostOps(const PostOps &postOps) { if (postOps.empty()) { return true; diff --git a/src/plugins/intel_cpu/src/nodes/executors/common/common_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/common/common_utils.hpp index bb590ba59e181a..49e46d2273c798 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/common/common_utils.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/common/common_utils.hpp @@ -61,5 +61,5 @@ static std::vector getDeQuantizedScales(const MemoryArgs& memory) { return DQScales; } -} // namespace intel_cpu -} // namespace ov \ No newline at end of file +} // namespace intel_cpu +} // namespace ov \ No newline at end of file