From 8cc715cd8d7885425883e89b11d360ff87c0256c Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Sat, 23 Nov 2024 18:55:48 +0100 Subject: [PATCH] [Snippets] Move BrgemmCopyB repacking logic outside the Subgraph (#27007) ### Details: Currently, CopyB repacking is always performed inside Subgraph. In the case when batch on B Matmul input is significantly smaller than batch on A Matmul input, and parallel work amount is big enough, this may lead to ineffective execution, since repacking for B input is performed in each parallel task whereas only one repacking iteration for each B batch is enough. Within this PR, CopyB repacking is moved outside the snippets kernel and performed via common reorder primitive just before the snippets kernel execution. ### Tickets: - *CVS-154383* --- .../pass/mha_parallel_wa_optimizer.hpp | 53 ++++ .../include/snippets/lowered/pass/pass.hpp | 16 ++ .../lowered/pass/runtime_optimizer.hpp | 52 ++++ .../snippets/lowered/pass/serialize_base.hpp | 4 +- .../lowered/pass/serialize_control_flow.hpp | 7 +- .../lowered/pass/serialize_data_flow.hpp | 7 +- .../include/snippets/runtime_configurator.hpp | 123 ++++----- .../snippets/utils/linear_ir_pass_dumper.hpp | 4 +- .../pass/mha_parallel_wa_optimizer.cpp | 175 +++++++++++++ src/common/snippets/src/lowered/pass/pass.cpp | 19 ++ .../snippets/src/pass/collapse_subgraph.cpp | 5 +- .../snippets/src/pass/split_dimension_m.cpp | 15 ++ .../snippets/src/runtime_configurator.cpp | 246 +++--------------- .../tests/include/utils/split_dim_m.hpp | 37 +++ .../tests/src/pass/mha_tokenization.cpp | 2 +- .../snippets/tests/src/utils/split_dim_m.cpp | 72 +++++ .../snippets/cpu_runtime_configurator.cpp | 69 +---- .../snippets/cpu_runtime_configurator.hpp | 39 +-- src/plugins/intel_cpu/src/nodes/reorder.cpp | 6 - src/plugins/intel_cpu/src/nodes/subgraph.cpp | 79 ++++-- src/plugins/intel_cpu/src/nodes/subgraph.h | 14 +- .../snippets/x64/op/brgemm_cpu.cpp | 24 +- .../snippets/x64/op/brgemm_cpu.hpp | 1 - .../snippets/x64/op/brgemm_utils.cpp | 18 ++ .../snippets/x64/op/brgemm_utils.hpp | 8 +- .../x64/pass/eliminate_brgemm_copy_b.cpp | 46 ++++ .../x64/pass/eliminate_brgemm_copy_b.hpp | 29 +++ .../adjust_brgemm_copy_b_loop_ports.cpp | 57 ++-- .../adjust_brgemm_copy_b_loop_ports.hpp | 9 +- .../brgemm_copy_b_loop_ports_adjuster.cpp | 49 ++++ .../brgemm_copy_b_loop_ports_adjuster.hpp | 33 +++ .../x64/pass/lowered/brgemm_cpu_blocking.cpp | 12 +- .../lowered/external_repacking_adjuster.cpp | 72 +++++ .../lowered/external_repacking_adjuster.hpp | 32 +++ 34 files changed, 969 insertions(+), 465 deletions(-) create mode 100644 src/common/snippets/include/snippets/lowered/pass/mha_parallel_wa_optimizer.hpp create mode 100644 src/common/snippets/include/snippets/lowered/pass/runtime_optimizer.hpp create mode 100644 src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp create mode 100644 src/common/snippets/tests/include/utils/split_dim_m.hpp create mode 100644 src/common/snippets/tests/src/utils/split_dim_m.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp diff --git a/src/common/snippets/include/snippets/lowered/pass/mha_parallel_wa_optimizer.hpp b/src/common/snippets/include/snippets/lowered/pass/mha_parallel_wa_optimizer.hpp new file mode 100644 index 00000000000000..9af247cd52ecab --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/mha_parallel_wa_optimizer.hpp @@ -0,0 +1,53 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_info.hpp" +#include "snippets/lowered/pass/runtime_optimizer.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { +/** + * @class MHAParallelWAOptimizer + * @brief Optimizes the dynamic MHA execution increasing parallel work amount dy dividing Brgemm's "M" dimension to "parallel_m" + * and "kernel_m". Uses heuristics from snippets::pass::SplitDimensionM for dimension splitting. + * The optimizer performs the following steps: + * - Identifies applicable Brgemm operations within the LinearIR. + * - Finds parameters whose shapes and layouts need to be adjusted after the split. + * - Determines loops that should be adjusted. + */ +class MHAParallelWAOptimizer : public lowered::pass::RuntimeOptimizer { +public: + MHAParallelWAOptimizer() = default; + MHAParallelWAOptimizer(const lowered::LinearIRCPtr& linear_ir, const RuntimeConfigurator* configurator); + + bool run(const lowered::LinearIR& linear_ir) override; + bool applicable() const override { return !m_loops_to_split.empty(); } + +private: + static std::unordered_set find_applicable_brgemms(const lowered::LinearIRCPtr& linear_ir); + static std::unordered_set find_unsqueezed_params( + const lowered::LinearIRCPtr& linear_ir, + const std::unordered_set& brgemms); + static std::vector find_loops_to_split( + const lowered::LinearIRCPtr& linear_ir, + const std::unordered_set& unsqueezed_params); + + std::vector m_loops_to_split{}; + std::unordered_set m_unsqueezed_params{}; + std::vector> m_optimized_layouts{}; + std::vector m_dim_M_idces{}; + size_t m_concurrency = 0; + + static const size_t m_dim_M_idx; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/lowered/pass/pass.hpp b/src/common/snippets/include/snippets/lowered/pass/pass.hpp index 446f96d30a27cf..2758ab85070341 100644 --- a/src/common/snippets/include/snippets/lowered/pass/pass.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/pass.hpp @@ -67,6 +67,21 @@ class Pass : public PassBase { virtual bool run(lowered::LinearIR& linear_ir) = 0; }; +/** + * @interface ConstPass + * @brief Base class for LIR passes which are performed on a full LIR body but doesn't change it + * @ingroup snippets + */ +class ConstPass : public PassBase { +public: + /** + * @brief Apply the pass to the Linear IR + * @param linear_ir the target Linear IR + * @return status of the pass + */ + virtual bool run(const lowered::LinearIR& linear_ir) = 0; +}; + /** * @interface RangedPass * @brief Base class for LIR passes which are performed on a range of a LIR body @@ -114,6 +129,7 @@ class PassPipeline { void register_positioned_passes(const std::vector& pos_passes); void run(lowered::LinearIR& linear_ir) const; + void run(const lowered::LinearIR& linear_ir) const; void run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) const; /** diff --git a/src/common/snippets/include/snippets/lowered/pass/runtime_optimizer.hpp b/src/common/snippets/include/snippets/lowered/pass/runtime_optimizer.hpp new file mode 100644 index 00000000000000..ed37a1c6c58bca --- /dev/null +++ b/src/common/snippets/include/snippets/lowered/pass/runtime_optimizer.hpp @@ -0,0 +1,52 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/pass.hpp" +#include "snippets/runtime_configurator.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { +/** + * @class RuntimeOptimizer + * @brief Base class for runtime optimizers that operate on LinearIR and RuntimeConfigurator during + * RuntimeConfigurator::update stage. + */ +class RuntimeOptimizer : public ConstPass { +public: + RuntimeOptimizer() = default; + RuntimeOptimizer(const RuntimeConfigurator* configurator) : m_configurator(configurator) { + OPENVINO_ASSERT(configurator, "RuntimeConfigurator musn't be nullptr"); + } + /** + * @brief Defines if this pass is applicable. If it is not applicable, its registration in pass pipeline can be skipped. + */ + virtual bool applicable() const = 0; + + /** + * @brief Creates an instance of the specified pass type and checks if it is applicable. + * If the pass is applicable, it is registered in the provided pipeline. + * @param pipeline The pipeline in which the pass should be registered. + * @param args The arguments to be forwarded to the pass constructor. + */ + template ::value>> + static void register_if_applicable(PassPipeline& pipeline, Args&&... args) { + auto pass = std::make_shared(std::forward(args)...); + if (pass->applicable()) { + pipeline.register_pass(pass); + } + } + +protected: + const RuntimeConfigurator* m_configurator = nullptr; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov diff --git a/src/common/snippets/include/snippets/lowered/pass/serialize_base.hpp b/src/common/snippets/include/snippets/lowered/pass/serialize_base.hpp index 51cc528a155a00..560744f4eb09d8 100644 --- a/src/common/snippets/include/snippets/lowered/pass/serialize_base.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/serialize_base.hpp @@ -16,9 +16,9 @@ namespace pass { * @brief Base class for LinearIR serialization passes * @ingroup snippets */ -class SerializeBase : public Pass { +class SerializeBase : public ConstPass { public: - OPENVINO_RTTI("SerializeBase", "Pass") + OPENVINO_RTTI("SerializeBase", "ConstPass") SerializeBase(const std::string& xml_path); protected: diff --git a/src/common/snippets/include/snippets/lowered/pass/serialize_control_flow.hpp b/src/common/snippets/include/snippets/lowered/pass/serialize_control_flow.hpp index 602e9d9df7ce32..2e8f91aed6c08d 100644 --- a/src/common/snippets/include/snippets/lowered/pass/serialize_control_flow.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/serialize_control_flow.hpp @@ -22,12 +22,7 @@ class SerializeControlFlow : public SerializeBase { OPENVINO_RTTI("SerializeControlFlow", "Pass", SerializeBase) SerializeControlFlow(const std::string& xml_path, bool update_dynamic_ops = false) : SerializeBase(xml_path), m_update_dynamic_ops{update_dynamic_ops} {} - - bool run(LinearIR& linear_ir) override { - return run(const_cast(linear_ir)); - } - // We need a const method to run from functions that can't change LIR - bool run(const LinearIR& linear_ir); + bool run(const LinearIR& linear_ir) override; private: const bool m_update_dynamic_ops = false; diff --git a/src/common/snippets/include/snippets/lowered/pass/serialize_data_flow.hpp b/src/common/snippets/include/snippets/lowered/pass/serialize_data_flow.hpp index ce5b3855400264..ecbc1a834ce388 100644 --- a/src/common/snippets/include/snippets/lowered/pass/serialize_data_flow.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/serialize_data_flow.hpp @@ -23,12 +23,7 @@ class SerializeDataFlow : public SerializeBase { public: OPENVINO_RTTI("SerializeDataFlow", "Pass", SerializeBase) SerializeDataFlow(const std::string& xml_path) : SerializeBase(xml_path) {} - - bool run(LinearIR& linear_ir) override { - return run(const_cast(linear_ir)); - } - // We need a const method to run from functions that can't change LIR - bool run(const LinearIR& linear_ir); + bool run(const LinearIR& linear_ir) override; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 10c15a4621a72a..866e98843fcd50 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -4,9 +4,9 @@ #pragma once +#include "snippets/kernel_executor_table.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_info.hpp" -#include "snippets/kernel_executor_table.hpp" #include "snippets/lowered/pass/pass.hpp" namespace ov { @@ -44,12 +44,15 @@ class RuntimeConfig { size_t tensor_rank = 0; size_t tile_rank = 0; + std::vector io_shapes = {}; + std::vector io_layouts = {}; std::vector io_data_offsets = {}; ov::snippets::VectorDims master_shape = {}; size_t buffer_scratchpad_size = 0; std::vector buffer_cluster_offsets {}; KernelExecutorTablePtr kernel_executor_table = std::make_shared(); + std::vector latest_shapes = {}; }; /** @@ -83,18 +86,62 @@ class RuntimeConfigurator { */ void reset_kernel_executor_table() const; -protected: + // Getters for private members + std::shared_ptr get_config() const { return m_config; } + size_t get_io_num() const { return m_io_num; } + size_t get_in_num() const { return m_in_num; } + const std::vector& get_io_descs() const { return m_io_descs; } + const std::vector& get_io_data_sizes() const { return m_io_data_sizes; } + const std::map>& get_dynamic_buffer_clusters() const { return m_dynamic_buffer_clusters; } + /** - * @brief Update RuntimeConfig based on LinearIR + * @brief Computes the offsets for each dimension of a tensor shape. + * + * This function calculates the offsets for each dimension of a tensor shape, which represent the distance between + * consecutive elements of the corresponding dimension. If a dimension size is 1, the next dimension starts + * immediately, and the stride is 0. + * @param shape The shape for offset computation. + * @param idx The index to get the corresponding offsets and io_data_sizes. + * @param idx_stride Defines the number of dimensions that should be skipped in the offsets vector. + */ + void compute_offsets(const ov::snippets::VectorDims& shape, size_t idx, size_t idx_stride) const; + struct UnifiedLoopInfoRtParams { + size_t work_amount = 0; + std::vector ptr_increments; + std::vector finalization_offsets; + }; + /** + * @brief Retrieves the runtime parameters for a given UnifiedLoopInfo. + * @param unified_loop_info The UnifiedLoopInfo for which the runtime parameters are to be retrieved. + * @return A LoopInfoRuntimeParams object containing the runtime parameters. + */ + static UnifiedLoopInfoRtParams get_loop_runtime_params(const lowered::UnifiedLoopInfoPtr& unified_loop_info); + using LoopInfoRuntimeParamsMap = std::unordered_map; + /** + * @brief Update Loop information in LinearIR: Unified and ExpandedLoopInfo * @param linear_ir LinearIR - * @todo Ticket 148891: Rewrite on PassPipeline */ - virtual void update(const lowered::LinearIRCPtr& linear_ir); + static void update_loop_info(const lowered::LinearIRCPtr& linear_ir); + /** + * @brief Updates the ExpandedLoopInfo based on the initialized runtime parameters. + * @param expanded_loop_info The ExpandedLoopInfo to be updated. + * @param initialized_info_map A map containing the initialized runtime parameters for UnifiedLoopInfo. + */ + static void update_expanded_loop_info(const lowered::ExpandedLoopInfoPtr& expanded_loop_info, + LoopInfoRuntimeParamsMap& initializated_info_map); /** * @brief Update tensor rank based on master shape * @param master_shape Master shape */ - virtual void update_tensor_rank(const ov::snippets::VectorDims& master_shape); + virtual void update_tensor_rank(const ov::snippets::VectorDims& master_shape) const; + +protected: + /** + * @brief Update RuntimeConfig based on LinearIR + * @param linear_ir LinearIR + * @todo Ticket 148891: Rewrite on PassPipeline + */ + virtual void update(const lowered::LinearIRCPtr& linear_ir); /** * @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator * @param linear_ir LinearIR @@ -120,21 +167,6 @@ class RuntimeConfigurator { * @param linear_ir LinearIR */ virtual void init_tensor_rank(const lowered::LinearIRCPtr& linear_ir) const; - - struct UnifiedLoopInfoRtParams { - size_t work_amount = 0; - std::vector ptr_increments; - std::vector finalization_offsets; - }; - static UnifiedLoopInfoRtParams get_loop_runtime_params(const lowered::UnifiedLoopInfoPtr& unified_loop_info); - using LoopInfoRuntimeParamsMap = std::unordered_map; - /** - * @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo - * @param linear_ir LinearIR - */ - static void update_loop_info(const lowered::LinearIRCPtr& linear_ir); - static void update_expanded_loop_info(const lowered::ExpandedLoopInfoPtr& expanded_loop_info, - LoopInfoRuntimeParamsMap& initializated_info_map); /** * @brief Update Buffer scratchpad size and offsets if needed * Note: `update_loop_info` must be called before @@ -146,8 +178,7 @@ class RuntimeConfigurator { * @param shapes shapes used in offsets computation * @param layouts layouts used in offsets computation */ - void update_data_offsets(const std::vector& shapes, - const std::vector>& layouts) const; + void update_data_offsets() const; /** * @brief Extract shapes from m_io_descs */ @@ -157,43 +188,6 @@ class RuntimeConfigurator { */ std::vector> extract_layouts() const; - class MHAParallelWAOptimizer { - public: - MHAParallelWAOptimizer() = default; - MHAParallelWAOptimizer(const ov::snippets::lowered::LinearIRCPtr& linear_ir, RuntimeConfigurator* configurator); - /** - * @brief Checks if the current master shape can be optimized, and if yes, updates all the necessary runtime information - * @return status if the optimization is applied - */ - bool optimize(); - - private: - /** - * @brief Checks if optimizer is enabled - * @todo Ticket 148891: when RuntimeConfigurator::update will be rewritten on PassPipeline, this method should be removed - * We will not just register MHAParallelWAOptimizer in case if it is not needed - */ - bool enabled() const; - - static std::unordered_set find_applicable_brgemms(const ov::snippets::lowered::LinearIRCPtr& linear_ir); - static std::unordered_set find_unsqueezed_params( - const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::unordered_set& brgemms); - static std::vector find_loops_to_split( - const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::unordered_set& unsqueezed_params); - - RuntimeConfigurator* configurator = nullptr; - - std::vector loops_to_split{}; - std::unordered_set unsqueezed_params{}; - std::vector> optimized_layouts{}; - std::vector m_dim_idces{}; - size_t concurrency = 0; - - static const size_t m_dim_idx; - } m_optimizer; - std::shared_ptr m_config = nullptr; size_t m_io_num = 0; @@ -203,7 +197,14 @@ class RuntimeConfigurator { // [cluster_id -> buffer expressions ] std::map> m_dynamic_buffer_clusters = {}; - std::vector m_latest_shapes = {}; + // WA: until ticket 148891 is not implemented, 2 pass pipelines for runtime optimizers are necessary since different + // optimizers must be called at different pipeline stages. + // - Intermediate optimizers must be called right after `update_loop_info` + // - Final optimizers must be called after all other RuntimeConfigurator's update methods + // When all updates will be rewritten on PassPipeline, PositionedPasses can be used to precisely define the place of + // the additional optimizers + lowered::pass::PassPipeline m_intermediate_optimizers; + lowered::pass::PassPipeline m_final_optimizers; }; } // namespace snippets diff --git a/src/common/snippets/include/snippets/utils/linear_ir_pass_dumper.hpp b/src/common/snippets/include/snippets/utils/linear_ir_pass_dumper.hpp index 85abfc9a91ab31..c8c145d7eac075 100644 --- a/src/common/snippets/include/snippets/utils/linear_ir_pass_dumper.hpp +++ b/src/common/snippets/include/snippets/utils/linear_ir_pass_dumper.hpp @@ -16,7 +16,7 @@ namespace snippets { class LIRPassDump { public: - explicit LIRPassDump(lowered::LinearIR& linear_ir, std::string pass_name) + explicit LIRPassDump(const lowered::LinearIR& linear_ir, std::string pass_name) : linear_ir(linear_ir), pass_name(std::move(pass_name)), debug_config(linear_ir.get_config().debug_config) { dump("_in"); } @@ -44,7 +44,7 @@ class LIRPassDump { num++; } - lowered::LinearIR& linear_ir; + const lowered::LinearIR& linear_ir; const std::string pass_name; const DebugCapsConfig& debug_config; }; diff --git a/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp b/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp new file mode 100644 index 00000000000000..2f57d6422cf11d --- /dev/null +++ b/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp @@ -0,0 +1,175 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/lowered/pass/mha_parallel_wa_optimizer.hpp" + +#include "snippets/itt.hpp" +#include "snippets/lowered/loop_info.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/pass/split_dimension_m.hpp" +#include "snippets/utils/loop_utils.hpp" +#include "snippets/utils/utils.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { +using namespace ov::snippets::pass; + +const size_t MHAParallelWAOptimizer::m_dim_M_idx = 1; + +MHAParallelWAOptimizer::MHAParallelWAOptimizer(const lowered::LinearIRCPtr& linear_ir, const RuntimeConfigurator* configurator) + : lowered::pass::RuntimeOptimizer(configurator) { + if (linear_ir->get_config().m_enable_domain_optimization || !linear_ir->is_dynamic()) + return; + + const auto brgemms = find_applicable_brgemms(linear_ir); + if (brgemms.empty()) + return; + + m_concurrency = linear_ir->get_config().m_min_parallel_work_amount; + m_unsqueezed_params = find_unsqueezed_params(linear_ir, brgemms); + OPENVINO_ASSERT(!m_unsqueezed_params.empty(), "unsqueezed_params mustn't be empty after initialization"); + m_loops_to_split = find_loops_to_split(linear_ir, m_unsqueezed_params); + + m_dim_M_idces.resize(configurator->get_io_num()); + m_optimized_layouts.resize(configurator->get_io_num()); + for (size_t i = 0; i < configurator->get_io_num(); ++i) { + const auto& layout = configurator->get_io_descs()[i]->get_layout(); + const auto dim_idx = i < configurator->get_in_num() ? utils::get_input_dim_idx(layout, m_dim_M_idx) + : utils::get_output_dim_idx(layout, m_dim_M_idx); + m_dim_M_idces[i] = dim_idx; + const auto m_idx = i < configurator->get_in_num() ? dim_idx : layout.size() - 2; + m_optimized_layouts[i] = SplitDimensionM::get_updated_order(layout, m_idx); + } +} + +bool MHAParallelWAOptimizer::run(const lowered::LinearIR& linear_ir) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::MHAParallelWAOptimizer") + const auto& config = m_configurator->get_config(); + size_t new_batch_dim, new_kernel_dim; + if (!SplitDimensionM::split(config->master_shape, m_concurrency, new_batch_dim, new_kernel_dim)) + return false; + auto& master_shape = config->master_shape; + *++master_shape.rbegin() = new_kernel_dim; + master_shape.insert(master_shape.cbegin() + master_shape.size() - 2, new_batch_dim); + m_configurator->update_tensor_rank(master_shape); + + RuntimeConfigurator::LoopInfoRuntimeParamsMap initialized_info; + auto updater = [&](const lowered::LoopInfoPtr& loop_info) { + if (const auto unified_loop_info = ov::as_type_ptr(loop_info)) { + if (initialized_info.count(unified_loop_info) == 0) { + if (!ov::is_type(unified_loop_info)) + unified_loop_info->set_work_amount(new_kernel_dim); + snippets::utils::update_data_pointer_shifts(unified_loop_info); + initialized_info[unified_loop_info] = RuntimeConfigurator::get_loop_runtime_params(unified_loop_info); + } + } else if (const auto expanded_loop_info = ov::as_type_ptr(loop_info)) { + m_configurator->update_expanded_loop_info(expanded_loop_info, initialized_info); + } else { + OPENVINO_THROW("Failed to update loop info: unknown type!"); + } + }; + lowered::LoopInfoSet updated_loops; + for (const auto& loop : m_loops_to_split) { + loop->apply(updater, updated_loops); + } + + for (size_t i = 0; i < m_configurator->get_io_num(); ++i) { + config->io_shapes[i] = m_unsqueezed_params.count(i) + ? SplitDimensionM::unsqueeze_m_dim(config->io_shapes[i], m_dim_M_idces[i]) + : SplitDimensionM::reshape_m_dim(config->io_shapes[i], m_dim_M_idces[i], new_batch_dim, new_kernel_dim); + } + config->io_layouts = m_optimized_layouts; + return true; +} + +std::unordered_set MHAParallelWAOptimizer::find_applicable_brgemms(const lowered::LinearIRCPtr& linear_ir) { + auto is_brgemm = [](const lowered::ExpressionPtr& expr) { + return ov::is_type(expr->get_node()); + }; + auto brgemm_it = std::find_if(linear_ir->begin(), linear_ir->end(), is_brgemm); + std::unordered_set brgemms; + while (brgemm_it != linear_ir->end()) { + brgemms.insert(*brgemm_it); + brgemm_it = std::find_if(std::next(brgemm_it), linear_ir->end(), is_brgemm); + } + const auto& loop_manager = linear_ir->get_loop_manager(); + auto applicable_brgemm = [&loop_manager](const lowered::ExpressionPtr& expr) { + const auto& loop_idces = expr->get_loop_ids(); + if (loop_idces.empty()) + return false; + const auto& outermost_loop = loop_manager->get_loop_info(loop_idces[0]); + if (!snippets::utils::is_dynamic_value(outermost_loop->get_work_amount())) + return false; + bool loop_by_m = true; + outermost_loop->iterate_through_ports([&loop_by_m](const lowered::LoopPort& port) { + if (port.is_incremented && port.dim_idx != m_dim_M_idx) + loop_by_m = false; + }); + return loop_by_m; + }; + return std::all_of(brgemms.begin(), brgemms.end(), applicable_brgemm) ? brgemms : std::unordered_set{}; +} + +std::unordered_set MHAParallelWAOptimizer::find_unsqueezed_params( + const lowered::LinearIRCPtr& linear_ir, + const std::unordered_set& brgemms) { + const auto& params = linear_ir->get_parameters(); + std::unordered_set unsqueezed_params; + auto add_param = [¶ms, &unsqueezed_params](const lowered::ExpressionPtr& expr) { + if (ov::is_type(expr->get_node())) { + auto found_param = std::find(params.begin(), params.end(), expr); + OPENVINO_ASSERT(found_param != params.end(), "find_param didn't found parameter for expr"); + unsqueezed_params.insert(std::distance(params.begin(), found_param)); + } + }; + + std::unordered_set visited; + for (const auto& brgemm : brgemms) { + const auto& brgemm_b_input = brgemm->get_input_port_connector(1)->get_source().get_expr(); + utils::visit_path(brgemm_b_input, visited, add_param, true); + } + return unsqueezed_params; +} + +std::vector MHAParallelWAOptimizer::find_loops_to_split( + const lowered::LinearIRCPtr& linear_ir, + const std::unordered_set& unsqueezed_params) { + const auto loop_manager = linear_ir->get_loop_manager(); + std::set loop_idces_to_split; + std::vector prev_loop_idces; + + auto add_loop_idx_to_split = [&](const lowered::ExpressionPtr& expr) { + const auto& loop_idces = expr->get_loop_ids(); + if (loop_idces != prev_loop_idces) { + prev_loop_idces = loop_idces; + for (const auto& loop_id : loop_idces) { + const auto expanded_loop_info = loop_manager->get_loop_info(loop_id); + if (expanded_loop_info->get_dim_idx() == m_dim_M_idx) { + loop_idces_to_split.insert(loop_id); + } + } + } + }; + + size_t i = 0; + std::unordered_set visited; + for (const auto& param : linear_ir->get_parameters()) { + if (unsqueezed_params.count(i++)) + continue; + utils::visit_path(param, visited, add_loop_idx_to_split, false); + } + + const auto& loops_map = linear_ir->get_loop_manager()->get_map(); + std::vector loops_to_split; + for (const auto& id : loop_idces_to_split) + loops_to_split.push_back(ov::as_type_ptr(loops_map.at(id))); + return loops_to_split; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/src/lowered/pass/pass.cpp b/src/common/snippets/src/lowered/pass/pass.cpp index f5b902a1a17b8c..235d248d8e9838 100644 --- a/src/common/snippets/src/lowered/pass/pass.cpp +++ b/src/common/snippets/src/lowered/pass/pass.cpp @@ -27,6 +27,23 @@ void PassPipeline::register_pass(const std::shared_ptr& pass) { m_passes.push_back(pass); } +void PassPipeline::run(const lowered::LinearIR& linear_ir) const { + for (const auto& pass : m_passes) { + OPENVINO_ASSERT(pass != nullptr, "PassPipeline has empty pass!"); + SNIPPETS_DEBUG_LIR_PASS_DUMP(linear_ir, pass); + + if (m_pass_config->is_disabled(pass->get_type_info())) { + continue; + } + const auto const_pass = std::dynamic_pointer_cast(pass); + OPENVINO_ASSERT(const_pass != nullptr, + "Unexpected pass (", + pass->get_type_info(), + ") is registered in PassPipeline. Only ConstPass is allowed."); + const_pass->run(linear_ir); + } +} + void PassPipeline::run(LinearIR& linear_ir) const { run(linear_ir, linear_ir.cbegin(), linear_ir.cend()); } @@ -41,6 +58,8 @@ void PassPipeline::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearI } if (auto lir_pass = std::dynamic_pointer_cast(pass)) { lir_pass->run(linear_ir); + } else if (auto const_pass = std::dynamic_pointer_cast(pass)) { + const_pass->run(linear_ir); } else if (auto ranged_pass = std::dynamic_pointer_cast(pass)) { ranged_pass->run(linear_ir, begin, end); } else { diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp index 0f0cc225173479..6348f89598523d 100644 --- a/src/common/snippets/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/src/pass/collapse_subgraph.cpp @@ -51,9 +51,12 @@ auto is_supported_op(const std::shared_ptr &n) -> bool { const auto parent = transpose->get_input_node_shared_ptr(0); const auto child = transpose->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); auto is_brgemm_case = ov::is_type(parent) || ov::is_type(child); + auto decomposition_case = true; // Check for Transpose parent is MatMul inside Subgraph if (const auto subgraph = ov::as_type_ptr(parent)) { if (GetSnippetsSubgraphType(subgraph) != SnippetsSubgraphType::Completed) { + // Transpose decomposition is supported only for Transpose nodes right after Subgraph's parameters + decomposition_case = false; const auto body = subgraph->body_ptr(); const auto subgraph_output = body->get_results()[transpose->input_value(0).get_index()]->get_input_node_shared_ptr(0); is_brgemm_case = is_brgemm_case || ov::is_type(subgraph_output); @@ -63,7 +66,7 @@ auto is_supported_op(const std::shared_ptr &n) -> bool { const auto& order = as_type_ptr(n->get_input_node_shared_ptr(1)); if (order) { const auto order_value = order->cast_vector(); - return (TransposeDecomposition::is_supported_transpose_order(order_value)) || + return (decomposition_case && TransposeDecomposition::is_supported_transpose_order(order_value)) || (is_brgemm_case && FuseTransposeBrgemm::is_supported_transpose_order(order_value)); } } diff --git a/src/common/snippets/src/pass/split_dimension_m.cpp b/src/common/snippets/src/pass/split_dimension_m.cpp index 0f50ad27931e04..ae95a371483163 100644 --- a/src/common/snippets/src/pass/split_dimension_m.cpp +++ b/src/common/snippets/src/pass/split_dimension_m.cpp @@ -34,6 +34,8 @@ bool SplitDimensionM::is_supported_matmul(const std::shared_ptr& std::pair SplitDimensionM::get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) { std::pair splited = { 1, m_dim }; + // Ideal case #1: M can be split on the parts one of which complements the batch dimension to the optimal parallel work amount + // In this case, each thread will execute the Snippets kernel once const size_t lower_bound = optimal_parallelism_work_amount / batch_dim; if (lower_bound * batch_dim == optimal_parallelism_work_amount && m_dim % lower_bound == 0) { splited.first = lower_bound; @@ -42,6 +44,19 @@ std::pair SplitDimensionM::get_splited_dimensions(size_t batch_d return splited; } + // Ideal case #2: M is divisible by optimal parallel work amount, and the new_m_dim is big enough + // In this case, each thread will execute the Snippets kernel 'batch_dim' times + if (m_dim % optimal_parallelism_work_amount == 0) { + const auto new_m_dim = m_dim / optimal_parallelism_work_amount; + const size_t min_kernel_m = 64; + if (new_m_dim >= min_kernel_m) { + splited.first = optimal_parallelism_work_amount; + splited.second = new_m_dim; + OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); + return splited; + } + } + const size_t upper_bound = utils::div_up(2 * optimal_parallelism_work_amount, batch_dim); for (size_t divisor_0 = upper_bound - 1; divisor_0 > 1; divisor_0--) { size_t divisor_1 = m_dim / divisor_0; diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 9174d93eea3f98..96d13074d042ba 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -7,16 +7,17 @@ #include "snippets/lowered/pass/compute_buffer_allocation_size.hpp" #include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/pass/insert_specific_iterations.hpp" -#include "snippets/pass/split_dimension_m.hpp" +#include "snippets/lowered/pass/mha_parallel_wa_optimizer.hpp" #include "snippets/snippets_isa.hpp" -#include "snippets/utils/utils.hpp" #include "snippets/utils/loop_utils.hpp" +#include "snippets/utils/utils.hpp" namespace ov { namespace snippets { using namespace ov::snippets::pass; using namespace ov::snippets::lowered; +using namespace ov::snippets::lowered::pass; #ifdef SNIPPETS_DEBUG_CAPS std::string RuntimeConfig::to_string() const { @@ -51,6 +52,8 @@ const std::shared_ptr& RuntimeConfigurator::get_updated_config(co initialization(linear_ir); update(linear_ir); + // Note: after 'update' is finished, io_shapes can be corrupted, so we move it to latest_shapes to avoid copying + m_config->latest_shapes = std::move(m_config->io_shapes); return m_config; } @@ -60,30 +63,32 @@ void RuntimeConfigurator::initialization(const lowered::LinearIRCPtr& linear_ir) init_buffer_info(linear_ir); OPENVINO_ASSERT(m_io_num > 0, "LinearIR must have parameters and results"); - m_latest_shapes.resize(m_io_num); + m_config->latest_shapes.resize(m_io_num); m_config->io_data_offsets.resize(m_io_num); m_config->tile_rank = linear_ir->get_config().m_loop_depth; - m_optimizer = MHAParallelWAOptimizer(linear_ir, this); + + RuntimeOptimizer::register_if_applicable(m_intermediate_optimizers, linear_ir, this); } void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { m_config->master_shape = linear_ir->get_master_shape(); - update_loop_info(linear_ir); + m_config->io_shapes = extract_shapes(); + m_config->io_layouts = extract_layouts(); + if (linear_ir->is_dynamic()) + update_loop_info(linear_ir); - if (!m_optimizer.optimize()) { - // If the optimization was not applied, offsets are updated using shapes from descriptors - auto shapes = extract_shapes(); - update_data_offsets(shapes, extract_layouts()); - m_latest_shapes = std::move(shapes); - } + m_intermediate_optimizers.run(*linear_ir); // Update KernelExecutor Table should be before `update_buffer_scratchpad_size` // because `ComputeAllocationSize` depends on subtensors which are updated in the table get_kernel_executor_table()->update_state(linear_ir); update_buffer_scratchpad_size(linear_ir); + + update_data_offsets(); + m_final_optimizers.run(*linear_ir); } -void RuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { +void RuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) const { m_config->tensor_rank = master_shape.size(); } @@ -257,8 +262,9 @@ void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRC OPENVINO_ASSERT(!utils::is_dynamic_value(m_config->buffer_scratchpad_size), "Buffer scratchpad size must be defined!"); } -void RuntimeConfigurator::update_data_offsets(const std::vector& shapes, - const std::vector>& layouts) const { +void RuntimeConfigurator::update_data_offsets() const { + const auto& shapes = m_config->io_shapes; + const auto& layouts = m_config->io_layouts; OPENVINO_ASSERT(shapes.size() == m_io_num, "Number of custom shapes must be 0 or be equal to m_io_num"); OPENVINO_ASSERT(layouts.size() == m_io_num, "Number of custom layouts must be 0 or be equal to m_io_num"); for (size_t i = 0; i < m_io_num; ++i) { @@ -271,26 +277,17 @@ void RuntimeConfigurator::update_data_offsets(const std::vector& sha // shape: s0, s1, s2 == 1, s3 // offsets: s1*s3, s3, 0, 1 const auto& shape = shapes[i]; - if (shape == m_latest_shapes[i]) + OPENVINO_ASSERT(m_config->tensor_rank >= shape.size(), "Incorrect tensor rank!"); + if (shape == m_config->latest_shapes[i]) continue; - - const auto& layout = layouts[i]; - auto& offsets = m_config->io_data_offsets[i]; - - offsets.resize(m_config->tensor_rank); - std::fill(offsets.begin(), offsets.end(), 0); if (utils::is_dynamic_vdims(shape)) return; - size_t dim_step = m_io_data_sizes[i]; - offsets[offsets.size() - 1] = dim_step; - - OPENVINO_ASSERT(m_config->tensor_rank >= shape.size(), "Incorrect tensor rank!"); const auto idx_stride = m_config->tensor_rank - shape.size(); - for (int i = static_cast(shape.size()) - 2; i >= 0; i--) { - dim_step *= shape[i + 1]; - offsets[i + idx_stride] = shape[i] != 1 ? dim_step : 0; - } + compute_offsets(shape, i, idx_stride); + + auto& offsets = m_config->io_data_offsets[i]; + const auto& layout = layouts[i]; if (!layout.empty()) { std::vector reordered_offsets(offsets.size()); const auto is_input = i < m_in_num; @@ -318,8 +315,21 @@ std::vector> RuntimeConfigurator::extract_layouts() const { return layouts; } +void RuntimeConfigurator::compute_offsets(const ov::snippets::VectorDims& shape, size_t idx, size_t idx_stride) const { + auto& offsets = m_config->io_data_offsets[idx]; + auto dim_step = m_io_data_sizes[idx]; + + offsets.resize(m_config->tensor_rank); + std::fill(offsets.begin(), offsets.end(), 0); + offsets[offsets.size() - 1] = dim_step; + for (int i = static_cast(shape.size()) - 2; i >= 0; i--) { + dim_step *= shape[i + 1]; + offsets[i + idx_stride] = shape[i] != 1 ? dim_step : 0; + } +} + void RuntimeConfigurator::set_kernel_executor_table(std::shared_ptr table) const { - OPENVINO_ASSERT(table, "Failed to update Kernel Executo Table: passed table is missed"); + OPENVINO_ASSERT(table, "Failed to update Kernel Executor Table: passed table is missed"); m_config->kernel_executor_table = std::move(table); } @@ -339,181 +349,5 @@ RuntimeConfigurator::UnifiedLoopInfoRtParams RuntimeConfigurator::get_loop_runti }); return rt_params; } - -const size_t RuntimeConfigurator::MHAParallelWAOptimizer::m_dim_idx = 1; - -RuntimeConfigurator::MHAParallelWAOptimizer::MHAParallelWAOptimizer( - const ov::snippets::lowered::LinearIRCPtr& linear_ir, - RuntimeConfigurator* configurator) - : configurator(configurator) { - OPENVINO_ASSERT(configurator != nullptr, "Configurator is nullptr"); - - if (linear_ir->get_config().m_enable_domain_optimization || !linear_ir->is_dynamic()) - return; - - const auto brgemms = find_applicable_brgemms(linear_ir); - // Parallel WA optimization is Brgemm related - if (brgemms.empty()) - return; - - concurrency = linear_ir->get_config().m_min_parallel_work_amount; - // At the moment this optimization is Brgemm related so there must be `unsqueezed_params` - unsqueezed_params = find_unsqueezed_params(linear_ir, brgemms); - OPENVINO_ASSERT(!unsqueezed_params.empty(), "unsqueezed_params mustn't be empty after initialization"); - loops_to_split = find_loops_to_split(linear_ir, unsqueezed_params); - - m_dim_idces.resize(configurator->m_io_num); - optimized_layouts.resize(configurator->m_io_num); - for (size_t i = 0; i < configurator->m_io_num; ++i) { - const auto& layout = configurator->m_io_descs[i]->get_layout(); - const auto dim_idx = i < configurator->m_in_num ? utils::get_input_dim_idx(layout, m_dim_idx) - : utils::get_output_dim_idx(layout, m_dim_idx); - m_dim_idces[i] = dim_idx; - optimized_layouts[i] = SplitDimensionM::get_updated_order(layout, i < configurator->m_in_num ? dim_idx : layout.size() - 2); - } -} - -bool RuntimeConfigurator::MHAParallelWAOptimizer::enabled() const { - return !loops_to_split.empty(); -} - -bool RuntimeConfigurator::MHAParallelWAOptimizer::optimize() { - OPENVINO_ASSERT(configurator != nullptr, "Configurator is nullptr"); - if (!enabled()) - return false; - - size_t new_batch_dim, new_kernel_dim; - if (!SplitDimensionM::split(configurator->m_config->master_shape, concurrency, new_batch_dim, new_kernel_dim)) - return false; - - auto& master_shape = configurator->m_config->master_shape; - *++master_shape.rbegin() = new_kernel_dim; - master_shape.insert(master_shape.cbegin() + master_shape.size() - 2, new_batch_dim); - configurator->update_tensor_rank(master_shape); - - LoopInfoRuntimeParamsMap initialized_info; - auto updater = [&](const lowered::LoopInfoPtr& loop_info) { - if (const auto unified_loop_info = ov::as_type_ptr(loop_info)) { - if (initialized_info.count(unified_loop_info) == 0) { - if (!ov::is_type(unified_loop_info)) - unified_loop_info->set_work_amount(new_kernel_dim); - utils::update_data_pointer_shifts(unified_loop_info); - initialized_info[unified_loop_info] = get_loop_runtime_params(unified_loop_info); - } - } else if (const auto expanded_loop_info = ov::as_type_ptr(loop_info)) { - configurator->update_expanded_loop_info(expanded_loop_info, initialized_info); - } else { - OPENVINO_THROW("Failed to update loop info: unknown type!"); - } - }; - lowered::LoopInfoSet updated_loops; - for (const auto& loop : loops_to_split) { - loop->apply(updater, updated_loops); - } - - auto shapes = configurator->extract_shapes(); - for (size_t i = 0; i < configurator->m_io_num; ++i) { - shapes[i] = unsqueezed_params.count(i) - ? SplitDimensionM::unsqueeze_m_dim(shapes[i], m_dim_idces[i]) - : SplitDimensionM::reshape_m_dim(shapes[i], m_dim_idces[i], new_batch_dim, new_kernel_dim); - } - configurator->update_data_offsets(shapes, optimized_layouts); - configurator->m_latest_shapes = std::move(shapes); - return true; -} - -std::unordered_set RuntimeConfigurator::MHAParallelWAOptimizer::find_applicable_brgemms( - const lowered::LinearIRCPtr& linear_ir) { - auto is_brgemm = [](const ExpressionPtr& expr) { - return ov::is_type(expr->get_node()); - }; - auto brgemm_it = std::find_if(linear_ir->begin(), linear_ir->end(), is_brgemm); - std::unordered_set brgemms; - while (brgemm_it != linear_ir->end()) { - brgemms.insert(*brgemm_it); - brgemm_it = std::find_if(std::next(brgemm_it), linear_ir->end(), is_brgemm); - } - const auto& loop_manager = linear_ir->get_loop_manager(); - // Brgemm is applicable if it has dynamic loop by M - // The loop by M is necessary since only in this case we can regulate BrgemmExecutor parameters (via loop's work amount) - // Only dynamic loops are applicable since in static case LoopEnd expressions are not updated during code generation and compiled as is - // Ticket: 148805 - auto applicable_brgemm = [&loop_manager](const ExpressionPtr& expr) { - const auto& loop_idces = expr->get_loop_ids(); - if (loop_idces.empty()) - return false; - const auto& outermost_loop = loop_manager->get_loop_info(loop_idces[0]); - if (!utils::is_dynamic_value(outermost_loop->get_work_amount())) - return false; - bool loop_by_m = true; - outermost_loop->iterate_through_ports([&loop_by_m](const LoopPort& port) { - if (port.is_incremented && port.dim_idx != m_dim_idx) - loop_by_m = false; - }); - return loop_by_m; - }; - // Note: if at least one brgemm is inapplicable, the parallel work amount optimization can't be applied - return std::all_of(brgemms.begin(), brgemms.end(), applicable_brgemm) ? brgemms : std::unordered_set{}; -} - -std::unordered_set RuntimeConfigurator::MHAParallelWAOptimizer::find_unsqueezed_params( - const lowered::LinearIRCPtr& linear_ir, - const std::unordered_set& brgemms) { - const auto& params = linear_ir->get_parameters(); - std::unordered_set unsqueezed_params; - auto add_param = [¶ms, &unsqueezed_params](const ExpressionPtr& expr) { - if (ov::is_type(expr->get_node())) { - auto found_param = std::find(params.begin(), params.end(), expr); - OPENVINO_ASSERT(found_param != params.end(), "find_param didn't found parameter for expr"); - unsqueezed_params.insert(std::distance(params.begin(), found_param)); - } - }; - - std::unordered_set visited; - for (const auto& brgemm : brgemms) { - const auto& brgemm_b_input = brgemm->get_input_port_connector(1)->get_source().get_expr(); - utils::visit_path(brgemm_b_input, visited, add_param, true); - } - return unsqueezed_params; -} - -std::vector RuntimeConfigurator::MHAParallelWAOptimizer::find_loops_to_split( - const lowered::LinearIRCPtr& linear_ir, - const std::unordered_set& unsqueezed_params) { - const auto loop_manager = linear_ir->get_loop_manager(); - std::set loop_idces_to_split; - std::vector prev_loop_idces; - - auto add_loop_idx_to_split = [&](const ExpressionPtr& expr) { - const auto& loop_idces = expr->get_loop_ids(); - if (loop_idces != prev_loop_idces) { - prev_loop_idces = loop_idces; - for (const auto& loop_id : loop_idces) { - const auto expanded_loop_info = loop_manager->get_loop_info(loop_id); - if (expanded_loop_info->get_dim_idx() == m_dim_idx) { - loop_idces_to_split.insert(loop_id); - } - } - } - }; - - size_t i = 0; - std::unordered_set visited; - // The idea is to traverse LIR down from the M dimension related parameters - // and find all the outermost loops: these loops will be split in runtime - for (const auto& param : linear_ir->get_parameters()) { - // Ops after non related params mustn't be traversed - if (unsqueezed_params.count(i++)) - continue; - utils::visit_path(param, visited, add_loop_idx_to_split, false); - } - - const auto& loops_map = linear_ir->get_loop_manager()->get_map(); - std::vector loops_to_split; - for (const auto& id : loop_idces_to_split) - loops_to_split.push_back(ov::as_type_ptr(loops_map.at(id))); - return loops_to_split; -} - } // namespace snippets } // namespace ov diff --git a/src/common/snippets/tests/include/utils/split_dim_m.hpp b/src/common/snippets/tests/include/utils/split_dim_m.hpp new file mode 100644 index 00000000000000..3e04c2a911d76a --- /dev/null +++ b/src/common/snippets/tests/include/utils/split_dim_m.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ov { +namespace test { +namespace snippets { + +struct InputData { + size_t cur_batch; + size_t cur_m; + size_t concurrency; +}; + +struct ReferenceData { + bool is_split; + size_t batch_m; + size_t kernel_m; +}; + +struct SplitDimensionMParams { + InputData input; + ReferenceData reference; +}; + +class SplitDimensionMTest : public testing::TestWithParam { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp index c5932ed690d670..382257f935cc49 100644 --- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp @@ -204,7 +204,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM_withMul) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHAWOTranspose_SplitM) { const auto& f = MHAWOTransposeSplitMFunction(std::vector{{10, 9216, 128}, {10, 128, 9216}, {10, 9216, 128}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{10, 3, 3072, 128}, {10, 1, 128, 9216}, {10, 1, 9216, 128}, {10, 9216, 128}}); + std::vector{{10, 18, 512, 128}, {10, 1, 128, 9216}, {10, 1, 9216, 128}, {10, 9216, 128}}); model = f.getOriginal(); model_ref = f.getReference(); config.set_concurrency(18); diff --git a/src/common/snippets/tests/src/utils/split_dim_m.cpp b/src/common/snippets/tests/src/utils/split_dim_m.cpp new file mode 100644 index 00000000000000..9e801fceae02e9 --- /dev/null +++ b/src/common/snippets/tests/src/utils/split_dim_m.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "utils/split_dim_m.hpp" + +#include "common_test_utils/ov_test_utils.hpp" +#include "snippets/pass/split_dimension_m.hpp" +#include "snippets/utils/utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string SplitDimensionMTest::getTestCaseName(testing::TestParamInfo obj) { + const auto& input = obj.param.input; + const auto& reference = obj.param.reference; + std::ostringstream result; + result << "Batch=" << input.cur_batch << "_"; + result << "CurM=" << input.cur_m << "_"; + result << "OptimalParallelWorkAmount=" << input.concurrency << "_"; + result << "IsSplit=" << reference.is_split << "_"; + result << "BatchM=" << reference.batch_m << "_"; + result << "KernelM=" << reference.kernel_m; + return result.str(); +} + +TEST_P(SplitDimensionMTest, SplitDimensionM) { + const auto& input = GetParam().input; + const auto& reference = GetParam().reference; + + // last_dim is fixed since it doesn't affect the SplitDimensionM result. + static const size_t last_dim = 1024; + ov::Shape shape = {input.cur_batch, input.cur_m, last_dim}; + size_t batch_m_dim, new_m_dim; + bool result = ov::snippets::pass::SplitDimensionM::split(shape, + input.concurrency, + batch_m_dim, + new_m_dim); + + ASSERT_EQ(result, reference.is_split); + if (result) { + ASSERT_EQ(batch_m_dim, reference.batch_m); + ASSERT_EQ(new_m_dim, reference.kernel_m); + } +} + +namespace SplitDimensionMInstantiation { +const std::vector split_dimension_cases = { + // Negative test cases: split is not needed + {InputData{40 /*cur_batch*/, 32 /*cur_m*/, 40 /*concurrency*/}, ReferenceData{false /*is_split*/}}, + {InputData{65, 32, 40}, ReferenceData{false}}, + + // Positive test cases + {InputData{20 /*cur_batch*/, 32 /*cur_m*/, 40 /*concurrency*/}, ReferenceData{true /*is_split*/, 2 /*batch_m*/, 16 /*kernel_m*/}}, + {InputData{30, 60, 40}, ReferenceData{true, 2, 30}}, + {InputData{10, 100, 40}, ReferenceData{true, 4, 25}}, + {InputData{15, 45, 40}, ReferenceData{true, 5, 9}}, + {InputData{25, 50, 40}, ReferenceData{true, 2, 25}}, + {InputData{5, 16384, 40}, ReferenceData{true, 8, 2048}}, + {InputData{5, 16384, 32}, ReferenceData{true, 32, 512}}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SplitDimensionM, + SplitDimensionMTest, + ::testing::ValuesIn(split_dimension_cases), + SplitDimensionMTest::getTestCaseName); + +} // namespace SplitDimensionMInstantiation +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 1c3d283ab673b1..b2758735b2d27a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -8,10 +8,12 @@ #include "snippets/utils/utils.hpp" #ifndef OPENVINO_ARCH_ARM64 -#include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp" +#include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp" +#include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp" #endif namespace ov { namespace intel_cpu { +using namespace ov::snippets::lowered::pass; const size_t CPURuntimeConfigurator::rank6D = 6; @@ -41,37 +43,19 @@ CPURuntimeConfigurator::CPURuntimeConfigurator() : ov::snippets::RuntimeConfigur void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { RuntimeConfigurator::initialization(linear_ir); - if (linear_ir->is_dynamic()) { - loopPortsAdjuster = BrgemmCopyBLoopPortsAdjuster(linear_ir); - } +#ifndef OPENVINO_ARCH_ARM64 + RuntimeOptimizer::register_if_applicable(m_intermediate_optimizers, linear_ir, this); + RuntimeOptimizer::register_if_applicable(m_final_optimizers, linear_ir, this); +#endif } void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { - m_config->master_shape = linear_ir->get_master_shape(); - if (linear_ir->is_dynamic()) { - update_loop_info(linear_ir); - } - - if (!m_optimizer.optimize()) { - // If the optimization was not applied, offsets are updated using shapes from descriptors - auto shapes = extract_shapes(); - update_data_offsets(shapes, extract_layouts()); - m_latest_shapes = std::move(shapes); - } + RuntimeConfigurator::update(linear_ir); if (linear_ir->is_dynamic()) - loopPortsAdjuster.optimize(); - - // Update KernelExecutor Table should be before `update_buffer_scratchpad_size` - // because `ComputeAllocationSize` depends on subtensors which are updated in the table - get_kernel_executor_table()->update_state(linear_ir); - update_buffer_scratchpad_size(linear_ir); - - if (linear_ir->is_dynamic()) { update_loop_args(linear_ir); - } } -void CPURuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { +void CPURuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) const { m_config->tensor_rank = std::max(master_shape.size(), rank6D); } @@ -101,40 +85,5 @@ void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::Linea } } } -#ifdef OPENVINO_ARCH_ARM64 -CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { -} - -void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() { -} -#else -CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { - const auto& pass = std::make_shared(); - pass->run(*linear_ir); - const auto& affected_uni_loops = pass->get_affected_loops(); - const auto& loop_map = linear_ir->get_loop_manager()->get_map(); - for (const auto& p : loop_map) { - if (const auto& exp_loop = ov::as_type_ptr(p.second)) { - const auto& uni_loop = exp_loop->get_unified_loop_info(); - if (affected_uni_loops.count(uni_loop)) - m_affected_uni2exp_map[uni_loop].push_back(exp_loop); - } - } -} - -void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() { - for (const auto& p : m_affected_uni2exp_map) { - const auto& uni_loop = p.first; - const auto& exp_loops = p.second; - snippets::RuntimeConfigurator::LoopInfoRuntimeParamsMap initialized_info; - if (intel_cpu::pass::AdjustBrgemmCopyBLoopPorts::update_loop_info(uni_loop)) { - initialized_info[uni_loop] = get_loop_runtime_params(uni_loop); - for (const auto& exp_loop : exp_loops) - update_expanded_loop_info(exp_loop, initialized_info); - } - } -} -#endif - } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index d8ef9772e813ff..42ce35a3c66c2b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -4,10 +4,10 @@ #pragma once -#include "snippets/runtime_configurator.hpp" - -#include "snippets/lowered/port_descriptor.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" +#include "memory_desc/cpu_blocked_memory_desc.h" +#include "snippets/lowered/port_descriptor.hpp" +#include "snippets/runtime_configurator.hpp" namespace ov { namespace intel_cpu { @@ -22,48 +22,25 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { #endif std::vector loop_args = {}; + std::unordered_map m_in_requested_descs = {}; }; class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { public: CPURuntimeConfigurator(); -protected: /** - * @brief Update RuntimeConfig based on LinearIR + * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig * @param linear_ir LinearIR */ + void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; +protected: void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; - /** - * @brief Update tensor rank based on master shape - * @param master_shape Master shape - */ - void update_tensor_rank(const ov::snippets::VectorDims& master_shape) override; - /** - * @brief Initializes tensor rank of config - * @param linear_ir LinearIR - */ + void update_tensor_rank(const ov::snippets::VectorDims& master_shape) const override; void init_tensor_rank(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const override; void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; - /** - * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig - * @param linear_ir LinearIR - */ - void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; static const size_t rank6D; - - class BrgemmCopyBLoopPortsAdjuster { - public: - BrgemmCopyBLoopPortsAdjuster() = default; - BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir); - - void optimize(); - - private: - std::unordered_map> m_affected_uni2exp_map; - } loopPortsAdjuster; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index 9b521cdb3b57c7..7257e31369bd66 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -17,13 +17,7 @@ #include #include -#include "convert.h" #include "cpu/x64/cpu_isa_traits.hpp" -#include "nodes/common/cpu_convert.h" -#include "nodes/common/cpu_memcpy.h" -#include "nodes/common/reorder_prim.h" -#include "openvino/core/parallel.hpp" -#include "shape_inference/shape_inference_pass_through.hpp" #include "utils/precision_support.h" #include "nodes/executors/executor.hpp" #include "nodes/executors/transpose_list.hpp" diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index ee24dd66493204..a23835d398cbe7 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -35,6 +35,7 @@ #include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp" #include "transformations/snippets/x64/pass/remove_converts.hpp" #include "transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp" +#include "transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp" #include "transformations/snippets/x64/pass/enforce_precision.hpp" #include "transformations/snippets/x64/shape_inference.hpp" #include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp" @@ -79,14 +80,14 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor { const BufferScratchpadAllocator& allocator) : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {} - void exec(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { const auto& callable = m_schedule->get_callable(); auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { init_call_args(call_args, inMemPtrs, outMemPtrs, ithr); }; - auto caller = [&](jit_snippets_call_args& call_args, const size_t* indexes) { - callable(&call_args, indexes); + auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes) { + callable(&call_args, indexes.data()); }; if (m_parallel_exec_domain.size() == rank6D) { @@ -127,7 +128,7 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { reset_exec_table_state = snippet_config->kernel_executor_table->get_state_reset(); } - void exec(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { const auto& callable = m_schedule->get_callable(); OPENVINO_ASSERT(data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!"); @@ -144,7 +145,7 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { init_call_args(call_args, ithr); }; - auto caller = [&](jit_snippets_call_args& call_args, const size_t* indexes) { + auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes) { update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); callable(&call_args); }; @@ -181,17 +182,17 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { } inline void update_ptrs(jit_snippets_call_args& call_args, const std::vector& src_ptrs, - const std::vector& dst_ptrs, const size_t* indexes) const { + const std::vector& dst_ptrs, const std::vector& indexes) const { for (size_t i = 0; i < src_ptrs.size(); i++) { auto i_ptr = src_ptrs[i]; - for (size_t j = 0; j < data_offsets[i].size() - 1; j++) { + for (size_t j = 0; j < indexes.size(); j++) { i_ptr += data_offsets[i][j] * indexes[j]; } call_args.src_ptrs[i] = i_ptr; } for (size_t i = 0; i < dst_ptrs.size(); i++) { auto i_ptr = dst_ptrs[i]; - for (size_t j = 0; j < data_offsets[i + src_ptrs.size()].size() - 1; j++) { + for (size_t j = 0; j < indexes.size(); j++) { i_ptr += data_offsets[i + src_ptrs.size()][j] * indexes[j]; } call_args.dst_ptrs[i] = i_ptr; @@ -648,6 +649,8 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() { } SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::Before, ov::snippets::pass::PropagatePrecision, ov::intel_cpu::pass::BrgemmToBrgemmCPU); + SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::After, ov::intel_cpu::pass::BrgemmToBrgemmCPU, + ov::intel_cpu::pass::EliminateBrgemmCopyB); SNIPPETS_REGISTER_PASS_ABSOLUTE_X86_64(Place::PipelineEnd, ov::intel_cpu::pass::RemoveConverts); SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON(Place::PipelineEnd, ov::intel_cpu::pass::MulAddToFMA); @@ -782,7 +785,12 @@ void Subgraph::prepareParams() { snippet->get_runtime_configurator()->set_kernel_executor_table(code_gen->get()->lowering_result.kernel_executor_table); } const auto& snippet_config = ov::as_type_ptr(snippet->update_runtime_config()); - return std::make_shared(key.attrs, code_gen, start_offset_in, start_offset_out, snippet_config, allocator); + return std::make_shared(key.attrs, + code_gen, + start_offset_in, + start_offset_out, + snippet_config, + allocator); } else { // Static case: // 1. Update runtime config to get static scheduling data (io data offsets, parallel domain) which will be compiled in JIT code @@ -793,7 +801,12 @@ void Subgraph::prepareParams() { [&snippet_config](const SubgraphCodeGeneratorKey& key) -> std::shared_ptr { return std::make_shared(key.attrs, snippet_config); }); - return std::make_shared(key.attrs, code_gen_result.first, start_offset_in, start_offset_out, snippet_config, allocator); + return std::make_shared(key.attrs, + code_gen_result.first, + start_offset_in, + start_offset_out, + snippet_config, + allocator); } }; @@ -846,7 +859,7 @@ bool Subgraph::created() const { void Subgraph::execute(dnnl::stream strm) { OPENVINO_ASSERT(execPtr, "Can't execute Subgraph node. Primitive didn't created"); - execPtr->exec(srcMemPtrs, dstMemPtrs); + execPtr->execute(strm, srcMemPtrs, dstMemPtrs); } void Subgraph::executeDynamicImpl(dnnl::stream strm) { @@ -893,7 +906,16 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptrbuffer_scratchpad_size; OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), "Undefined buffer scratchpad size!"); - m_buffer_scratchpad = allocator(static_cast(m_nthreads) * m_buffer_scratchpad_size); + m_internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; + m_in_requested_descs = snippet_config->m_in_requested_descs; + const auto external_repacking_buffer_size = + std::accumulate(m_in_requested_descs.begin(), + m_in_requested_descs.end(), + size_t(0), + [](size_t sum, const std::pair& requested_desc_elem) { + return sum + requested_desc_elem.second->getCurrentMemSize(); + }); + m_buffer_scratchpad = allocator(m_internal_buffer_size + external_repacking_buffer_size); #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) const auto target = std::dynamic_pointer_cast(snippet_attrs->snippet->get_generator()->get_target_machine()); @@ -919,7 +941,7 @@ void Subgraph::SubgraphExecutor::segfault_detector() { #endif void Subgraph::SubgraphExecutor::parallel_for6d(const std::function& initializer, - const std::function& caller) { + const std::function&)>& caller) { const auto& dom = m_parallel_exec_domain; #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) @@ -933,7 +955,7 @@ void Subgraph::SubgraphExecutor::parallel_for6d(const std::function indexes{0, 0, 0, 0, 0}; parallel_it_init(start, indexes[0], dom[0], indexes[1], dom[1], indexes[2], dom[2], indexes[3], dom[3], indexes[4], dom[4]); for (size_t iwork = start; iwork < end; ++iwork) { caller(call_args, indexes); @@ -943,7 +965,7 @@ void Subgraph::SubgraphExecutor::parallel_for6d(const std::function& initializer, - const std::function& caller) { + const std::function&)>& caller) { const auto& dom = m_parallel_exec_domain; #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) @@ -965,11 +987,36 @@ void Subgraph::SubgraphExecutor::parallel_forNd(const std::function& inMemPtrs, const std::vector& outMemPtrs) { + if (!m_in_requested_descs.empty()) { + auto reorderedInMemPtrs = reorder_inputs(strm, inMemPtrs); + exec_impl(reorderedInMemPtrs, outMemPtrs); + } else { + exec_impl(inMemPtrs, outMemPtrs); + } +} + +std::vector Subgraph::SubgraphExecutor::reorder_inputs(const dnnl::stream& strm, const std::vector& inMemPtrs) { + auto reordered_in_ptrs = inMemPtrs; + size_t offset = m_internal_buffer_size; + for (const auto& requested_descs_elem : m_in_requested_descs) { + const auto in_idx = requested_descs_elem.first; + const auto& requested_desc = requested_descs_elem.second; + + const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; + const auto scratch_mem = std::make_shared(strm.get_engine(), requested_desc, data_ptr, false); + scratch_mem->load(*reordered_in_ptrs[in_idx]); + reordered_in_ptrs[in_idx] = scratch_mem; + offset += requested_desc->getCurrentMemSize(); + } + return reordered_in_ptrs; +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index ffd7944c59d48a..8040da0a98ef57 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -129,13 +129,15 @@ class Subgraph::SubgraphExecutor { const BufferScratchpadAllocator& allocator); virtual ~SubgraphExecutor() = default; - virtual void exec(const std::vector& inMemPtrs, const std::vector& outMemPtrs) = 0; + void execute(const dnnl::stream& strm, const std::vector& inMemPtrs, const std::vector& outMemPtrs); protected: + virtual void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) = 0; + void parallel_for6d(const std::function& initializer, - const std::function& caller); + const std::function&)>& caller); void parallel_forNd(const std::function& initializer, - const std::function& caller); + const std::function&)>& caller); inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const { if (m_buffer_scratchpad_size > 0) @@ -151,6 +153,7 @@ class Subgraph::SubgraphExecutor { // Buffer scratchpad MemoryPtr m_buffer_scratchpad = nullptr; size_t m_buffer_scratchpad_size = 0; + size_t m_internal_buffer_size = 0; const size_t rank6D = 6; @@ -164,6 +167,11 @@ class Subgraph::SubgraphExecutor { bool enabled_segfault_detector = false; inline void segfault_detector(); #endif + +private: + std::vector reorder_inputs(const dnnl::stream& strm, const std::vector& inMemPtrs); + + std::unordered_map m_in_requested_descs = {}; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp index b40bd88f31726b..1c3e90bbccf34f 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp @@ -68,13 +68,9 @@ void BrgemmCPU::custom_constructor_validate_and_infer_types(std::vector INTERNAL_OP_SCOPE(BrgemmCPU_constructor_validate_and_infer_types); validate_inputs(); - // During ctor call, BrgemmCPU doesn't know his port descriptors. - // So we use port descs from source inputs - const auto brgemm_copy = with_repacking(m_type) ? get_brgemm_copy() : nullptr; - const auto planar_input_shapes = - std::vector{ snippets::utils::get_planar_pshape(get_input_partial_shape(0), layout_a), - brgemm_copy ? snippets::utils::get_planar_pshape(brgemm_copy->input(0)) - : snippets::utils::get_planar_pshape(get_input_partial_shape(1), layout_b) }; + const std::vector planar_input_shapes{ + snippets::utils::get_planar_pshape(get_input_partial_shape(0), layout_a), + snippets::utils::get_planar_pshape(get_input_partial_shape(1), layout_b)}; auto output_shape = infer_output_partial_shape(planar_input_shapes); set_output_type(0, get_output_type(), snippets::utils::get_planar_pshape(output_shape, layout_c)); @@ -130,20 +126,6 @@ std::shared_ptr BrgemmCPU::clone_with_new_inputs(const OutputVector& new_a } } -std::shared_ptr BrgemmCPU::get_brgemm_copy() const { - OPENVINO_ASSERT(one_of(m_type, BRGEMM_TYPE::REPACKING_ONLY, BRGEMM_TYPE::WITH_COMPENSATIONS, BRGEMM_TYPE::WITH_AMX), "Brgemm doesn't need BrgemmCopyB"); - auto b_input_node = get_input_node_shared_ptr(1); - if (const auto brgemm_copy_b = ov::as_type_ptr(b_input_node)) { - return brgemm_copy_b; - } - if (ov::is_type(b_input_node)) { - if (const auto brgemm_copy_b = ov::as_type_ptr(b_input_node->get_input_node_shared_ptr(0))) { - return brgemm_copy_b; - } - } - OPENVINO_THROW("BrgemmCopyB hasn't been found!"); -} - size_t BrgemmCPU::get_offset_scratch() const { OPENVINO_ASSERT(with_scratchpad(m_type) && get_input_size() == 3, "Offset of scratchpad must be only in Brgemm with scratchpad on 3rd input"); return get_input_offset(2); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp index a646ffc792fd6d..a781bc7ddd4e15 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp @@ -44,7 +44,6 @@ class BrgemmCPU : public snippets::op::Brgemm { BRGEMM_TYPE get_type() const { return m_type; } size_t get_offset_scratch() const; - std::shared_ptr get_brgemm_copy() const; bool visit_attributes(AttributeVisitor& visitor) override; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp index adc215ef1d9900..6a4fc83d409355 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp @@ -6,7 +6,10 @@ #include "dnnl_extension_utils.h" #include "emitters/utils.hpp" +#include "snippets/lowered/expressions/buffer_expression.hpp" +#include "snippets/op/buffer.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "utils/general_utils.h" using namespace Xbyak; @@ -83,6 +86,21 @@ size_t compute_inner_n_block(const ov::element::Type& precision) { default: OPENVINO_THROW("BrgemmCopyB doesn't support precision ", precision); } } + +ov::snippets::lowered::ExpressionPtr get_copy_b_expr(const ov::snippets::lowered::ExpressionPtr& brgemm_expr) { + OPENVINO_ASSERT(ov::is_type(brgemm_expr->get_node()), "get_copy_b_expr must be called only for BrgemmCPU node"); + const auto b_input_expr = brgemm_expr->get_input_port_connector(1)->get_source().get_expr(); + if (ov::is_type(b_input_expr->get_node())) { + return b_input_expr; + } else if (ov::is_type(b_input_expr)) { + OPENVINO_ASSERT(b_input_expr->get_input_count() >= 1, "BufferExpression on brgemm's B input must have at least one input"); + const auto input_buffer_expr = b_input_expr->get_input_port_connector(0)->get_source().get_expr(); + if (ov::is_type(input_buffer_expr->get_node())) { + return input_buffer_expr; + } + } + return nullptr; +} } // namespace repacking } // namespace brgemm_utils } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp index aeb5b22cd56129..0d8e3f5fb6fc9b 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp @@ -18,7 +18,7 @@ enum class BRGEMM_TYPE { STAND_ALONE, // No extra requirements, used for f32|f32 WITH_AMX, // i8|i8 or bf16|bf16 on AMX system - needs BrgemmCopyB and scratchpad WITH_COMPENSATIONS, // i8|i8 (non-AMX system) - needs BrgemmCopyB for data repacking and compensations - REPACKING_ONLY // u8|i8 or bf16|bf16 (non-AMX system) - needs BrgemmCopyB on second input for data repacking + REPACKING_ONLY, // u8|i8, or bf16|bf16 (non-AMX system), or brgemm with transpose_b=true - needs BrgemmCopyB on second input for data repacking }; dnnl::impl::cpu::x64::cpu_isa_t get_primitive_isa(const ov::element::Type& dt_in0, bool is_with_amx); @@ -56,6 +56,12 @@ T compute_LDB(T n_block, const ov::element::Type& precision) { n_block : std::max(n_block, static_cast(compute_inner_n_block(precision))); } +/** + * @brief Retrieves the expression pointer for the brgemm_copy_b expression corresponding to the given BrgemmCPU expression. + * @param brgemm_expr The expression pointer for the BrgemmCPU operation. + * @return The expression pointer for the BrgemmCopyB operation. + */ +snippets::lowered::ExpressionPtr get_copy_b_expr(const snippets::lowered::ExpressionPtr& brgemm_expr); } // namespace repacking } // namespace brgemm_utils } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp new file mode 100644 index 00000000000000..4ad2bb8a11a667 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp @@ -0,0 +1,46 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "eliminate_brgemm_copy_b.hpp" + +#include "cpu/x64/cpu_isa_traits.hpp" +#include "openvino/pass/pattern/matcher.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/optional.hpp" +#include "snippets/itt.hpp" +#include "snippets/op/rank_normalization.hpp" +#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" + +namespace ov { +namespace intel_cpu { + +pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() { + MATCHER_SCOPE(EliminateBrgemmCopyB); + auto m_param = ov::pass::pattern::wrap_type(); + auto m_rank_norm = ov::pass::pattern::optional(m_param); + auto m_copy_b = ov::pass::pattern::wrap_type({m_param}); + + auto callback = [=](ov::pass::pattern::Matcher& m) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::EliminateBrgemmCopyB") + const auto& pattern_map = m.get_pattern_value_map(); + const auto& copy_b_out = pattern_map.at(m_copy_b); + const auto copy_b_node = ov::as_type_ptr(copy_b_out.get_node_shared_ptr()); + OPENVINO_ASSERT(copy_b_node, "BrgemmCopyB node is null in EliminateBrgemmCopyB transformation"); + + const auto& in_desc = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(copy_b_node->input(0)); + const auto& layout = in_desc->get_layout(); + // TODO: + // 1. Ticket 157340: support external repacking for copyB with compensations + // 2. Ticket 157339: support external repacking for non-planar layout + if (!ov::snippets::utils::is_planar_layout(layout) || + brgemm_utils::with_compensations(copy_b_node->get_type()) || transformation_callback(copy_b_node)) + return false; + return ov::replace_output_update_name(copy_b_out, copy_b_node->input_value(0)); + }; + + auto m = std::make_shared(m_copy_b, matcher_name); + register_matcher(m, callback); +} +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp new file mode 100644 index 00000000000000..2cdeae53fab026 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { + +/** + * @interface EliminateBrgemmCopyB + * @brief EliminateBrgemmCopyB identifies BrgemmCopyB nodes which can be inferred outside the Subgraph. + * If this is possible, CopyB node is removed, and the external repacking is configured on the further pipeline stages in RuntimeConfigurator. + * + * @ingroup snippets + */ +class EliminateBrgemmCopyB: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("EliminateBrgemmCopyB", "0"); + EliminateBrgemmCopyB(); +}; + + +} // namespace pass +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp index c421e5cc2a4805..7dfe711a5a5c67 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp @@ -65,38 +65,35 @@ bool pass::AdjustBrgemmCopyBLoopPorts::run(const snippets::lowered::LinearIR& li bool modified = false; + auto get_repacking_loop_idces = [](const snippets::lowered::ExpressionPtr& brgemm_expr) { + // Repacking may be extracted outside the snippets kernel. In this case, brgemm parent expression is a parameter. + if (is_type(brgemm_expr->get_input_port_connector(1)->get_source().get_expr()->get_node())) + return std::vector{}; + const auto repacking_expr = brgemm_utils::repacking::get_copy_b_expr(brgemm_expr); + OPENVINO_ASSERT(repacking_expr, "BrgemmCopyB expression is not found"); + return repacking_expr->get_loop_ids(); + }; + for (const auto& expr : linear_ir) { - const auto& node = expr->get_node(); - if (!is_type(node)) + const auto brgemm = ov::as_type_ptr(expr->get_node()); + if (!brgemm || !brgemm_utils::with_repacking(brgemm->get_type())) continue; - const auto& repacking_loop_ids = expr->get_loop_ids(); - const auto& child_ports = expr->get_output_port(0).get_connected_ports(); - OPENVINO_ASSERT(child_ports.size() == 1 && - is_type(child_ports.begin()->get_expr()), - "BrgemmCopyB should have one BufferExpression child"); - auto grandchild_ports = child_ports.begin()->get_expr()->get_output_port(0).get_connected_ports(); - for (const auto& target_port : grandchild_ports) { - const auto& port_node = target_port.get_expr()->get_node(); - if (!is_type(port_node)) { - OPENVINO_ASSERT(is_type(port_node), - "Invalid grandchild of BrgemmCopyB"); - continue; - } - const auto &brgemm_loop_ids = target_port.get_expr()->get_loop_ids(); - // Continue if there is no blocking loop - if (brgemm_loop_ids.empty() && repacking_loop_ids.empty()) - continue; - OPENVINO_ASSERT(brgemm_loop_ids.size() > repacking_loop_ids.size(), "Invalid BrgemmCopyB loop configuration"); - const auto &loop_manager = linear_ir.get_loop_manager(); - for (auto i = repacking_loop_ids.size(); i < brgemm_loop_ids.size(); i++) { - const auto &loop = loop_manager->get_loop_info(brgemm_loop_ids[i]); - auto uni_loop = ov::as_type_ptr(loop); - if (!uni_loop) - uni_loop = ov::as_type_ptr(loop)->get_unified_loop_info(); - if (!m_affected_loops.count(uni_loop) && update_loop_info(uni_loop)) { - m_affected_loops.insert(uni_loop); - modified = true; - } + const auto& brgemm_loop_ids = expr->get_loop_ids(); + const auto& repacking_loop_ids = get_repacking_loop_idces(expr); + // Continue if there is no blocking loop + if (brgemm_loop_ids.empty() && repacking_loop_ids.empty()) + continue; + + OPENVINO_ASSERT(brgemm_loop_ids.size() > repacking_loop_ids.size(), "Invalid BrgemmCopyB loop configuration"); + const auto &loop_manager = linear_ir.get_loop_manager(); + for (auto i = repacking_loop_ids.size(); i < brgemm_loop_ids.size(); i++) { + const auto &loop = loop_manager->get_loop_info(brgemm_loop_ids[i]); + auto uni_loop = ov::as_type_ptr(loop); + if (!uni_loop) + uni_loop = ov::as_type_ptr(loop)->get_unified_loop_info(); + if (!m_affected_loops.count(uni_loop) && update_loop_info(uni_loop)) { + m_affected_loops.insert(uni_loop); + modified = true; } } } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp index 5c65c7a0282823..794c55d868158a 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp @@ -18,14 +18,11 @@ namespace pass { * Finds loop ports connected to BrgemmCopyB and sets appropriate pointer increments. * @ingroup snippets */ -class AdjustBrgemmCopyBLoopPorts: public snippets::lowered::pass::Pass { +class AdjustBrgemmCopyBLoopPorts: public snippets::lowered::pass::ConstPass { public: AdjustBrgemmCopyBLoopPorts() = default; - OPENVINO_RTTI("AdjustBrgemmCopyBLoopPorts", "Pass"); - bool run(const snippets::lowered::LinearIR& linear_ir); - bool run(snippets::lowered::LinearIR& linear_ir) override { - return run(const_cast(linear_ir)); - } + OPENVINO_RTTI("AdjustBrgemmCopyBLoopPorts", "ConstPass"); + bool run(const snippets::lowered::LinearIR& linear_ir) override; static bool update_loop_info(const snippets::lowered::UnifiedLoopInfoPtr& uni_loop_info); const std::unordered_set& get_affected_loops() { return m_affected_loops; } private: diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp new file mode 100644 index 00000000000000..d88e0660e9e6fb --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "brgemm_copy_b_loop_ports_adjuster.hpp" + +#include "snippets/itt.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp" + +namespace ov { +namespace intel_cpu { + +BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, + const CPURuntimeConfigurator* configurator) + : ov::snippets::lowered::pass::RuntimeOptimizer(configurator) { + if (!linear_ir->is_dynamic()) + return; + + const auto& pass = std::make_shared(); + pass->run(*linear_ir); + const auto& affected_uni_loops = pass->get_affected_loops(); + const auto& loop_map = linear_ir->get_loop_manager()->get_map(); + for (const auto& p : loop_map) { + if (const auto& exp_loop = ov::as_type_ptr(p.second)) { + const auto& uni_loop = exp_loop->get_unified_loop_info(); + if (affected_uni_loops.count(uni_loop)) + m_affected_uni2exp_map[uni_loop].push_back(exp_loop); + } + } +} + +bool BrgemmCopyBLoopPortsAdjuster::run(const snippets::lowered::LinearIR& linear_ir) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmCopyBLoopPortsAdjuster") + for (const auto& p : m_affected_uni2exp_map) { + const auto& uni_loop = p.first; + const auto& exp_loops = p.second; + snippets::RuntimeConfigurator::LoopInfoRuntimeParamsMap initialized_info; + if (intel_cpu::pass::AdjustBrgemmCopyBLoopPorts::update_loop_info(uni_loop)) { + initialized_info[uni_loop] = snippets::RuntimeConfigurator::get_loop_runtime_params(uni_loop); + for (const auto& exp_loop : exp_loops) + snippets::RuntimeConfigurator::update_expanded_loop_info(exp_loop, initialized_info); + } + } + return true; +} + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp new file mode 100644 index 00000000000000..7b9f30ac96e4b1 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp @@ -0,0 +1,33 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_info.hpp" +#include "snippets/lowered/pass/runtime_optimizer.hpp" + +namespace ov { +namespace intel_cpu { + +/** + * @class BrgemmCopyBLoopPortsAdjuster + * @brief A runtime optimizer that adjusts blocked loops parameters for Brgemm operations which require repacking. + */ +class BrgemmCopyBLoopPortsAdjuster : public ov::snippets::lowered::pass::RuntimeOptimizer { +public: + BrgemmCopyBLoopPortsAdjuster() = default; + BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, const CPURuntimeConfigurator* configurator); + + bool run(const snippets::lowered::LinearIR& linear_ir) override; + bool applicable() const override { return !m_affected_uni2exp_map.empty(); } + +private: + std::unordered_map> m_affected_uni2exp_map; +}; + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp index 9b3009284e09e8..66d6f4d223c90f 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp @@ -83,11 +83,12 @@ bool BrgemmCPUBlocking::mark_blocking_loops(LinearIR& linear_ir, if (stand_alone(type)) return res; - const auto copy_b_expr = linear_ir.get_expr_by_node(brgemm->get_brgemm_copy()); - const ov::snippets::VectorDims full_subtensor(2, get_full_dim_value()); - copy_b_expr->get_input_port_descriptor(0)->set_subtensor(full_subtensor); - copy_b_expr->get_output_port_descriptor(0)->set_subtensor(full_subtensor); - + const auto copy_b_expr = repacking::get_copy_b_expr(brgemm_expr); + if (copy_b_expr) { + const ov::snippets::VectorDims full_subtensor(2, get_full_dim_value()); + copy_b_expr->get_input_port_descriptor(0)->set_subtensor(full_subtensor); + copy_b_expr->get_output_port_descriptor(0)->set_subtensor(full_subtensor); + } if (with_amx(type)) { move_new_memory_buffer(linear_ir, brgemm_it); auto buffer_it = std::prev(brgemm_it); @@ -98,6 +99,7 @@ bool BrgemmCPUBlocking::mark_blocking_loops(LinearIR& linear_ir, if (with_compensations(type)) { const ov::snippets::VectorDims compensations_subtensor{1, get_full_dim_value()}; OPENVINO_ASSERT(brgemm_expr->get_input_count() == 3, "Brgemm must have 3 inputs in case of compensations."); + OPENVINO_ASSERT(copy_b_expr, "BrgemmCopyB must be present in case of compensations."); const auto& compens_port = brgemm_expr->get_input_port(2); compens_port.get_descriptor_ptr()->set_subtensor(compensations_subtensor); copy_b_expr->get_output_port_descriptor(1)->set_subtensor(compensations_subtensor); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp new file mode 100644 index 00000000000000..e98c8ebbecf49b --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp @@ -0,0 +1,72 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "external_repacking_adjuster.hpp" + +#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "memory_desc/cpu_blocked_memory_desc.h" +#include "snippets/itt.hpp" +#include "snippets/utils/utils.hpp" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/snippets/x64/op/brgemm_utils.hpp" + +namespace ov { +namespace intel_cpu { + +BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, + const CPURuntimeConfigurator* configurator) + : snippets::lowered::pass::RuntimeOptimizer(configurator) { + const auto& params = linear_ir->get_parameters(); + for (size_t i = 0; i < params.size(); ++i) { + const auto& param = params[i]; + const auto consumers = param->get_output_port_connector(0)->get_consumers(); + const bool brgemm_with_extracted_repacking = + std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) { + auto brgemm = ov::as_type_ptr(port.get_expr()->get_node()); + return brgemm && brgemm_utils::with_repacking(brgemm->get_type()) && port.get_index() == 1; + }); + if (brgemm_with_extracted_repacking) { + m_param_idces_with_external_repacking.insert(i); + // Ticket 157339: Support non-planar layout + OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(configurator->get_io_descs()[i]->get_layout()), + "Non-planar layout is not supported for external repacking"); + } + } +} + +bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& linear_ir) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmExternalRepackingAdjuster") + const auto& cpu_config = ov::as_type_ptr(m_configurator->get_config()); + auto& optimal_descs = cpu_config->m_in_requested_descs; + for (const auto& i : m_param_idces_with_external_repacking) { + const auto& shape = cpu_config->io_shapes[i]; + const auto& K = *++shape.rbegin(); + const auto& N = *shape.rbegin(); + + const auto& precision = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0); + const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision); + const size_t brgemm_kernel_rank = 2; + // Firstly, batch dims are set + VectorDims requested_blocked_shape(shape.begin(), shape.end() - brgemm_kernel_rank); + // Then, the blocked dims are formed + requested_blocked_shape.insert( + requested_blocked_shape.end(), + {snippets::utils::div_up(K, vnni_factor), std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)), vnni_factor}); + + VectorDims requested_order(shape.size() - brgemm_kernel_rank); + std::iota(requested_order.begin(), requested_order.end(), 0); + const auto last_idx = shape.size() - 1; + requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); + + optimal_descs[i] = std::make_shared(precision, Shape(shape), requested_blocked_shape, requested_order); + + ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1); + shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end()); + m_configurator->compute_offsets(shape_for_offset, i, 0); + } + return true; +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp new file mode 100644 index 00000000000000..f102af8f23fe5b --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "snippets/lowered/pass/runtime_optimizer.hpp" +#include "snippets/runtime_configurator.hpp" + +namespace ov { +namespace intel_cpu { + +/** + * @class BrgemmExternalRepackingAdjuster + * @brief A runtime optimizer that creates the memory descs for BRGEMM inputs which require external repacking. + * The generated memory descs are stored in the CPU runtime config. + */ +class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::RuntimeOptimizer { +public: + BrgemmExternalRepackingAdjuster() = default; + BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, const CPURuntimeConfigurator* configurator); + + bool run(const snippets::lowered::LinearIR& linear_ir) override; + bool applicable() const override { return !m_param_idces_with_external_repacking.empty(); } + +private: + std::set m_param_idces_with_external_repacking; +}; + +} // namespace intel_cpu +} // namespace ov