From 1f7baf464e6cebfc5575b4a40f6cf6b29fac596a Mon Sep 17 00:00:00 2001 From: chenhuwa Date: Tue, 19 Sep 2023 13:14:19 +0800 Subject: [PATCH] explicite emitters created, as dynamic bounding of virtual funtion is in compilation stage --- .../src/emitters/x64/jit_emitter.cpp | 73 --------------- .../src/emitters/x64/jit_emitter.hpp | 52 +++++++++-- .../emitters/x64/jit_load_store_emitters.cpp | 19 ++++ .../emitters/x64/jit_load_store_emitters.hpp | 8 ++ .../emitters/x64/jit_snippets_emitters.cpp | 74 ++------------- .../emitters/x64/jit_snippets_emitters.hpp | 4 +- .../intel_cpu/src/nodes/interpolate.cpp | 1 - src/plugins/intel_cpu/src/nodes/subgraph.cpp | 89 ++++++++----------- 8 files changed, 119 insertions(+), 201 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp index 76368db8c2b5a2..f727f8d9d1d7a5 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp @@ -213,78 +213,5 @@ void jit_emitter::emit_code(const std::vector &in_idxs, const std::vecto emitter_postamble(); } -void jit_emitter::internal_call_preamble() const { - // gprs - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15, - h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp}; - size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); - - h->sub(h->rsp, n_gprs_to_save * gpr_size); - for (size_t i = 0; i < n_gprs_to_save; ++i) - h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]); - - // mask regs - // need preserve based on cpu capability, instead of host isa. - // in case there are possibilty that different isa emitters exist in one subgraph KernelEmitter from perf standpoint in the future. - // e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg. - // do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted. - if (cpu::x64::mayiuse(cpu::x64::avx512_core)) { - h->sub(h->rsp, k_mask_num * k_mask_size); - for (size_t i = 0; i < k_mask_num; ++i) { - h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast(i))); - } - } - - // vector regs - // 1. Caller obligation to save vector registers as callee may use them. - // 2. There is an implicit assumption that the host code uses the same - // `isa` as the injector. Once the assumption is wrong, `vecs_count` and - // `vlen` should be replaced with `host_isa::vlen` and - // `host_isa::vecs_count`. - h->sub(h->rsp, get_max_vecs_count() * get_vec_length()); - for (size_t i = 0; i < get_max_vecs_count(); ++i) { - push_vec(h->ptr[h->rsp + i * get_vec_length()], i); - } -} - -void jit_emitter::internal_call_postamble() const { - // restore vector registers - for (int i = static_cast(get_max_vecs_count()) - 1; i >= 0; --i) { - pop_vec(static_cast(i), h->ptr[h->rsp + i * get_vec_length()]); - } - h->add(h->rsp, (get_max_vecs_count()) * get_vec_length()); - - // restore k reg - if (cpu::x64::mayiuse(cpu::x64::avx512_core)) { - for (int i = k_mask_num - 1; i >= 0; --i) { - h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]); - } - h->add(h->rsp, k_mask_num * k_mask_size); - } - - // restore gpr registers - Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15, - h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp}; - size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]); - for (int i = n_gprs_to_save - 1; i >= 0; --i) - h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]); - h->add(h->rsp, n_gprs_to_save * gpr_size); -} - -// additional 16 byte for offset, callee can use arbitrary regs. -void jit_emitter::internal_call_rsp_align() const { - h->mov(h->rbx, h->rsp); - h->and_(h->rbx, 0xf); - h->sub(h->rsp, h->rbx); - h->sub(h->rsp, 0x10); - h->mov(h->ptr[h->rsp], h->rbx); -} - -void jit_emitter::internal_call_rsp_restore() const { - h->mov(h->rbx, h->ptr[h->rsp]); - h->add(h->rsp, 0x10); - h->add(h->rsp, h->rbx); -} - } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp index 5f3af9311b7907..1f286aa4b4812e 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp @@ -13,9 +13,38 @@ #include +#define SNIPPETS_DEBUG_INFO + namespace ov { namespace intel_cpu { +#ifdef SNIPPETS_DEBUG_INFO + +class LoopBeginEmitter; +class LoopEndEmitter; +class jit_load_emitter; +class jit_store_emitter; + +enum EmitterType { + LoopBegin, + LoopEnd, + Load, + Store, + EmitterNum, +}; + +struct DebugInfo { + EmitterType e_type = EmitterNum; + LoopBeginEmitter* loop_begin_emitter = nullptr; + LoopEndEmitter* loop_end_emitter = nullptr; + jit_load_emitter* load_emitter = nullptr; + jit_store_emitter* store_emitter = nullptr; +}; + +extern DebugInfo g_debug_err_handler; + +#endif + enum emitter_in_out_map { vec_to_vec, vec_to_gpr, @@ -43,7 +72,6 @@ class jit_emitter : public ov::snippets::Emitter { virtual size_t get_inputs_num() const = 0; virtual size_t aux_vecs_count() const; emitter_in_out_map get_in_out_type() const; - virtual void print_debug_info() const {std::cerr << "Debug info was not specified.\n"; } /** * @brief Returns supported precisions. @@ -106,6 +134,23 @@ class jit_emitter : public ov::snippets::Emitter { mutable std::vector aux_vec_idxs; mutable std::vector aux_gpr_idxs; +#ifdef SNIPPETS_DEBUG_INFO + virtual void build_debug_info(jit_emitter* p_emitter, EmitterType emitter_type) const { + h->push(h->r15); + h->push(h->r14); + + h->mov(h->r15, reinterpret_cast(&p_emitter)); + h->mov(h->r14, reinterpret_cast(this)); + h->mov(h->qword[h->r15], h->r14); + + h->mov(h->r15, reinterpret_cast(&g_debug_err_handler.e_type)); + h->mov(h->qword[h->r15], emitter_type); + + h->pop(h->r14); + h->pop(h->r15); + } +#endif + static constexpr int k_mask_size = 8; static constexpr int k_mask_num = 8; static constexpr int gpr_size = 8; @@ -133,11 +178,6 @@ class jit_emitter : public ov::snippets::Emitter { } } - virtual void internal_call_preamble() const; - virtual void internal_call_postamble() const; - virtual void internal_call_rsp_align() const; - virtual void internal_call_rsp_restore() const; - private: mutable std::vector preserved_vec_idxs; mutable std::vector preserved_gpr_idxs; diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_load_store_emitters.cpp index ce09fb071ea4ce..bae9bc90ab377c 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_load_store_emitters.cpp @@ -97,6 +97,7 @@ size_t jit_load_emitter::aux_gprs_count() const { } void jit_load_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { + build_debug_info(g_debug_err_handler.load_emitter, EmitterType::Load); const int offset = in_idxs.size() == 2 ? in_idxs[1] : 0; if (host_isa_ == cpu::x64::sse41) { emit_isa(Reg64(in_idxs[0]), static_cast(out_idxs[0]), offset); @@ -587,6 +588,14 @@ void jit_load_emitter::register_table_entries() { } } +void jit_load_emitter::print_debug_info() const { + std::cerr << "Segfault happens in jit_load_emitter." << "\n"; + // std::cerr << "Emitter name:" << name_ << "\n"; + // std::cerr << "load_num_:" << load_num_ << "\n"; + // std::cerr << "src_prc_:" << src_prc_ << "\n"; + // std::cerr << "dst_prc_:" << dst_prc_ << "\n"; +} + /// STORE /// jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, Precision src_prc, Precision dst_prc, int store_num, arithmetic_mode mode, Precision exec_prc, @@ -647,6 +656,8 @@ void jit_store_emitter::emit_data() const { } void jit_store_emitter::emit_impl(const std::vector &in_idxs, const std::vector &out_idxs) const { + build_debug_info(g_debug_err_handler.store_emitter, EmitterType::Store); + // h->int_(11); const int offset = in_idxs.size() == 2 ? in_idxs[1] : 0; if (host_isa_ == cpu::x64::sse41) { emit_isa(static_cast(in_idxs[0]), Reg64(out_idxs[0]), offset); @@ -1251,5 +1262,13 @@ void jit_store_emitter::register_table_entries() { } } +void jit_store_emitter::print_debug_info() const { + std::cerr << "Segfault happens in jit_store_emitter." << "\n"; + // std::cerr << "Emitter name:" << name_ << "\n"; + // std::cerr << "store_num_:" << store_num_ << "\n"; + // std::cerr << "src_prc_:" << src_prc_ << "\n"; + // std::cerr << "dst_prc_:" << dst_prc_ << "\n"; +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_load_store_emitters.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_load_store_emitters.hpp index 7230a81b724f49..0902f87265c10c 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_load_store_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_load_store_emitters.hpp @@ -71,6 +71,8 @@ class jit_load_emitter : public jit_emitter { size_t get_inputs_num() const override; + void print_debug_info() const; + private: template void emit_isa(const Xbyak::Reg64 ®_src, const int out_vec_idx, const int offset) const; @@ -99,6 +101,8 @@ class jit_load_emitter : public jit_emitter { InferenceEngine::Precision dst_prc_; bool is_fill_; std::string fill_value_; + + std::shared_ptr m_load_node; }; class jit_store_emitter : public jit_emitter { @@ -134,6 +138,8 @@ class jit_store_emitter : public jit_emitter { return uni_vcvtneps2bf16_; } + void print_debug_info() const; + private: template void emit_isa(const int in_vec_idx, const Xbyak::Reg64 ®_dst, const int offset) const; @@ -171,6 +177,8 @@ class jit_store_emitter : public jit_emitter { mutable bool data_reg_updated = false; mutable int data_idx = 0; mutable int aux_src_idx = 0; + + std::shared_ptr m_store_node; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp index 31d994f875e135..b990d581eedb02 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp @@ -21,12 +21,6 @@ using namespace dnnl::impl::cpu::x64; namespace ov { namespace intel_cpu { -jit_emitter* g_debug_err_handler = nullptr; - -static void assign_emitter(KernelEmitter* error_emitter) { - g_debug_err_handler = error_emitter; -} - using jit_generator = dnnl::impl::cpu::x64::jit_generator; using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t; using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; @@ -58,12 +52,6 @@ jit_container_emitter::jit_container_emitter(jit_generator* h, cpu_isa_t isa, co in_out_type_ = emitter_in_out_map::gpr_to_gpr; } -void jit_container_emitter::print_debug_info() const { - // std::cerr << "Emitter type name:" << get_type_name(this) << "\n"; - // std::cerr << "Mapped node friendly name:" << m_kernel_node->get_friendly_name() << "\n"; - std::cerr << "Debug info for jit_container_emitter was printed successfully\n"; -} - void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool, mapping_info& vec_map_pool, snippets::lowered::LinearIR::container& expressions) const { if (expressions.empty()) @@ -345,62 +333,14 @@ void KernelEmitter::emit_impl(const std::vector& in, std::vector data_ptr_regs; transform_idxs_to_regs(data_ptr_regs_idx, data_ptr_regs); -// init_data_pointers(reg_indexes, reg_const_params, data_ptr_regs); - // h->nop(); - // h->nop(); - // h->nop(); - - // h->push(reg_const_params); - // h->push(reg_indexes); - - // h->mov(reg_indexes, reinterpret_cast(&g_debug_err_handler)); - // h->mov(reg_const_params, reinterpret_cast(this)); - // h->mov(h->qword[reg_indexes], reg_const_params); - - // h->pop(reg_indexes); - // h->pop(reg_const_params); - // h->int_(11); - - // - internal_call_preamble(); - - h->push(h->rax); - h->push(abi_param1); - h->mov(h->rax, reinterpret_cast(&assign_emitter)); - h->mov(abi_param1, reinterpret_cast(this)); - - internal_call_rsp_align(); - h->call(h->rax); - internal_call_rsp_restore(); - - h->pop(abi_param1); - h->pop(h->rax); - - internal_call_postamble(); - h->int_(11); - // - - // h->nop(); - // h->nop(); - // h->nop(); - + init_data_pointers(reg_indexes, reg_const_params, data_ptr_regs); + for (const auto& expression : body) { + const auto& emitter = expression->get_emitter(); + std::vector in_regs, out_regs; + std::tie(in_regs, out_regs) = expression->get_reg_info(); + emitter->emit_code(in_regs, out_regs, vec_regs_pool, gp_regs_pool); + } h->postamble(); -//////////////////////////////////////////////////////////////////////////////////////////////// - // h->preamble(); - - // Reg64 reg_indexes = Reg64(static_cast(reg_indexes_idx)); - // Reg64 reg_const_params = Reg64(static_cast(reg_const_params_idx)); - // std::vector data_ptr_regs; - // transform_idxs_to_regs(data_ptr_regs_idx, data_ptr_regs); - - // init_data_pointers(reg_indexes, reg_const_params, data_ptr_regs); - // for (const auto& expression : body) { - // const auto& emitter = expression->get_emitter(); - // std::vector in_regs, out_regs; - // std::tie(in_regs, out_regs) = expression->get_reg_info(); - // emitter->emit_code(in_regs, out_regs, vec_regs_pool, gp_regs_pool); - // } - // h->postamble(); } LoopBeginEmitter::LoopBeginEmitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_emitter(h, isa) { diff --git a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.hpp index 8c2934b22b40c6..f0d58add543092 100644 --- a/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.hpp @@ -23,8 +23,6 @@ namespace ov { namespace intel_cpu { -extern jit_emitter* g_debug_err_handler; - #define SNIPPETS_MAX_SNIPPETS_DIMS 12 #define SNIPPETS_MAX_HARNESS_DIMS 5 #define SNIPPETS_MAX_TILE_RANK 2 @@ -126,6 +124,7 @@ class LoopBeginEmitter : public jit_emitter { const std::vector &out) const; // todo: it is purely virtual in the base class, but do we need it? size_t get_inputs_num() const override {return 0;} + void print_debug_info() const; private: using jit_emitter::emit_code; @@ -148,6 +147,7 @@ class LoopEndEmitter : public jit_emitter { const std::vector &out) const; // todo: it is purely virtual in the base class, but do we need it? size_t get_inputs_num() const override {return 0;} + void print_debug_info() const; private: using jit_emitter::emit_code; diff --git a/src/plugins/intel_cpu/src/nodes/interpolate.cpp b/src/plugins/intel_cpu/src/nodes/interpolate.cpp index 3b7c9a81645d3c..60a1259f43aba4 100644 --- a/src/plugins/intel_cpu/src/nodes/interpolate.cpp +++ b/src/plugins/intel_cpu/src/nodes/interpolate.cpp @@ -1521,7 +1521,6 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi depthwise_inj_idx++; post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep(); - // depthwise_inj_idx++; } else if (post_op.is_quantization()) { bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize; bool do_rounding = do_dequantization || dst_prc == Precision::FP32 || i != p.len() - 1; diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 81416f7e8dc999..21e63f357ec745 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -45,6 +45,11 @@ using namespace Xbyak; namespace ov { namespace intel_cpu { + +#ifdef SNIPPETS_DEBUG_INFO +DebugInfo g_debug_err_handler; +#endif + namespace node { namespace { @@ -462,61 +467,29 @@ void Snippet::SnippetJitExecutor::update_ptrs(jit_snippets_call_args& call_args, } } -// void Snippet::SnippetJitExecutor::schedule_6d(const std::vector& inMemPtrs, const std::vector& outMemPtrs) { -// const auto& dom = exec_domain; -// // < N, C, H, W > < 1, 1, N, C*H*W> -// parallel_for5d(dom[0], dom[1], dom[2], dom[3], dom[4], -// [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) { -// int64_t indexes[] = {d0, d1, d2, d3, d4}; -// jit_snippets_call_args call_args; -// update_ptrs(call_args, inMemPtrs, outMemPtrs); - -// schedule.get_callable()(indexes, &call_args); -// }); -// } - void Snippet::SnippetJitExecutor::schedule_6d(const std::vector& inMemPtrs, const std::vector& outMemPtrs) { const auto& dom = exec_domain; // < N, C, H, W > < 1, 1, N, C*H*W> - const auto callable = schedule.get_callable(); - - __sighandler_t signal_handler = [](int signal) { - std::cerr << "Segfault was caught by the signal handler.\n"; - // auto k = dynamic_cast(g_debug_err_handler); - // k->print_debug_info(); - g_debug_err_handler->print_debug_info(); - // if (k) { - // k->print_debug_info(); - // } else { - // std::cout << "is not a kernelemitter, k:" << k << std::endl; - // } - // auto k1 = dynamic_cast(g_debug_err_handler); - // if (k1) { - // std::cout << "is a jit_emitter, k1:" << k1 << std::endl; - // k1->print_debug_info(); - // } - // g_debug_err_handler->print_debug_info(); - // struct sigaction new_handler{}; - // new_handler.sa_handler = SIG_DFL; - // sigaction(SIGSEGV, &new_handler, nullptr); - }; - struct sigaction new_handler{}; - new_handler.sa_handler = signal_handler; - sigaction(SIGSEGV, &new_handler, nullptr); - - int64_t indexes[] = {0, 0, 0, 0, 0}; - jit_snippets_call_args call_args; - update_ptrs(call_args, inMemPtrs, outMemPtrs); - callable(indexes, &call_args); - OPENVINO_ASSERT(g_debug_err_handler, "Debug handler was not set"); - OPENVINO_THROW("EXPERIMENT FINISHED"); -// parallel_for5d(dom[0], dom[1], dom[2], dom[3], dom[4], -// [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) { -// int64_t indexes[] = {d0, d1, d2, d3, d4}; -// jit_snippets_call_args call_args; -// update_ptrs(call_args, inMemPtrs, outMemPtrs); -// callable(indexes, &call_args); -// }); + parallel_for5d(dom[0], dom[1], dom[2], dom[3], dom[4], + [&](int64_t d0, int64_t d1, int64_t d2, int64_t d3, int64_t d4) { + int64_t indexes[] = {d0, d1, d2, d3, d4}; + jit_snippets_call_args call_args; + update_ptrs(call_args, inMemPtrs, outMemPtrs); + + __sighandler_t signal_handler = [](int signal) { + std::cerr << "Segfault was caught by the signal handler.\n"; + if (ov::intel_cpu::g_debug_err_handler.e_type == EmitterType::Load) { + ov::intel_cpu::g_debug_err_handler.load_emitter->print_debug_info(); + } else if (ov::intel_cpu::g_debug_err_handler.e_type == EmitterType::Store) { + ov::intel_cpu::g_debug_err_handler.store_emitter->print_debug_info(); + } + }; + struct sigaction new_handler{}; + new_handler.sa_handler = signal_handler; + sigaction(SIGSEGV, &new_handler, nullptr); + + schedule.get_callable()(indexes, &call_args); + }); } void Snippet::SnippetJitExecutor::schedule_nt(const std::vector& inMemPtrs, const std::vector& outMemPtrs) { @@ -536,6 +509,18 @@ void Snippet::SnippetJitExecutor::schedule_nt(const std::vector& inMe tmp /= work_size[j]; } + __sighandler_t signal_handler = [](int signal) { + std::cerr << "Segfault was caught by the signal handler.\n"; + if (ov::intel_cpu::g_debug_err_handler.e_type == EmitterType::Load) { + ov::intel_cpu::g_debug_err_handler.load_emitter->print_debug_info(); + } else if (ov::intel_cpu::g_debug_err_handler.e_type == EmitterType::Store) { + ov::intel_cpu::g_debug_err_handler.store_emitter->print_debug_info(); + } + }; + struct sigaction new_handler{}; + new_handler.sa_handler = signal_handler; + sigaction(SIGSEGV, &new_handler, nullptr); + schedule.get_callable()(indexes.data(), &call_args); } });