Skip to content

Commit

Permalink
explicite emitters created, as dynamic bounding of virtual funtion is…
Browse files Browse the repository at this point in the history
… in compilation stage
  • Loading branch information
chenhu-wang committed Sep 19, 2023
1 parent f03caa4 commit 1f7baf4
Show file tree
Hide file tree
Showing 8 changed files with 119 additions and 201 deletions.
73 changes: 0 additions & 73 deletions src/plugins/intel_cpu/src/emitters/x64/jit_emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,78 +213,5 @@ void jit_emitter::emit_code(const std::vector<size_t> &in_idxs, const std::vecto
emitter_postamble();
}

void jit_emitter::internal_call_preamble() const {
// gprs
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);

h->sub(h->rsp, n_gprs_to_save * gpr_size);
for (size_t i = 0; i < n_gprs_to_save; ++i)
h->mov(h->ptr[h->rsp + i * gpr_size], gprs_to_save[i]);

// mask regs
// need preserve based on cpu capability, instead of host isa.
// in case there are possibilty that different isa emitters exist in one subgraph KernelEmitter from perf standpoint in the future.
// e.g. other emitters isa is avx512, while this emitter isa is avx2, and internal call is used. Internal call may use avx512 and spoil k-reg.
// do not care about platform w/ avx512_common but w/o avx512_core(knight landing), which is obsoleted.
if (cpu::x64::mayiuse(cpu::x64::avx512_core)) {
h->sub(h->rsp, k_mask_num * k_mask_size);
for (size_t i = 0; i < k_mask_num; ++i) {
h->kmovq(h->ptr[h->rsp + i * k_mask_size], Opmask(static_cast<int>(i)));
}
}

// vector regs
// 1. Caller obligation to save vector registers as callee may use them.
// 2. There is an implicit assumption that the host code uses the same
// `isa` as the injector. Once the assumption is wrong, `vecs_count` and
// `vlen` should be replaced with `host_isa::vlen` and
// `host_isa::vecs_count`.
h->sub(h->rsp, get_max_vecs_count() * get_vec_length());
for (size_t i = 0; i < get_max_vecs_count(); ++i) {
push_vec(h->ptr[h->rsp + i * get_vec_length()], i);
}
}

void jit_emitter::internal_call_postamble() const {
// restore vector registers
for (int i = static_cast<int>(get_max_vecs_count()) - 1; i >= 0; --i) {
pop_vec(static_cast<size_t>(i), h->ptr[h->rsp + i * get_vec_length()]);
}
h->add(h->rsp, (get_max_vecs_count()) * get_vec_length());

// restore k reg
if (cpu::x64::mayiuse(cpu::x64::avx512_core)) {
for (int i = k_mask_num - 1; i >= 0; --i) {
h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
}
h->add(h->rsp, k_mask_num * k_mask_size);
}

// restore gpr registers
Xbyak::Operand gprs_to_save[] = {h->r8, h->r9, h->r10, h->r11, h->r12, h->r13, h->r14, h->r15,
h->rax, h->rbx, h->rcx, h->rdx, h->rdi, h->rsi, h->rbp};
size_t n_gprs_to_save = sizeof(gprs_to_save) / sizeof(gprs_to_save[0]);
for (int i = n_gprs_to_save - 1; i >= 0; --i)
h->mov(gprs_to_save[i], h->ptr[h->rsp + i * gpr_size]);
h->add(h->rsp, n_gprs_to_save * gpr_size);
}

// additional 16 byte for offset, callee can use arbitrary regs.
void jit_emitter::internal_call_rsp_align() const {
h->mov(h->rbx, h->rsp);
h->and_(h->rbx, 0xf);
h->sub(h->rsp, h->rbx);
h->sub(h->rsp, 0x10);
h->mov(h->ptr[h->rsp], h->rbx);
}

void jit_emitter::internal_call_rsp_restore() const {
h->mov(h->rbx, h->ptr[h->rsp]);
h->add(h->rsp, 0x10);
h->add(h->rsp, h->rbx);
}

} // namespace intel_cpu
} // namespace ov
52 changes: 46 additions & 6 deletions src/plugins/intel_cpu/src/emitters/x64/jit_emitter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,38 @@

#include <set>

#define SNIPPETS_DEBUG_INFO

namespace ov {
namespace intel_cpu {

#ifdef SNIPPETS_DEBUG_INFO

class LoopBeginEmitter;
class LoopEndEmitter;
class jit_load_emitter;
class jit_store_emitter;

enum EmitterType {
LoopBegin,
LoopEnd,
Load,
Store,
EmitterNum,
};

struct DebugInfo {
EmitterType e_type = EmitterNum;
LoopBeginEmitter* loop_begin_emitter = nullptr;
LoopEndEmitter* loop_end_emitter = nullptr;
jit_load_emitter* load_emitter = nullptr;
jit_store_emitter* store_emitter = nullptr;
};

extern DebugInfo g_debug_err_handler;

#endif

enum emitter_in_out_map {
vec_to_vec,
vec_to_gpr,
Expand Down Expand Up @@ -43,7 +72,6 @@ class jit_emitter : public ov::snippets::Emitter {
virtual size_t get_inputs_num() const = 0;
virtual size_t aux_vecs_count() const;
emitter_in_out_map get_in_out_type() const;
virtual void print_debug_info() const {std::cerr << "Debug info was not specified.\n"; }

/**
* @brief Returns supported precisions.
Expand Down Expand Up @@ -106,6 +134,23 @@ class jit_emitter : public ov::snippets::Emitter {
mutable std::vector<size_t> aux_vec_idxs;
mutable std::vector<size_t> aux_gpr_idxs;

#ifdef SNIPPETS_DEBUG_INFO
virtual void build_debug_info(jit_emitter* p_emitter, EmitterType emitter_type) const {
h->push(h->r15);
h->push(h->r14);

h->mov(h->r15, reinterpret_cast<uint64_t>(&p_emitter));
h->mov(h->r14, reinterpret_cast<uint64_t>(this));
h->mov(h->qword[h->r15], h->r14);

h->mov(h->r15, reinterpret_cast<uint64_t>(&g_debug_err_handler.e_type));
h->mov(h->qword[h->r15], emitter_type);

h->pop(h->r14);
h->pop(h->r15);
}
#endif

static constexpr int k_mask_size = 8;
static constexpr int k_mask_num = 8;
static constexpr int gpr_size = 8;
Expand Down Expand Up @@ -133,11 +178,6 @@ class jit_emitter : public ov::snippets::Emitter {
}
}

virtual void internal_call_preamble() const;
virtual void internal_call_postamble() const;
virtual void internal_call_rsp_align() const;
virtual void internal_call_rsp_restore() const;

private:
mutable std::vector<size_t> preserved_vec_idxs;
mutable std::vector<size_t> preserved_gpr_idxs;
Expand Down
19 changes: 19 additions & 0 deletions src/plugins/intel_cpu/src/emitters/x64/jit_load_store_emitters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ size_t jit_load_emitter::aux_gprs_count() const {
}

void jit_load_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
build_debug_info(g_debug_err_handler.load_emitter, EmitterType::Load);
const int offset = in_idxs.size() == 2 ? in_idxs[1] : 0;
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(Reg64(in_idxs[0]), static_cast<int>(out_idxs[0]), offset);
Expand Down Expand Up @@ -587,6 +588,14 @@ void jit_load_emitter::register_table_entries() {
}
}

void jit_load_emitter::print_debug_info() const {
std::cerr << "Segfault happens in jit_load_emitter." << "\n";
// std::cerr << "Emitter name:" << name_ << "\n";
// std::cerr << "load_num_:" << load_num_ << "\n";
// std::cerr << "src_prc_:" << src_prc_ << "\n";
// std::cerr << "dst_prc_:" << dst_prc_ << "\n";
}

/// STORE ///
jit_store_emitter::jit_store_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa,
Precision src_prc, Precision dst_prc, int store_num, arithmetic_mode mode, Precision exec_prc,
Expand Down Expand Up @@ -647,6 +656,8 @@ void jit_store_emitter::emit_data() const {
}

void jit_store_emitter::emit_impl(const std::vector<size_t> &in_idxs, const std::vector<size_t> &out_idxs) const {
build_debug_info(g_debug_err_handler.store_emitter, EmitterType::Store);
// h->int_(11);
const int offset = in_idxs.size() == 2 ? in_idxs[1] : 0;
if (host_isa_ == cpu::x64::sse41) {
emit_isa<cpu::x64::sse41>(static_cast<int>(in_idxs[0]), Reg64(out_idxs[0]), offset);
Expand Down Expand Up @@ -1251,5 +1262,13 @@ void jit_store_emitter::register_table_entries() {
}
}

void jit_store_emitter::print_debug_info() const {
std::cerr << "Segfault happens in jit_store_emitter." << "\n";
// std::cerr << "Emitter name:" << name_ << "\n";
// std::cerr << "store_num_:" << store_num_ << "\n";
// std::cerr << "src_prc_:" << src_prc_ << "\n";
// std::cerr << "dst_prc_:" << dst_prc_ << "\n";
}

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ class jit_load_emitter : public jit_emitter {

size_t get_inputs_num() const override;

void print_debug_info() const;

private:
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const Xbyak::Reg64 &reg_src, const int out_vec_idx, const int offset) const;
Expand Down Expand Up @@ -99,6 +101,8 @@ class jit_load_emitter : public jit_emitter {
InferenceEngine::Precision dst_prc_;
bool is_fill_;
std::string fill_value_;

std::shared_ptr<snippets::op::Kernel> m_load_node;
};

class jit_store_emitter : public jit_emitter {
Expand Down Expand Up @@ -134,6 +138,8 @@ class jit_store_emitter : public jit_emitter {
return uni_vcvtneps2bf16_;
}

void print_debug_info() const;

private:
template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void emit_isa(const int in_vec_idx, const Xbyak::Reg64 &reg_dst, const int offset) const;
Expand Down Expand Up @@ -171,6 +177,8 @@ class jit_store_emitter : public jit_emitter {
mutable bool data_reg_updated = false;
mutable int data_idx = 0;
mutable int aux_src_idx = 0;

std::shared_ptr<snippets::op::Kernel> m_store_node;
};

} // namespace intel_cpu
Expand Down
74 changes: 7 additions & 67 deletions src/plugins/intel_cpu/src/emitters/x64/jit_snippets_emitters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,6 @@ using namespace dnnl::impl::cpu::x64;
namespace ov {
namespace intel_cpu {

jit_emitter* g_debug_err_handler = nullptr;

static void assign_emitter(KernelEmitter* error_emitter) {
g_debug_err_handler = error_emitter;
}

using jit_generator = dnnl::impl::cpu::x64::jit_generator;
using cpu_isa_t = dnnl::impl::cpu::x64::cpu_isa_t;
using ExpressionPtr = ov::snippets::lowered::ExpressionPtr;
Expand Down Expand Up @@ -58,12 +52,6 @@ jit_container_emitter::jit_container_emitter(jit_generator* h, cpu_isa_t isa, co
in_out_type_ = emitter_in_out_map::gpr_to_gpr;
}

void jit_container_emitter::print_debug_info() const {
// std::cerr << "Emitter type name:" << get_type_name(this) << "\n";
// std::cerr << "Mapped node friendly name:" << m_kernel_node->get_friendly_name() << "\n";
std::cerr << "Debug info for jit_container_emitter was printed successfully\n";
}

void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool, mapping_info& vec_map_pool,
snippets::lowered::LinearIR::container& expressions) const {
if (expressions.empty())
Expand Down Expand Up @@ -345,62 +333,14 @@ void KernelEmitter::emit_impl(const std::vector<size_t>& in,
std::vector<Reg64> data_ptr_regs;
transform_idxs_to_regs(data_ptr_regs_idx, data_ptr_regs);

// init_data_pointers(reg_indexes, reg_const_params, data_ptr_regs);
// h->nop();
// h->nop();
// h->nop();

// h->push(reg_const_params);
// h->push(reg_indexes);

// h->mov(reg_indexes, reinterpret_cast<uint64_t>(&g_debug_err_handler));
// h->mov(reg_const_params, reinterpret_cast<uint64_t>(this));
// h->mov(h->qword[reg_indexes], reg_const_params);

// h->pop(reg_indexes);
// h->pop(reg_const_params);
// h->int_(11);

//
internal_call_preamble();

h->push(h->rax);
h->push(abi_param1);
h->mov(h->rax, reinterpret_cast<size_t>(&assign_emitter));
h->mov(abi_param1, reinterpret_cast<size_t>(this));

internal_call_rsp_align();
h->call(h->rax);
internal_call_rsp_restore();

h->pop(abi_param1);
h->pop(h->rax);

internal_call_postamble();
h->int_(11);
//

// h->nop();
// h->nop();
// h->nop();

init_data_pointers(reg_indexes, reg_const_params, data_ptr_regs);
for (const auto& expression : body) {
const auto& emitter = expression->get_emitter();
std::vector<size_t> in_regs, out_regs;
std::tie(in_regs, out_regs) = expression->get_reg_info();
emitter->emit_code(in_regs, out_regs, vec_regs_pool, gp_regs_pool);
}
h->postamble();
////////////////////////////////////////////////////////////////////////////////////////////////
// h->preamble();

// Reg64 reg_indexes = Reg64(static_cast<int>(reg_indexes_idx));
// Reg64 reg_const_params = Reg64(static_cast<int>(reg_const_params_idx));
// std::vector<Reg64> data_ptr_regs;
// transform_idxs_to_regs(data_ptr_regs_idx, data_ptr_regs);

// init_data_pointers(reg_indexes, reg_const_params, data_ptr_regs);
// for (const auto& expression : body) {
// const auto& emitter = expression->get_emitter();
// std::vector<size_t> in_regs, out_regs;
// std::tie(in_regs, out_regs) = expression->get_reg_info();
// emitter->emit_code(in_regs, out_regs, vec_regs_pool, gp_regs_pool);
// }
// h->postamble();
}

LoopBeginEmitter::LoopBeginEmitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_emitter(h, isa) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
namespace ov {
namespace intel_cpu {

extern jit_emitter* g_debug_err_handler;

#define SNIPPETS_MAX_SNIPPETS_DIMS 12
#define SNIPPETS_MAX_HARNESS_DIMS 5
#define SNIPPETS_MAX_TILE_RANK 2
Expand Down Expand Up @@ -126,6 +124,7 @@ class LoopBeginEmitter : public jit_emitter {
const std::vector<size_t> &out) const;
// todo: it is purely virtual in the base class, but do we need it?
size_t get_inputs_num() const override {return 0;}
void print_debug_info() const;

private:
using jit_emitter::emit_code;
Expand All @@ -148,6 +147,7 @@ class LoopEndEmitter : public jit_emitter {
const std::vector<size_t> &out) const;
// todo: it is purely virtual in the base class, but do we need it?
size_t get_inputs_num() const override {return 0;}
void print_debug_info() const;

private:
using jit_emitter::emit_code;
Expand Down
1 change: 0 additions & 1 deletion src/plugins/intel_cpu/src/nodes/interpolate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1521,7 +1521,6 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi

depthwise_inj_idx++;
post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
// depthwise_inj_idx++;
} else if (post_op.is_quantization()) {
bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
bool do_rounding = do_dequantization || dst_prc == Precision::FP32 || i != p.len() - 1;
Expand Down
Loading

0 comments on commit 1f7baf4

Please sign in to comment.