diff --git a/CodeGen/include/Luau/AssemblyBuilderA64.h b/CodeGen/include/Luau/AssemblyBuilderA64.h index a86403d4c..bea70fd00 100644 --- a/CodeGen/include/Luau/AssemblyBuilderA64.h +++ b/CodeGen/include/Luau/AssemblyBuilderA64.h @@ -211,7 +211,6 @@ class AssemblyBuilderA64 void placeSR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift = 0, int N = 0); void placeSR2(const char* name, RegisterA64 dst, RegisterA64 src, uint8_t op, uint8_t op2 = 0); void placeR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, uint8_t op2); - void placeR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t sizes, uint8_t op, uint8_t op2); void placeR1(const char* name, RegisterA64 dst, RegisterA64 src, uint32_t op); void placeI12(const char* name, RegisterA64 dst, RegisterA64 src1, int src2, uint8_t op); void placeI16(const char* name, RegisterA64 dst, int src, uint8_t op, int shift = 0); @@ -230,6 +229,7 @@ class AssemblyBuilderA64 void placeBM(const char* name, RegisterA64 dst, RegisterA64 src1, uint32_t src2, uint8_t op); void placeBFM(const char* name, RegisterA64 dst, RegisterA64 src1, int src2, uint8_t op, int immr, int imms); void placeER(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift); + void placeVR(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint16_t op, uint8_t op2); void place(uint32_t word); diff --git a/CodeGen/src/AssemblyBuilderA64.cpp b/CodeGen/src/AssemblyBuilderA64.cpp index 96d17192f..ffb0a774e 100644 --- a/CodeGen/src/AssemblyBuilderA64.cpp +++ b/CodeGen/src/AssemblyBuilderA64.cpp @@ -63,13 +63,22 @@ AssemblyBuilderA64::~AssemblyBuilderA64() void AssemblyBuilderA64::mov(RegisterA64 dst, RegisterA64 src) { - CODEGEN_ASSERT(dst.kind == KindA64::w || dst.kind == KindA64::x || dst == sp); - CODEGEN_ASSERT(dst.kind == src.kind || (dst.kind == KindA64::x && src == sp) || (dst == sp && src.kind == KindA64::x)); + if (dst.kind != KindA64::q) + { + CODEGEN_ASSERT(dst.kind == KindA64::w || dst.kind == KindA64::x || dst == sp); + CODEGEN_ASSERT(dst.kind == src.kind || (dst.kind == KindA64::x && src == sp) || (dst == sp && src.kind == KindA64::x)); - if (dst == sp || src == sp) - placeR1("mov", dst, src, 0b00'100010'0'000000000000); + if (dst == sp || src == sp) + placeR1("mov", dst, src, 0b00'100010'0'000000000000); + else + placeSR2("mov", dst, src, 0b01'01010); + } else - placeSR2("mov", dst, src, 0b01'01010); + { + CODEGEN_ASSERT(dst.kind == src.kind); + + placeR1("mov", dst, src, 0b10'01110'10'1'00000'00011'1 | (src.index << 6)); + } } void AssemblyBuilderA64::mov(RegisterA64 dst, int src) @@ -575,12 +584,18 @@ void AssemblyBuilderA64::fadd(RegisterA64 dst, RegisterA64 src1, RegisterA64 src placeR3("fadd", dst, src1, src2, 0b11110'01'1, 0b0010'10); } - else + else if (dst.kind == KindA64::s) { - CODEGEN_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s); + CODEGEN_ASSERT(src1.kind == KindA64::s && src2.kind == KindA64::s); placeR3("fadd", dst, src1, src2, 0b11110'00'1, 0b0010'10); } + else + { + CODEGEN_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q); + + placeVR("fadd", dst, src1, src2, 0b0'01110'0'0'1, 0b11010'1); + } } void AssemblyBuilderA64::fdiv(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2) @@ -591,12 +606,18 @@ void AssemblyBuilderA64::fdiv(RegisterA64 dst, RegisterA64 src1, RegisterA64 src placeR3("fdiv", dst, src1, src2, 0b11110'01'1, 0b0001'10); } - else + else if (dst.kind == KindA64::s) { - CODEGEN_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s); + CODEGEN_ASSERT(src1.kind == KindA64::s && src2.kind == KindA64::s); placeR3("fdiv", dst, src1, src2, 0b11110'00'1, 0b0001'10); } + else + { + CODEGEN_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q); + + placeVR("fdiv", dst, src1, src2, 0b1'01110'00'1, 0b11111'1); + } } void AssemblyBuilderA64::fmul(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2) @@ -607,12 +628,18 @@ void AssemblyBuilderA64::fmul(RegisterA64 dst, RegisterA64 src1, RegisterA64 src placeR3("fmul", dst, src1, src2, 0b11110'01'1, 0b0000'10); } - else + else if (dst.kind == KindA64::s) { - CODEGEN_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s); + CODEGEN_ASSERT(src1.kind == KindA64::s && src2.kind == KindA64::s); placeR3("fmul", dst, src1, src2, 0b11110'00'1, 0b0000'10); } + else + { + CODEGEN_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q); + + placeVR("fmul", dst, src1, src2, 0b1'01110'00'1, 0b11011'1); + } } void AssemblyBuilderA64::fneg(RegisterA64 dst, RegisterA64 src) @@ -623,12 +650,18 @@ void AssemblyBuilderA64::fneg(RegisterA64 dst, RegisterA64 src) placeR1("fneg", dst, src, 0b000'11110'01'1'0000'10'10000); } - else + else if (dst.kind == KindA64::s) { - CODEGEN_ASSERT(dst.kind == KindA64::s && src.kind == KindA64::s); + CODEGEN_ASSERT(src.kind == KindA64::s); placeR1("fneg", dst, src, 0b000'11110'00'1'0000'10'10000); } + else + { + CODEGEN_ASSERT(dst.kind == KindA64::q && src.kind == KindA64::q); + + placeR1("fneg", dst, src, 0b011'01110'1'0'10000'01111'10); + } } void AssemblyBuilderA64::fsqrt(RegisterA64 dst, RegisterA64 src) @@ -646,12 +679,18 @@ void AssemblyBuilderA64::fsub(RegisterA64 dst, RegisterA64 src1, RegisterA64 src placeR3("fsub", dst, src1, src2, 0b11110'01'1, 0b0011'10); } - else + else if (dst.kind == KindA64::s) { - CODEGEN_ASSERT(dst.kind == KindA64::s && src1.kind == KindA64::s && src2.kind == KindA64::s); + CODEGEN_ASSERT(src1.kind == KindA64::s && src2.kind == KindA64::s); placeR3("fsub", dst, src1, src2, 0b11110'00'1, 0b0011'10); } + else + { + CODEGEN_ASSERT(dst.kind == KindA64::q && src1.kind == KindA64::q && src2.kind == KindA64::q); + + placeVR("fsub", dst, src1, src2, 0b0'01110'10'1, 0b11010'1); + } } void AssemblyBuilderA64::ins_4s(RegisterA64 dst, RegisterA64 src, uint8_t index) @@ -952,18 +991,6 @@ void AssemblyBuilderA64::placeR3(const char* name, RegisterA64 dst, RegisterA64 commit(); } -void AssemblyBuilderA64::placeR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t sizes, uint8_t op, uint8_t op2) -{ - if (logText) - log(name, dst, src1, src2); - - CODEGEN_ASSERT(dst.kind == KindA64::w || dst.kind == KindA64::x || dst.kind == KindA64::d || dst.kind == KindA64::q); - CODEGEN_ASSERT(dst.kind == src1.kind && dst.kind == src2.kind); - - place(dst.index | (src1.index << 5) | (op2 << 10) | (src2.index << 16) | (op << 21) | (sizes << 29)); - commit(); -} - void AssemblyBuilderA64::placeR1(const char* name, RegisterA64 dst, RegisterA64 src, uint32_t op) { if (logText) @@ -1226,6 +1253,17 @@ void AssemblyBuilderA64::placeER(const char* name, RegisterA64 dst, RegisterA64 commit(); } +void AssemblyBuilderA64::placeVR(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint16_t op, uint8_t op2) +{ + if (logText) + logAppend(" %-12sv%d.4s,v%d.4s,v%d.4s\n", name, dst.index, src1.index, src2.index); + + CODEGEN_ASSERT(dst.kind == KindA64::q && dst.kind == src1.kind && dst.kind == src2.kind); + + place(dst.index | (src1.index << 5) | (op2 << 10) | (src2.index << 16) | (op << 21) | (1 << 30)); + commit(); +} + void AssemblyBuilderA64::place(uint32_t word) { CODEGEN_ASSERT(codePos < codeEnd); diff --git a/CodeGen/src/IrLoweringA64.cpp b/CodeGen/src/IrLoweringA64.cpp index 681c56ec8..9d9df188d 100644 --- a/CodeGen/src/IrLoweringA64.cpp +++ b/CodeGen/src/IrLoweringA64.cpp @@ -12,6 +12,7 @@ #include "lgc.h" LUAU_DYNAMIC_FASTFLAGVARIABLE(LuauCodeGenFixBufferLenCheckA64, false) +LUAU_FASTFLAGVARIABLE(LuauCodeGenVectorA64, false) namespace Luau { @@ -673,15 +674,26 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - RegisterA64 tempb = regs.allocTemp(KindA64::s); + if (FFlag::LuauCodeGenVectorA64) + { + build.fadd(inst.regA64, regOp(inst.a), regOp(inst.b)); - for (uint8_t i = 0; i < 3; i++) + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + else { - build.dup_4s(tempa, regOp(inst.a), i); - build.dup_4s(tempb, regOp(inst.b), i); - build.fadd(tempa, tempa, tempb); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + RegisterA64 tempa = regs.allocTemp(KindA64::s); + RegisterA64 tempb = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.dup_4s(tempb, regOp(inst.b), i); + build.fadd(tempa, tempa, tempb); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -689,15 +701,26 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - RegisterA64 tempb = regs.allocTemp(KindA64::s); + if (FFlag::LuauCodeGenVectorA64) + { + build.fsub(inst.regA64, regOp(inst.a), regOp(inst.b)); - for (uint8_t i = 0; i < 3; i++) + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + else { - build.dup_4s(tempa, regOp(inst.a), i); - build.dup_4s(tempb, regOp(inst.b), i); - build.fsub(tempa, tempa, tempb); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + RegisterA64 tempa = regs.allocTemp(KindA64::s); + RegisterA64 tempb = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.dup_4s(tempb, regOp(inst.b), i); + build.fsub(tempa, tempa, tempb); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -705,15 +728,26 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - RegisterA64 tempb = regs.allocTemp(KindA64::s); + if (FFlag::LuauCodeGenVectorA64) + { + build.fmul(inst.regA64, regOp(inst.a), regOp(inst.b)); - for (uint8_t i = 0; i < 3; i++) + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + else { - build.dup_4s(tempa, regOp(inst.a), i); - build.dup_4s(tempb, regOp(inst.b), i); - build.fmul(tempa, tempa, tempb); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + RegisterA64 tempa = regs.allocTemp(KindA64::s); + RegisterA64 tempb = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.dup_4s(tempb, regOp(inst.b), i); + build.fmul(tempa, tempa, tempb); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -721,15 +755,26 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a, inst.b}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); - RegisterA64 tempb = regs.allocTemp(KindA64::s); + if (FFlag::LuauCodeGenVectorA64) + { + build.fdiv(inst.regA64, regOp(inst.a), regOp(inst.b)); - for (uint8_t i = 0; i < 3; i++) + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + else { - build.dup_4s(tempa, regOp(inst.a), i); - build.dup_4s(tempb, regOp(inst.b), i); - build.fdiv(tempa, tempa, tempb); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + RegisterA64 tempa = regs.allocTemp(KindA64::s); + RegisterA64 tempb = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.dup_4s(tempb, regOp(inst.b), i); + build.fdiv(tempa, tempa, tempb); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } @@ -737,13 +782,24 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regA64 = regs.allocReuse(KindA64::q, index, {inst.a}); - RegisterA64 tempa = regs.allocTemp(KindA64::s); + if (FFlag::LuauCodeGenVectorA64) + { + build.fneg(inst.regA64, regOp(inst.a)); - for (uint8_t i = 0; i < 3; i++) + RegisterA64 tempw = regs.allocTemp(KindA64::w); + build.mov(tempw, LUA_TVECTOR); + build.ins_4s(inst.regA64, tempw, 3); + } + else { - build.dup_4s(tempa, regOp(inst.a), i); - build.fneg(tempa, tempa); - build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + RegisterA64 tempa = regs.allocTemp(KindA64::s); + + for (uint8_t i = 0; i < 3; i++) + { + build.dup_4s(tempa, regOp(inst.a), i); + build.fneg(tempa, tempa); + build.ins_4s(inst.regA64, i, castReg(KindA64::q, tempa), 0); + } } break; } diff --git a/tests/AssemblyBuilderA64.test.cpp b/tests/AssemblyBuilderA64.test.cpp index 6657d8891..320a7a6ad 100644 --- a/tests/AssemblyBuilderA64.test.cpp +++ b/tests/AssemblyBuilderA64.test.cpp @@ -218,6 +218,7 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Moves") { SINGLE_COMPARE(mov(x0, x1), 0xAA0103E0); SINGLE_COMPARE(mov(w0, w1), 0x2A0103E0); + SINGLE_COMPARE(mov(q0, q1), 0x4EA11C20); SINGLE_COMPARE(movz(x0, 42), 0xD2800540); SINGLE_COMPARE(movz(w0, 42), 0x52800540); @@ -501,6 +502,15 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "PrePostIndexing") SINGLE_COMPARE(str(q0, mem(x1, 1, AddressKindA64::post)), 0x3C801420); } +TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "SIMDMath") +{ + SINGLE_COMPARE(fadd(q0, q1, q2), 0x4E22D420); + SINGLE_COMPARE(fsub(q0, q1, q2), 0x4EA2D420); + SINGLE_COMPARE(fmul(q0, q1, q2), 0x6E22DC20); + SINGLE_COMPARE(fdiv(q0, q1, q2), 0x6E22FC20); + SINGLE_COMPARE(fneg(q0, q1), 0x6EA0F820); +} + TEST_CASE("LogTest") { AssemblyBuilderA64 build(/* logText= */ true); @@ -552,6 +562,7 @@ TEST_CASE("LogTest") build.ins_4s(q31, 1, q29, 2); build.dup_4s(s29, q31, 2); build.dup_4s(q29, q30, 0); + build.fmul(q0, q1, q2); build.setLabel(l); build.ret(); @@ -594,6 +605,7 @@ TEST_CASE("LogTest") ins v31.s[1],v29.s[2] dup s29,v31.s[2] dup v29.4s,v30.s[0] + fmul v0.4s,v1.4s,v2.4s .L1: ret )";