Skip to content

Commit

Permalink
GS:SW: Split g_const into separate structs for 128 and 256 bit vectors
Browse files Browse the repository at this point in the history
Allows more code to access members with 8b offsets
  • Loading branch information
TellowKrinkle committed Oct 7, 2024
1 parent 4a88111 commit 0d434d6
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 47 deletions.
13 changes: 7 additions & 6 deletions pcsx2/GS/Renderers/SW/GSDrawScanline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@

#if MULTI_ISA_COMPILE_ONCE
// Lack of a better home
constexpr GSScanlineConstantData g_const;
constexpr GSScanlineConstantData256B g_const_256b;
constexpr GSScanlineConstantData128B g_const_128b;
#endif

MULTI_ISA_UNSHARED_IMPL;
Expand Down Expand Up @@ -206,10 +207,10 @@ void GSDrawScanline::CSetupPrim(const GSVertexSW* vertex, const u16* index, cons
constexpr int vlen = sizeof(VectorF) / sizeof(float);

#if _M_SSE >= 0x501
const GSVector8* shift = (GSVector8*)g_const.m_shift_256b;
const GSVector8* shift = (GSVector8*)g_const_256b.m_shift;
const GSVector4 step_shift = GSVector4::broadcast32(&shift[0]);
#else
const GSVector4* shift = (GSVector4*)g_const.m_shift_128b;
const GSVector4* shift = (GSVector4*)g_const_128b.m_shift;
const GSVector4 step_shift = shift[0];
#endif

Expand Down Expand Up @@ -494,7 +495,7 @@ __ri void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSV
constexpr int vlen = sizeof(VectorF) / sizeof(float);

#if _M_SSE < 0x501
const GSVector4i* const_test = (GSVector4i*)g_const.m_test_128b;
const GSVector4i* const_test = (GSVector4i*)g_const_128b.m_test;
#endif
VectorI test;
VectorF z0, z1;
Expand All @@ -514,7 +515,7 @@ __ri void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSV
steps = pixels + skip - vlen;
left -= skip;
#if _M_SSE >= 0x501
test = GSVector8i::i8to32(g_const.m_test_256b[skip]) | GSVector8i::i8to32(g_const.m_test_256b[15 + (steps & (steps >> 31))]);
test = GSVector8i::i8to32(g_const_256b.m_test[skip]) | GSVector8i::i8to32(g_const_256b.m_test[15 + (steps & (steps >> 31))]);
#else
test = const_test[skip] | const_test[7 + (steps & (steps >> 31))];
#endif
Expand Down Expand Up @@ -1755,7 +1756,7 @@ __ri void GSDrawScanline::CDrawScanline(int pixels, int left, int top, const GSV
if (!sel.notest)
{
#if _M_SSE >= 0x501
test = GSVector8i::i8to32(g_const.m_test_256b[15 + (steps & (steps >> 31))]);
test = GSVector8i::i8to32(g_const_256b.m_test[15 + (steps & (steps >> 31))]);
#else
test = const_test[7 + (steps & (steps >> 31))];
#endif
Expand Down
24 changes: 13 additions & 11 deletions pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ using namespace Xbyak;
#define _rip_local_d_p(x) _rip_local_d(x)
#endif

#if USING_YMM
static constexpr const GSScanlineConstantData256B& g_const = g_const_256b;
#else
static constexpr const GSScanlineConstantData128B& g_const = g_const_128b;
#endif

template <typename A, typename B>
static bool IsInRipRelativeRange(A* a, B* b)
{
Expand Down Expand Up @@ -667,13 +673,13 @@ void GSDrawScanlineCodeGenerator::Init()
if (isXmm)
{
shl(a1.cvt32(), 4); // * sizeof(m_test[0])
movdqa(_test, ptr[a1 + _m_const + offsetof(GSScanlineConstantData, m_test_128b[0])]);
por(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData, m_test_128b[7])]);
movdqa(_test, ptr[a1 + _m_const + offsetof(GSScanlineConstantData128B, m_test[0])]);
por(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData128B, m_test[7])]);
}
else
{
pmovsxbd(_test, ptr[a1 * 8 + _m_const + offsetof(GSScanlineConstantData, m_test_256b[0])]);
pmovsxbd(xym0, ptr[rax * 8 + _m_const + offsetof(GSScanlineConstantData, m_test_256b[15])]);
pmovsxbd(_test, ptr[a1 * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[0])]);
pmovsxbd(xym0, ptr[rax * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[15])]);
por(_test, xym0);
shl(a1.cvt32(), 5); // * sizeof(m_test[0])
}
Expand Down Expand Up @@ -1052,9 +1058,9 @@ void GSDrawScanlineCodeGenerator::Step()
cdqe();

#if USING_XMM
movdqa(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData, m_test_128b[7])]);
movdqa(_test, ptr[rax + _m_const + offsetof(GSScanlineConstantData128B, m_test[7])]);
#else
pmovsxbd(_test, ptr[rax * 8 + _m_const + offsetof(GSScanlineConstantData, m_test_256b[15])]);
pmovsxbd(_test, ptr[rax * 8 + _m_const + offsetof(GSScanlineConstantData256B, m_test[15])]);
#endif
}
}
Expand Down Expand Up @@ -1594,11 +1600,7 @@ static int log2_coeff_offset(int i)
{
// Yay, you can't offsetof with non-constant array indices
uptr base = reinterpret_cast<uptr>(&g_const);
#if USING_XMM
uptr target = reinterpret_cast<uptr>(&g_const.m_log2_coef_128b[i]);
#else
uptr target = reinterpret_cast<uptr>(&g_const.m_log2_coef_256b[i]);
#endif
uptr target = reinterpret_cast<uptr>(&g_const.m_log2_coef[i]);
return target - base;
};

Expand Down
58 changes: 37 additions & 21 deletions pcsx2/GS/Renderers/SW/GSScanlineEnvironment.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,20 @@ struct alignas(32) GSScanlineLocalData // per prim variables, each thread has it
const GSScanlineGlobalData* gd;
};

namespace GSScanlineConstantData
{
static constexpr float log2_coef[] = {
0.204446009836232697516f,
-1.04913055217340124191f,
2.28330284476918490682f,
1.0f
};
};

// Constant shared by all threads (to reduce cache miss)
struct GSScanlineConstantData : public GSAlignedClass<32>
struct alignas(64) GSScanlineConstantData256B
{
alignas(32) u8 m_test_256b[16][8] = {
alignas(32) u8 m_test[16][8] = {
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
Expand All @@ -264,7 +274,7 @@ struct GSScanlineConstantData : public GSAlignedClass<32>
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
};
alignas(32) float m_shift_256b[9][8] = {
alignas(32) float m_shift[9][8] = {
{ 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f , 8.0f},
{ 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f , 6.0f , 7.0f},
{ -1.0f , 0.0f , 1.0f , 2.0f , 3.0f , 4.0f , 5.0f , 6.0f},
Expand All @@ -275,9 +285,24 @@ struct GSScanlineConstantData : public GSAlignedClass<32>
{ -6.0f , -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f , 1.0f},
{ -7.0f , -6.0f , -5.0f , -4.0f , -3.0f , -2.0f , -1.0f , 0.0f},
};
alignas(32) float m_log2_coef_256b[4][8] = {};
alignas(32) float m_log2_coef[4][8] = {};

constexpr GSScanlineConstantData256B()
{
using namespace GSScanlineConstantData;
for (size_t n = 0; n < std::size(log2_coef); ++n)
{
for (size_t i = 0; i < 8; ++i)
{
m_log2_coef[n][i] = log2_coef[n];
}
}
}
};

alignas(16) u32 m_test_128b[8][4] = {
struct alignas(64) GSScanlineConstantData128B
{
alignas(16) u32 m_test[8][4] = {
{0x00000000, 0x00000000, 0x00000000, 0x00000000},
{0xffffffff, 0x00000000, 0x00000000, 0x00000000},
{0xffffffff, 0xffffffff, 0x00000000, 0x00000000},
Expand All @@ -287,34 +312,25 @@ struct GSScanlineConstantData : public GSAlignedClass<32>
{0x00000000, 0x00000000, 0x00000000, 0xffffffff},
{0x00000000, 0x00000000, 0x00000000, 0x00000000},
};
alignas(16) float m_shift_128b[5][4] = {
alignas(16) float m_shift[5][4] = {
{ 4.0f , 4.0f , 4.0f , 4.0f},
{ 0.0f , 1.0f , 2.0f , 3.0f},
{ -1.0f , 0.0f , 1.0f , 2.0f},
{ -2.0f , -1.0f , 0.0f , 1.0f},
{ -3.0f , -2.0f , -1.0f , 0.0f},
};
alignas(16) float m_log2_coef_128b[4][4] = {};
alignas(16) float m_log2_coef[4][4] = {};

constexpr GSScanlineConstantData()
constexpr GSScanlineConstantData128B()
{
constexpr float log2_coef[] = {
0.204446009836232697516f,
-1.04913055217340124191f,
2.28330284476918490682f,
1.0f
};

using namespace GSScanlineConstantData;
for (size_t n = 0; n < std::size(log2_coef); ++n)
{
for (size_t i = 0; i < 4; ++i)
{
m_log2_coef_128b[n][i] = log2_coef[n];
m_log2_coef_256b[n][i] = log2_coef[n];
m_log2_coef_256b[n][i + 4] = log2_coef[n];
}
m_log2_coef[n][i] = log2_coef[n];
}
}
};

extern const GSScanlineConstantData g_const;
extern const GSScanlineConstantData256B g_const_256b;
extern const GSScanlineConstantData128B g_const_128b;
18 changes: 9 additions & 9 deletions pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ void GSSetupPrimCodeGenerator::Generate()
{

if (isXmm)
mov(rax, (size_t)g_const.m_shift_128b);
mov(rax, (size_t)g_const_128b.m_shift);
else
mov(rax, (size_t)g_const.m_shift_256b);
mov(rax, (size_t)g_const_256b.m_shift);

for (int i = 0; i < (m_sel.notest ? 2 : many_regs ? 9 : 5); i++)
{
Expand Down Expand Up @@ -253,7 +253,7 @@ void GSSetupPrimCodeGenerator::Depth_YMM()
if (i < 4 || many_regs)
vmulps(ymm0, Ymm(4 + i), ymm1);
else
vmulps(ymm0, ymm1, ptr[g_const.m_shift_256b[i + 1]]);
vmulps(ymm0, ymm1, ptr[g_const_256b.m_shift[i + 1]]);
cvttps2dq(ymm0, ymm0);
pshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
Expand Down Expand Up @@ -281,7 +281,7 @@ void GSSetupPrimCodeGenerator::Depth_YMM()
if (i < 4 || many_regs)
vmulps(ymm1, Ymm(4 + i), ymm0);
else
vmulps(ymm1, ymm0, ptr[g_const.m_shift_256b[i + 1]]);
vmulps(ymm1, ymm0, ptr[g_const_256b.m_shift[i + 1]]);
movaps(_rip_local_di(i, z), ymm1);
}
}
Expand Down Expand Up @@ -356,7 +356,7 @@ void GSSetupPrimCodeGenerator::Texture()
if (i < 4 || many_regs)
THREEARG(mulps, xym2, XYm(4 + i), xym1);
else
vmulps(ymm2, ymm1, ptr[g_const.m_shift_256b[i + 1]]);
vmulps(ymm2, ymm1, ptr[g_const_256b.m_shift[i + 1]]);

if (m_sel.fst)
{
Expand Down Expand Up @@ -424,7 +424,7 @@ void GSSetupPrimCodeGenerator::Color()
if (i < 4 || many_regs)
THREEARG(mulps, xym0, XYm(4 + i), xym2);
else
vmulps(ymm0, ymm2, ptr[g_const.m_shift_256b[i + 1]]);
vmulps(ymm0, ymm2, ptr[g_const_256b.m_shift[i + 1]]);
cvttps2dq(xym0, xym0);
packssdw(xym0, xym0);

Expand All @@ -433,7 +433,7 @@ void GSSetupPrimCodeGenerator::Color()
if (i < 4 || many_regs)
THREEARG(mulps, xym1, XYm(4 + i), xym3);
else
vmulps(ymm1, ymm3, ptr[g_const.m_shift_256b[i + 1]]);
vmulps(ymm1, ymm3, ptr[g_const_256b.m_shift[i + 1]]);
cvttps2dq(xym1, xym1);
packssdw(xym1, xym1);

Expand All @@ -460,7 +460,7 @@ void GSSetupPrimCodeGenerator::Color()
if (i < 4 || many_regs)
THREEARG(mulps, xym0, XYm(4 + i), xym2);
else
vmulps(ymm0, ymm2, ptr[g_const.m_shift_256b[i + 1]]);
vmulps(ymm0, ymm2, ptr[g_const_256b.m_shift[i + 1]]);
cvttps2dq(xym0, xym0);
packssdw(xym0, xym0);

Expand All @@ -469,7 +469,7 @@ void GSSetupPrimCodeGenerator::Color()
if (i < 4 || many_regs)
THREEARG(mulps, xym1, XYm(4 + i), xym3);
else
vmulps(ymm1, ymm3, ptr[g_const.m_shift_256b[i + 1]]);
vmulps(ymm1, ymm3, ptr[g_const_256b.m_shift[i + 1]]);
cvttps2dq(xym1, xym1);
packssdw(xym1, xym1);

Expand Down

0 comments on commit 0d434d6

Please sign in to comment.