Skip to content

Commit

Permalink
R5900: Improve the EE cache performance with SIMD
Browse files Browse the repository at this point in the history
  • Loading branch information
F0bes committed Dec 26, 2024
1 parent 6a0f811 commit cc5e30a
Show file tree
Hide file tree
Showing 4 changed files with 224 additions and 93 deletions.
147 changes: 91 additions & 56 deletions pcsx2/COP0.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,116 +230,145 @@ void MapTLB(const tlbs& t, int i)
u32 saddr, eaddr;

COP0_LOG("MAP TLB %d: 0x%08X-> [0x%08X 0x%08X] S=%d G=%d ASID=%d Mask=0x%03X EntryLo0 PFN=%x EntryLo0 Cache=%x EntryLo1 PFN=%x EntryLo1 Cache=%x VPN2=%x",
i, t.VPN2, t.PFN0, t.PFN1, t.S >> 31, t.G, t.ASID,
t.Mask, t.EntryLo0 >> 6, (t.EntryLo0 & 0x38) >> 3, t.EntryLo1 >> 6, (t.EntryLo1 & 0x38) >> 3, t.VPN2);
i, t.VPN2(), t.PFN0(), t.PFN1(), t.isSPR() >> 31, t.isGlobal(), t.EntryHi.ASID,
t.Mask(), t.EntryLo0.PFN, t.EntryLo0.C, t.EntryLo1.PFN, t.EntryLo1.C, t.VPN2());

// According to the manual
// 'It [SPR] must be mapped into a contiguous 16 KB of virtual address space that is
// aligned on a 16KB boundary.Results are not guaranteed if this restriction is not followed.'
// Assume that the game isn't doing anything less-than-ideal with the scratchpad mapping and map it directly to eeMem->Scratch.
if (t.S)
if (t.isSPR())
{
if (t.VPN2 != 0x70000000)
Console.Warning("COP0: Mapping Scratchpad to non-default address 0x%08X", t.VPN2);
if (t.VPN2() != 0x70000000)
Console.Warning("COP0: Mapping Scratchpad to non-default address 0x%08X", t.VPN2());

vtlb_VMapBuffer(t.VPN2, eeMem->Scratch, Ps2MemSize::Scratch);
vtlb_VMapBuffer(t.VPN2(), eeMem->Scratch, Ps2MemSize::Scratch);
}
else
{
if (t.EntryLo0 & 0x2)
if (t.EntryLo0.V)
{
mask = ((~t.Mask) << 1) & 0xfffff;
saddr = t.VPN2 >> 12;
eaddr = saddr + t.Mask + 1;
mask = ((~t.Mask()) << 1) & 0xfffff;
saddr = t.VPN2() >> 12;
eaddr = saddr + t.Mask() + 1;

for (addr = saddr; addr < eaddr; addr++)
{
if ((addr & mask) == ((t.VPN2 >> 12) & mask))
if ((addr & mask) == ((t.VPN2() >> 12) & mask))
{ //match
memSetPageAddr(addr << 12, t.PFN0 + ((addr - saddr) << 12));
memSetPageAddr(addr << 12, t.PFN0() + ((addr - saddr) << 12));
Cpu->Clear(addr << 12, 0x400);
}
}
}

if (t.EntryLo1 & 0x2)
if (t.EntryLo1.V)
{
mask = ((~t.Mask) << 1) & 0xfffff;
saddr = (t.VPN2 >> 12) + t.Mask + 1;
eaddr = saddr + t.Mask + 1;
mask = ((~t.Mask()) << 1) & 0xfffff;
saddr = (t.VPN2() >> 12) + t.Mask() + 1;
eaddr = saddr + t.Mask() + 1;

for (addr = saddr; addr < eaddr; addr++)
{
if ((addr & mask) == ((t.VPN2 >> 12) & mask))
if ((addr & mask) == ((t.VPN2() >> 12) & mask))
{ //match
memSetPageAddr(addr << 12, t.PFN1 + ((addr - saddr) << 12));
memSetPageAddr(addr << 12, t.PFN1() + ((addr - saddr) << 12));
Cpu->Clear(addr << 12, 0x400);
}
}
}
}
}

__inline u32 ConvertPageMask(const u32 PageMask)
{
const u32 mask = std::popcount(PageMask >> 13);

pxAssertMsg(!((mask & 1) || mask > 12), "Invalid page mask for this TLB entry. EE cache doesn't know what to do here.");

return (1 << (12 + mask)) - 1;
}

void UnmapTLB(const tlbs& t, int i)
{
//Console.WriteLn("Clear TLB %d: %08x-> [%08x %08x] S=%d G=%d ASID=%d Mask= %03X", i,t.VPN2,t.PFN0,t.PFN1,t.S,t.G,t.ASID,t.Mask);
u32 mask, addr;
u32 saddr, eaddr;

if (t.S)
if (t.isSPR())
{
vtlb_VMapUnmap(t.VPN2, 0x4000);
vtlb_VMapUnmap(t.VPN2(), 0x4000);
return;
}

if (t.EntryLo0 & 0x2)
if (t.EntryLo0.V)
{
mask = ((~t.Mask) << 1) & 0xfffff;
saddr = t.VPN2 >> 12;
eaddr = saddr + t.Mask + 1;
mask = ((~t.Mask()) << 1) & 0xfffff;
saddr = t.VPN2() >> 12;
eaddr = saddr + t.Mask() + 1;
// Console.WriteLn("Clear TLB: %08x ~ %08x",saddr,eaddr-1);
for (addr = saddr; addr < eaddr; addr++)
{
if ((addr & mask) == ((t.VPN2 >> 12) & mask))
if ((addr & mask) == ((t.VPN2() >> 12) & mask))
{ //match
memClearPageAddr(addr << 12);
Cpu->Clear(addr << 12, 0x400);
}
}
}

if (t.EntryLo1 & 0x2)
if (t.EntryLo1.V)
{
mask = ((~t.Mask) << 1) & 0xfffff;
saddr = (t.VPN2 >> 12) + t.Mask + 1;
eaddr = saddr + t.Mask + 1;
mask = ((~t.Mask()) << 1) & 0xfffff;
saddr = (t.VPN2() >> 12) + t.Mask() + 1;
eaddr = saddr + t.Mask() + 1;
// Console.WriteLn("Clear TLB: %08x ~ %08x",saddr,eaddr-1);
for (addr = saddr; addr < eaddr; addr++)
{
if ((addr & mask) == ((t.VPN2 >> 12) & mask))
if ((addr & mask) == ((t.VPN2() >> 12) & mask))
{ //match
memClearPageAddr(addr << 12);
Cpu->Clear(addr << 12, 0x400);
}
}
}

for (size_t i = 0; i < cachedTlbs.count; i++)
{
if (cachedTlbs.PFN0s[i] == t.PFN0() && cachedTlbs.PFN1s[i] == t.PFN1() && cachedTlbs.PageMasks[i] == ConvertPageMask(t.PageMask.UL))
{
for (size_t j = i; j < cachedTlbs.count - 1; j++)
{
cachedTlbs.CacheEnabled0[j] = cachedTlbs.CacheEnabled0[j + 1];
cachedTlbs.CacheEnabled1[j] = cachedTlbs.CacheEnabled1[j + 1];
cachedTlbs.PFN0s[j] = cachedTlbs.PFN0s[j + 1];
cachedTlbs.PFN1s[j] = cachedTlbs.PFN1s[j + 1];
cachedTlbs.PageMasks[j] = cachedTlbs.PageMasks[j + 1];
}
cachedTlbs.count--;
break;
}
}
}

void WriteTLB(int i)
{
tlb[i].PageMask = cpuRegs.CP0.n.PageMask;
tlb[i].EntryHi = cpuRegs.CP0.n.EntryHi;
tlb[i].EntryLo0 = cpuRegs.CP0.n.EntryLo0;
tlb[i].EntryLo1 = cpuRegs.CP0.n.EntryLo1;

tlb[i].Mask = (cpuRegs.CP0.n.PageMask >> 13) & 0xfff;
tlb[i].nMask = (~tlb[i].Mask) & 0xfff;
tlb[i].VPN2 = ((cpuRegs.CP0.n.EntryHi >> 13) & (~tlb[i].Mask)) << 13;
tlb[i].ASID = cpuRegs.CP0.n.EntryHi & 0xfff;
tlb[i].G = cpuRegs.CP0.n.EntryLo0 & cpuRegs.CP0.n.EntryLo1 & 0x1;
tlb[i].PFN0 = (((cpuRegs.CP0.n.EntryLo0 >> 6) & 0xFFFFF) & (~tlb[i].Mask)) << 12;
tlb[i].PFN1 = (((cpuRegs.CP0.n.EntryLo1 >> 6) & 0xFFFFF) & (~tlb[i].Mask)) << 12;
tlb[i].S = cpuRegs.CP0.n.EntryLo0 & 0x80000000;
tlb[i].PageMask.UL = cpuRegs.CP0.n.PageMask;
tlb[i].EntryHi.UL = cpuRegs.CP0.n.EntryHi;
tlb[i].EntryLo0.UL = cpuRegs.CP0.n.EntryLo0;
tlb[i].EntryLo1.UL = cpuRegs.CP0.n.EntryLo1;

if (!tlb[i].isSPR() && ((tlb[i].EntryLo0.V && tlb[i].EntryLo0.isCached()) || (tlb[i].EntryLo1.V && tlb[i].EntryLo1.isCached())))
{
const size_t idx = cachedTlbs.count;
cachedTlbs.CacheEnabled0[idx] = tlb[i].EntryLo0.isCached() ? ~0 : 0;
cachedTlbs.CacheEnabled1[idx] = tlb[i].EntryLo1.isCached() ? ~0 : 0;
cachedTlbs.PFN1s[idx] = tlb[i].PFN1();
cachedTlbs.PFN0s[idx] = tlb[i].PFN0();
cachedTlbs.PageMasks[idx] = ConvertPageMask(tlb[i].PageMask.UL);

cachedTlbs.count++;
}

MapTLB(tlb[i], i);
}
Expand All @@ -357,10 +386,16 @@ namespace COP0 {

int i = cpuRegs.CP0.n.Index & 0x3f;

cpuRegs.CP0.n.PageMask = tlb[i].PageMask;
cpuRegs.CP0.n.EntryHi = tlb[i].EntryHi & ~(tlb[i].PageMask | 0x1f00);
cpuRegs.CP0.n.EntryLo0 = (tlb[i].EntryLo0 & ~1) | ((tlb[i].EntryHi >> 12) & 1);
cpuRegs.CP0.n.EntryLo1 = (tlb[i].EntryLo1 & ~1) | ((tlb[i].EntryHi >> 12) & 1);
cpuRegs.CP0.n.PageMask = tlb[i].PageMask.UL;
cpuRegs.CP0.n.EntryHi = tlb[i].EntryHi.UL & ~(tlb[i].PageMask.UL | 0x1f00);
/*
* TEST THIS??
cpuRegs.CP0.n.EntryLo0 = (tlb[i].EntryLo0 & ~1) | ((tlb[i].EntryHi.UL >> 12) & 1);
cpuRegs.CP0.n.EntryLo1 = (tlb[i].EntryLo1 & ~1) | ((tlb[i].EntryHi.UL >> 12) & 1);
*/
cpuRegs.CP0.n.EntryLo0 = tlb[i].EntryLo0.UL;
cpuRegs.CP0.n.EntryLo1 = tlb[i].EntryLo1.UL;

}

void TLBWI()
Expand All @@ -374,10 +409,10 @@ namespace COP0 {
cpuRegs.CP0.n.EntryLo0, cpuRegs.CP0.n.EntryLo1);

UnmapTLB(tlb[j], j);
tlb[j].PageMask = cpuRegs.CP0.n.PageMask;
tlb[j].EntryHi = cpuRegs.CP0.n.EntryHi;
tlb[j].EntryLo0 = cpuRegs.CP0.n.EntryLo0;
tlb[j].EntryLo1 = cpuRegs.CP0.n.EntryLo1;
tlb[j].PageMask.UL = cpuRegs.CP0.n.PageMask;
tlb[j].EntryHi.UL = cpuRegs.CP0.n.EntryHi;
tlb[j].EntryLo0.UL = cpuRegs.CP0.n.EntryLo0;
tlb[j].EntryLo1.UL = cpuRegs.CP0.n.EntryLo1;
WriteTLB(j);
}

Expand All @@ -394,10 +429,10 @@ namespace COP0 {
//if (j > 48) return;

UnmapTLB(tlb[j], j);
tlb[j].PageMask = cpuRegs.CP0.n.PageMask;
tlb[j].EntryHi = cpuRegs.CP0.n.EntryHi;
tlb[j].EntryLo0 = cpuRegs.CP0.n.EntryLo0;
tlb[j].EntryLo1 = cpuRegs.CP0.n.EntryLo1;
tlb[j].PageMask.UL = cpuRegs.CP0.n.PageMask;
tlb[j].EntryHi.UL = cpuRegs.CP0.n.EntryHi;
tlb[j].EntryLo0.UL = cpuRegs.CP0.n.EntryLo0;
tlb[j].EntryLo1.UL = cpuRegs.CP0.n.EntryLo1;
WriteTLB(j);
}

Expand All @@ -422,7 +457,7 @@ namespace COP0 {
cpuRegs.CP0.n.Index = 0xFFFFFFFF;
for (i = 0; i < 48; i++)
{
if (tlb[i].VPN2 == ((~tlb[i].Mask) & (EntryHi32.s.VPN2)) && ((tlb[i].G & 1) || ((tlb[i].ASID & 0xff) == EntryHi32.s.ASID)))
if (tlb[i].VPN2() == ((~tlb[i].Mask()) & (EntryHi32.s.VPN2)) && ((tlb[i].isGlobal()) || ((tlb[i].EntryHi.ASID & 0xff) == EntryHi32.s.ASID)))
{
cpuRegs.CP0.n.Index = i;
break;
Expand Down
3 changes: 3 additions & 0 deletions pcsx2/R5900.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ u32 EEoCycle;

alignas(16) cpuRegistersPack _cpuRegistersPack;
alignas(16) tlbs tlb[48];
cachedTlbs_t cachedTlbs;

R5900cpu *Cpu = NULL;

static constexpr uint eeWaitCycles = 3072;
Expand All @@ -59,6 +61,7 @@ void cpuReset()
std::memset(&cpuRegs, 0, sizeof(cpuRegs));
std::memset(&fpuRegs, 0, sizeof(fpuRegs));
std::memset(&tlb, 0, sizeof(tlb));
cachedTlbs.count = 0;

cpuRegs.pc = 0xbfc00000; //set pc reg to stack
cpuRegs.CP0.n.Config = 0x440;
Expand Down
85 changes: 76 additions & 9 deletions pcsx2/R5900.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

#include "common/Pcsx2Defs.h"

#include <array>

// --------------------------------------------------------------------------------------
// EE Bios function name tables.
// --------------------------------------------------------------------------------------
Expand Down Expand Up @@ -160,17 +162,69 @@ struct fpuRegisters {
u32 ACCflag; // an internal accumulator overflow flag
};

union PageMask_t
{
struct
{
u32 : 13;
u32 Mask : 12;
u32 : 7;
};
u32 UL;

constexpr u32 nMask() const { return ~Mask & 0xfff; };
};

union EntryHi_t
{
struct
{
u32 ASID:8;
u32 : 5;
u32 VPN2:19;
};
u32 UL;
};

union EntryLo_t
{
struct
{
u32 G:1;
u32 V:1;
u32 D:1;
u32 C:3;
u32 PFN:20;
u32 : 5;
u32 S : 1; // Only used in EntryLo0
};
u32 UL;

constexpr bool isCached() const { return C == 0x3; }
};

struct tlbs
{
u32 PageMask,EntryHi;
u32 EntryLo0,EntryLo1;
u32 Mask, nMask;
u32 G;
u32 ASID;
u32 VPN2;
u32 PFN0;
u32 PFN1;
u32 S;
PageMask_t PageMask;
EntryHi_t EntryHi;
EntryLo_t EntryLo0;
EntryLo_t EntryLo1;

// (((cpuRegs.CP0.n.EntryLo0 >> 6) & 0xFFFFF) & (~tlb[i].Mask())) << 12;
constexpr u32 PFN0() const { return (EntryLo0.PFN & ~Mask()) << 12; }
constexpr u32 PFN1() const { return (EntryLo1.PFN & ~Mask()) << 12; }
constexpr u32 VPN2() const {return ((EntryHi.VPN2) & (~Mask())) << 13; }
constexpr u32 Mask() const { return PageMask.Mask; }
constexpr bool isGlobal() const { return EntryLo0.G && EntryLo1.G; }
constexpr bool isSPR() const { return EntryLo0.S; }

constexpr bool operator==(const tlbs& other) const
{
return PageMask.UL == other.PageMask.UL &&
EntryHi.UL == other.EntryHi.UL &&
EntryLo0.UL == other.EntryLo0.UL &&
EntryLo1.UL == other.EntryLo1.UL;
}
};

#ifndef _PC_
Expand Down Expand Up @@ -211,6 +265,19 @@ struct cpuRegistersPack
alignas(16) extern cpuRegistersPack _cpuRegistersPack;
alignas(16) extern tlbs tlb[48];

struct cachedTlbs_t
{
u32 count;

alignas(16) std::array<u32, 48> PageMasks;
alignas(16) std::array<u32, 48> PFN1s;
alignas(16) std::array<u32, 48> CacheEnabled1;
alignas(16) std::array<u32, 48> PFN0s;
alignas(16) std::array<u32, 48> CacheEnabled0;
};

extern cachedTlbs_t cachedTlbs;

static cpuRegisters& cpuRegs = _cpuRegistersPack.cpuRegs;
static fpuRegisters& fpuRegs = _cpuRegistersPack.fpuRegs;

Expand Down
Loading

0 comments on commit cc5e30a

Please sign in to comment.