Skip to content

Commit

Permalink
n64: improve and extend cache coherency checks
Browse files Browse the repository at this point in the history
To make sure to intercept all possible errors, the check are now
performed by the RDRAM module, whenever a RDRAM read/write behind
a cacheline happens.

CPU writes to cache is now tracking dirtyness at the byte level rather
than whole cacheline level, so that hardware accessing memory does not
trigger a false positive for false-shared variables. An initial
round of testing has shown that the check would trigger far too much
otherwise.

Error reporting is also much improved to provide more context to
analyze the issue, including tracking the PC at which the hardware DMA
was triggered.
  • Loading branch information
rasky committed Dec 2, 2023
1 parent 9a8e7f8 commit 6a3f9c0
Show file tree
Hide file tree
Showing 23 changed files with 180 additions and 86 deletions.
7 changes: 4 additions & 3 deletions ares/n64/ai/ai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ auto AI::sample(f64& left, f64& right) -> void {

if(io.dmaLength[0] && io.dmaEnable) {
io.dmaAddress[0].bit(13,23) += io.dmaAddressCarry;
auto data = rdram.ram.read<Word>(io.dmaAddress[0]);
auto data = rdram.ram.read<Word>(io.dmaAddress[0], "AI");
auto l = s16(data >> 16);
auto r = s16(data >> 0);
left = l / 32768.0;
Expand All @@ -50,8 +50,9 @@ auto AI::sample(f64& left, f64& right) -> void {
}
if(!io.dmaLength[0]) {
if(--io.dmaCount) {
io.dmaAddress[0] = io.dmaAddress[1];
io.dmaLength [0] = io.dmaLength [1];
io.dmaAddress[0] = io.dmaAddress[1];
io.dmaLength [0] = io.dmaLength [1];
io.dmaOriginPc[0] = io.dmaOriginPc[1];
mi.raise(MI::IRQ::AI);
}
}
Expand Down
1 change: 1 addition & 0 deletions ares/n64/ai/ai.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ struct AI : Thread, Memory::RCP<AI> {
n1 dmaAddressCarry;
n18 dmaLength[2];
n2 dmaCount;
u64 dmaOriginPc[2];
n14 dacRate;
n4 bitRate;
} io;
Expand Down
1 change: 1 addition & 0 deletions ares/n64/ai/io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ auto AI::writeWord(u32 address, u32 data_, Thread& thread) -> void {
if(io.dmaCount < 2) {
if(io.dmaCount == 0) mi.raise(MI::IRQ::AI);
io.dmaLength[io.dmaCount] = length;
io.dmaOriginPc[io.dmaCount] = cpu.ipu.pc;
io.dmaCount++;
}
}
Expand Down
3 changes: 2 additions & 1 deletion ares/n64/cartridge/flash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ auto Cartridge::Flash::writeWord(u32 address, u64 data) -> void {
}
if(mode == Mode::Write) {
for(u32 index = 0; index < 128; index += 2) {
u16 half = rdram.ram.read<Half>(source + index);
// FIXME: this is obviously wrong, the flash can't access RDRAM
u16 half = rdram.ram.read<Half>(source + index, "Flash");
Memory::Writable::write<Half>(offset + index, half);
}
}
Expand Down
4 changes: 3 additions & 1 deletion ares/n64/cpu/cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,9 +184,11 @@ struct CPU : Thread {
template<u32 Size> auto write(u32 address, u64 data) -> void;

bool valid;
bool dirty;
u16 dirty;
u32 tag;
u16 index;
u64 fillpc;
u64 dirtypc;
union {
u8 bytes[16];
u16 halfs[8];
Expand Down
12 changes: 7 additions & 5 deletions ares/n64/cpu/dcache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ auto CPU::DataCache::Line::hit(u32 address) const -> bool {

auto CPU::DataCache::Line::fill(u32 address) -> void {
cpu.step(40 * 2);
valid = 1;
dirty = 0;
tag = address & ~0x0000'0fff;
valid = 1;
dirty = 0;
tag = address & ~0x0000'0fff;
fillpc = cpu.ipu.pc;
cpu.busReadBurst<DCache>(tag | index, words);
}

Expand Down Expand Up @@ -41,7 +42,8 @@ auto CPU::DataCache::Line::write(u32 address, u64 data) -> void {
words[address >> 2 & 2 | 0] = data >> 32;
words[address >> 2 & 2 | 1] = data >> 0;
}
dirty = 1;
dirty |= ((1 << Size) - 1) << (address & 0xF);
dirtypc = cpu.ipu.pc;
}

template<u32 Size>
Expand All @@ -60,7 +62,7 @@ auto CPU::DataCache::readDebug(u32 vaddr, u32 address) -> u8 {
auto& line = this->line(vaddr);
if(!line.hit(address)) {
Thread dummyThread{};
return bus.read<Byte>(address, dummyThread);
return bus.read<Byte>(address, dummyThread, "Ares Debugger");
}
return line.read<Byte>(address);
}
Expand Down
10 changes: 5 additions & 5 deletions ares/n64/cpu/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ auto CPU::devirtualizeDebug(u64 vaddr) -> u64 {

template<u32 Size>
inline auto CPU::busWrite(u32 address, u64 data) -> void {
bus.write<Size>(address, data, *this);
bus.write<Size>(address, data, *this, "CPU");
}

template<u32 Size>
Expand All @@ -156,7 +156,7 @@ inline auto CPU::busWriteBurst(u32 address, u32 *data) -> void {

template<u32 Size>
inline auto CPU::busRead(u32 address) -> u64 {
return bus.read<Size>(address, *this);
return bus.read<Size>(address, *this, "CPU");
}

template<u32 Size>
Expand Down Expand Up @@ -239,17 +239,17 @@ auto CPU::readDebug(u64 vaddr) -> u8 {
case Context::Segment::Mapped:
if(auto match = tlb.load(vaddr, true)) {
if(match.cache) return dcache.readDebug(vaddr, match.address & context.physMask);
return bus.read<Byte>(match.address & context.physMask, dummyThread);
return bus.read<Byte>(match.address & context.physMask, dummyThread, "Ares Debugger");
}
return 0;
case Context::Segment::Cached:
return dcache.readDebug(vaddr, vaddr & 0x1fff'ffff);
case Context::Segment::Cached32:
return dcache.readDebug(vaddr, vaddr & 0xffff'ffff);
case Context::Segment::Direct:
return bus.read<Byte>(vaddr & 0x1fff'ffff, dummyThread);
return bus.read<Byte>(vaddr & 0x1fff'ffff, dummyThread, "Ares Debugger");
case Context::Segment::Direct32:
return bus.read<Byte>(vaddr & 0xffff'ffff, dummyThread);
return bus.read<Byte>(vaddr & 0xffff'ffff, dummyThread, "Ares Debugger");
}

unreachable;
Expand Down
2 changes: 1 addition & 1 deletion ares/n64/cpu/recompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ auto CPU::Recompiler::emit(u32 vaddr, u32 address, bool singleInstruction) -> Bl
Thread thread;
bool hasBranched = 0;
while(true) {
u32 instruction = bus.read<Word>(address, thread);
u32 instruction = bus.read<Word>(address, thread, "Ares Recompiler");
if(callInstructionPrologue) {
mov32(reg(1), imm(instruction));
call(&CPU::instructionPrologue);
Expand Down
46 changes: 21 additions & 25 deletions ares/n64/memory/bus.hpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
template<u32 Size>
inline auto Bus::read(u32 address, Thread& thread) -> u64 {
inline auto Bus::read(u32 address, Thread& thread, const char *peripheral) -> u64 {
static constexpr u64 unmapped = 0;
static_assert(Size == Byte || Size == Half || Size == Word || Size == Dual);

if(address <= 0x007f'ffff) return rdram.ram.read<Size>(address);
if(address <= 0x03ef'ffff) return unmapped;
if(address <= 0x03ef'ffff) return rdram.ram.read<Size>(address, peripheral);
if(address <= 0x03ff'ffff) return rdram.read<Size>(address, thread);
if(address <= 0x0407'ffff) return rsp.read<Size>(address, thread);
if(address <= 0x040f'ffff) return rsp.status.read<Size>(address, thread);
Expand All @@ -27,16 +26,18 @@ template<u32 Size>
inline auto Bus::readBurst(u32 address, u32 *data, Thread& thread) -> void {
static_assert(Size == DCache || Size == ICache);

if(address <= 0x03ef'ffff) return rdram.ram.readBurst<Size>(address, data, "CPU");
if(address <= 0x03ff'ffff) {
data[0] = read<Word>(address | 0x0, thread);
data[1] = read<Word>(address | 0x4, thread);
data[2] = read<Word>(address | 0x8, thread);
data[3] = read<Word>(address | 0xc, thread);
// FIXME: not hardware validated, no idea of the behavior
data[0] = rdram.readWord(address | 0x0, thread);
data[1] = 0;
data[2] = 0;
data[3] = 0;
if constexpr(Size == ICache) {
data[4] = read<Word>(address | 0x10, thread);
data[5] = read<Word>(address | 0x14, thread);
data[6] = read<Word>(address | 0x18, thread);
data[7] = read<Word>(address | 0x1c, thread);
data[4] = 0;
data[5] = 0;
data[6] = 0;
data[7] = 0;
}
return;
}
Expand All @@ -46,15 +47,14 @@ inline auto Bus::readBurst(u32 address, u32 *data, Thread& thread) -> void {
}

template<u32 Size>
inline auto Bus::write(u32 address, u64 data, Thread& thread) -> void {
inline auto Bus::write(u32 address, u64 data, Thread& thread, const char *peripheral) -> void {
static_assert(Size == Byte || Size == Half || Size == Word || Size == Dual);
if constexpr(Accuracy::CPU::Recompiler) {
cpu.recompiler.invalidate(address + 0); if constexpr(Size == Dual)
cpu.recompiler.invalidate(address + 4);
}

if(address <= 0x007f'ffff) return rdram.ram.write<Size>(address, data);
if(address <= 0x03ef'ffff) return;
if(address <= 0x03ef'ffff) return rdram.ram.write<Size>(address, data, peripheral);
if(address <= 0x03ff'ffff) return rdram.write<Size>(address, data, thread);
if(address <= 0x0407'ffff) return rsp.write<Size>(address, data, thread);
if(address <= 0x040f'ffff) return rsp.status.write<Size>(address, data, thread);
Expand All @@ -76,21 +76,17 @@ inline auto Bus::write(u32 address, u64 data, Thread& thread) -> void {
template<u32 Size>
inline auto Bus::writeBurst(u32 address, u32 *data, Thread& thread) -> void {
static_assert(Size == DCache || Size == ICache);
if constexpr(Accuracy::CPU::Recompiler) {
cpu.recompiler.invalidateRange(address, address + (Size == DCache ? 16 : 32));
}

if(address <= 0x03ef'ffff) return rdram.ram.writeBurst<Size>(address, data, "CPU");
if(address <= 0x03ff'ffff) {
write<Word>(address | 0x0, data[0], thread);
write<Word>(address | 0x4, data[1], thread);
write<Word>(address | 0x8, data[2], thread);
write<Word>(address | 0xc, data[3], thread);
if constexpr(Size == ICache) {
write<Word>(address | 0x10, data[4], thread);
write<Word>(address | 0x14, data[5], thread);
write<Word>(address | 0x18, data[6], thread);
write<Word>(address | 0x1c, data[7], thread);
}
// FIXME: not hardware validated, but a good guess
rdram.writeWord(address | 0x0, data[0], thread);
return;
}

debug(unusual, "[Bus::readBurst] CPU frozen because of cached write to non-RDRAM area: 0x", hex(address, 8L));
debug(unusual, "[Bus::writeBurst] CPU frozen because of cached write to non-RDRAM area: 0x", hex(address, 8L));
cpu.scc.sysadFrozen = true;
}
4 changes: 2 additions & 2 deletions ares/n64/memory/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ namespace Memory {

struct Bus {
//bus.hpp
template<u32 Size> auto read(u32 address, Thread& thread) -> u64;
template<u32 Size> auto write(u32 address, u64 data, Thread& thread) -> void;
template<u32 Size> auto read(u32 address, Thread& thread, const char *peripheral) -> u64;
template<u32 Size> auto write(u32 address, u64 data, Thread& thread, const char *peripheral) -> void;

template<u32 Size> auto readBurst(u32 address, u32* data, Thread& thread) -> void;
template<u32 Size> auto writeBurst(u32 address, u32* data, Thread& thread) -> void;
Expand Down
2 changes: 1 addition & 1 deletion ares/n64/n64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ namespace ares::Nintendo64 {
#include <n64/pif/pif.hpp>
#include <n64/ri/ri.hpp>
#include <n64/si/si.hpp>
#include <n64/rdram/rdram.hpp>
#include <n64/cpu/cpu.hpp>
#include <n64/rdram/rdram.hpp>
#include <n64/rsp/rsp.hpp>
#include <n64/rdp/rdp.hpp>
#include <n64/memory/bus.hpp>
Expand Down
22 changes: 3 additions & 19 deletions ares/n64/pi/dma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,7 @@ auto PI::dmaRead() -> void {

u32 lastCacheline = 0xffff'ffff;
for(u32 address = 0; address < io.readLength; address += 2) {
if (system.homebrewMode && ((io.dramAddress + address) & ~15) != lastCacheline) {
lastCacheline = address & ~15;
auto& line = cpu.dcache.line(io.dramAddress + address);
if (line.hit(io.dramAddress) && line.dirty) {
debug(unusual, "PI DMA reading from cached memory ", hex((io.dramAddress + address) | 0x80000000), " (missing cache writeback?)");
}
}
u16 data = rdram.ram.read<Half>(io.dramAddress + address);
u16 data = rdram.ram.read<Half>(io.dramAddress + address, "PI DMA");
busWrite<Half>(io.pbusAddress + address, data);
}
}
Expand Down Expand Up @@ -49,17 +42,8 @@ auto PI::dmaWrite() -> void {
cpu.recompiler.invalidateRange(io.dramAddress, cur_len);
}

u32 lastCacheline = 0xffff'ffff;
for (u32 i = 0; i < cur_len; i++) {
if (system.homebrewMode && (io.dramAddress & ~15) != lastCacheline) {
lastCacheline = io.dramAddress & ~15;
auto& line = cpu.dcache.line(io.dramAddress);
if (line.hit(io.dramAddress)) {
debug(unusual, "PI DMA writing to cached memory ", hex(io.dramAddress | 0x80000000), " (missing cache invalidation?)");
}
}
rdram.ram.write<Byte>(io.dramAddress++, mem[i]);
}
for (u32 i = 0; i < cur_len; i++)
rdram.ram.write<Byte>(io.dramAddress++, mem[i], "PI DMA");
io.dramAddress = (io.dramAddress + 7) & ~7;

first_block = false;
Expand Down
2 changes: 2 additions & 0 deletions ares/n64/pi/io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ auto PI::ioWrite(u32 address, u32 data_) -> void {
//PI_READ_LENGTH
io.readLength = n24(data);
io.dmaBusy = 1;
io.originPc = cpu.ipu.pc;
queue.insert(Queue::PI_DMA_Read, dmaDuration(true));
dmaRead();
}
Expand All @@ -106,6 +107,7 @@ auto PI::ioWrite(u32 address, u32 data_) -> void {
//PI_WRITE_LENGTH
io.writeLength = n24(data);
io.dmaBusy = 1;
io.originPc = cpu.ipu.pc;
queue.insert(Queue::PI_DMA_Write, dmaDuration(false));
dmaWrite();
}
Expand Down
1 change: 1 addition & 0 deletions ares/n64/pi/pi.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ struct PI : Memory::RCP<PI> {
n32 readLength;
n32 writeLength;
n32 busLatch;
u64 originPc;
} io;

struct BSD {
Expand Down
4 changes: 2 additions & 2 deletions ares/n64/pif/io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ auto PIF::dmaRead(u32 address, u32 ramAddress) -> void {
intA(Read, Size64);
for(u32 offset = 0; offset < 64; offset += 4) {
u32 data = readInt(address + offset);
rdram.ram.write<Word>(ramAddress + offset, data);
rdram.ram.write<Word>(ramAddress + offset, data, "SI DMA");
}
}

auto PIF::dmaWrite(u32 address, u32 ramAddress) -> void {
for(u32 offset = 0; offset < 64; offset += 4) {
u32 data = rdram.ram.read<Word>(ramAddress + offset);
u32 data = rdram.ram.read<Word>(ramAddress + offset, "SI DMA");
writeInt(address + offset, data);
}
intA(Write, Size64);
Expand Down
54 changes: 51 additions & 3 deletions ares/n64/rdram/debugger.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ auto RDRAM::Debugger::load(Node::Object parent) -> void {
}

memory.ram->setRead([&](u32 address) -> u8 {
return rdram.ram.read<Byte>(address);
return rdram.ram.read<Byte>(address, "Ares Debugger");
});
memory.ram->setWrite([&](u32 address, u8 data) -> void {
return rdram.ram.write<Byte>(address, data);
return rdram.ram.write<Byte>(address, data, "Ares Debugger");
});

memory.dcache = parent->append<Node::Debugger::Memory>("DCache");
Expand All @@ -25,7 +25,7 @@ auto RDRAM::Debugger::load(Node::Object parent) -> void {
if(line.hit(address)) {
line.write<Byte>(address, data);
} else {
rdram.ram.write<Byte>(address, data);
rdram.ram.write<Byte>(address, data, "Ares Debugger");
}
});

Expand Down Expand Up @@ -59,3 +59,51 @@ auto RDRAM::Debugger::io(bool mode, u32 chipID, u32 address, u32 data) -> void {
tracer.io->notify(message);
}
}

auto RDRAM::Debugger::cacheErrorContext(string peripheral) -> string {
if(peripheral == "CPU") {
return { "\tCurrent CPU PC: ", hex(cpu.ipu.pc, 16L), "\n" };
}
if(peripheral == "RSP DMA") {
if(rsp.dma.current.originCpu) {
return { "\tRSP DMA started at CPU PC: ", hex(rsp.dma.current.originPc, 16L), "\n" };
} else {
return { "\tRSP DMA started at RSP PC: ", hex(rsp.dma.current.originPc, 3L), "\n" };
}
}
if(peripheral == "PI DMA") {
return { "\tPI DMA started at CPU PC: ", hex(pi.io.originPc, 16L), "\n" };
}
if(peripheral == "AI DMA") {
return { "\tAI DMA started at CPU PC: ", hex(ai.io.dmaOriginPc[0], 16L), "\n" };
}
return "";
}

auto RDRAM::Debugger::readWord(u32 address, int size, const char *peripheral) -> void {
if (system.homebrewMode && (address & ~15) != lastReadCacheline) {
lastReadCacheline = address & ~15;
auto& line = cpu.dcache.line(address);
u16 dirtyMask = ((1 << size) - 1) << (address & 0xF);
if (line.hit(address) && (line.dirty & dirtyMask)) {
string msg = { peripheral, " reading from RDRAM address ", hex(address), " which is modified in the cache (missing cache writeback?)\n"};
msg.append(string{ "\tCacheline was loaded at CPU PC: ", hex(line.fillpc, 16L), "\n" });
msg.append(string{ "\tCacheline was last written at CPU PC: ", hex(line.dirtypc, 16L), "\n" });
msg.append(cacheErrorContext(peripheral));
debug(unusual, msg);
}
}
}

auto RDRAM::Debugger::writeWord(u32 address, int size, u64 value, const char *peripheral) -> void {
if (system.homebrewMode && (address & ~15) != lastWrittenCacheline) {
lastWrittenCacheline = address & ~15;
auto& line = cpu.dcache.line(address);
if (line.hit(address)) {
string msg = { peripheral, " writing to RDRAM address ", hex(address), " which is cached (missing cache invalidation?)\n"};
msg.append(string{ "\tCacheline was loaded at CPU PC: ", hex(line.fillpc, 16L), "\n" });
msg.append(cacheErrorContext(peripheral));
debug(unusual, msg);
}
}
}
Loading

0 comments on commit 6a3f9c0

Please sign in to comment.