From 411449cd51773bc7efd3f561541dd4ffaee8b359 Mon Sep 17 00:00:00 2001 From: squidbus <175574877+squidbus@users.noreply.github.com> Date: Mon, 9 Sep 2024 03:23:16 -0700 Subject: [PATCH] Initial support for compiling on ARM64. (#788) --- CMakeLists.txt | 31 ++++++++++++-- externals/CMakeLists.txt | 1 - src/common/arch.h | 10 +++++ src/common/assert.cpp | 7 ++++ src/common/rdtsc.h | 17 ++++++++ src/core/address_space.cpp | 39 +++++++++++++----- src/core/address_space.h | 3 +- .../libraries/kernel/thread_management.cpp | 3 ++ src/core/linker.cpp | 7 ++++ src/core/module.cpp | 3 ++ src/core/tls.cpp | 41 +++++++++++++++++-- src/video_core/page_manager.cpp | 29 ++++++++++--- 12 files changed, 166 insertions(+), 25 deletions(-) create mode 100644 src/common/arch.h diff --git a/CMakeLists.txt b/CMakeLists.txt index d1413b15697..a24d6df7dfa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,22 @@ endif() option(ENABLE_QT_GUI "Enable the Qt GUI. If not selected then the emulator uses a minimal SDL-based UI instead" OFF) +# First, determine whether to use CMAKE_OSX_ARCHITECTURES or CMAKE_SYSTEM_PROCESSOR. +if (APPLE AND CMAKE_OSX_ARCHITECTURES) + set(BASE_ARCHITECTURE "${CMAKE_OSX_ARCHITECTURES}") +else() + set(BASE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") +endif() + +# Next, match common architecture strings down to a known common value. +if (BASE_ARCHITECTURE MATCHES "(x86)|(X86)|(amd64)|(AMD64)") + set(ARCHITECTURE "x86_64") +elseif (BASE_ARCHITECTURE MATCHES "(aarch64)|(AARCH64)|(arm64)|(ARM64)") + set(ARCHITECTURE "arm64") +else() + message(FATAL_ERROR "Unsupported CPU architecture: ${BASE_ARCHITECTURE}") +endif() + # This function should be passed a list of all files in a target. It will automatically generate file groups # following the directory hierarchy, so that the layout of the files in IDEs matches the one in the filesystem. function(create_target_directory_groups target_name) @@ -309,6 +325,7 @@ set(COMMON src/common/logging/backend.cpp src/common/logging/text_formatter.h src/common/logging/types.h src/common/alignment.h + src/common/arch.h src/common/assert.cpp src/common/assert.h src/common/bit_field.h @@ -358,8 +375,6 @@ set(CORE src/core/aerolib/stubs.cpp src/core/aerolib/aerolib.h src/core/address_space.cpp src/core/address_space.h - src/core/cpu_patches.cpp - src/core/cpu_patches.h src/core/crypto/crypto.cpp src/core/crypto/crypto.h src/core/crypto/keys.h @@ -417,6 +432,12 @@ set(CORE src/core/aerolib/stubs.cpp src/core/virtual_memory.h ) +if (ARCHITECTURE STREQUAL "x86_64") + set(CORE ${CORE} + src/core/cpu_patches.cpp + src/core/cpu_patches.h) +endif() + set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/profile.h src/shader_recompiler/recompiler.cpp @@ -678,8 +699,10 @@ if (APPLE) target_link_libraries(shadps4 PRIVATE ${MOLTENVK}) endif() - # Reserve system-managed memory space. - target_link_options(shadps4 PRIVATE -Wl,-no_pie,-no_fixup_chains,-no_huge,-pagezero_size,0x4000,-segaddr,TCB_SPACE,0x4000,-segaddr,GUEST_SYSTEM,0x400000,-image_base,0x20000000000) + if (ARCHITECTURE STREQUAL "x86_64") + # Reserve system-managed memory space. + target_link_options(shadps4 PRIVATE -Wl,-no_pie,-no_fixup_chains,-no_huge,-pagezero_size,0x4000,-segaddr,TCB_SPACE,0x4000,-segaddr,GUEST_SYSTEM,0x400000,-image_base,0x20000000000) + endif() # Replacement for std::chrono::time_zone target_link_libraries(shadps4 PRIVATE date::date-tz) diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index b3ba2134a3b..fe4ac5e9e8d 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -43,7 +43,6 @@ else() endif() if (NOT TARGET FFmpeg::ffmpeg) - set(ARCHITECTURE "x86_64") add_subdirectory(ffmpeg-core) add_library(FFmpeg::ffmpeg ALIAS ffmpeg) endif() diff --git a/src/common/arch.h b/src/common/arch.h new file mode 100644 index 00000000000..b22366cb7c9 --- /dev/null +++ b/src/common/arch.h @@ -0,0 +1,10 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#if defined(__x86_64__) || defined(_M_X64) +#define ARCH_X86_64 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#define ARCH_ARM64 1 +#endif diff --git a/src/common/assert.cpp b/src/common/assert.cpp index 78c6ec07543..be0feb71dac 100644 --- a/src/common/assert.cpp +++ b/src/common/assert.cpp @@ -1,10 +1,17 @@ // SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/arch.h" #include "common/assert.h" #include "common/logging/backend.h" +#if defined(ARCH_X86_64) #define Crash() __asm__ __volatile__("int $3") +#elif defined(ARCH_ARM64) +#define Crash() __asm__ __volatile__("brk 0") +#else +#error "Missing Crash() implementation for target CPU architecture." +#endif void assert_fail_impl() { Common::Log::Stop(); diff --git a/src/common/rdtsc.h b/src/common/rdtsc.h index 3180273e50b..4e4d5843696 100644 --- a/src/common/rdtsc.h +++ b/src/common/rdtsc.h @@ -3,6 +3,8 @@ #pragma once +#include "common/arch.h" + #ifdef _MSC_VER #include #endif @@ -13,15 +15,20 @@ namespace Common { #ifdef _MSC_VER __forceinline static u64 FencedRDTSC() { +#ifdef ARCH_X86_64 _mm_lfence(); _ReadWriteBarrier(); const u64 result = __rdtsc(); _mm_lfence(); _ReadWriteBarrier(); return result; +#else +#error "Missing FencedRDTSC() implementation for target CPU architecture." +#endif } #else static inline u64 FencedRDTSC() { +#ifdef ARCH_X86_64 u64 eax; u64 edx; asm volatile("lfence\n\t" @@ -29,6 +36,16 @@ static inline u64 FencedRDTSC() { "lfence\n\t" : "=a"(eax), "=d"(edx)); return (edx << 32) | eax; +#elif defined(ARCH_ARM64) + u64 ret; + asm volatile("isb\n\t" + "mrs %0, cntvct_el0\n\t" + "isb\n\t" + : "=r"(ret)::"memory"); + return ret; +#else +#error "Missing FencedRDTSC() implementation for target CPU architecture." +#endif } #endif diff --git a/src/core/address_space.cpp b/src/core/address_space.cpp index 0dd7a76f2ec..3950bd5fe57 100644 --- a/src/core/address_space.cpp +++ b/src/core/address_space.cpp @@ -3,11 +3,13 @@ #include #include "common/alignment.h" +#include "common/arch.h" #include "common/assert.h" #include "common/error.h" #include "core/address_space.h" #include "core/libraries/kernel/memory_management.h" #include "core/memory.h" +#include "libraries/error_codes.h" #ifdef _WIN32 #include @@ -15,9 +17,8 @@ #include #include #endif -#include "libraries/error_codes.h" -#ifdef __APPLE__ +#if defined(__APPLE__) && defined(ARCH_X86_64) // Reserve space for the system address space using a zerofill section. asm(".zerofill GUEST_SYSTEM,GUEST_SYSTEM,__guest_system,0xFBFC00000"); #endif @@ -308,12 +309,12 @@ struct AddressSpace::Impl { constexpr int protection_flags = PROT_READ | PROT_WRITE; constexpr int base_map_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE; -#ifdef __APPLE__ - // On ARM64 Macs, we run into limitations due to the commpage from 0xFC0000000 - 0xFFFFFFFFF - // and the GPU carveout region from 0x1000000000 - 0x6FFFFFFFFF. We can allocate the system - // managed region, as well as system reserved if reduced in size slightly, but we cannot map - // the user region where we want, so we must let the OS put it wherever possible and hope - // the game won't rely on its location. +#if defined(__APPLE__) && defined(ARCH_X86_64) + // On ARM64 Macs under Rosetta 2, we run into limitations due to the commpage from + // 0xFC0000000 - 0xFFFFFFFFF and the GPU carveout region from 0x1000000000 - 0x6FFFFFFFFF. + // We can allocate the system managed region, as well as system reserved if reduced in size + // slightly, but we cannot map the user region where we want, so we must let the OS put it + // wherever possible and hope the game won't rely on its location. system_managed_base = reinterpret_cast( mmap(reinterpret_cast(SYSTEM_MANAGED_MIN), system_managed_size, protection_flags, base_map_flags | MAP_FIXED, -1, 0)); @@ -325,12 +326,22 @@ struct AddressSpace::Impl { protection_flags, base_map_flags, -1, 0)); #else const auto virtual_size = system_managed_size + system_reserved_size + user_size; +#if defined(ARCH_X86_64) const auto virtual_base = reinterpret_cast(mmap(reinterpret_cast(SYSTEM_MANAGED_MIN), virtual_size, protection_flags, base_map_flags | MAP_FIXED, -1, 0)); system_managed_base = virtual_base; system_reserved_base = reinterpret_cast(SYSTEM_RESERVED_MIN); user_base = reinterpret_cast(USER_MIN); +#else + // Map memory wherever possible and instruction translation can handle offsetting to the + // base. + const auto virtual_base = reinterpret_cast( + mmap(nullptr, virtual_size, protection_flags, base_map_flags, -1, 0)); + system_managed_base = virtual_base; + system_reserved_base = virtual_base + SYSTEM_RESERVED_MIN - SYSTEM_MANAGED_MIN; + user_base = virtual_base + USER_MIN - SYSTEM_MANAGED_MIN; +#endif #endif if (system_managed_base == MAP_FAILED || system_reserved_base == MAP_FAILED || user_base == MAP_FAILED) { @@ -430,9 +441,11 @@ struct AddressSpace::Impl { if (write) { flags |= PROT_WRITE; } +#ifdef ARCH_X86_64 if (execute) { flags |= PROT_EXEC; } +#endif int ret = mprotect(reinterpret_cast(virtual_addr), size, flags); ASSERT_MSG(ret == 0, "mprotect failed: {}", strerror(errno)); } @@ -463,8 +476,14 @@ AddressSpace::~AddressSpace() = default; void* AddressSpace::Map(VAddr virtual_addr, size_t size, u64 alignment, PAddr phys_addr, bool is_exec) { - return impl->Map(virtual_addr, phys_addr, size, - is_exec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE); +#if ARCH_X86_64 + const auto prot = is_exec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE; +#else + // On non-native architectures, we can simplify things by ignoring the execute flag for the + // canonical copy of the memory and rely on the JIT to map translated code as executable. + constexpr auto prot = PAGE_READWRITE; +#endif + return impl->Map(virtual_addr, phys_addr, size, prot); } void* AddressSpace::MapFile(VAddr virtual_addr, size_t size, size_t offset, u32 prot, diff --git a/src/core/address_space.h b/src/core/address_space.h index 2a3488d572d..3233c758812 100644 --- a/src/core/address_space.h +++ b/src/core/address_space.h @@ -4,6 +4,7 @@ #pragma once #include +#include "common/arch.h" #include "common/enum.h" #include "common/types.h" @@ -23,7 +24,7 @@ constexpr VAddr CODE_BASE_OFFSET = 0x100000000ULL; constexpr VAddr SYSTEM_MANAGED_MIN = 0x00000400000ULL; constexpr VAddr SYSTEM_MANAGED_MAX = 0x07FFFFBFFFULL; constexpr VAddr SYSTEM_RESERVED_MIN = 0x07FFFFC000ULL; -#ifdef __APPLE__ +#if defined(__APPLE__) && defined(ARCH_X86_64) // Can only comfortably reserve the first 0x7C0000000 of system reserved space. constexpr VAddr SYSTEM_RESERVED_MAX = 0xFBFFFFFFFULL; #else diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index 80328dc1160..4a835847192 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -6,6 +6,7 @@ #include #include "common/alignment.h" +#include "common/arch.h" #include "common/assert.h" #include "common/error.h" #include "common/logging/log.h" @@ -989,7 +990,9 @@ static void cleanup_thread(void* arg) { static void* run_thread(void* arg) { auto* thread = static_cast(arg); Common::SetCurrentThreadName(thread->name.c_str()); +#ifdef ARCH_X86_64 Core::InitializeThreadPatchStack(); +#endif auto* linker = Common::Singleton::Instance(); linker->InitTlsForThread(false); void* ret = nullptr; diff --git a/src/core/linker.cpp b/src/core/linker.cpp index 0d76f4b9ee5..e8aab673d75 100644 --- a/src/core/linker.cpp +++ b/src/core/linker.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/alignment.h" +#include "common/arch.h" #include "common/assert.h" #include "common/config.h" #include "common/logging/log.h" @@ -27,6 +28,7 @@ static PS4_SYSV_ABI void ProgramExitFunc() { } static void RunMainEntry(VAddr addr, EntryParams* params, ExitFunc exit_func) { +#ifdef ARCH_X86_64 // reinterpret_cast(addr)(params, exit_func); // can't be used, stack has to have // a specific layout asm volatile("andq $-16, %%rsp\n" // Align to 16 bytes @@ -46,6 +48,9 @@ static void RunMainEntry(VAddr addr, EntryParams* params, ExitFunc exit_func) { : : "r"(addr), "r"(params), "r"(exit_func) : "rax", "rsi", "rdi"); +#else + UNIMPLEMENTED_MSG("Missing RunMainEntry() implementation for target CPU architecture."); +#endif } Linker::Linker() : memory{Memory::Instance()} {} @@ -85,7 +90,9 @@ void Linker::Execute() { // Init primary thread. Common::SetCurrentThreadName("GAME_MainThread"); +#ifdef ARCH_X86_64 InitializeThreadPatchStack(); +#endif Libraries::Kernel::pthreadInitSelfMainThread(); InitTlsForThread(true); diff --git a/src/core/module.cpp b/src/core/module.cpp index f48848bbd33..c28ac10618a 100644 --- a/src/core/module.cpp +++ b/src/core/module.cpp @@ -3,6 +3,7 @@ #include #include "common/alignment.h" +#include "common/arch.h" #include "common/assert.h" #include "common/logging/log.h" #ifdef ENABLE_QT_GUI @@ -134,9 +135,11 @@ void Module::LoadModuleToMemory(u32& max_tls_index) { LOG_INFO(Core_Linker, "segment_mode ..........: {}", segment_mode); add_segment(elf_pheader[i]); +#ifdef ARCH_X86_64 if (elf_pheader[i].p_flags & PF_EXEC) { PatchInstructions(segment_addr, segment_file_size, c); } +#endif break; } case PT_DYNAMIC: diff --git a/src/core/tls.cpp b/src/core/tls.cpp index 4a0cdb0dcdb..eb07e7a72c3 100644 --- a/src/core/tls.cpp +++ b/src/core/tls.cpp @@ -2,23 +2,28 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include "common/arch.h" #include "common/assert.h" #include "common/types.h" #include "core/tls.h" #ifdef _WIN32 #include -#elif defined(__APPLE__) +#elif defined(__APPLE__) && defined(ARCH_X86_64) #include #include #include #include +#elif !defined(ARCH_X86_64) +#include #endif namespace Core { #ifdef _WIN32 +// Windows + static DWORD slot = 0; static std::once_flag slot_alloc_flag; @@ -40,7 +45,9 @@ Tcb* GetTcbBase() { return reinterpret_cast(TlsGetValue(GetTcbKey())); } -#elif defined(__APPLE__) +#elif defined(__APPLE__) && defined(ARCH_X86_64) + +// Apple x86_64 // Reserve space in the 32-bit address range for allocating TCB pages. asm(".zerofill TCB_SPACE,TCB_SPACE,__guest_system,0x3FC000"); @@ -132,7 +139,9 @@ Tcb* GetTcbBase() { return tcb; } -#else +#elif defined(ARCH_X86_64) + +// Other POSIX x86_64 void SetTcbBase(void* image_address) { asm volatile("wrgsbase %0" ::"r"(image_address) : "memory"); @@ -144,6 +153,32 @@ Tcb* GetTcbBase() { return tcb; } +#else + +// POSIX non-x86_64 +// Just sets up a simple thread-local variable to store it, then instruction translation can point +// code to it. + +static pthread_key_t slot = 0; +static std::once_flag slot_alloc_flag; + +static void AllocTcbKey() { + ASSERT(pthread_key_create(&slot, nullptr) == 0); +} + +pthread_key_t GetTcbKey() { + std::call_once(slot_alloc_flag, &AllocTcbKey); + return slot; +} + +void SetTcbBase(void* image_address) { + ASSERT(pthread_setspecific(GetTcbKey(), image_address) == 0); +} + +Tcb* GetTcbBase() { + return static_cast(pthread_getspecific(GetTcbKey())); +} + #endif } // namespace Core diff --git a/src/video_core/page_manager.cpp b/src/video_core/page_manager.cpp index 18b8aee2180..d62077b048d 100644 --- a/src/video_core/page_manager.cpp +++ b/src/video_core/page_manager.cpp @@ -3,6 +3,7 @@ #include #include "common/alignment.h" +#include "common/arch.h" #include "common/assert.h" #include "common/error.h" #include "video_core/page_manager.h" @@ -159,6 +160,27 @@ struct PageManager::Impl { int uffd; }; #else + +#if defined(__APPLE__) + +#if defined(ARCH_X86_64) +#define IS_WRITE_ERROR(ctx) ((ctx)->uc_mcontext->__es.__err & 0x2) +#elif defined(ARCH_ARM64) +#define IS_WRITE_ERROR(ctx) ((ctx)->uc_mcontext->__es.__esr & 0x40) +#endif + +#else + +#if defined(ARCH_X86_64) +#define IS_WRITE_ERROR(ctx) ((ctx)->uc_mcontext.gregs[REG_ERR] & 0x2) +#endif + +#endif + +#ifndef IS_WRITE_ERROR +#error "Missing IS_WRITE_ERROR() implementation for target OS and CPU architecture. +#endif + struct PageManager::Impl { Impl(Vulkan::Rasterizer* rasterizer_) { rasterizer = rasterizer_; @@ -194,12 +216,7 @@ struct PageManager::Impl { static void GuestFaultSignalHandler(int sig, siginfo_t* info, void* raw_context) { ucontext_t* ctx = reinterpret_cast(raw_context); const VAddr address = reinterpret_cast(info->si_addr); -#ifdef __APPLE__ - const u32 err = ctx->uc_mcontext->__es.__err; -#else - const greg_t err = ctx->uc_mcontext.gregs[REG_ERR]; -#endif - if (err & 0x2) { + if (IS_WRITE_ERROR(ctx)) { const VAddr addr_aligned = Common::AlignDown(address, PAGESIZE); rasterizer->InvalidateMemory(addr_aligned, PAGESIZE); } else {