From f1d7e1144a6cad5e70ca19fd44c693f42148e913 Mon Sep 17 00:00:00 2001 From: notaz Date: Sat, 18 Jan 2025 02:20:51 +0200 Subject: [PATCH 01/20] try some more CIs --- .github/workflows/ci-libretro.yml | 33 ++++++++++++++++++++++++++++ .github/workflows/ci-linux-arm64.yml | 4 ++-- .github/workflows/ci-linux-armhf.yml | 4 ++-- .github/workflows/ci-linux.yml | 2 +- jni/Android.mk | 7 +++--- 5 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/ci-libretro.yml diff --git a/.github/workflows/ci-libretro.yml b/.github/workflows/ci-libretro.yml new file mode 100644 index 000000000..e1d608f0b --- /dev/null +++ b/.github/workflows/ci-libretro.yml @@ -0,0 +1,33 @@ +name: CI (Linux) +on: [push, pull_request] +jobs: + build-libretro: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: make + run: make -j$(getconf _NPROCESSORS_ONLN) -f Makefile.libretro + + build-libretro-win32: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install dependencies + run: | + sudo apt-get update -qq + sudo apt-get install -y gcc-mingw-w64 + - name: make + run: make -j$(getconf _NPROCESSORS_ONLN) -f Makefile.libretro platform=win32 CC=x86_64-w64-mingw32-gcc + + build-libretro-android: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: make + run: $(ANDROID_NDK_HOME)/ndk-build -j$(getconf _NPROCESSORS_ONLN) --no-print-directory -C jni/ diff --git a/.github/workflows/ci-linux-arm64.yml b/.github/workflows/ci-linux-arm64.yml index 249928f37..398c5186e 100644 --- a/.github/workflows/ci-linux-arm64.yml +++ b/.github/workflows/ci-linux-arm64.yml @@ -1,7 +1,7 @@ name: CI (Linux arm64) on: [push, pull_request] jobs: - build-linux: + build-linux-arm64: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 @@ -19,4 +19,4 @@ jobs: - name: configure run: DUMP_CONFIG_LOG=1 CROSS_COMPILE=aarch64-linux-gnu- SDL_CONFIG=usr/bin/sdl-config PATH=$PATH:usr/bin CFLAGS='-Iusr/include/ -Iusr/include/SDL' LDFLAGS='-Lusr/lib/aarch64-linux-gnu/ -Llib/aarch64-linux-gnu/ -Wl,-rpath-link=lib/aarch64-linux-gnu/,-rpath-link=usr/lib/aarch64-linux-gnu/,-rpath-link=usr/lib/aarch64-linux-gnu/pulseaudio/' ./configure - name: make - run: make + run: make -j$(getconf _NPROCESSORS_ONLN) diff --git a/.github/workflows/ci-linux-armhf.yml b/.github/workflows/ci-linux-armhf.yml index 6e37781b7..154a8bd19 100644 --- a/.github/workflows/ci-linux-armhf.yml +++ b/.github/workflows/ci-linux-armhf.yml @@ -1,7 +1,7 @@ name: CI (Linux armhf) on: [push, pull_request] jobs: - build-linux: + build-linux-armhf: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 @@ -19,4 +19,4 @@ jobs: - name: configure run: DUMP_CONFIG_LOG=1 CROSS_COMPILE=arm-linux-gnueabihf- SDL_CONFIG=usr/bin/sdl-config PATH=$PATH:usr/bin CFLAGS='-Iusr/include/ -Iusr/include/SDL' LDFLAGS='-Lusr/lib/arm-linux-gnueabihf/ -Llib/arm-linux-gnueabihf/ -Wl,-rpath-link=lib/arm-linux-gnueabihf/,-rpath-link=usr/lib/arm-linux-gnueabihf/,-rpath-link=usr/lib/arm-linux-gnueabihf/pulseaudio/' ./configure - name: make - run: make + run: make -j$(getconf _NPROCESSORS_ONLN) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 7ab7d3603..595f3ed33 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -14,4 +14,4 @@ jobs: - name: configure run: DUMP_CONFIG_LOG=1 ./configure - name: make - run: make + run: make -j$(getconf _NPROCESSORS_ONLN) diff --git a/jni/Android.mk b/jni/Android.mk index 440c6b880..8439f3d77 100644 --- a/jni/Android.mk +++ b/jni/Android.mk @@ -1,4 +1,5 @@ LOCAL_PATH := $(call my-dir) +include $(CLEAR_VARS) $(shell cd "$(LOCAL_PATH)" && ((git describe --always || echo) | sed -e 's/.*/#define REV "\0"/' > ../frontend/revision.h_)) $(shell cd "$(LOCAL_PATH)" && (diff -q ../frontend/revision.h_ ../frontend/revision.h > /dev/null 2>&1 || cp ../frontend/revision.h_ ../frontend/revision.h)) @@ -22,6 +23,8 @@ DYNAREC_DIR := $(ROOT_DIR)/libpcsxcore/new_dynarec DEPS_DIR := $(ROOT_DIR)/deps LIBRETRO_COMMON := $(DEPS_DIR)/libretro-common EXTRA_INCLUDES := +COREFLAGS := +SOURCES_ASM := # core SOURCES_C := $(CORE_DIR)/cdriso.c \ @@ -98,7 +101,6 @@ SOURCES_C += \ $(LCHDR_ZSTD)/decompress/zstd_ddict.c \ $(LCHDR_ZSTD)/decompress/zstd_decompress_block.c \ $(LCHDR_ZSTD)/decompress/zstd_decompress.c -SOURCES_ASM := EXTRA_INCLUDES += $(LCHDR)/include $(LCHDR_LZMA)/include $(LCHDR_ZSTD) COREFLAGS += -DHAVE_CHD -DZ7_ST -DZSTD_DISABLE_ASM ifeq (,$(call gte,$(APP_PLATFORM_LEVEL),18)) @@ -245,14 +247,13 @@ ifneq ($(GIT_VERSION)," unknown") COREFLAGS += -DGIT_VERSION=\"$(GIT_VERSION)\" endif -include $(CLEAR_VARS) LOCAL_MODULE := retro LOCAL_SRC_FILES := $(SOURCES_C) $(SOURCES_ASM) LOCAL_CFLAGS := $(COREFLAGS) LOCAL_C_INCLUDES := $(ROOT_DIR)/include LOCAL_C_INCLUDES += $(DEPS_DIR)/crypto LOCAL_C_INCLUDES += $(EXTRA_INCLUDES) -LOCAL_LDFLAGS += -Wl,-version-script=$(FRONTEND_DIR)/libretro-version-script +LOCAL_LDFLAGS := -Wl,-version-script=$(FRONTEND_DIR)/libretro-version-script LOCAL_LDFLAGS += -Wl,--script=$(FRONTEND_DIR)/libretro-extern.T LOCAL_LDFLAGS += -Wl,--gc-sections LOCAL_LDLIBS := -lz -llog From 40575606e57a917ba88ae45a0d267dfe2419efac Mon Sep 17 00:00:00 2001 From: notaz Date: Sat, 18 Jan 2025 02:27:42 +0200 Subject: [PATCH 02/20] attempt to fix CI --- .github/workflows/ci-libretro.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-libretro.yml b/.github/workflows/ci-libretro.yml index e1d608f0b..b33407980 100644 --- a/.github/workflows/ci-libretro.yml +++ b/.github/workflows/ci-libretro.yml @@ -1,4 +1,4 @@ -name: CI (Linux) +name: CI (libretro) on: [push, pull_request] jobs: build-libretro: @@ -29,5 +29,5 @@ jobs: - uses: actions/checkout@v4 with: submodules: true - - name: make - run: $(ANDROID_NDK_HOME)/ndk-build -j$(getconf _NPROCESSORS_ONLN) --no-print-directory -C jni/ + - name: ndk-build + run: ${ANDROID_NDK_HOME}/ndk-build -j$(getconf _NPROCESSORS_ONLN) --no-print-directory -C jni/ From 97fa754ea10367f512b08c11c6f661d0beb1934f Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 17 Jan 2025 01:25:17 +0100 Subject: [PATCH 03/20] lightrec: Add LIGHTREC_DEBUG functionality When set, Lightrec will be built with the debug log level, the built-in disassembler for PSX code and JIT code (requires binutils), and support outputing data to the big-ass debugger (if the proper environment variables are set). Signed-off-by: Paul Cercueil --- Makefile | 11 ++- frontend/main.c | 2 + include/lightrec/lightrec-config.h | 2 +- jni/Android.mk | 2 + libpcsxcore/lightrec/plugin.c | 122 ++++++++++++++++++++++++++++- 5 files changed, 136 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 7b9ac5ffb..ec07225ec 100644 --- a/Makefile +++ b/Makefile @@ -118,13 +118,22 @@ endif ifeq "$(DYNAREC)" "lightrec" CFLAGS += -Ideps/lightning/include -Ideps/lightrec -Iinclude/lightning -Iinclude/lightrec \ -DLIGHTREC -DLIGHTREC_STATIC +ifeq ($(LIGHTREC_DEBUG),1) +deps/lightrec/%.o: CFLAGS += -DLOG_LEVEL=DEBUG_L +libpcsxcore/lightrec/plugin.o: CFLAGS += -DLIGHTREC_DEBUG=1 +frontend/main.o: CFLAGS += -DLIGHTREC_DEBUG=1 +deps/lightning/%.o: CFLAGS += -DDISASSEMBLER=1 -DBINUTILS_2_38=1 -DBINUTILS_2_29=1 \ + -DHAVE_DISASSEMBLE_INIT_FOR_TARGET=1 -DPACKAGE_VERSION=1 +LDFLAGS += -lopcodes -lbfd +endif LIGHTREC_CUSTOM_MAP ?= 0 LIGHTREC_CUSTOM_MAP_OBJ ?= libpcsxcore/lightrec/mem.o LIGHTREC_THREADED_COMPILER ?= 0 LIGHTREC_CODE_INV ?= 0 CFLAGS += -DLIGHTREC_CUSTOM_MAP=$(LIGHTREC_CUSTOM_MAP) \ -DLIGHTREC_CODE_INV=$(LIGHTREC_CODE_INV) \ - -DLIGHTREC_ENABLE_THREADED_COMPILER=$(LIGHTREC_THREADED_COMPILER) + -DLIGHTREC_ENABLE_THREADED_COMPILER=$(LIGHTREC_THREADED_COMPILER) \ + -DLIGHTREC_ENABLE_DISASSEMBLER=$(or $(LIGHTREC_DEBUG),0) ifeq ($(LIGHTREC_CUSTOM_MAP),1) LDLIBS += -lrt OBJS += $(LIGHTREC_CUSTOM_MAP_OBJ) diff --git a/frontend/main.c b/frontend/main.c index 2dd5ca4ce..b2a4fea45 100644 --- a/frontend/main.c +++ b/frontend/main.c @@ -737,7 +737,9 @@ int main(int argc, char *argv[]) else menu_loop(); +#ifndef LIGHTREC_DEBUG pl_start_watchdog(); +#endif while (!g_emu_want_quit) { diff --git a/include/lightrec/lightrec-config.h b/include/lightrec/lightrec-config.h index 3d4b81e6b..8453fe460 100644 --- a/include/lightrec/lightrec-config.h +++ b/include/lightrec/lightrec-config.h @@ -8,7 +8,7 @@ #define ENABLE_THREADED_COMPILER LIGHTREC_ENABLE_THREADED_COMPILER #define ENABLE_FIRST_PASS 1 -#define ENABLE_DISASSEMBLER 0 +#define ENABLE_DISASSEMBLER LIGHTREC_ENABLE_DISASSEMBLER #define ENABLE_CODE_BUFFER 1 #define HAS_DEFAULT_ELM 1 diff --git a/jni/Android.mk b/jni/Android.mk index 8439f3d77..0fe5cb95b 100644 --- a/jni/Android.mk +++ b/jni/Android.mk @@ -134,6 +134,7 @@ HAVE_ARI64=0 HAVE_LIGHTREC=0 LIGHTREC_CUSTOM_MAP=0 LIGHTREC_THREADED_COMPILER=0 +LIGHTREC_DEBUG=0 HAVE_GPU_NEON=0 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) HAVE_ARI64=1 @@ -154,6 +155,7 @@ else endif COREFLAGS += -DLIGHTREC_CUSTOM_MAP=$(LIGHTREC_CUSTOM_MAP) COREFLAGS += -DLIGHTREC_ENABLE_THREADED_COMPILER=$(LIGHTREC_THREADED_COMPILER) + COREFLAGS += -DLIGHTREC_ENABLE_DISASSEMBLER=$(or $(LIGHTREC_DEBUG),0) ifeq ($(HAVE_ARI64),1) SOURCES_C += $(DYNAREC_DIR)/new_dynarec.c \ diff --git a/libpcsxcore/lightrec/plugin.c b/libpcsxcore/lightrec/plugin.c index d62f35bdf..0ca44e443 100644 --- a/libpcsxcore/lightrec/plugin.c +++ b/libpcsxcore/lightrec/plugin.c @@ -72,6 +72,9 @@ static bool use_lightrec_interpreter; static bool block_stepping; //static bool use_pcsx_interpreter; #define use_pcsx_interpreter 0 +static bool ram_disabled; +static bool lightrec_debug, lightrec_very_debug; +static u32 lightrec_begin_cycles; extern u32 lightrec_hacks; @@ -340,6 +343,8 @@ static void lightrec_enable_ram(struct lightrec_state *state, bool enable) memcpy(psxM, cache_buf, sizeof(cache_buf)); else memcpy(cache_buf, psxM, sizeof(cache_buf)); + + ram_disabled = !enable; } static bool lightrec_can_hw_direct(u32 kaddr, bool is_write, u8 size) @@ -465,6 +470,16 @@ static int lightrec_plugin_init(void) use_lightrec_interpreter = !!getenv("LIGHTREC_INTERPRETER"); +#ifdef LIGHTREC_DEBUG + char *cycles = getenv("LIGHTREC_BEGIN_CYCLES"); + + lightrec_very_debug = !!getenv("LIGHTREC_VERY_DEBUG"); + lightrec_debug = lightrec_very_debug || !!getenv("LIGHTREC_DEBUG"); + + if (cycles) + lightrec_begin_cycles = (unsigned int) strtol(cycles, NULL, 0); +#endif + lightrec_state = lightrec_init(LIGHTREC_PROG_NAME, lightrec_map, ARRAY_SIZE(lightrec_map), &lightrec_ops); @@ -481,6 +496,104 @@ static int lightrec_plugin_init(void) return 0; } +static u32 do_calculate_hash(const void *buffer, u32 count, u32 needle, bool le) +{ + unsigned int i; + const u32 *data = (const u32 *) buffer; + u32 hash = needle; + + count /= 4; + for(i = 0; i < count; ++i) { + hash += le ? LE32TOH(data[i]) : data[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } + + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + + return hash; +} + +static u32 hash_calculate_le(const void *buffer, u32 count) +{ + return do_calculate_hash(buffer, count, 0xffffffff, true); +} + +u32 hash_calculate(const void *buffer, u32 count) +{ + return do_calculate_hash(buffer, count, 0xffffffff, false); +} + +static u32 hash_calculate_ram(const void *buffer, u32 ram_size) +{ + u32 hash; + + if (ram_disabled) + hash = hash_calculate_le(cache_buf, sizeof(cache_buf)); + else + hash = hash_calculate_le(buffer, sizeof(cache_buf)); + + return do_calculate_hash(buffer + sizeof(cache_buf), + ram_size - sizeof(cache_buf), + hash, true); +} + +static const char * const mips_regs[] = { + "zero", + "at", + "v0", "v1", + "a0", "a1", "a2", "a3", + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", + "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", + "t8", "t9", + "k0", "k1", + "gp", "sp", "fp", "ra", + "lo", "hi", +}; + +static void print_for_big_ass_debugger(void) +{ + struct lightrec_registers *regs; + unsigned int i; + + regs = lightrec_get_registers(lightrec_state); + + printf("CYCLE 0x%08x PC 0x%08x", psxRegs.cycle, psxRegs.pc); + + if (lightrec_very_debug) + printf(" RAM 0x%08x SCRATCH 0x%08x HW 0x%08x", + hash_calculate_ram(psxM, 0x200000), + hash_calculate_le(psxH, 0x400), + hash_calculate_le(psxH + 0x1000, 0x2000)); + + printf(" CP0 0x%08x CP2D 0x%08x CP2C 0x%08x INT 0x%04x INTCYCLE 0x%08x GPU 0x%08x", + hash_calculate(regs->cp0, sizeof(regs->cp0)), + hash_calculate(regs->cp2d, sizeof(regs->cp2d)), + hash_calculate(regs->cp2c, sizeof(regs->cp2c)), + psxRegs.interrupt, + hash_calculate(psxRegs.intCycle, sizeof(psxRegs.intCycle)), + LE32TOH(HW_GPU_STATUS)); + + if (lightrec_very_debug) { + for (i = 0; i < 32; i++) + printf(" CP2D%u 0x%08x", i, regs->cp2d[i]); + for (i = 0; i < 32; i++) + printf(" CP2C%u 0x%08x", i, regs->cp2c[i]); + } + + if (lightrec_very_debug) + for (i = 0; i < 34; i++) + printf(" %s 0x%08x", mips_regs[i], regs->gpr[i]); + else + printf(" GPR 0x%08x", + hash_calculate(regs->gpr, sizeof(regs->gpr))); + printf("\n"); + + fflush(stdout); +} + static void lightrec_plugin_sync_regs_to_pcsx(bool need_cp2); static void lightrec_plugin_sync_regs_from_pcsx(bool need_cp2); @@ -488,6 +601,7 @@ static void lightrec_plugin_execute_internal(bool block_only) { struct lightrec_registers *regs; u32 flags, cycles_pcsx; + u32 old_pc = psxRegs.pc; regs = lightrec_get_registers(lightrec_state); gen_interupt((psxCP0Regs *)regs->cp0); @@ -522,6 +636,8 @@ static void lightrec_plugin_execute_internal(bool block_only) if (flags & LIGHTREC_EXIT_SEGFAULT) { fprintf(stderr, "Exiting at cycle 0x%08x\n", psxRegs.cycle); + if (lightrec_debug) + print_for_big_ass_debugger(); exit(1); } @@ -542,6 +658,10 @@ static void lightrec_plugin_execute_internal(bool block_only) } } + if (lightrec_debug && psxRegs.cycle >= lightrec_begin_cycles && psxRegs.pc != old_pc) { + print_for_big_ass_debugger(); + } + if ((regs->cp0[13] & regs->cp0[12] & 0x300) && (regs->cp0[12] & 0x1)) { /* Handle software interrupts */ regs->cp0[13] &= ~0x7c; @@ -552,7 +672,7 @@ static void lightrec_plugin_execute_internal(bool block_only) static void lightrec_plugin_execute(psxRegisters *regs) { while (!regs->stop) - lightrec_plugin_execute_internal(false); + lightrec_plugin_execute_internal(lightrec_very_debug); } static void lightrec_plugin_execute_block(psxRegisters *regs, From 746146baf4cb036a63de67549f0bbf838ee291fa Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 17 Jan 2025 01:49:07 +0100 Subject: [PATCH 04/20] Disable mult/div and memset optimizations when debugging Lightrec These optimizations cause Lightrec's interpreter and dynarec to have different behaviours. Signed-off-by: Paul Cercueil --- Makefile | 3 ++- include/lightrec/lightrec-config.h | 4 ++-- jni/Android.mk | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index ec07225ec..e119c55c5 100644 --- a/Makefile +++ b/Makefile @@ -133,7 +133,8 @@ LIGHTREC_CODE_INV ?= 0 CFLAGS += -DLIGHTREC_CUSTOM_MAP=$(LIGHTREC_CUSTOM_MAP) \ -DLIGHTREC_CODE_INV=$(LIGHTREC_CODE_INV) \ -DLIGHTREC_ENABLE_THREADED_COMPILER=$(LIGHTREC_THREADED_COMPILER) \ - -DLIGHTREC_ENABLE_DISASSEMBLER=$(or $(LIGHTREC_DEBUG),0) + -DLIGHTREC_ENABLE_DISASSEMBLER=$(or $(LIGHTREC_DEBUG),0) \ + -DLIGHTREC_NO_DEBUG=$(if $(LIGHTREC_DEBUG),0,1) ifeq ($(LIGHTREC_CUSTOM_MAP),1) LDLIBS += -lrt OBJS += $(LIGHTREC_CUSTOM_MAP_OBJ) diff --git a/include/lightrec/lightrec-config.h b/include/lightrec/lightrec-config.h index 8453fe460..534472efb 100644 --- a/include/lightrec/lightrec-config.h +++ b/include/lightrec/lightrec-config.h @@ -14,14 +14,14 @@ #define HAS_DEFAULT_ELM 1 #define OPT_REMOVE_DIV_BY_ZERO_SEQ 1 -#define OPT_REPLACE_MEMSET 1 +#define OPT_REPLACE_MEMSET LIGHTREC_NO_DEBUG #define OPT_DETECT_IMPOSSIBLE_BRANCHES 1 #define OPT_HANDLE_LOAD_DELAYS 1 #define OPT_TRANSFORM_OPS 1 #define OPT_LOCAL_BRANCHES 1 #define OPT_SWITCH_DELAY_SLOTS 1 #define OPT_FLAG_IO 1 -#define OPT_FLAG_MULT_DIV 1 +#define OPT_FLAG_MULT_DIV LIGHTREC_NO_DEBUG #define OPT_EARLY_UNLOAD 1 #define OPT_PRELOAD_PC 1 diff --git a/jni/Android.mk b/jni/Android.mk index 0fe5cb95b..222e870bc 100644 --- a/jni/Android.mk +++ b/jni/Android.mk @@ -134,7 +134,6 @@ HAVE_ARI64=0 HAVE_LIGHTREC=0 LIGHTREC_CUSTOM_MAP=0 LIGHTREC_THREADED_COMPILER=0 -LIGHTREC_DEBUG=0 HAVE_GPU_NEON=0 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) HAVE_ARI64=1 @@ -156,6 +155,7 @@ endif COREFLAGS += -DLIGHTREC_CUSTOM_MAP=$(LIGHTREC_CUSTOM_MAP) COREFLAGS += -DLIGHTREC_ENABLE_THREADED_COMPILER=$(LIGHTREC_THREADED_COMPILER) COREFLAGS += -DLIGHTREC_ENABLE_DISASSEMBLER=$(or $(LIGHTREC_DEBUG),0) + COREFLAGS += -DLIGHTREC_NO_DEBUG=$(if $(LIGHTREC_DEBUG),0,1) ifeq ($(HAVE_ARI64),1) SOURCES_C += $(DYNAREC_DIR)/new_dynarec.c \ From ea9687abb2dee446f628457f47a56873b7afe948 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 17 Jan 2025 01:50:05 +0100 Subject: [PATCH 05/20] lightrec: Add the big-ass debugger This debugger has this name because it doesn't even try to be suttle - it will run the dynarec and interpreter versions of Lightrec in parallel, comparing their behaviour at every exit point, and returning any issue as soon as they appear. By default, the emulator will print a checksum of the registers after each exit point. When a mismatch is found, it is advised to re-start the debugging setting the LIGHTREC_VERY_DEBUG=1 environment variable, and to set the LIGHTREC_BEGIN_CYCLES environment variable to the cycle value of the last known state. When the "very debug" mode is used, the interpreter and dynarec will exit after each single block, and the emulator will compute a checksum of the whole RAM and scratchpad and print all registers. This two-level debugging allows to find a mismatch point very fast, and then fine-tune until the exact breaking point is found. Signed-off-by: Paul Cercueil --- libpcsxcore/lightrec/big_ass_debugger.py | 112 +++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100755 libpcsxcore/lightrec/big_ass_debugger.py diff --git a/libpcsxcore/lightrec/big_ass_debugger.py b/libpcsxcore/lightrec/big_ass_debugger.py new file mode 100755 index 000000000..ed07e7d38 --- /dev/null +++ b/libpcsxcore/lightrec/big_ass_debugger.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 + +from time import sleep +from sys import argv +from os import environ +import subprocess + +def get_next_line(p): + line = "" + + while line[0:5] != "CYCLE": + line = p.readline().decode() + + if (len(line) == 0): + sleep(0.001) + elif line[0:5] != "CYCLE": + print(line[:-1]) + + return line + +def print_differences(inter, dynarec): + inter_array = inter.split(" ") + inter_dict = dict(zip(inter_array[::2], inter_array[1::2])) + dynarec_array = dynarec.split(" ") + dynarec_dict = dict(zip(dynarec_array[::2], dynarec_array[1::2])) + + diff = dict([(k, (inter_dict[k], dynarec_dict[k])) for k in inter_dict.keys() if inter_dict[k] != dynarec_dict[k]]) + + print("\nDifferences:") + print("{:15}{:15}{:15}".format("", "Interpreter", "Dynarec")) + for k in diff: + print("{:15}{:15}{:15}".format(k, diff[k][0], diff[k][1])) + +def print_mismatch(inter, dynarec, oldline): + print("\nMismatch!") + print(inter + " - Interpreter") + print(dynarec + " - Dynarec") + print("State before the mismatch:") + print(oldline) + print_differences(inter, dynarec) + +def read_loop(p1, p2): + oldline = "" + + while True: + line1 = get_next_line(p1) + line2 = get_next_line(p2) + + if line1 != line2: + # TODO: Proper matching + + # Lightrec might be lagging behind + #if line1[0:16] != line2[0:16]: + if line1[6:16] != line2[6:16]: + cycle1 = int(line1[6:16], 16) + cycle2 = int(line2[6:16], 16) + + if cycle1 < cycle2: + print(line2[:-1] + " - Dynarec") + + while cycle1 < cycle2: + print(line1[:-1] + " - Interpreter lagging behind") + print_differences(line1[:-1], line2[:-1]) + line1 = get_next_line(p1) + cycle1 = int(line1[6:16], 16) + + while cycle1 > cycle2: + print(line2[:-1] + " - Dynarec lagging behind") + print_differences(line1[:-1], line2[:-1]) + line2 = get_next_line(p2) + cycle2 = int(line2[6:16], 16) + + if line1 != line2: + print_mismatch(line1[:-1], line2[:-1], oldline) + break + + if cycle2 < cycle1: + print(line1[:-1] + " - Interpreter") + + while cycle1 > cycle2: + print(line2[:-1] + " - Dynarec lagging behind") + print_differences(line1[:-1], line2[:-1]) + line2 = get_next_line(p2) + cycle2 = int(line2[6:16], 16) + + while cycle1 < cycle2: + print(line1[:-1] + " - Interpreter lagging behind") + print_differences(line1[:-1], line2[:-1]) + line1 = get_next_line(p1) + cycle1 = int(line1[6:16], 16) + + if line1 != line2: + print_mismatch(line1[:-1], line2[:-1], oldline) + break + + if line1 == line2: + oldline = line1[:-1] + print(oldline[:16] + " - Match") + continue + + print_mismatch(line1[:-1], line2[:-1], oldline) + break + else: + oldline = line1[:-1] + +def main(): + with subprocess.Popen(['./pcsx'] + argv[1:], env={ **environ, 'LIGHTREC_DEBUG': '1', 'LIGHTREC_INTERPRETER': '1' }, stdout=subprocess.PIPE, bufsize=1) as fifo_int: + with subprocess.Popen(['./pcsx'] + argv[1:], env={ **environ, 'LIGHTREC_DEBUG': '1' }, stdout=subprocess.PIPE, bufsize=1) as fifo_jit: + read_loop(fifo_int.stdout, fifo_jit.stdout) + +if __name__ == '__main__': + main() From d7aae17c8da2b92768c787a3eaf16e387933fbf7 Mon Sep 17 00:00:00 2001 From: notaz Date: Sun, 19 Jan 2025 01:42:35 +0200 Subject: [PATCH 06/20] cdrom: disable some hack for xa --- libpcsxcore/cdrom.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libpcsxcore/cdrom.c b/libpcsxcore/cdrom.c index 0856e154f..e834275c0 100644 --- a/libpcsxcore/cdrom.c +++ b/libpcsxcore/cdrom.c @@ -1323,7 +1323,8 @@ static void cdrReadInterruptSetResult(unsigned char result) cdr.Irq1Pending = result; // F1 2000 timing hack :( // compensate for some csum func @80014380 taking too long - psxRegs.intCycle[PSXINT_CDREAD].sCycle += cdReadTime / 10; + if (!cdr.AdpcmActive) + psxRegs.intCycle[PSXINT_CDREAD].sCycle += cdReadTime / 10; return; } SetResultSize(1); From 12882b5af2c33559763018078d9e42bf58a2fcbc Mon Sep 17 00:00:00 2001 From: notaz Date: Sun, 19 Jan 2025 22:58:28 +0200 Subject: [PATCH 07/20] cdrom: drop the propagation thing It does more harm than good by causing instability. Maybe make it optional someday. --- libpcsxcore/cdrom.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libpcsxcore/cdrom.c b/libpcsxcore/cdrom.c index e834275c0..68748168e 100644 --- a/libpcsxcore/cdrom.c +++ b/libpcsxcore/cdrom.c @@ -663,6 +663,8 @@ static int msfiEq(const u8 *a, const u8 *b) void cdrPlayReadInterrupt(void) { + // this works but causes instability for timing sensitive games +#if 0 int hit = cdra_prefetch(cdr.SetSectorPlay[0], cdr.SetSectorPlay[1], cdr.SetSectorPlay[2]); if (!hit && cdr.PhysCdPropagations < 75/2) { // this propagates the real cdrom delays to the emulated game @@ -670,7 +672,7 @@ void cdrPlayReadInterrupt(void) cdr.PhysCdPropagations++; return; } - +#endif cdr.LastReadSeekCycles = psxRegs.cycle; if (cdr.Reading) { From 11d2721a6ff90624c97c8266d7ee30284275ae55 Mon Sep 17 00:00:00 2001 From: notaz Date: Mon, 20 Jan 2025 03:06:41 +0200 Subject: [PATCH 08/20] cdrom: try different seek timing unclear if it's better, might need to revert --- libpcsxcore/cdrom.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/libpcsxcore/cdrom.c b/libpcsxcore/cdrom.c index 68748168e..d0e2d4c08 100644 --- a/libpcsxcore/cdrom.c +++ b/libpcsxcore/cdrom.c @@ -580,18 +580,26 @@ static boolean canDoTurbo(void) static int cdrSeekTime(unsigned char *target) { - int diff = msf2sec(cdr.SetSectorPlay) - msf2sec(target); - int seekTime = abs(diff) * (cdReadTime / 2000); + int diff = abs((int)msf2sec(cdr.SetSectorPlay) - (int)msf2sec(target)); + int seekTime = diff * (cdReadTime / 2000); int cyclesSinceRS = psxRegs.cycle - cdr.LastReadSeekCycles; seekTime = MAX_VALUE(seekTime, 20000); + // sled seek? + if (diff >= 7200) + seekTime = PSXCLK / 7 + diff * 64; + // add *something* as rotation time until the target sector + if (cyclesSinceRS >= cdReadTime) + seekTime += (8 - ((cyclesSinceRS >> 18) & 7)) * (cdReadTime / 2); + // Transformers Beast Wars Transmetals does Setloc(x),SeekL,Setloc(x),ReadN // and then wants some slack time if (cdr.DriveState == DRIVESTATE_PAUSED || cyclesSinceRS < cdReadTime *3/2) seekTime += cdReadTime; - seekTime = MIN_VALUE(seekTime, PSXCLK * 2 / 3); - CDR_LOG("seek: %.2f %.2f (%.2f) st %d di %d\n", (float)seekTime / PSXCLK, + //seekTime = MIN_VALUE(seekTime, PSXCLK * 2 / 3); + CDR_LOG("seek: %02d:%02d:%02d %.2f %.2f (%.2f) st %d di %d\n", + target[0], target[1], target[2], (float)seekTime / PSXCLK, (float)seekTime / cdReadTime, (float)cyclesSinceRS / cdReadTime, cdr.DriveState, diff); return seekTime; From bd7eb07e41ebaf1b9eed155919c9061384b10853 Mon Sep 17 00:00:00 2001 From: notaz Date: Mon, 20 Jan 2025 03:08:29 +0200 Subject: [PATCH 09/20] standalone: allow to enable slowboot in config at least --- frontend/menu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/frontend/menu.c b/frontend/menu.c index 773ad606c..462d78c53 100644 --- a/frontend/menu.c +++ b/frontend/menu.c @@ -415,6 +415,7 @@ static const struct { CE_CONFIG_VAL(FractionalFramerate), CE_CONFIG_VAL(PreciseExceptions), CE_CONFIG_VAL(TurboCD), + CE_CONFIG_VAL(SlowBoot), CE_INTVAL(region), CE_INTVAL_V(g_scaler, 3), CE_INTVAL(g_gamma), From f79ea77933ffe03ac9e0a5cae78a037e1938cb62 Mon Sep 17 00:00:00 2001 From: notaz Date: Tue, 21 Jan 2025 01:12:35 +0200 Subject: [PATCH 10/20] spu: rework reverb according to nocash docs rpi4 performs about the same --- plugins/dfsound/externals.h | 76 +++---- plugins/dfsound/registers.c | 64 +++--- plugins/dfsound/reverb.c | 395 +++++++++++------------------------- 3 files changed, 184 insertions(+), 351 deletions(-) diff --git a/plugins/dfsound/externals.h b/plugins/dfsound/externals.h index 6dbbac67a..6fdef9db1 100644 --- a/plugins/dfsound/externals.h +++ b/plugins/dfsound/externals.h @@ -128,43 +128,45 @@ typedef struct int VolLeft; int VolRight; - int FB_SRC_A; // (offset) - int FB_SRC_B; // (offset) - int IIR_ALPHA; // (coef.) - int ACC_COEF_A; // (coef.) - int ACC_COEF_B; // (coef.) - int ACC_COEF_C; // (coef.) - int ACC_COEF_D; // (coef.) - int IIR_COEF; // (coef.) - int FB_ALPHA; // (coef.) - int FB_X; // (coef.) - int IIR_DEST_A0; // (offset) - int IIR_DEST_A1; // (offset) - int ACC_SRC_A0; // (offset) - int ACC_SRC_A1; // (offset) - int ACC_SRC_B0; // (offset) - int ACC_SRC_B1; // (offset) - int IIR_SRC_A0; // (offset) - int IIR_SRC_A1; // (offset) - int IIR_DEST_B0; // (offset) - int IIR_DEST_B1; // (offset) - int ACC_SRC_C0; // (offset) - int ACC_SRC_C1; // (offset) - int ACC_SRC_D0; // (offset) - int ACC_SRC_D1; // (offset) - int IIR_SRC_B1; // (offset) - int IIR_SRC_B0; // (offset) - int MIX_DEST_A0; // (offset) - int MIX_DEST_A1; // (offset) - int MIX_DEST_B0; // (offset) - int MIX_DEST_B1; // (offset) - int IN_COEF_L; // (coef.) - int IN_COEF_R; // (coef.) - - int dirty; // registers changed - - // MIX_DEST_xx - FB_SRC_x - int FB_SRC_A0, FB_SRC_A1, FB_SRC_B0, FB_SRC_B1; + // directly from nocash docs + //int dAPF1; // 1DC0 disp Reverb APF Offset 1 + //int dAPF2; // 1DC2 disp Reverb APF Offset 2 + int vIIR; // 1DC4 volume Reverb Reflection Volume 1 + int vCOMB1; // 1DC6 volume Reverb Comb Volume 1 + int vCOMB2; // 1DC8 volume Reverb Comb Volume 2 + int vCOMB3; // 1DCA volume Reverb Comb Volume 3 + int vCOMB4; // 1DCC volume Reverb Comb Volume 4 + int vWALL; // 1DCE volume Reverb Reflection Volume 2 + int vAPF1; // 1DD0 volume Reverb APF Volume 1 + int vAPF2; // 1DD2 volume Reverb APF Volume 2 + int mLSAME; // 1DD4 src/dst Reverb Same Side Reflection Address 1 Left + int mRSAME; // 1DD6 src/dst Reverb Same Side Reflection Address 1 Right + int mLCOMB1; // 1DD8 src Reverb Comb Address 1 Left + int mRCOMB1; // 1DDA src Reverb Comb Address 1 Right + int mLCOMB2; // 1DDC src Reverb Comb Address 2 Left + int mRCOMB2; // 1DDE src Reverb Comb Address 2 Right + int dLSAME; // 1DE0 src Reverb Same Side Reflection Address 2 Left + int dRSAME; // 1DE2 src Reverb Same Side Reflection Address 2 Right + int mLDIFF; // 1DE4 src/dst Reverb Different Side Reflect Address 1 Left + int mRDIFF; // 1DE6 src/dst Reverb Different Side Reflect Address 1 Right + int mLCOMB3; // 1DE8 src Reverb Comb Address 3 Left + int mRCOMB3; // 1DEA src Reverb Comb Address 3 Right + int mLCOMB4; // 1DEC src Reverb Comb Address 4 Left + int mRCOMB4; // 1DEE src Reverb Comb Address 4 Right + int dLDIFF; // 1DF0 src Reverb Different Side Reflect Address 2 Left + int dRDIFF; // 1DF2 src Reverb Different Side Reflect Address 2 Right + int mLAPF1; // 1DF4 src/dst Reverb APF Address 1 Left + int mRAPF1; // 1DF6 src/dst Reverb APF Address 1 Right + int mLAPF2; // 1DF8 src/dst Reverb APF Address 2 Left + int mRAPF2; // 1DFA src/dst Reverb APF Address 2 Right + int vLIN; // 1DFC volume Reverb Input Volume Left + int vRIN; // 1DFE volume Reverb Input Volume Right + + // subtracted offsets + int mLAPF1_dAPF1, mRAPF1_dAPF1, mLAPF2_dAPF2, mRAPF2_dAPF2; + + int dirty; // registers changed + } REVERBInfo; /////////////////////////////////////////////////////////// diff --git a/plugins/dfsound/registers.c b/plugins/dfsound/registers.c index 6d72d3ca1..75e7d7a11 100644 --- a/plugins/dfsound/registers.c +++ b/plugins/dfsound/registers.c @@ -278,38 +278,38 @@ void CALLBACK SPUwriteRegister(unsigned long reg, unsigned short val, ReverbOn(16,24,val); break; //-------------------------------------------------// - case H_Reverb+0 : goto rvbd; - case H_Reverb+2 : goto rvbd; - case H_Reverb+4 : spu.rvb->IIR_ALPHA=(short)val; break; - case H_Reverb+6 : spu.rvb->ACC_COEF_A=(short)val; break; - case H_Reverb+8 : spu.rvb->ACC_COEF_B=(short)val; break; - case H_Reverb+10 : spu.rvb->ACC_COEF_C=(short)val; break; - case H_Reverb+12 : spu.rvb->ACC_COEF_D=(short)val; break; - case H_Reverb+14 : spu.rvb->IIR_COEF=(short)val; break; - case H_Reverb+16 : spu.rvb->FB_ALPHA=(short)val; break; - case H_Reverb+18 : spu.rvb->FB_X=(short)val; break; - case H_Reverb+20 : goto rvbd; - case H_Reverb+22 : goto rvbd; - case H_Reverb+24 : goto rvbd; - case H_Reverb+26 : goto rvbd; - case H_Reverb+28 : goto rvbd; - case H_Reverb+30 : goto rvbd; - case H_Reverb+32 : goto rvbd; - case H_Reverb+34 : goto rvbd; - case H_Reverb+36 : goto rvbd; - case H_Reverb+38 : goto rvbd; - case H_Reverb+40 : goto rvbd; - case H_Reverb+42 : goto rvbd; - case H_Reverb+44 : goto rvbd; - case H_Reverb+46 : goto rvbd; - case H_Reverb+48 : goto rvbd; - case H_Reverb+50 : goto rvbd; - case H_Reverb+52 : goto rvbd; - case H_Reverb+54 : goto rvbd; - case H_Reverb+56 : goto rvbd; - case H_Reverb+58 : goto rvbd; - case H_Reverb+60 : spu.rvb->IN_COEF_L=(short)val; break; - case H_Reverb+62 : spu.rvb->IN_COEF_R=(short)val; break; + case H_Reverb + 0x00 : goto rvbd; + case H_Reverb + 0x02 : goto rvbd; + case H_Reverb + 0x04 : spu.rvb->vIIR = (signed short)val; break; + case H_Reverb + 0x06 : spu.rvb->vCOMB1 = (signed short)val; break; + case H_Reverb + 0x08 : spu.rvb->vCOMB2 = (signed short)val; break; + case H_Reverb + 0x0a : spu.rvb->vCOMB3 = (signed short)val; break; + case H_Reverb + 0x0c : spu.rvb->vCOMB4 = (signed short)val; break; + case H_Reverb + 0x0e : spu.rvb->vWALL = (signed short)val; break; + case H_Reverb + 0x10 : spu.rvb->vAPF1 = (signed short)val; break; + case H_Reverb + 0x12 : spu.rvb->vAPF2 = (signed short)val; break; + case H_Reverb + 0x14 : goto rvbd; + case H_Reverb + 0x16 : goto rvbd; + case H_Reverb + 0x18 : goto rvbd; + case H_Reverb + 0x1a : goto rvbd; + case H_Reverb + 0x1c : goto rvbd; + case H_Reverb + 0x1e : goto rvbd; + case H_Reverb + 0x20 : goto rvbd; + case H_Reverb + 0x22 : goto rvbd; + case H_Reverb + 0x24 : goto rvbd; + case H_Reverb + 0x26 : goto rvbd; + case H_Reverb + 0x28 : goto rvbd; + case H_Reverb + 0x2a : goto rvbd; + case H_Reverb + 0x2c : goto rvbd; + case H_Reverb + 0x2e : goto rvbd; + case H_Reverb + 0x30 : goto rvbd; + case H_Reverb + 0x32 : goto rvbd; + case H_Reverb + 0x34 : goto rvbd; + case H_Reverb + 0x36 : goto rvbd; + case H_Reverb + 0x38 : goto rvbd; + case H_Reverb + 0x3a : goto rvbd; + case H_Reverb + 0x3c : spu.rvb->vLIN = (signed short)val; break; + case H_Reverb + 0x3e : spu.rvb->vRIN = (signed short)val; break; } return; diff --git a/plugins/dfsound/reverb.c b/plugins/dfsound/reverb.c index c0ecea1da..8d31c35b1 100644 --- a/plugins/dfsound/reverb.c +++ b/plugins/dfsound/reverb.c @@ -21,6 +21,7 @@ #include "stdafx.h" #include "spu.h" +#include #define _IN_REVERB @@ -42,130 +43,136 @@ INLINE void StartREVERB(int ch) //////////////////////////////////////////////////////////////////////// -INLINE int rvb2ram_offs(int curr, int space, int iOff) +INLINE int rvb_wrap(int ofs, int space) { - iOff += curr; - if (iOff >= 0x40000) iOff -= space; - return iOff; +#if 0 + int mask = (0x3ffff - ofs) >> 31; + ofs = ofs - (space & mask); +#else + if (ofs >= 0x40000) + ofs -= space; +#endif + //assert(ofs >= 0x40000 - space); + //assert(ofs < 0x40000); + return ofs; +} + +INLINE int rvb2ram_offs(int curr, int space, int ofs) +{ + ofs += curr; + return rvb_wrap(ofs, space); } // get_buffer content helper: takes care about wraps #define g_buffer(var) \ - ((int)(signed short)LE16TOH(spu.spuMem[rvb2ram_offs(curr_addr, space, rvb->var)])) + ((int)(signed short)LE16TOH(spuMem[rvb2ram_offs(curr_addr, space, var)])) // saturate iVal and store it as var -#define s_buffer(var, iVal) \ +#define s_buffer_w(var, iVal) \ ssat32_to_16(iVal); \ - spu.spuMem[rvb2ram_offs(curr_addr, space, rvb->var)] = HTOLE16(iVal) - -#define s_buffer1(var, iVal) \ - ssat32_to_16(iVal); \ - spu.spuMem[rvb2ram_offs(curr_addr, space, rvb->var + 1)] = HTOLE16(iVal) + spuMem[rvb2ram_offs(curr_addr, space, var)] = HTOLE16(iVal) //////////////////////////////////////////////////////////////////////// -// portions based on spu2-x from PCSX2 +// from nocash psx-spx static void MixREVERB(int *SSumLR, int *RVB, int ns_to, int curr_addr) { + unsigned short *spuMem = spu.spuMem; const REVERBInfo *rvb = spu.rvb; - int IIR_ALPHA = rvb->IIR_ALPHA; - int IIR_COEF = rvb->IIR_COEF; int space = 0x40000 - rvb->StartAddr; - int l, r, ns; + int mlsame_m2o = rvb->mLSAME + space - 1; + int mrsame_m2o = rvb->mRSAME + space - 1; + int mldiff_m2o = rvb->mLDIFF + space - 1; + int mrdiff_m2o = rvb->mRDIFF + space - 1; + int vCOMB1 = rvb->vCOMB1, vCOMB2 = rvb->vCOMB2; + int vCOMB3 = rvb->vCOMB3, vCOMB4 = rvb->vCOMB4; + int vAPF1 = rvb->vAPF1, vAPF2 = rvb->vAPF2; + int vIIR = rvb->vIIR; + int vWALL = rvb->vWALL; + int ns; + + if (mlsame_m2o >= space) mlsame_m2o -= space; + if (mrsame_m2o >= space) mrsame_m2o -= space; + if (mldiff_m2o >= space) mldiff_m2o -= space; + if (mrdiff_m2o >= space) mrdiff_m2o -= space; for (ns = 0; ns < ns_to * 2; ) { - int ACC0, ACC1, FB_A0, FB_A1, FB_B0, FB_B1; - int mix_dest_a0, mix_dest_a1, mix_dest_b0, mix_dest_b1; - - int input_L = RVB[ns] * rvb->IN_COEF_L; - int input_R = RVB[ns+1] * rvb->IN_COEF_R; - - int IIR_INPUT_A0 = ((g_buffer(IIR_SRC_A0) * IIR_COEF) + input_L) >> 15; - int IIR_INPUT_A1 = ((g_buffer(IIR_SRC_A1) * IIR_COEF) + input_R) >> 15; - int IIR_INPUT_B0 = ((g_buffer(IIR_SRC_B0) * IIR_COEF) + input_L) >> 15; - int IIR_INPUT_B1 = ((g_buffer(IIR_SRC_B1) * IIR_COEF) + input_R) >> 15; - - int iir_dest_a0 = g_buffer(IIR_DEST_A0); - int iir_dest_a1 = g_buffer(IIR_DEST_A1); - int iir_dest_b0 = g_buffer(IIR_DEST_B0); - int iir_dest_b1 = g_buffer(IIR_DEST_B1); - - int IIR_A0 = iir_dest_a0 + ((IIR_INPUT_A0 - iir_dest_a0) * IIR_ALPHA >> 15); - int IIR_A1 = iir_dest_a1 + ((IIR_INPUT_A1 - iir_dest_a1) * IIR_ALPHA >> 15); - int IIR_B0 = iir_dest_b0 + ((IIR_INPUT_B0 - iir_dest_b0) * IIR_ALPHA >> 15); - int IIR_B1 = iir_dest_b1 + ((IIR_INPUT_B1 - iir_dest_b1) * IIR_ALPHA >> 15); + int Lin = RVB[ns] * rvb->vLIN; + int Rin = RVB[ns+1] * rvb->vRIN; + int mlsame_m2 = g_buffer(mlsame_m2o) << 15; // -1 + int mrsame_m2 = g_buffer(mrsame_m2o) << 15; + int mldiff_m2 = g_buffer(mldiff_m2o) << 15; + int mrdiff_m2 = g_buffer(mrdiff_m2o) << 15; + int Lout, Rout; + + mlsame_m2 += ((Lin + g_buffer(rvb->dLSAME) * vWALL - mlsame_m2) >> 15) * vIIR; + mrsame_m2 += ((Rin + g_buffer(rvb->dRSAME) * vWALL - mrsame_m2) >> 15) * vIIR; + mldiff_m2 += ((Lin + g_buffer(rvb->dLDIFF) * vWALL - mldiff_m2) >> 15) * vIIR; + mrdiff_m2 += ((Rin + g_buffer(rvb->dRDIFF) * vWALL - mrdiff_m2) >> 15) * vIIR; + mlsame_m2 >>= 15; s_buffer_w(rvb->mLSAME, mlsame_m2); + mrsame_m2 >>= 15; s_buffer_w(rvb->mRSAME, mrsame_m2); + mldiff_m2 >>= 15; s_buffer_w(rvb->mLDIFF, mldiff_m2); + mrdiff_m2 >>= 15; s_buffer_w(rvb->mRDIFF, mrdiff_m2); + + Lout = vCOMB1 * g_buffer(rvb->mLCOMB1) + vCOMB2 * g_buffer(rvb->mLCOMB2) + + vCOMB3 * g_buffer(rvb->mLCOMB3) + vCOMB4 * g_buffer(rvb->mLCOMB4); + Rout = vCOMB1 * g_buffer(rvb->mRCOMB1) + vCOMB2 * g_buffer(rvb->mRCOMB2) + + vCOMB3 * g_buffer(rvb->mRCOMB3) + vCOMB4 * g_buffer(rvb->mRCOMB4); preload(SSumLR + ns + 64*2/4 - 4); - s_buffer1(IIR_DEST_A0, IIR_A0); - s_buffer1(IIR_DEST_A1, IIR_A1); - s_buffer1(IIR_DEST_B0, IIR_B0); - s_buffer1(IIR_DEST_B1, IIR_B1); + Lout -= vAPF1 * g_buffer(rvb->mLAPF1_dAPF1); Lout >>= 15; + Rout -= vAPF1 * g_buffer(rvb->mRAPF1_dAPF1); Rout >>= 15; + s_buffer_w(rvb->mLAPF1, Lout); + s_buffer_w(rvb->mRAPF1, Rout); + Lout = Lout * vAPF1 + (g_buffer(rvb->mLAPF1_dAPF1) << 15); + Rout = Rout * vAPF1 + (g_buffer(rvb->mRAPF1_dAPF1) << 15); preload(RVB + ns + 64*2/4 - 4); - ACC0 = (g_buffer(ACC_SRC_A0) * rvb->ACC_COEF_A + - g_buffer(ACC_SRC_B0) * rvb->ACC_COEF_B + - g_buffer(ACC_SRC_C0) * rvb->ACC_COEF_C + - g_buffer(ACC_SRC_D0) * rvb->ACC_COEF_D) >> 15; - ACC1 = (g_buffer(ACC_SRC_A1) * rvb->ACC_COEF_A + - g_buffer(ACC_SRC_B1) * rvb->ACC_COEF_B + - g_buffer(ACC_SRC_C1) * rvb->ACC_COEF_C + - g_buffer(ACC_SRC_D1) * rvb->ACC_COEF_D) >> 15; - - FB_A0 = g_buffer(FB_SRC_A0); - FB_A1 = g_buffer(FB_SRC_A1); - FB_B0 = g_buffer(FB_SRC_B0); - FB_B1 = g_buffer(FB_SRC_B1); - - mix_dest_a0 = ACC0 - ((FB_A0 * rvb->FB_ALPHA) >> 15); - mix_dest_a1 = ACC1 - ((FB_A1 * rvb->FB_ALPHA) >> 15); - - mix_dest_b0 = FB_A0 + (((ACC0 - FB_A0) * rvb->FB_ALPHA - FB_B0 * rvb->FB_X) >> 15); - mix_dest_b1 = FB_A1 + (((ACC1 - FB_A1) * rvb->FB_ALPHA - FB_B1 * rvb->FB_X) >> 15); + Lout -= vAPF2 * g_buffer(rvb->mLAPF2_dAPF2); Lout >>= 15; + Rout -= vAPF2 * g_buffer(rvb->mRAPF2_dAPF2); Rout >>= 15; + s_buffer_w(rvb->mLAPF2, Lout); + s_buffer_w(rvb->mRAPF2, Rout); + Lout = Lout * vAPF2 + (g_buffer(rvb->mLAPF2_dAPF2) << 15); + Rout = Rout * vAPF2 + (g_buffer(rvb->mRAPF2_dAPF2) << 15); - s_buffer(MIX_DEST_A0, mix_dest_a0); - s_buffer(MIX_DEST_A1, mix_dest_a1); - s_buffer(MIX_DEST_B0, mix_dest_b0); - s_buffer(MIX_DEST_B1, mix_dest_b1); + Lout = ((Lout >> 15) * rvb->VolLeft) >> 15; + Rout = ((Rout >> 15) * rvb->VolRight) >> 15; - l = (mix_dest_a0 + mix_dest_b0) / 2; - r = (mix_dest_a1 + mix_dest_b1) / 2; - - l = (l * rvb->VolLeft) >> 15; // 15? - r = (r * rvb->VolRight) >> 15; - - SSumLR[ns++] += l; - SSumLR[ns++] += r; - SSumLR[ns++] += l; - SSumLR[ns++] += r; + SSumLR[ns++] += Lout; + SSumLR[ns++] += Rout; + SSumLR[ns++] += Lout; + SSumLR[ns++] += Rout; curr_addr++; - if (curr_addr >= 0x40000) curr_addr = rvb->StartAddr; + curr_addr = rvb_wrap(curr_addr, space); } } static void MixREVERB_off(int *SSumLR, int ns_to, int curr_addr) { const REVERBInfo *rvb = spu.rvb; + unsigned short *spuMem = spu.spuMem; int space = 0x40000 - rvb->StartAddr; - int l, r, ns; + int Lout, Rout, ns; for (ns = 0; ns < ns_to * 2; ) { preload(SSumLR + ns + 64*2/4 - 4); - l = (g_buffer(MIX_DEST_A0) + g_buffer(MIX_DEST_B0)) / 2; - r = (g_buffer(MIX_DEST_A1) + g_buffer(MIX_DEST_B1)) / 2; + // todo: is this missing COMB and APF1? + Lout = g_buffer(rvb->mLAPF2_dAPF2); + Rout = g_buffer(rvb->mLAPF2_dAPF2); - l = (l * rvb->VolLeft) >> 15; - r = (r * rvb->VolRight) >> 15; + Lout = (Lout * rvb->VolLeft) >> 15; + Rout = (Rout * rvb->VolRight) >> 15; - SSumLR[ns++] += l; - SSumLR[ns++] += r; - SSumLR[ns++] += l; - SSumLR[ns++] += r; + SSumLR[ns++] += Lout; + SSumLR[ns++] += Rout; + SSumLR[ns++] += Lout; + SSumLR[ns++] += Rout; curr_addr++; if (curr_addr >= 0x40000) curr_addr = rvb->StartAddr; @@ -199,30 +206,30 @@ static void REVERBPrep(void) t -= space; \ rvb->d = t - prep_offs(IIR_SRC_A0, 32); - prep_offs(IIR_SRC_A1, 34); - prep_offs(IIR_SRC_B0, 36); - prep_offs(IIR_SRC_B1, 38); - prep_offs(IIR_DEST_A0, 20); - prep_offs(IIR_DEST_A1, 22); - prep_offs(IIR_DEST_B0, 36); - prep_offs(IIR_DEST_B1, 38); - prep_offs(ACC_SRC_A0, 24); - prep_offs(ACC_SRC_A1, 26); - prep_offs(ACC_SRC_B0, 28); - prep_offs(ACC_SRC_B1, 30); - prep_offs(ACC_SRC_C0, 40); - prep_offs(ACC_SRC_C1, 42); - prep_offs(ACC_SRC_D0, 44); - prep_offs(ACC_SRC_D1, 46); - prep_offs(MIX_DEST_A0, 52); - prep_offs(MIX_DEST_A1, 54); - prep_offs(MIX_DEST_B0, 56); - prep_offs(MIX_DEST_B1, 58); - prep_offs2(FB_SRC_A0, 52, 0); - prep_offs2(FB_SRC_A1, 54, 0); - prep_offs2(FB_SRC_B0, 56, 2); - prep_offs2(FB_SRC_B1, 58, 2); + prep_offs(mLSAME, 0x14); + prep_offs(mRSAME, 0x16); + prep_offs(mLCOMB1, 0x18); + prep_offs(mRCOMB1, 0x1a); + prep_offs(mLCOMB2, 0x1c); + prep_offs(mRCOMB2, 0x1e); + prep_offs(dLSAME, 0x20); + prep_offs(dRSAME, 0x22); + prep_offs(mLDIFF, 0x24); + prep_offs(mRDIFF, 0x26); + prep_offs(mLCOMB3, 0x28); + prep_offs(mRCOMB3, 0x2a); + prep_offs(mLCOMB4, 0x2c); + prep_offs(mRCOMB4, 0x2e); + prep_offs(dLDIFF, 0x30); + prep_offs(dRDIFF, 0x32); + prep_offs(mLAPF1, 0x34); + prep_offs(mRAPF1, 0x36); + prep_offs(mLAPF2, 0x38); + prep_offs(mRAPF2, 0x3a); + prep_offs2(mLAPF1_dAPF1, 0x34, 0); + prep_offs2(mRAPF1_dAPF1, 0x36, 0); + prep_offs2(mLAPF2_dAPF2, 0x38, 2); + prep_offs2(mRAPF2_dAPF2, 0x3a, 2); #undef prep_offs #undef prep_offs2 @@ -245,180 +252,4 @@ INLINE void REVERBDo(int *SSumLR, int *RVB, int ns_to, int curr_addr) #endif -/* ------------------------------------------------------------------------------ -PSX reverb hardware notes -by Neill Corlett ------------------------------------------------------------------------------ - -Yadda yadda disclaimer yadda probably not perfect yadda well it's okay anyway -yadda yadda. - ------------------------------------------------------------------------------ - -Basics ------- - -- The reverb buffer is 22khz 16-bit mono PCM. -- It starts at the reverb address given by 1DA2, extends to - the end of sound RAM, and wraps back to the 1DA2 address. - -Setting the address at 1DA2 resets the current reverb work address. - -This work address ALWAYS increments every 1/22050 sec., regardless of -whether reverb is enabled (bit 7 of 1DAA set). - -And the contents of the reverb buffer ALWAYS play, scaled by the -"reverberation depth left/right" volumes (1D84/1D86). -(which, by the way, appear to be scaled so 3FFF=approx. 1.0, 4000=-1.0) - ------------------------------------------------------------------------------ - -Register names --------------- - -These are probably not their real names. -These are probably not even correct names. -We will use them anyway, because we can. - -1DC0: FB_SRC_A (offset) -1DC2: FB_SRC_B (offset) -1DC4: IIR_ALPHA (coef.) -1DC6: ACC_COEF_A (coef.) -1DC8: ACC_COEF_B (coef.) -1DCA: ACC_COEF_C (coef.) -1DCC: ACC_COEF_D (coef.) -1DCE: IIR_COEF (coef.) -1DD0: FB_ALPHA (coef.) -1DD2: FB_X (coef.) -1DD4: IIR_DEST_A0 (offset) -1DD6: IIR_DEST_A1 (offset) -1DD8: ACC_SRC_A0 (offset) -1DDA: ACC_SRC_A1 (offset) -1DDC: ACC_SRC_B0 (offset) -1DDE: ACC_SRC_B1 (offset) -1DE0: IIR_SRC_A0 (offset) -1DE2: IIR_SRC_A1 (offset) -1DE4: IIR_DEST_B0 (offset) -1DE6: IIR_DEST_B1 (offset) -1DE8: ACC_SRC_C0 (offset) -1DEA: ACC_SRC_C1 (offset) -1DEC: ACC_SRC_D0 (offset) -1DEE: ACC_SRC_D1 (offset) -1DF0: IIR_SRC_B1 (offset) -1DF2: IIR_SRC_B0 (offset) -1DF4: MIX_DEST_A0 (offset) -1DF6: MIX_DEST_A1 (offset) -1DF8: MIX_DEST_B0 (offset) -1DFA: MIX_DEST_B1 (offset) -1DFC: IN_COEF_L (coef.) -1DFE: IN_COEF_R (coef.) - -The coefficients are signed fractional values. --32768 would be -1.0 - 32768 would be 1.0 (if it were possible... the highest is of course 32767) - -The offsets are (byte/8) offsets into the reverb buffer. -i.e. you multiply them by 8, you get byte offsets. -You can also think of them as (samples/4) offsets. -They appear to be signed. They can be negative. -None of the documented presets make them negative, though. - -Yes, 1DF0 and 1DF2 appear to be backwards. Not a typo. - ------------------------------------------------------------------------------ - -What it does ------------- - -We take all reverb sources: -- regular channels that have the reverb bit on -- cd and external sources, if their reverb bits are on -and mix them into one stereo 44100hz signal. - -Lowpass/downsample that to 22050hz. The PSX uses a proper bandlimiting -algorithm here, but I haven't figured out the hysterically exact specifics. -I use an 8-tap filter with these coefficients, which are nice but probably -not the real ones: - -0.037828187894 -0.157538631280 -0.321159685278 -0.449322115345 -0.449322115345 -0.321159685278 -0.157538631280 -0.037828187894 - -So we have two input samples (INPUT_SAMPLE_L, INPUT_SAMPLE_R) every 22050hz. - -* IN MY EMULATION, I divide these by 2 to make it clip less. - (and of course the L/R output coefficients are adjusted to compensate) - The real thing appears to not do this. - -At every 22050hz tick: -- If the reverb bit is enabled (bit 7 of 1DAA), execute the reverb - steady-state algorithm described below -- AFTERWARDS, retrieve the "wet out" L and R samples from the reverb buffer - (This part may not be exactly right and I guessed at the coefs. TODO: check later.) - L is: 0.333 * (buffer[MIX_DEST_A0] + buffer[MIX_DEST_B0]) - R is: 0.333 * (buffer[MIX_DEST_A1] + buffer[MIX_DEST_B1]) -- Advance the current buffer position by 1 sample - -The wet out L and R are then upsampled to 44100hz and played at the -"reverberation depth left/right" (1D84/1D86) volume, independent of the main -volume. - ------------------------------------------------------------------------------ - -Reverb steady-state -------------------- - -The reverb steady-state algorithm is fairly clever, and of course by -"clever" I mean "batshit insane". - -buffer[x] is relative to the current buffer position, not the beginning of -the buffer. Note that all buffer offsets must wrap around so they're -contained within the reverb work area. - -Clipping is performed at the end... maybe also sooner, but definitely at -the end. - -IIR_INPUT_A0 = buffer[IIR_SRC_A0] * IIR_COEF + INPUT_SAMPLE_L * IN_COEF_L; -IIR_INPUT_A1 = buffer[IIR_SRC_A1] * IIR_COEF + INPUT_SAMPLE_R * IN_COEF_R; -IIR_INPUT_B0 = buffer[IIR_SRC_B0] * IIR_COEF + INPUT_SAMPLE_L * IN_COEF_L; -IIR_INPUT_B1 = buffer[IIR_SRC_B1] * IIR_COEF + INPUT_SAMPLE_R * IN_COEF_R; - -IIR_A0 = IIR_INPUT_A0 * IIR_ALPHA + buffer[IIR_DEST_A0] * (1.0 - IIR_ALPHA); -IIR_A1 = IIR_INPUT_A1 * IIR_ALPHA + buffer[IIR_DEST_A1] * (1.0 - IIR_ALPHA); -IIR_B0 = IIR_INPUT_B0 * IIR_ALPHA + buffer[IIR_DEST_B0] * (1.0 - IIR_ALPHA); -IIR_B1 = IIR_INPUT_B1 * IIR_ALPHA + buffer[IIR_DEST_B1] * (1.0 - IIR_ALPHA); - -buffer[IIR_DEST_A0 + 1sample] = IIR_A0; -buffer[IIR_DEST_A1 + 1sample] = IIR_A1; -buffer[IIR_DEST_B0 + 1sample] = IIR_B0; -buffer[IIR_DEST_B1 + 1sample] = IIR_B1; - -ACC0 = buffer[ACC_SRC_A0] * ACC_COEF_A + - buffer[ACC_SRC_B0] * ACC_COEF_B + - buffer[ACC_SRC_C0] * ACC_COEF_C + - buffer[ACC_SRC_D0] * ACC_COEF_D; -ACC1 = buffer[ACC_SRC_A1] * ACC_COEF_A + - buffer[ACC_SRC_B1] * ACC_COEF_B + - buffer[ACC_SRC_C1] * ACC_COEF_C + - buffer[ACC_SRC_D1] * ACC_COEF_D; - -FB_A0 = buffer[MIX_DEST_A0 - FB_SRC_A]; -FB_A1 = buffer[MIX_DEST_A1 - FB_SRC_A]; -FB_B0 = buffer[MIX_DEST_B0 - FB_SRC_B]; -FB_B1 = buffer[MIX_DEST_B1 - FB_SRC_B]; - -buffer[MIX_DEST_A0] = ACC0 - FB_A0 * FB_ALPHA; -buffer[MIX_DEST_A1] = ACC1 - FB_A1 * FB_ALPHA; -buffer[MIX_DEST_B0] = (FB_ALPHA * ACC0) - FB_A0 * (FB_ALPHA^0x8000) - FB_B0 * FB_X; -buffer[MIX_DEST_B1] = (FB_ALPHA * ACC1) - FB_A1 * (FB_ALPHA^0x8000) - FB_B1 * FB_X; - ------------------------------------------------------------------------------ -*/ - // vim:shiftwidth=1:expandtab From c6809aec4a68d45e31ef4bc8b6bcb8e0ec160684 Mon Sep 17 00:00:00 2001 From: notaz Date: Tue, 21 Jan 2025 22:51:15 +0200 Subject: [PATCH 11/20] cpu: make sure config is applied when core is changed --- frontend/libretro.c | 1 - frontend/menu.c | 1 - libpcsxcore/lightrec/plugin.c | 5 +++-- libpcsxcore/misc.c | 11 ++++++----- libpcsxcore/new_dynarec/emu_if.c | 3 +++ libpcsxcore/psxinterpreter.c | 1 + libpcsxcore/psxinterpreter.h | 1 + 7 files changed, 14 insertions(+), 9 deletions(-) diff --git a/frontend/libretro.c b/frontend/libretro.c index 86c336735..14ba8fc32 100644 --- a/frontend/libretro.c +++ b/frontend/libretro.c @@ -2311,7 +2311,6 @@ static void update_variables(bool in_flight) prev_cpu->Notify(R3000ACPU_NOTIFY_BEFORE_SAVE, NULL); prev_cpu->Shutdown(); psxCpu->Init(); - psxCpu->Reset(); psxCpu->Notify(R3000ACPU_NOTIFY_AFTER_LOAD, NULL); } } diff --git a/frontend/menu.c b/frontend/menu.c index 462d78c53..ec803abe3 100644 --- a/frontend/menu.c +++ b/frontend/menu.c @@ -2739,7 +2739,6 @@ void menu_prepare_emu(void) prev_cpu->Notify(R3000ACPU_NOTIFY_BEFORE_SAVE, NULL); prev_cpu->Shutdown(); psxCpu->Init(); - psxCpu->Reset(); psxCpu->Notify(R3000ACPU_NOTIFY_AFTER_LOAD, NULL); } diff --git a/libpcsxcore/lightrec/plugin.c b/libpcsxcore/lightrec/plugin.c index 0ca44e443..a04d42d9d 100644 --- a/libpcsxcore/lightrec/plugin.c +++ b/libpcsxcore/lightrec/plugin.c @@ -78,6 +78,7 @@ static u32 lightrec_begin_cycles; extern u32 lightrec_hacks; +static void lightrec_plugin_apply_config(); extern void lightrec_code_inv(void *ptr, uint32_t len); enum my_cp2_opcodes { @@ -493,6 +494,7 @@ static int lightrec_plugin_init(void) #ifndef _WIN32 signal(SIGPIPE, exit); #endif + lightrec_plugin_apply_config(); return 0; } @@ -726,6 +728,7 @@ static void lightrec_plugin_apply_config() cycles_per_op_old = cycles_per_op; lightrec_set_cycles_per_opcode(lightrec_state, cycles_per_op); + lightrec_set_unsafe_opt_flags(lightrec_state, lightrec_hacks); intApplyConfig(); } @@ -756,8 +759,6 @@ static void lightrec_plugin_reset(void) regs->cp0[12] = 0x10900000; // COP0 enabled | BEV = 1 | TS = 1 regs->cp0[15] = 0x00000002; // PRevID = Revision ID, same as R3000A - - lightrec_set_unsafe_opt_flags(lightrec_state, lightrec_hacks); } static void lightrec_plugin_sync_regs_from_pcsx(bool need_cp2) diff --git a/libpcsxcore/misc.c b/libpcsxcore/misc.c index 34745adda..357a20a0c 100644 --- a/libpcsxcore/misc.c +++ b/libpcsxcore/misc.c @@ -820,6 +820,12 @@ int LoadState(const char *file) { psxHwFreeze(f, 0); psxRcntFreeze(f, 0); mdecFreeze(f, 0); + + if (Config.HLE != oldhle) { + // at least ari64 drc compiles differently so hard reset + psxCpu->Shutdown(); + psxCpu->Init(); + } ndrc_freeze(f, 0); padFreeze(f, 0); @@ -827,11 +833,6 @@ int LoadState(const char *file) { if (Config.HLE) psxBiosCheckExe(biosBranchCheckOld, 0x60, 1); - if (Config.HLE != oldhle) { - // at least ari64 drc compiles differently so hard reset - psxCpu->Shutdown(); - psxCpu->Init(); - } psxCpu->Notify(R3000ACPU_NOTIFY_AFTER_LOAD, NULL); result = 0; diff --git a/libpcsxcore/new_dynarec/emu_if.c b/libpcsxcore/new_dynarec/emu_if.c index 4c21b002f..618cafd83 100644 --- a/libpcsxcore/new_dynarec/emu_if.c +++ b/libpcsxcore/new_dynarec/emu_if.c @@ -570,6 +570,9 @@ static int ari64_init() zeromem_ptr = zero_mem; scratch_buf_ptr = scratch_buf; // for gte_neon.S + ndrc_g.cycle_multiplier_old = Config.cycle_multiplier; + ndrc_g.hacks_old = ndrc_g.hacks | ndrc_g.hacks_pergame; + ari64_apply_config(); ari64_thread_init(); return 0; diff --git a/libpcsxcore/psxinterpreter.c b/libpcsxcore/psxinterpreter.c index 7e7325586..e0edecd64 100644 --- a/libpcsxcore/psxinterpreter.c +++ b/libpcsxcore/psxinterpreter.c @@ -1166,6 +1166,7 @@ void (*psxCP2[64])(struct psxCP2Regs *regs) = { /////////////////////////////////////////// static int intInit() { + intApplyConfig(); return 0; } diff --git a/libpcsxcore/psxinterpreter.h b/libpcsxcore/psxinterpreter.h index bc219a49d..e49498986 100644 --- a/libpcsxcore/psxinterpreter.h +++ b/libpcsxcore/psxinterpreter.h @@ -2,6 +2,7 @@ #define __PSXINTERPRETER_H__ struct psxRegisters; +struct psxCP2Regs; // get an opcode without triggering exceptions or affecting cache u32 intFakeFetch(u32 pc); From e45d2bc4c3440ee8792502bb0a96feec91e26fde Mon Sep 17 00:00:00 2001 From: notaz Date: Tue, 21 Jan 2025 23:12:57 +0200 Subject: [PATCH 12/20] db: update libretro/pcsx_rearmed#515 note that lightrec already runs more cycles as it doesn't do any gte stall things --- libpcsxcore/database.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libpcsxcore/database.c b/libpcsxcore/database.c index eced23031..d9ec221cb 100644 --- a/libpcsxcore/database.c +++ b/libpcsxcore/database.c @@ -154,6 +154,9 @@ cycle_multiplier_overrides[] = { 153, { "SLUS00943" } }, /* Sol Divide: FMV timing */ { 200, { "SLUS01519", "SCPS45260", "SLPS01463" } }, + /* Legend of Legaia - some attack moves lag and cause a/v desync */ + { 160, { "SCUS94254", "SCUS94366", "SCES01752" } }, + { 160, { "SCES01944", "SCES01945", "SCES01946", "SCES01947" } }, }; static const struct From 89874a81d87c5f2fda85fc47bd1a0cf2c681a5d0 Mon Sep 17 00:00:00 2001 From: notaz Date: Wed, 22 Jan 2025 03:10:57 +0200 Subject: [PATCH 13/20] drc: rework vsync --- libpcsxcore/new_dynarec/new_dynarec.c | 107 +++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 1 deletion(-) diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index 4052f7486..7942cd0c6 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -253,7 +253,7 @@ static struct decoded_insn u_char is_delay_load:1; // is_load + MFC/CFC u_char is_exception:1; // unconditional, also interp. fallback u_char may_except:1; // might generate an exception - u_char ls_type:2; // load/store type (ls_width_type) + u_char ls_type:2; // load/store type (ls_width_type LS_*) } dops[MAXBLOCK]; enum ls_width_type { @@ -309,6 +309,7 @@ static struct compile_info static u_int expirep; static u_int stop_after_jal; static u_int f1_hack; + static u_int vsync_hack; #ifdef STAT_PRINT static int stat_bc_direct; static int stat_bc_pre; @@ -5543,6 +5544,52 @@ static void rjump_assemble(int i, const struct regstat *i_regs) #endif } +static void vsync_hack_assemble(int i, int ld_ofs, int cc) +{ + int sp = get_reg(branch_regs[i].regmap, 29); + int ro = get_reg(branch_regs[i].regmap, ROREG); + int cycles = CLOCK_ADJUST(9+5) * 16; + void *t_exit[3], *loop_target, *t_loop_break; + int j; + if (sp < 0 || (ram_offset && ro < 0)) + return; + assem_debug("; vsync hack\n"); + host_tempreg_acquire(); + emit_cmpimm(cc, -cycles); + t_exit[0] = out; + emit_jge(0); + emit_cmpimm(sp, RAM_SIZE); + t_exit[1] = out; + emit_jno(0); + if (ro >= 0) { + emit_addimm(sp, ld_ofs, HOST_TEMPREG); + emit_ldr_dualindexed(ro, HOST_TEMPREG, HOST_TEMPREG); + } + else + emit_readword_indexed(ld_ofs, sp, HOST_TEMPREG); + emit_cmpimm(HOST_TEMPREG, 17); + t_exit[2] = out; + emit_jl(0); + + assem_debug("1:\n"); + loop_target = out; + emit_addimm(HOST_TEMPREG, -16, HOST_TEMPREG); + emit_addimm(cc, cycles, cc); + emit_cmpimm(HOST_TEMPREG, 17); + t_loop_break = out; + emit_jl(DJT_2); + emit_cmpimm(cc, -cycles); + emit_jl(loop_target); + + assem_debug("2:\n"); + set_jump_target(t_loop_break, out); + do_store_word(sp, ld_ofs, HOST_TEMPREG, ro, 1); + + for (j = 0; j < ARRAY_SIZE(t_exit); j++) + set_jump_target(t_exit[j], out); + host_tempreg_release(); +} + static void cjump_assemble(int i, const struct regstat *i_regs) { const signed char *i_regmap = i_regs->regmap; @@ -5556,6 +5603,7 @@ static void cjump_assemble(int i, const struct regstat *i_regs) int internal=internal_branch(cinfo[i].ba); if(i==(cinfo[i].ba-start)>>2) assem_debug("idle loop\n"); if(!match) invert=1; + if (vsync_hack && (vsync_hack >> 16) == i) invert=1; #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if(i>(cinfo[i].ba-start)>>2) invert=1; #endif @@ -5690,6 +5738,8 @@ static void cjump_assemble(int i, const struct regstat *i_regs) } if(invert) { if(taken) set_jump_target(taken, out); + if (vsync_hack && (vsync_hack >> 16) == i) + vsync_hack_assemble(i, vsync_hack & 0xffff, cc); #ifdef CORTEX_A8_BRANCH_PREDICTION_HACK if (match && (!internal || !dops[(cinfo[i].ba-start)>>2].is_ds)) { if(adj) { @@ -6600,9 +6650,55 @@ static void force_intcall(int i) cinfo[i].ba = -1; } +static noinline void do_vsync(int i) +{ + // lui a0, x; addiu a0, x; jal puts + u32 addr = (cinfo[i].imm << 16) + (signed short)cinfo[i+1].imm; + char *str = NULL; + int j, t, jals_cnt = 0; + + if (!is_ram_addr(addr)) + return; + str = (char *)psxM + (addr & 0x1fffff); + if (!str || strncmp(str, "VSync: timeout", 14)) + return; + // jal clearPad, jal clearRCnt; j return; nop + for (j = i+2; j < slen; j++) { + if (dops[j].itype == SHIFTIMM || dops[j].itype == IMM16 || dops[j].itype == ALU) + continue; + if (dops[j].opcode == 0x03) { + jals_cnt++; continue; + } + break; + } + if (j >= slen || jals_cnt != 3 || dops[j++].opcode != 0x02) + return; + for (; j < slen; j++) + if (dops[j].itype != SHIFTIMM && dops[j].itype != IMM16) + break; + if (j >= slen || dops[j].opcode != 0x23) // lw x, condition + return; + j += 2; + if (dops[j].opcode != 0 || dops[j].opcode2 != 0x2A) // slt x, y + return; + if (dops[++j].opcode != 0x05) // bnez x, loop + return; + t = (cinfo[j].ba - start) / 4; + if (t < 0 || t >= slen) + return; + // lw x, d(sp) + if (dops[t].opcode != 0x23 || dops[t].rs1 != 29 || (u32)cinfo[t].imm >= 1024) + return; + if (dops[t+2].opcode != 0x09 || cinfo[t+2].imm != -1) // addiu x, -1 + return; + SysPrintf("vsync @%08x\n", start + t*4); + vsync_hack = (j << 16) | (cinfo[t].imm & 0xffff); +} + static int apply_hacks(void) { int i; + vsync_hack = 0; if (HACK_ENABLED(NDHACK_NO_COMPAT_HACKS)) return 0; /* special hack(s) */ @@ -6616,6 +6712,13 @@ static int apply_hacks(void) SysPrintf("PE2 hack @%08x\n", start + (i+3)*4); dops[i + 3].itype = NOP; } + // see also: psxBiosCheckExe() + if (i > 1 && dops[i].opcode == 0x0f && dops[i].rt1 == 4 + && dops[i+1].opcode == 0x09 && dops[i+1].rt1 == 4 && dops[i+1].rs1 == 4 + && dops[i+2].opcode == 0x03) + { + do_vsync(i); + } } if (source[0] == 0x3c05edb8 && source[1] == 0x34a58320) { @@ -6640,6 +6743,7 @@ static int apply_hacks(void) return 1; } } +#if 0 // alt vsync, not used if (Config.HLE) { if (start <= psxRegs.biosBranchCheck && psxRegs.biosBranchCheck < start + i*4) @@ -6652,6 +6756,7 @@ static int apply_hacks(void) } } } +#endif return 0; } From 14c9acee3b0ac7cc96cb8e9922139a3dce83cce7 Mon Sep 17 00:00:00 2001 From: notaz Date: Thu, 23 Jan 2025 01:11:27 +0200 Subject: [PATCH 14/20] drc: generate diff-able debug output --- libpcsxcore/new_dynarec/assem_arm.c | 69 ++++++++++++++------------- libpcsxcore/new_dynarec/assem_arm64.c | 24 +++++----- libpcsxcore/new_dynarec/new_dynarec.c | 14 ++++-- 3 files changed, 59 insertions(+), 48 deletions(-) diff --git a/libpcsxcore/new_dynarec/assem_arm.c b/libpcsxcore/new_dynarec/assem_arm.c index 5b1d6fdb8..b08104624 100644 --- a/libpcsxcore/new_dynarec/assem_arm.c +++ b/libpcsxcore/new_dynarec/assem_arm.c @@ -82,23 +82,24 @@ void invalidate_addr_r9(); void invalidate_addr_r10(); void invalidate_addr_r12(); -const u_int invalidate_addr_reg[16] = { - (int)invalidate_addr_r0, - (int)invalidate_addr_r1, - (int)invalidate_addr_r2, - (int)invalidate_addr_r3, - (int)invalidate_addr_r4, - (int)invalidate_addr_r5, - (int)invalidate_addr_r6, - (int)invalidate_addr_r7, - (int)invalidate_addr_r8, - (int)invalidate_addr_r9, - (int)invalidate_addr_r10, +const void *invalidate_addr_reg[16] = { + invalidate_addr_r0, + invalidate_addr_r1, + invalidate_addr_r2, + invalidate_addr_r3, + invalidate_addr_r4, + invalidate_addr_r5, + invalidate_addr_r6, + invalidate_addr_r7, + invalidate_addr_r8, + invalidate_addr_r9, + invalidate_addr_r10, 0, - (int)invalidate_addr_r12, + invalidate_addr_r12, 0, 0, - 0}; + 0 +}; /* Linker */ @@ -987,7 +988,7 @@ static int can_jump_or_call(const void *a) static void emit_call(const void *a_) { int a = (int)a_; - assem_debug("bl %x (%x+%x)%s\n",a,(int)out,a-(int)out-8,func_name(a_)); + assem_debug("bl %p%s\n", log_addr(a), func_name(a_)); u_int offset=genjmp(a); output_w32(0xeb000000|offset); } @@ -995,7 +996,7 @@ static void emit_call(const void *a_) static void emit_jmp(const void *a_) { int a = (int)a_; - assem_debug("b %x (%x+%x)%s\n",a,(int)out,a-(int)out-8,func_name(a_)); + assem_debug("b %p%s\n", log_addr(a_), func_name(a_)); u_int offset=genjmp(a); output_w32(0xea000000|offset); } @@ -1003,7 +1004,7 @@ static void emit_jmp(const void *a_) static void emit_jne(const void *a_) { int a = (int)a_; - assem_debug("bne %x\n",a); + assem_debug("bne %p\n", log_addr(a_)); u_int offset=genjmp(a); output_w32(0x1a000000|offset); } @@ -1011,7 +1012,7 @@ static void emit_jne(const void *a_) static void emit_jeq(const void *a_) { int a = (int)a_; - assem_debug("beq %x\n",a); + assem_debug("beq %p\n", log_addr(a_)); u_int offset=genjmp(a); output_w32(0x0a000000|offset); } @@ -1019,7 +1020,7 @@ static void emit_jeq(const void *a_) static void emit_js(const void *a_) { int a = (int)a_; - assem_debug("bmi %x\n",a); + assem_debug("bmi %p\n", log_addr(a_)); u_int offset=genjmp(a); output_w32(0x4a000000|offset); } @@ -1027,7 +1028,7 @@ static void emit_js(const void *a_) static void emit_jns(const void *a_) { int a = (int)a_; - assem_debug("bpl %x\n",a); + assem_debug("bpl %p\n", log_addr(a_)); u_int offset=genjmp(a); output_w32(0x5a000000|offset); } @@ -1035,7 +1036,7 @@ static void emit_jns(const void *a_) static void emit_jl(const void *a_) { int a = (int)a_; - assem_debug("blt %x\n",a); + assem_debug("blt %p\n", log_addr(a_)); u_int offset=genjmp(a); output_w32(0xba000000|offset); } @@ -1043,7 +1044,7 @@ static void emit_jl(const void *a_) static void emit_jge(const void *a_) { int a = (int)a_; - assem_debug("bge %x\n",a); + assem_debug("bge %p\n", log_addr(a_)); u_int offset=genjmp(a); output_w32(0xaa000000|offset); } @@ -1051,7 +1052,7 @@ static void emit_jge(const void *a_) static void emit_jo(const void *a_) { int a = (int)a_; - assem_debug("bvs %x\n",a); + assem_debug("bvs %p\n", log_addr(a_)); u_int offset=genjmp(a); output_w32(0x6a000000|offset); } @@ -1059,7 +1060,7 @@ static void emit_jo(const void *a_) static void emit_jno(const void *a_) { int a = (int)a_; - assem_debug("bvc %x\n",a); + assem_debug("bvc %p\n", log_addr(a_)); u_int offset=genjmp(a); output_w32(0x7a000000|offset); } @@ -1067,7 +1068,7 @@ static void emit_jno(const void *a_) static void emit_jc(const void *a_) { int a = (int)a_; - assem_debug("bcs %x\n",a); + assem_debug("bcs %p\n", log_addr(a_)); u_int offset=genjmp(a); output_w32(0x2a000000|offset); } @@ -1075,7 +1076,7 @@ static void emit_jc(const void *a_) static void emit_jcc(const void *a_) { int a = (int)a_; - assem_debug("bcc %x\n",a); + assem_debug("bcc %p\n", log_addr(a_)); u_int offset=genjmp(a); output_w32(0x3a000000|offset); } @@ -1454,9 +1455,10 @@ static void emit_ldrb_indexedsr12_reg(int base, int r, int rt) output_w32(0xe7d00000|rd_rn_rm(rt,base,r)|0x620); } -static void emit_callne(int a) +static void emit_callne(const void *a_) { - assem_debug("blne %x\n",a); + int a = (int)a_; + assem_debug("blne %p\n", log_addr(a_)); u_int offset=genjmp(a); output_w32(0x1b000000|offset); } @@ -1492,10 +1494,11 @@ static attr_unused void emit_addpl_imm(int rs,int imm,int rt) output_w32(0x52800000|rd_rn_rm(rt,rs,0)|armval); } -static void emit_jno_unlikely(int a) +static void emit_jno_unlikely(void *a_) { - //emit_jno(a); - assem_debug("addvc pc,pc,#? (%x)\n",/*a-(int)out-8,*/a); + //emit_jno(a_); + assert(a_ == NULL); + assem_debug("addvc pc,pc,#? (%p)\n", /*a-(int)out-8,*/ log_addr(a_)); output_w32(0x72800000|rd_rn_rm(15,15,0)); } @@ -1665,7 +1668,7 @@ static void mov_loadtype_adj(enum stub_type type,int rs,int rt) static void do_readstub(int n) { - assem_debug("do_readstub %x\n",start+stubs[n].a*4); + assem_debug("do_readstub %p\n", log_addr(start + stubs[n].a*4)); literal_pool(256); set_jump_target(stubs[n].addr, out); enum stub_type type=stubs[n].type; @@ -1836,7 +1839,7 @@ static void inline_readstub(enum stub_type type, int i, u_int addr, static void do_writestub(int n) { - assem_debug("do_writestub %x\n",start+stubs[n].a*4); + assem_debug("do_writestub %p\n", log_addr(start + stubs[n].a*4)); literal_pool(256); set_jump_target(stubs[n].addr, out); enum stub_type type=stubs[n].type; diff --git a/libpcsxcore/new_dynarec/assem_arm64.c b/libpcsxcore/new_dynarec/assem_arm64.c index 5d6a78292..9f2f66af6 100644 --- a/libpcsxcore/new_dynarec/assem_arm64.c +++ b/libpcsxcore/new_dynarec/assem_arm64.c @@ -971,7 +971,7 @@ static int can_jump_or_call(const void *a) static void emit_call(const void *a) { intptr_t diff = (u_char *)a - out; - assem_debug("bl %p (%p+%lx)%s\n", a, out, diff, func_name(a)); + assem_debug("bl %p%s\n", log_addr(a), func_name(a)); assert(!(diff & 3)); if (-134217728 <= diff && diff <= 134217727) output_w32(0x94000000 | ((diff >> 2) & 0x03ffffff)); @@ -981,77 +981,77 @@ static void emit_call(const void *a) static void emit_jmp(const void *a) { - assem_debug("b %p (%p+%lx)%s\n", a, out, (u_char *)a - out, func_name(a)); + assem_debug("b %p%s\n", log_addr(a), func_name(a)); u_int offset = genjmp(a); output_w32(0x14000000 | offset); } static void emit_jne(const void *a) { - assem_debug("bne %p\n", a); + assem_debug("bne %p\n", log_addr(a)); u_int offset = genjmpcc(a); output_w32(0x54000000 | (offset << 5) | COND_NE); } static void emit_jeq(const void *a) { - assem_debug("beq %p\n", a); + assem_debug("beq %p\n", log_addr(a)); u_int offset = genjmpcc(a); output_w32(0x54000000 | (offset << 5) | COND_EQ); } static void emit_js(const void *a) { - assem_debug("bmi %p\n", a); + assem_debug("bmi %p\n", log_addr(a)); u_int offset = genjmpcc(a); output_w32(0x54000000 | (offset << 5) | COND_MI); } static void emit_jns(const void *a) { - assem_debug("bpl %p\n", a); + assem_debug("bpl %p\n", log_addr(a)); u_int offset = genjmpcc(a); output_w32(0x54000000 | (offset << 5) | COND_PL); } static void emit_jl(const void *a) { - assem_debug("blt %p\n", a); + assem_debug("blt %p\n", log_addr(a)); u_int offset = genjmpcc(a); output_w32(0x54000000 | (offset << 5) | COND_LT); } static void emit_jge(const void *a) { - assem_debug("bge %p\n", a); + assem_debug("bge %p\n", log_addr(a)); u_int offset = genjmpcc(a); output_w32(0x54000000 | (offset << 5) | COND_GE); } static void emit_jo(const void *a) { - assem_debug("bvs %p\n", a); + assem_debug("bvs %p\n", log_addr(a)); u_int offset = genjmpcc(a); output_w32(0x54000000 | (offset << 5) | COND_VS); } static void emit_jno(const void *a) { - assem_debug("bvc %p\n", a); + assem_debug("bvc %p\n", log_addr(a)); u_int offset = genjmpcc(a); output_w32(0x54000000 | (offset << 5) | COND_VC); } static void emit_jc(const void *a) { - assem_debug("bcs %p\n", a); + assem_debug("bcs %p\n", log_addr(a)); u_int offset = genjmpcc(a); output_w32(0x54000000 | (offset << 5) | COND_CS); } static void emit_cb(u_int isnz, u_int is64, const void *a, u_int r) { - assem_debug("cb%sz %s,%p\n", isnz?"n":"", is64?regname64[r]:regname[r], a); + assem_debug("cb%sz %s,%p\n", isnz?"n":"", is64?regname64[r]:regname[r], log_addr(a)); u_int offset = genjmpcc(a); is64 = is64 ? 0x80000000 : 0; isnz = isnz ? 0x01000000 : 0; diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index 7942cd0c6..9b090e4b3 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -57,6 +57,7 @@ static Jit g_jit; //#define DISASM //#define ASSEM_PRINT +//#define ASSEM_PRINT_ADDRS //#define REGMAP_PRINT // with DISASM only //#define INV_DEBUG_W //#define STAT_PRINT @@ -66,6 +67,12 @@ static Jit g_jit; #else #define assem_debug(...) #endif +#ifdef ASSEM_PRINT_ADDRS +#define log_addr(a) (a) +#else +// for diff-able output +#define log_addr(a) ((u_long)(a) <= 1024u ? (void *)(a) : (void *)0xadd0l) +#endif //#define inv_debug printf #define inv_debug(...) @@ -9130,7 +9137,7 @@ static int noinline new_recompile_block(u_int addr) u_int state_rflags = 0; int i; - assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, out); + assem_debug("NOTCOMPILED: addr = %x -> %p\n", addr, log_addr(out)); if (addr & 3) { if (addr != hack_addr) { @@ -9455,7 +9462,8 @@ static int noinline new_recompile_block(u_int addr) /* Pass 9 - Linker */ for(i=0;i %8x\n",link_addr[i].addr,link_addr[i].target); + assem_debug("link: %p -> %08x\n", + log_addr(link_addr[i].addr), link_addr[i].target); literal_pool(64); if (!link_addr[i].internal) { @@ -9510,7 +9518,7 @@ static int noinline new_recompile_block(u_int addr) { if ((i == 0 || dops[i].bt) && instr_addr[i]) { - assem_debug("%p (%d) <- %8x\n", instr_addr[i], i, start + i*4); + assem_debug("%p (%d) <- %8x\n", log_addr(instr_addr[i]), i, start + i*4); u_int vaddr = start + i*4; literal_pool(256); From cf8401830df21650d93b38d2bcaf58dcc80c29d4 Mon Sep 17 00:00:00 2001 From: notaz Date: Wed, 22 Jan 2025 03:11:12 +0200 Subject: [PATCH 15/20] drc: detect unoptimized stack reloads --- libpcsxcore/new_dynarec/new_dynarec.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index 9b090e4b3..86333a860 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -7434,6 +7434,17 @@ static noinline void pass2a_unneeded_other(void) break; } } + // rm redundant stack loads (unoptimized code, assuming no io mem access through sp) + if (i > 0 && dops[i].is_load && dops[i].rs1 == 29 && dops[i].ls_type == LS_32 + && dops[i-1].is_store && dops[i-1].rs1 == 29 && dops[i-1].ls_type == LS_32 + && dops[i-1].rs2 == dops[i].rt1 && !dops[i-1].is_ds && i < slen - 1 + && dops[i+1].rs1 != dops[i].rt1 && dops[i+1].rs2 != dops[i].rt1 + && !dops[i].bt && cinfo[i].imm == cinfo[i-1].imm) + { + cinfo[i].imm = 0; + memset(&dops[i], 0, sizeof(dops[i])); + dops[i].itype = NOP; + } } } From 21e24294686e369064a34e9ec807cc9caaf1aa3e Mon Sep 17 00:00:00 2001 From: notaz Date: Wed, 22 Jan 2025 23:26:54 +0200 Subject: [PATCH 16/20] drc: handle gte stalls closer to the interpreter --- libpcsxcore/gte.c | 4 +-- libpcsxcore/gte.h | 1 - libpcsxcore/new_dynarec/linkage_arm.S | 13 --------- libpcsxcore/new_dynarec/linkage_arm64.S | 13 --------- libpcsxcore/new_dynarec/new_dynarec.c | 36 ++++++------------------- 5 files changed, 10 insertions(+), 57 deletions(-) diff --git a/libpcsxcore/gte.c b/libpcsxcore/gte.c index 991a4452c..bdc8fa521 100644 --- a/libpcsxcore/gte.c +++ b/libpcsxcore/gte.c @@ -283,8 +283,8 @@ const unsigned char gte_cycletab[64] = { 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 39, }; -// warning: called by the dynarec -int gteCheckStallRaw(u32 op_cycles, psxRegisters *regs) { +// warning: ari64 drc stores it's negative cycles in gteBusyCycle +static int gteCheckStallRaw(u32 op_cycles, psxRegisters *regs) { u32 left = regs->gteBusyCycle - regs->cycle; int stall = 0; diff --git a/libpcsxcore/gte.h b/libpcsxcore/gte.h index f1dcc66a6..70ec9fe1d 100644 --- a/libpcsxcore/gte.h +++ b/libpcsxcore/gte.h @@ -69,7 +69,6 @@ struct psxCP2Regs; extern const unsigned char gte_cycletab[64]; -int gteCheckStallRaw(u32 op_cycles, psxRegisters *regs); void gteCheckStall(u32 op); u32 MFC2(struct psxCP2Regs *regs, int reg); diff --git a/libpcsxcore/new_dynarec/linkage_arm.S b/libpcsxcore/new_dynarec/linkage_arm.S index 39afc88e7..9ac9e05dd 100644 --- a/libpcsxcore/new_dynarec/linkage_arm.S +++ b/libpcsxcore/new_dynarec/linkage_arm.S @@ -31,7 +31,6 @@ #define ndrc_get_addr_ht_param ESYM(ndrc_get_addr_ht_param) #define ndrc_write_invalidate_one ESYM(ndrc_write_invalidate_one) #define gen_interupt ESYM(gen_interupt) -#define gteCheckStallRaw ESYM(gteCheckStallRaw) #define psxException ESYM(psxException) #define execI ESYM(execI) #endif @@ -637,18 +636,6 @@ FUNCTION(rcnt2_read_count_m1): lsr r0, #16 @ /= 8 bx lr -FUNCTION(call_gteStall): - /* r0 = op_cycles, r1 = cycles */ - ldr r2, [fp, #LO_last_count] - str lr, [fp, #LO_saved_lr] - add r1, r1, r2 - str r1, [fp, #LO_cycle] - add r1, fp, #LO_psxRegs - bl gteCheckStallRaw - ldr lr, [fp, #LO_saved_lr] - add r10, r10, r0 - bx lr - #ifdef HAVE_ARMV6 FUNCTION(get_reg): diff --git a/libpcsxcore/new_dynarec/linkage_arm64.S b/libpcsxcore/new_dynarec/linkage_arm64.S index fb961cca8..47aa39c75 100644 --- a/libpcsxcore/new_dynarec/linkage_arm64.S +++ b/libpcsxcore/new_dynarec/linkage_arm64.S @@ -29,7 +29,6 @@ #define ndrc_add_jump_out ESYM(ndrc_add_jump_out) #define ndrc_get_addr_ht ESYM(ndrc_get_addr_ht) #define gen_interupt ESYM(gen_interupt) -#define gteCheckStallRaw ESYM(gteCheckStallRaw) #define psxException ESYM(psxException) #define execI ESYM(execI) #endif @@ -382,18 +381,6 @@ jump_handle_swx_interp: /* almost never happens */ bl execI b jump_to_new_pc -FUNCTION(call_gteStall): - /* w0 = op_cycles, w1 = cycles */ - ldr w2, [rFP, #LO_last_count] - str lr, [rFP, #LO_saved_lr] - add w1, w1, w2 - str w1, [rFP, #LO_cycle] - add x1, rFP, #LO_psxRegs - bl gteCheckStallRaw - ldr lr, [rFP, #LO_saved_lr] - add rCC, rCC, w0 - ret - #ifdef DRC_DBG #undef do_insn_cmp FUNCTION(do_insn_cmp_arm64): diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index 86333a860..d14505155 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -406,7 +406,6 @@ void jump_overflow_ds(u_int u0, u_int u1, u_int pc); void jump_addrerror (u_int cause, u_int addr, u_int pc); void jump_addrerror_ds(u_int cause, u_int addr, u_int pc); void jump_to_new_pc(); -void call_gteStall(); void new_dyna_leave(); void *ndrc_get_addr_ht(u_int vaddr, struct ht_entry *ht); @@ -1296,7 +1295,6 @@ static const struct { FUNCNAME(jump_overflow_ds), FUNCNAME(jump_addrerror), FUNCNAME(jump_addrerror_ds), - FUNCNAME(call_gteStall), FUNCNAME(new_dyna_leave), FUNCNAME(pcsx_mtc0), FUNCNAME(pcsx_mtc0_ds), @@ -3671,11 +3669,7 @@ static void rfe_assemble(int i, const struct regstat *i_regs) static int cop2_is_stalling_op(int i, int *cycles) { - if (dops[i].opcode == 0x3a) { // SWC2 - *cycles = 0; - return 1; - } - if (dops[i].itype == COP2 && (dops[i].opcode2 == 0 || dops[i].opcode2 == 2)) { // MFC2/CFC2 + if (dops[i].itype == COP2 || dops[i].itype == C2LS) { *cycles = 0; return 1; } @@ -3709,7 +3703,7 @@ static void emit_log_gte_stall(int i, int stall, u_int reglist) static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u_int reglist) { - int j = i, other_gte_op_cycles = -1, stall = -MAXBLOCK, cycles_passed; + int j = i, cycles, other_gte_op_cycles = -1, stall = -MAXBLOCK, cycles_passed; int rtmp = reglist_find_free(reglist); if (HACK_ENABLED(NDHACK_NO_STALLS)) @@ -3733,17 +3727,11 @@ static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u if (other_gte_op_cycles >= 0) stall = other_gte_op_cycles - cycles_passed; else if (cycles_passed >= 44) - stall = 0; // can't stall + stall = 0; // can't possibly stall if (stall == -MAXBLOCK && rtmp >= 0) { // unknown stall, do the expensive runtime check assem_debug("; cop2_do_stall_check\n"); -#if 0 // too slow - save_regs(reglist); - emit_movimm(gte_cycletab[op], 0); - emit_addimm(HOST_CCREG, cinfo[i].ccadj, 1); - emit_far_call(call_gteStall); - restore_regs(reglist); -#else + // busy - (cc + adj) -> busy - adj - cc host_tempreg_acquire(); emit_readword(&psxRegs.gteBusyCycle, rtmp); emit_addimm(rtmp, -cinfo[i].ccadj, rtmp); @@ -3752,7 +3740,6 @@ static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u emit_cmovb_reg(rtmp, HOST_CCREG); //emit_log_gte_stall(i, 0, reglist); host_tempreg_release(); -#endif } else if (stall > 0) { //emit_log_gte_stall(i, stall, reglist); @@ -3760,7 +3747,8 @@ static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u } // save gteBusyCycle, if needed - if (gte_cycletab[op] == 0) + cycles = gte_cycletab[op]; + if (cycles == 0) return; other_gte_op_cycles = -1; for (j = i + 1; j < slen; j++) { @@ -3777,20 +3765,12 @@ static void cop2_do_stall_check(u_int op, int i, const struct regstat *i_regs, u // will handle stall when assembling that op return; cycles_passed = cinfo[min(j, slen -1)].ccadj - cinfo[i].ccadj; - if (cycles_passed >= 44) + if (cycles_passed >= cycles) return; assem_debug("; save gteBusyCycle\n"); host_tempreg_acquire(); -#if 0 - emit_readword(&last_count, HOST_TEMPREG); - emit_add(HOST_TEMPREG, HOST_CCREG, HOST_TEMPREG); - emit_addimm(HOST_TEMPREG, cinfo[i].ccadj, HOST_TEMPREG); - emit_addimm(HOST_TEMPREG, gte_cycletab[op]), HOST_TEMPREG); - emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle); -#else - emit_addimm(HOST_CCREG, cinfo[i].ccadj + gte_cycletab[op], HOST_TEMPREG); + emit_addimm(HOST_CCREG, cinfo[i].ccadj + cycles, HOST_TEMPREG); emit_writeword(HOST_TEMPREG, &psxRegs.gteBusyCycle); -#endif host_tempreg_release(); } From aab00414900cfc9caf582c042d8ffae40631cf2e Mon Sep 17 00:00:00 2001 From: notaz Date: Thu, 23 Jan 2025 00:18:09 +0200 Subject: [PATCH 17/20] drc: rearrange stop_after_jal stuff, limit NI prints --- libpcsxcore/new_dynarec/new_dynarec.c | 54 +++++++++++++++------------ 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index d14505155..fda54348d 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -315,6 +315,7 @@ static struct compile_info static void *copy; static u_int expirep; static u_int stop_after_jal; + static u_int ni_count; static u_int f1_hack; static u_int vsync_hack; #ifdef STAT_PRINT @@ -6326,6 +6327,7 @@ void new_dynarec_clear_full(void) expirep = EXPIRITY_OFFSET; literalcount=0; stop_after_jal=0; + ni_count=0; inv_code_start=inv_code_end=~0; hack_addr=0; f1_hack=0; @@ -6911,7 +6913,7 @@ static void disassemble_one(int i, u_int src) default: break; } - if (type == INTCALL) + if (type == INTCALL && ni_count < 64) SysPrintf("NI %08x @%08x (%08x)\n", src, start + i*4, start); dops[i].itype = type; dops[i].opcode2 = op2; @@ -7060,7 +7062,7 @@ static void disassemble_one(int i, u_int src) static noinline void pass1_disassemble(u_int pagelimit) { - int i, j, done = 0, ni_count = 0; + int i, j, done = 0; int ds_next = 0; for (i = 0; !done; i++) @@ -7180,7 +7182,17 @@ static noinline void pass1_disassemble(u_int pagelimit) /* Is this the end of the block? */ if (i > 0 && dops[i-1].is_ujump) { - if (dops[i-1].rt1 == 0) { // not jal + // Don't recompile stuff that's already compiled + if (check_addr(start + i*4+4)) { + done = 1; + continue; + } + // Don't get too close to the limit + if (i > MAXBLOCK - 64) + done = 2; + if (dops[i-1].opcode2 == 0x08 || dops[i-1].rs1 == 31) // JR; JALR x, lr + done = 2; + else if (dops[i-1].itype != RJUMP && dops[i-1].rt1 == 0) { // not JAL(R) int found_bbranch = 0, t = (cinfo[i-1].ba - start) / 4; if ((u_int)(t - i) < 64 && start + (t+64)*4 < pagelimit) { // scan for a branch back to i+1 @@ -7200,29 +7212,26 @@ static noinline void pass1_disassemble(u_int pagelimit) done = 2; } else { - if(stop_after_jal) done=1; - // Stop on BREAK - if((source[i+1]&0xfc00003f)==0x0d) done=1; + // jal(r) - continue or perf may suffer for platforms without + // runtime block linking (like in crash3) + if (stop_after_jal) + done = 2; } - // Don't recompile stuff that's already compiled - if(check_addr(start+i*4+4)) done=1; - // Don't get too close to the limit - if (i > MAXBLOCK - 64) - done = 1; } if (dops[i].itype == HLECALL) done = 1; - else if (dops[i].itype == INTCALL) + else if (dops[i].itype == INTCALL) { + ni_count++; done = 2; + } else if (dops[i].is_exception) - done = stop_after_jal ? 1 : 2; + done = 2; if (done == 2) { // Does the block continue due to a branch? - for(j=i-1;j>=0;j--) - { - if(cinfo[j].ba==start+i*4) done=j=0; // Branch into delay slot - if(cinfo[j].ba==start+i*4+4) done=j=0; - if(cinfo[j].ba==start+i*4+8) done=j=0; + for (j = i-1; j >= 0; j--) { + if (cinfo[j].ba == start+i*4) done=j=0; // Branch into delay slot + if (cinfo[j].ba == start+i*4+4) done=j=0; + if (cinfo[j].ba == start+i*4+8) done=j=0; } } //assert(i 8 || dops[i].opcode == 0x11)) { - done=stop_after_jal=1; - SysPrintf("Disabled speculative precompilation\n"); - } + } + if (ni_count > 32 && !stop_after_jal) { + stop_after_jal = 1; + SysPrintf("Disabled speculative precompilation\n"); } while (i > 0 && dops[i-1].is_jump) i--; From 555d3b51cd4f189006adef2d493fe0dde5c44393 Mon Sep 17 00:00:00 2001 From: notaz Date: Fri, 24 Jan 2025 00:19:14 +0200 Subject: [PATCH 18/20] drc: implement block linking on platforms that lacked it ... and likely break some of those platforms that I can't test :( --- libpcsxcore/new_dynarec/assem_arm.c | 37 +++++++-------- libpcsxcore/new_dynarec/assem_arm.h | 1 + libpcsxcore/new_dynarec/assem_arm64.c | 34 +++++-------- libpcsxcore/new_dynarec/assem_arm64.h | 5 +- libpcsxcore/new_dynarec/linkage_arm.S | 25 ++++------ libpcsxcore/new_dynarec/linkage_arm64.S | 28 +++++++++-- libpcsxcore/new_dynarec/new_dynarec.c | 50 +++++++++++++++----- libpcsxcore/new_dynarec/new_dynarec_config.h | 6 +++ 8 files changed, 111 insertions(+), 75 deletions(-) diff --git a/libpcsxcore/new_dynarec/assem_arm.c b/libpcsxcore/new_dynarec/assem_arm.c index b08104624..5caa536ea 100644 --- a/libpcsxcore/new_dynarec/assem_arm.c +++ b/libpcsxcore/new_dynarec/assem_arm.c @@ -28,6 +28,10 @@ #include "pcnt.h" #include "arm_features.h" +#ifdef TC_WRITE_OFFSET +#error "not implemented" +#endif + #ifdef DRC_DBG #pragma GCC diagnostic ignored "-Wunused-function" #pragma GCC diagnostic ignored "-Wunused-variable" @@ -103,11 +107,19 @@ const void *invalidate_addr_reg[16] = { /* Linker */ +static void set_jump_target_far1(u_int *insn, void *target) +{ + u_int ni = *insn & 0xff000000; + ni |= (((u_int)target - (u_int)insn - 8u) << 6) >> 8; + assert((ni & 0x0e000000) == 0x0a000000); + *insn = ni; +} + static void set_jump_target(void *addr, void *target_) { - u_int target = (u_int)target_; - u_char *ptr = addr; - u_int *ptr2=(u_int *)ptr; + const u_int target = (u_int)target_; + const u_char *ptr = addr; + u_int *ptr2 = (u_int *)ptr; if(ptr[3]==0xe2) { assert((target-(u_int)ptr2-8)<1024); assert(((uintptr_t)addr&3)==0); @@ -130,8 +142,7 @@ static void set_jump_target(void *addr, void *target_) else *ptr2=(0x7A000000)|(((target-(u_int)ptr2-8)<<6)>>8); } else { - assert((ptr[3]&0x0e)==0xa); - *ptr2=(*ptr2&0xFF000000)|(((target-(u_int)ptr2-8)<<6)>>8); + set_jump_target_far1(ptr2, target_); } } @@ -190,20 +201,6 @@ static void *find_extjump_insn(void *stub) return *l_ptr; } -// find where external branch is liked to using addr of it's stub: -// get address that insn one after stub loads (dyna_linker arg1), -// treat it as a pointer to branch insn, -// return addr where that branch jumps to -#if 0 -static void *get_pointer(void *stub) -{ - //printf("get_pointer(%x)\n",(int)stub); - int *i_ptr=find_extjump_insn(stub); - assert((*i_ptr&0x0f000000)==0x0a000000); // b - return (u_char *)i_ptr+((*i_ptr<<8)>>6)+8; -} -#endif - // Allocate a specific ARM register. static void alloc_arm_reg(struct regstat *cur,int i,signed char reg,int hr) { @@ -1586,7 +1583,7 @@ static void literal_pool_jumpover(int n) set_jump_target(jaddr, out); } -// parsed by get_pointer, find_extjump_insn +// parsed by find_extjump_insn, check_extjump2 static void emit_extjump(u_char *addr, u_int target) { u_char *ptr=(u_char *)addr; diff --git a/libpcsxcore/new_dynarec/assem_arm.h b/libpcsxcore/new_dynarec/assem_arm.h index b35587672..74b1657fc 100644 --- a/libpcsxcore/new_dynarec/assem_arm.h +++ b/libpcsxcore/new_dynarec/assem_arm.h @@ -36,6 +36,7 @@ extern char *invc_ptr; +// note: max due to branch encoding: arm 32M, arm64 128M #define TARGET_SIZE_2 24 // 2^24 = 16 megabytes struct tramp_insns diff --git a/libpcsxcore/new_dynarec/assem_arm64.c b/libpcsxcore/new_dynarec/assem_arm64.c index 9f2f66af6..8f174fde4 100644 --- a/libpcsxcore/new_dynarec/assem_arm64.c +++ b/libpcsxcore/new_dynarec/assem_arm64.c @@ -24,14 +24,24 @@ #include "arm_features.h" /* Linker */ +static void set_jump_target_far1(u_int *insn_, void *target) +{ + u_int *insn = NDRC_WRITE_OFFSET(insn_); + u_int in = *insn & 0xfc000000; + intptr_t offset = (u_char *)target - (u_char *)insn_; + assert(in == 0x14000000); + assert(-134217728 <= offset && offset < 134217728); + in |= (offset >> 2) & 0x3ffffff; + *insn = in; +} + static void set_jump_target(void *addr, void *target) { u_int *ptr = NDRC_WRITE_OFFSET(addr); intptr_t offset = (u_char *)target - (u_char *)addr; if ((*ptr&0xFC000000) == 0x14000000) { // b - assert(offset>=-134217728LL&&offset<134217728LL); - *ptr=(*ptr&0xFC000000)|((offset>>2)&0x3ffffff); + set_jump_target_far1(addr, target); } else if ((*ptr&0xff000000) == 0x54000000 // b.cond || (*ptr&0x7e000000) == 0x34000000) { // cbz/cbnz @@ -61,24 +71,6 @@ static void *find_extjump_insn(void *stub) return ptr + offset / 4; } -#if 0 -// find where external branch is liked to using addr of it's stub: -// get address that the stub loads (dyna_linker arg1), -// treat it as a pointer to branch insn, -// return addr where that branch jumps to -static void *get_pointer(void *stub) -{ - int *i_ptr = find_extjump_insn(stub); - if ((*i_ptr&0xfc000000) == 0x14000000) // b - return i_ptr + ((signed int)(*i_ptr<<6)>>6); - if ((*i_ptr&0xff000000) == 0x54000000 // b.cond - || (*i_ptr&0x7e000000) == 0x34000000) // cbz/cbnz - return i_ptr + ((signed int)(*i_ptr<<8)>>13); - assert(0); - return NULL; -} -#endif - // Allocate a specific ARM register. static void alloc_arm_reg(struct regstat *cur,int i,signed char reg,int hr) { @@ -1365,7 +1357,7 @@ static void literal_pool_jumpover(int n) { } -// parsed by get_pointer, find_extjump_insn +// parsed by find_extjump_insn, check_extjump2 static void emit_extjump(u_char *addr, u_int target) { assert(((addr[3]&0xfc)==0x14) || ((addr[3]&0xff)==0x54)); // b or b.cond diff --git a/libpcsxcore/new_dynarec/assem_arm64.h b/libpcsxcore/new_dynarec/assem_arm64.h index f8ee042f3..948b91d88 100644 --- a/libpcsxcore/new_dynarec/assem_arm64.h +++ b/libpcsxcore/new_dynarec/assem_arm64.h @@ -1,8 +1,9 @@ #define HOST_IMM8 1 /* calling convention: - r0 -r17: caller-save - r19-r29: callee-save */ + x0 -x17: caller-save + x18 : caller-save (platform reg) + x19-x29: callee-save */ #define HOST_REGS 29 #define EXCLUDE_REG -1 diff --git a/libpcsxcore/new_dynarec/linkage_arm.S b/libpcsxcore/new_dynarec/linkage_arm.S index 9ac9e05dd..7976cb7d6 100644 --- a/libpcsxcore/new_dynarec/linkage_arm.S +++ b/libpcsxcore/new_dynarec/linkage_arm.S @@ -26,7 +26,7 @@ #ifdef __MACH__ #define dynarec_local ESYM(dynarec_local) -#define ndrc_add_jump_out ESYM(ndrc_add_jump_out) +#define ndrc_patch_link ESYM(ndrc_patch_link) #define ndrc_get_addr_ht ESYM(ndrc_get_addr_ht) #define ndrc_get_addr_ht_param ESYM(ndrc_get_addr_ht_param) #define ndrc_write_invalidate_one ESYM(ndrc_write_invalidate_one) @@ -148,15 +148,15 @@ DRC_VAR(mini_ht, 256) FUNCTION(dyna_linker): /* r0 = virtual target address */ /* r1 = pointer to an instruction to patch */ -#ifndef NO_WRITE_EXEC +#if 1 ldr r7, [r1] mov r4, r0 add r6, r7, #2 mov r5, r1 lsl r6, r6, #8 /* must not compile - that might expire the caller block */ - ldr r0, [fp, #LO_hash_table_ptr] - mov r1, r4 + ldr r0, [fp, #LO_hash_table_ptr] + mov r1, r4 mov r2, #0 /* ndrc_compile_mode=ndrc_cm_no_compile */ bl ndrc_get_addr_ht_param @@ -166,22 +166,15 @@ FUNCTION(dyna_linker): teq r0, r6 bxeq r0 /* Stale i-cache */ mov r0, r4 - mov r1, r6 - bl ndrc_add_jump_out - - sub r2, r8, r5 - and r1, r7, #0xff000000 - lsl r2, r2, #6 - sub r1, r1, #2 - add r1, r1, r2, lsr #8 - str r1, [r5] + mov r1, r5 + mov r2, r6 + mov r3, r8 + bl ndrc_patch_link bx r8 0: mov r0, r4 -#else - /* XXX: should be able to do better than this... */ #endif - ldr r1, [fp, #LO_hash_table_ptr] + ldr r1, [fp, #LO_hash_table_ptr] bl ndrc_get_addr_ht bx r0 .size dyna_linker, .-dyna_linker diff --git a/libpcsxcore/new_dynarec/linkage_arm64.S b/libpcsxcore/new_dynarec/linkage_arm64.S index 47aa39c75..730f9cacc 100644 --- a/libpcsxcore/new_dynarec/linkage_arm64.S +++ b/libpcsxcore/new_dynarec/linkage_arm64.S @@ -26,7 +26,7 @@ #ifdef __MACH__ #define dynarec_local ESYM(dynarec_local) -#define ndrc_add_jump_out ESYM(ndrc_add_jump_out) +#define ndrc_patch_link ESYM(ndrc_patch_link) #define ndrc_get_addr_ht ESYM(ndrc_get_addr_ht) #define gen_interupt ESYM(gen_interupt) #define psxException ESYM(psxException) @@ -90,8 +90,30 @@ DRC_VAR(mini_ht, 256) .align 2 FUNCTION(dyna_linker): - /* r0 = virtual target address */ - /* r1 = instruction to patch */ + /* w0 = virtual target address */ + /* x1 = instruction to patch */ +#if 1 + mov w19, w0 + mov x20, x1 + /* must not compile - that might expire the caller block */ + ldr x0, [rFP, #LO_hash_table_ptr] + mov w1, w19 + mov w2, #0 /* ndrc_compile_mode=ndrc_cm_no_compile */ + bl ndrc_get_addr_ht_param + cbz x0, 0f + + ldr w2, [x20] + mov x3, x0 + sbfiz x2, x2, 2, 26 + add x2, x2, x20 + mov x1, x20 + mov w0, w19 + mov x19, x3 + bl ndrc_patch_link + br x19 +0: + mov w0, w19 +#endif ldr x1, [rFP, #LO_hash_table_ptr] bl ndrc_get_addr_ht br x0 diff --git a/libpcsxcore/new_dynarec/new_dynarec.c b/libpcsxcore/new_dynarec/new_dynarec.c index fda54348d..437d17648 100644 --- a/libpcsxcore/new_dynarec/new_dynarec.c +++ b/libpcsxcore/new_dynarec/new_dynarec.c @@ -410,7 +410,6 @@ void jump_to_new_pc(); void new_dyna_leave(); void *ndrc_get_addr_ht(u_int vaddr, struct ht_entry *ht); -void ndrc_add_jump_out(u_int vaddr, void *src); void ndrc_write_invalidate_one(u_int addr); static void ndrc_write_invalidate_many(u_int addr, u_int end); @@ -1717,15 +1716,15 @@ void new_dynarec_invalidate_all_pages(void) } // Add an entry to jump_out after making a link -// src should point to code by emit_extjump() -void ndrc_add_jump_out(u_int vaddr, void *src) +// stub should point to stub code by emit_extjump() +static void ndrc_add_jump_out(u_int vaddr, void *stub) { - inv_debug("ndrc_add_jump_out: %p -> %x\n", src, vaddr); + inv_debug("ndrc_add_jump_out: %p -> %x\n", stub, vaddr); u_int page = get_page(vaddr); struct jump_info *ji; stat_inc(stat_links); - check_extjump2(src); + check_extjump2(stub); ji = jumps[page]; if (ji == NULL) { ji = malloc(sizeof(*ji) + sizeof(ji->e[0]) * 16); @@ -1738,10 +1737,30 @@ void ndrc_add_jump_out(u_int vaddr, void *src) } jumps[page] = ji; ji->e[ji->count].target_vaddr = vaddr; - ji->e[ji->count].stub = src; + ji->e[ji->count].stub = stub; ji->count++; } +void ndrc_patch_link(u_int vaddr, void *insn, void *stub, void *target) +{ + void *insn_end = (char *)insn + 4; + + //start_tcache_write(insn, insn_end); + mprotect_w_x(insn, insn_end, 0); + + assert(target != stub); + set_jump_target_far1(insn, target); + ndrc_add_jump_out(vaddr, stub); + +#if defined(__aarch64__) || defined(NO_WRITE_EXEC) + // arm64: no syscall concerns, dyna_linker lacks stale detection + // w^x: have to do costly permission switching anyway + new_dyna_clear_cache(NDRC_WRITE_OFFSET(insn), NDRC_WRITE_OFFSET(insn_end)); +#endif + //end_tcache_write(insn, insn_end); + mprotect_w_x(insn, insn_end, 1); +} + /* Register allocation */ static void alloc_set(struct regstat *cur, int reg, int hr) @@ -6287,11 +6306,11 @@ static noinline void new_dynarec_test(void) SysPrintf("(%p) testing if we can run recompiled code @%p...\n", new_dynarec_test, out); - ((volatile u_int *)NDRC_WRITE_OFFSET(out))[0]++; // make the cache dirty for (i = 0; i < ARRAY_SIZE(ret); i++) { out = ndrc->translation_cache; beginning = start_block(); + ((volatile u_int *)NDRC_WRITE_OFFSET(out))[0]++; // make the cache dirty emit_movimm(DRC_TEST_VAL + i, 0); // test emit_ret(); literal_pool(0); @@ -6406,6 +6425,8 @@ void new_dynarec_init(void) void *mw = mmap(NULL, sizeof(*ndrc), PROT_READ | PROT_WRITE, (flags = MAP_SHARED), fd, 0); assert(mw != MAP_FAILED); + #endif + #if defined(NO_WRITE_EXEC) || defined(TC_WRITE_OFFSET) prot = PROT_READ | PROT_EXEC; #endif ndrc = mmap((void *)desired_addr, sizeof(*ndrc), prot, flags, fd, 0); @@ -6418,14 +6439,17 @@ void new_dynarec_init(void) #endif #endif #else - #ifndef NO_WRITE_EXEC ndrc = (struct ndrc_mem *)((size_t)(ndrc_bss + align) & ~align); + #ifndef NO_WRITE_EXEC // not all systems allow execute in data segment by default // size must be 4K aligned for 3DS? if (mprotect(ndrc, sizeof(*ndrc), PROT_READ | PROT_WRITE | PROT_EXEC) != 0) SysPrintf("mprotect(%p) failed: %s\n", ndrc, strerror(errno)); #endif + #ifdef TC_WRITE_OFFSET + #error "misconfiguration detected" + #endif #endif out = ndrc->translation_cache; new_dynarec_clear_full(); @@ -6473,17 +6497,17 @@ void new_dynarec_cleanup(void) static u_int *get_source_start(u_int addr, u_int *limit) { - if (addr < 0x00800000 - || (0x80000000 <= addr && addr < 0x80800000) - || (0xa0000000 <= addr && addr < 0xa0800000)) + if (addr < 0x00800000u + || (0x80000000u <= addr && addr < 0x80800000u) + || (0xa0000000u <= addr && addr < 0xa0800000u)) { // used for BIOS calls mostly? *limit = (addr & 0xa0600000) + 0x00200000; return (u_int *)(psxM + (addr & 0x1fffff)); } else if ( - /* (0x9fc00000 <= addr && addr < 0x9fc80000) ||*/ - (0xbfc00000 <= addr && addr < 0xbfc80000)) + (0x9fc00000u <= addr && addr < 0x9fc80000u) || + (0xbfc00000u <= addr && addr < 0xbfc80000u)) { // BIOS. The multiplier should be much higher as it's uncached 8bit mem // XXX: disabled as this introduces differences from the interpreter diff --git a/libpcsxcore/new_dynarec/new_dynarec_config.h b/libpcsxcore/new_dynarec/new_dynarec_config.h index 9687aa975..0781e47d7 100644 --- a/libpcsxcore/new_dynarec/new_dynarec_config.h +++ b/libpcsxcore/new_dynarec/new_dynarec_config.h @@ -6,6 +6,12 @@ #define USE_MINI_HT 1 //#define REG_PREFETCH 1 +// options: +//#define NO_WRITE_EXEC 1 +//#define BASE_ADDR_DYNAMIC 1 +//#define TC_WRITE_OFFSET 1 +//#define NDRC_CACHE_FLUSH_ALL 1 + #if defined(__MACH__) || defined(HAVE_LIBNX) #define NO_WRITE_EXEC 1 #endif From 60afad4737f00b7efff92d2f6c6faa8d2712f667 Mon Sep 17 00:00:00 2001 From: notaz Date: Fri, 24 Jan 2025 01:51:59 +0200 Subject: [PATCH 19/20] hle: assorted fixes like switching bios<->hle, w^x dynarec --- libpcsxcore/new_dynarec/emu_if.c | 26 ++++++++++++++++++++++++-- libpcsxcore/psxbios.c | 29 ++++++++++++++++++++++------- libpcsxcore/psxbios.h | 1 + libpcsxcore/psxmem.c | 3 ++- libpcsxcore/r3000a.c | 13 ++++++++++++- libpcsxcore/r3000a.h | 1 + 6 files changed, 62 insertions(+), 11 deletions(-) diff --git a/libpcsxcore/new_dynarec/emu_if.c b/libpcsxcore/new_dynarec/emu_if.c index 618cafd83..69e2bc9b4 100644 --- a/libpcsxcore/new_dynarec/emu_if.c +++ b/libpcsxcore/new_dynarec/emu_if.c @@ -13,6 +13,7 @@ #include "../psxinterpreter.h" #include "../psxcounters.h" #include "../psxevents.h" +#include "../psxbios.h" #include "../r3000a.h" #include "../gte_arm.h" #include "../gte_neon.h" @@ -352,11 +353,27 @@ static noinline void ari64_execute_threaded_slow(struct psxRegisters *regs, //ari64_notify(R3000ACPU_NOTIFY_BEFORE_SAVE, NULL); psxInt.Notify(R3000ACPU_NOTIFY_AFTER_LOAD, NULL); - do + psxCpu = &psxInt; + for (;;) { psxInt.ExecuteBlock(regs, block_caller); + + if (ndrc_g.thread.busy_addr == ~0u) + break; + if (block_caller == EXEC_CALLER_HLE) { + if (!psxBiosSoftcallEnded()) + continue; + break; + } + else if (block_caller == EXEC_CALLER_BOOT) { + if (!psxExecuteBiosEnded()) + continue; + break; + } + if (regs->stop) + break; } - while (!regs->stop && ndrc_g.thread.busy_addr != ~0u && block_caller == EXEC_CALLER_OTHER); + psxCpu = &psxRec; psxInt.Notify(R3000ACPU_NOTIFY_BEFORE_SAVE, NULL); //ari64_notify(R3000ACPU_NOTIFY_AFTER_LOAD, NULL); @@ -402,7 +419,12 @@ static void ari64_execute_threaded_block(struct psxRegisters *regs, regs->stop++; regs->next_interupt = regs->cycle + 1; + ari64_execute_threaded_once(regs, caller); + if (regs->cpuInRecursion) { + // must sync since we are returning to compiled code + ari64_thread_sync(); + } if (caller == EXEC_CALLER_BOOT) regs->stop--; diff --git a/libpcsxcore/psxbios.c b/libpcsxcore/psxbios.c index 0e8087543..d008cc9b2 100644 --- a/libpcsxcore/psxbios.c +++ b/libpcsxcore/psxbios.c @@ -404,9 +404,22 @@ static void mips_return_void_c(u32 cycle) static int returned_from_exception(void) { // 0x80000080 means it took another exception just after return - return pc0 == k0 || pc0 == 0x80000080; + return pc0 == k0 || pc0 == 0x80000080 +#ifdef LIGHTREC + // lightrec doesn't return at 0x80000080, so look + // for the next block too + || pc0 == A_EXCEPTION +#endif + ; +} + +int psxBiosSoftcallEnded(void) +{ + return pc0 == 0x80001000 || returned_from_exception(); } +// TODO: get rid of this softCall() thing as recursive cpu calls cause +// complications with dynarecs static inline void softCall(u32 pc) { u32 sra = ra; u32 ssr = psxRegs.CP0.n.SR; @@ -419,14 +432,15 @@ static inline void softCall(u32 pc) { psxRegs.cpuInRecursion++; psxCpu->Notify(R3000ACPU_NOTIFY_AFTER_LOAD, PTR_1); - while (pc0 != 0x80001000 && ++lim < 0x100000) + while (!psxBiosSoftcallEnded() && ++lim < 0x100000) psxCpu->ExecuteBlock(&psxRegs, EXEC_CALLER_HLE); psxCpu->Notify(R3000ACPU_NOTIFY_BEFORE_SAVE, PTR_1); psxRegs.cpuInRecursion--; - if (lim == 0x100000) - PSXBIOS_LOG("softCall @%x hit lim\n", pc); + if (pc0 != 0x80001000) + log_unhandled("%s @%x did not return (@%x cnt=%d)\n", + __func__, pc, pc0, lim); ra = sra; psxRegs.CP0.n.SR |= ssr & 0x404; } @@ -444,14 +458,15 @@ static inline void softCallInException(u32 pc) { psxRegs.cpuInRecursion++; psxCpu->Notify(R3000ACPU_NOTIFY_AFTER_LOAD, PTR_1); - while (!returned_from_exception() && pc0 != 0x80001000 && ++lim < 0x100000) + while (!psxBiosSoftcallEnded() && ++lim < 0x100000) psxCpu->ExecuteBlock(&psxRegs, EXEC_CALLER_HLE); psxCpu->Notify(R3000ACPU_NOTIFY_BEFORE_SAVE, PTR_1); psxRegs.cpuInRecursion--; - if (lim == 0x100000) - PSXBIOS_LOG("softCallInException @%x hit lim\n", pc); + if (pc0 != 0x80001000 && !psxBiosSoftcallEnded()) + log_unhandled("%s @%x did not return (@%x cnt=%d)\n", + __func__, pc, pc0, lim); if (pc0 == 0x80001000) ra = sra; } diff --git a/libpcsxcore/psxbios.h b/libpcsxcore/psxbios.h index c8c07ff7c..1f26693a0 100644 --- a/libpcsxcore/psxbios.h +++ b/libpcsxcore/psxbios.h @@ -42,6 +42,7 @@ void psxBiosCnfLoaded(u32 tcb_cnt, u32 evcb_cnt, u32 sp); void psxBiosSetupBootState(void); void psxBiosCheckExe(u32 t_addr, u32 t_size, int loading_state); void psxBiosCheckBranch(void); +int psxBiosSoftcallEnded(void); extern void (*biosA0[256])(); extern void (**biosB0)(); diff --git a/libpcsxcore/psxmem.c b/libpcsxcore/psxmem.c index e08bd895f..13301992c 100644 --- a/libpcsxcore/psxmem.c +++ b/libpcsxcore/psxmem.c @@ -270,7 +270,6 @@ void psxMemReset() { if (f == NULL) { SysMessage(_("Could not open BIOS:\"%s\". Enabling HLE Bios!\n"), bios); - memset(psxR, 0, 0x80000); } else { if (fread(psxR, 1, 0x80000, f) == 0x80000) { Config.HLE = FALSE; @@ -280,6 +279,8 @@ void psxMemReset() { fclose(f); } } + if (Config.HLE) + memset(psxR, 0, 0x80000); } void psxMemShutdown() { diff --git a/libpcsxcore/r3000a.c b/libpcsxcore/r3000a.c index c282422d5..cfd1ab096 100644 --- a/libpcsxcore/r3000a.c +++ b/libpcsxcore/r3000a.c @@ -62,6 +62,8 @@ int psxInit() { void psxReset() { boolean introBypassed = FALSE; + boolean oldhle = Config.HLE; + psxMemReset(); memset(&psxRegs, 0, sizeof(psxRegs)); @@ -75,6 +77,11 @@ void psxReset() { psxRegs.CP0.n.SR &= ~(1u << 22); // RAM exception vector } + if (Config.HLE != oldhle) { + // at least ari64 drc compiles differently so hard reset + psxCpu->Shutdown(); + psxCpu->Init(); + } psxCpu->ApplyConfig(); psxCpu->Reset(); @@ -174,11 +181,15 @@ void psxJumpTest() { } } +int psxExecuteBiosEnded(void) { + return (psxRegs.pc & 0xff800000) == 0x80000000; +} + void psxExecuteBios() { int i; for (i = 0; i < 5000000; i++) { psxCpu->ExecuteBlock(&psxRegs, EXEC_CALLER_BOOT); - if ((psxRegs.pc & 0xff800000) == 0x80000000) + if (psxExecuteBiosEnded()) break; } if (psxRegs.pc != 0x80030000) diff --git a/libpcsxcore/r3000a.h b/libpcsxcore/r3000a.h index 2889885fe..df7882ce7 100644 --- a/libpcsxcore/r3000a.h +++ b/libpcsxcore/r3000a.h @@ -234,6 +234,7 @@ void psxShutdown(); void psxException(u32 code, enum R3000Abdt bdt, psxCP0Regs *cp0); void psxBranchTest(); void psxExecuteBios(); +int psxExecuteBiosEnded(void); void psxJumpTest(); void irq10Interrupt(); From 16b099b434b1971f71c1023b56833721004d8104 Mon Sep 17 00:00:00 2001 From: notaz Date: Sat, 25 Jan 2025 02:52:07 +0200 Subject: [PATCH 20/20] standalone: fix missed y scaling --- frontend/plugin_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/plugin_lib.c b/frontend/plugin_lib.c index e12a79815..9b2fee345 100644 --- a/frontend/plugin_lib.c +++ b/frontend/plugin_lib.c @@ -336,7 +336,7 @@ static void pl_vout_flip(const void *vram, int stride, int bgr24, // offset xoffs = x * pl_vout_scale_w; - doffs = xoffs + y * dstride; + doffs = xoffs + y * pl_vout_scale_h * dstride; if (dims_changed) flip_clear_counter = 3;