From 4bccf990f3495fcc9d8ad10f974acb6fbc1699cf Mon Sep 17 00:00:00 2001
From: DH <dh.rpcs3@gmail.com>
Date: Fri, 1 Nov 2024 09:51:50 +0300
Subject: [PATCH] gpu: reduce cpu usage on cache commands

---
 rpcsx/gpu/Device.cpp        | 18 +++++++++++++++---
 rpcsx/gpu/DeviceContext.hpp | 11 +++++++----
 rpcsx/iodev/dce.cpp         | 24 ++++++++++++++++++++----
 3 files changed, 42 insertions(+), 11 deletions(-)
diff --git a/rpcsx/gpu/Device.cpp b/rpcsx/gpu/Device.cpp
index f5dfd44..4f8ee3d 100644
--- a/rpcsx/gpu/Device.cpp
+++ b/rpcsx/gpu/Device.cpp
@@ -210,7 +210,14 @@ Device::Device() : vkContext(createVkContext(this)) {
 
   cacheUpdateThread = std::jthread([this](const std::stop_token &stopToken) {
     auto &sched = graphicsPipes[0].scheduler;
+    std::uint32_t prevIdleValue = 0;
     while (!stopToken.stop_requested()) {
+      if (gpuCacheCommandIdle.wait(prevIdleValue) != std::errc{}) {
+        continue;
+      }
+
+      prevIdleValue = gpuCacheCommandIdle.load(std::memory_order::acquire);
+
       for (int vmId = 0; vmId < kMaxProcessCount; ++vmId) {
         auto page = gpuCacheCommand[vmId].load(std::memory_order::relaxed);
         if (page == 0) {
@@ -996,11 +1003,16 @@ static void notifyPageChanges(Device *device, int vmId, std::uint32_t firstPage,
       (static_cast<std::uint64_t>(pageCount - 1) << 32) | firstPage;
 
   while (true) {
-    for (std::size_t i = 0; i < std::size(device->cacheCommands); ++i) {
+    for (std::size_t i = 0; i < std::size(device->cpuCacheCommands); ++i) {
       std::uint64_t expCommand = 0;
-      if (device->cacheCommands[vmId][i].compare_exchange_strong(
-              expCommand, command, std::memory_order::acquire,
+      if (device->cpuCacheCommands[vmId][i].compare_exchange_strong(
+              expCommand, command, std::memory_order::release,
               std::memory_order::relaxed)) {
+        device->cpuCacheCommandsIdle[vmId].fetch_add(
+            1, std::memory_order::release);
+        device->cpuCacheCommandsIdle[vmId].notify_one();
+
+        while (device->cpuCacheCommands[vmId][i].load(std::memory_order::acquire) != 0) {}
         return;
       }
     }
diff --git a/rpcsx/gpu/DeviceContext.hpp b/rpcsx/gpu/DeviceContext.hpp
index 70c36ac..dc2f37d 100644
--- a/rpcsx/gpu/DeviceContext.hpp
+++ b/rpcsx/gpu/DeviceContext.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "orbis/utils/SharedAtomic.hpp"
 #include <atomic>
 #include <cstdint>
 
@@ -66,10 +67,12 @@ enum {
 struct DeviceContext {
   static constexpr auto kMaxProcessCount = 6;
 
-  PadState kbPadState;
-  std::atomic<std::uint64_t> cacheCommands[kMaxProcessCount][4];
-  std::atomic<std::uint32_t> gpuCacheCommand[kMaxProcessCount];
-  std::atomic<std::uint8_t> *cachePages[kMaxProcessCount];
+  PadState kbPadState{};
+  std::atomic<std::uint64_t> cpuCacheCommands[kMaxProcessCount][4]{};
+  orbis::shared_atomic32 cpuCacheCommandsIdle[kMaxProcessCount]{};
+  orbis::shared_atomic32 gpuCacheCommand[kMaxProcessCount]{};
+  orbis::shared_atomic32 gpuCacheCommandIdle{};
+  std::atomic<std::uint8_t> *cachePages[kMaxProcessCount]{};
 
   volatile std::uint32_t flipBuffer[kMaxProcessCount];
   volatile std::uint64_t flipArg[kMaxProcessCount];
diff --git a/rpcsx/iodev/dce.cpp b/rpcsx/iodev/dce.cpp
index 0ca39c9..80ccf04 100644
--- a/rpcsx/iodev/dce.cpp
+++ b/rpcsx/iodev/dce.cpp
@@ -137,15 +137,26 @@ static void runBridge(int vmId) {
     auto gpu = amdgpu::DeviceCtl{orbis::g_context.gpuDevice};
     auto &gpuCtx = gpu.getContext();
     std::vector<std::uint64_t> fetchedCommands;
-    fetchedCommands.reserve(std::size(gpuCtx.cacheCommands));
+    fetchedCommands.reserve(std::size(gpuCtx.cpuCacheCommands));
+
+    std::vector<std::atomic<std::uint64_t> *> fetchedAtomics;
+    std::uint32_t prevIdleValue = 0;
 
     while (true) {
-      for (auto &command : gpuCtx.cacheCommands) {
-        std::uint64_t value = command[vmId].load(std::memory_order::relaxed);
+      if (gpuCtx.cpuCacheCommandsIdle[vmId].wait(prevIdleValue) !=
+          std::errc{}) {
+        continue;
+      }
+
+      prevIdleValue =
+          gpuCtx.cpuCacheCommandsIdle[vmId].load(std::memory_order::acquire);
+
+      for (auto &command : gpuCtx.cpuCacheCommands[vmId]) {
+        std::uint64_t value = command.load(std::memory_order::relaxed);
 
         if (value != 0) {
           fetchedCommands.push_back(value);
-          command[vmId].store(0, std::memory_order::relaxed);
+          fetchedAtomics.push_back(&command);
         }
       }
 
@@ -187,7 +198,12 @@ static void runBridge(int vmId) {
         }
       }
 
+      for (auto fetchedAtomic : fetchedAtomics) {
+        fetchedAtomic->store(0, std::memory_order::release);
+      }
+
       fetchedCommands.clear();
+      fetchedAtomics.clear();
     }
   }}.detach();
 }