From 9ce2caecfc85e4fda50fed3350304b75b223b06b Mon Sep 17 00:00:00 2001
From: "Dr. Moritz Lehmann" <dr.moritz.lehmann@gmail.com>
Date: Thu, 2 May 2024 20:51:32 +0200
Subject: [PATCH] Fixed terrible performance on ARM GPUs by macro-replacing
 fused-multiply-add (fma) with a*b+c

---
 README.md      | 5 ++---
 src/opencl.hpp | 3 +++
 2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 00ae9f44..ca0e7ef6 100644
--- a/README.md
+++ b/README.md
@@ -153,6 +153,7 @@ The fastest and most memory efficient lattice Boltzmann CFD software, running on
   - fixed that voxelization failed in Intel OpenCL CPU Runtime due to array out-of-bounds access
   - fixed that voxelization did not always produce binary identical results in multi-GPU compared to single-GPU
   - fixed that velocity voxelization failed for free surface simulations
+  - fixed terrible performance on ARM GPUs by macro-replacing fused-multiply-add (`fma`) with `a*b+c`
   - fixed that <kbd>Y</kbd>/<kbd>Z</kbd> keys were incorrect for `QWERTY` keyboard layout in Linux
   - fixed that free camera movement speed in help overlay was not updated in stationary image when scrolling
   - fixed that cursor would sometimes flicker when scrolling on trackpads with Linux-X11 interactive graphics
@@ -552,9 +553,7 @@ Colors: 🔴 AMD, 🔵 Intel, 🟢 Nvidia, ⚪ Apple, 🟡 ARM, 🟤 Glenfly
 | 🔵&nbsp;UHD&nbsp;Graphics&nbsp;P630              |               0.46 |          51 |           42 |              177 (65%) |               288 (53%) |               137 (25%) |
 | 🔵&nbsp;HD&nbsp;Graphics&nbsp;5500               |               0.35 |           3 |           26 |               75 (45%) |               192 (58%) |               108 (32%) |
 | 🔵&nbsp;HD&nbsp;Graphics&nbsp;4600               |               0.38 |           2 |           26 |              105 (63%) |               115 (35%) |                34 (10%) |
-| 🟡&nbsp;Mali-G610&nbsp;MP4 (Orange&nbsp;Pi&nbsp;5&nbsp;Plus) |   0.06 |          16 |           34 |               43 (19%) |                59 (13%) |                19 ( 4%) |
-| 🟡&nbsp;Mali-G72&nbsp;MP18 (Samsung&nbsp;S9+)    |               0.24 |           4 |           29 |               14 ( 7%) |                17 ( 5%) |                12 ( 3%) |
-| 🟡&nbsp;Qualcomm&nbsp;Adreno&nbsp;530 (LG&nbsp;G6) |             0.33 |           2 |           30 |                1 ( 1%) |                 1 ( 0%) |                 1 ( 0%) |
+| 🟡&nbsp;Mali-G72&nbsp;MP18 (Samsung&nbsp;S9+)    |               0.24 |           4 |           29 |              110 (59%) |               230 (62%) |                21 ( 6%) |
 |                                                  |                    |             |              |                        |                         |                         |
 | 🔴&nbsp;2x&nbsp;EPYC&nbsp;9654                   |              29.49 |        1536 |          922 |             1381 (23%) |              1814 (15%) |              1801 (15%) |
 | 🔵&nbsp;2x&nbsp;Xeon&nbsp;CPU&nbsp;Max&nbsp;9480 |              13.62 |         256 |          614 |             2037 (51%) |              1520 (19%) |              1464 (18%) |
diff --git a/src/opencl.hpp b/src/opencl.hpp
index 593ef82c..60001034 100644
--- a/src/opencl.hpp
+++ b/src/opencl.hpp
@@ -25,6 +25,7 @@ struct Device_Info {
 	uint clock_frequency=0u; // in MHz
 	bool is_cpu=false, is_gpu=false;
 	bool intel_gpu_above_4gb_patch = false; // memory allocations greater than 4GB need to be specifically enabled on Intel GPUs
+	bool arm_fma_patch = false;  // ARM GPUs have terrible fma performance, so replace with a*b+c
 	uint is_fp64_capable=0u, is_fp32_capable=0u, is_fp16_capable=0u, is_int64_capable=0u, is_int32_capable=0u, is_int16_capable=0u, is_int8_capable=0u;
 	uint cores=0u; // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
 	float tflops=0.0f; // estimated device FP32 floating point performance in TeraFLOPs/s
@@ -77,6 +78,7 @@ struct Device_Info {
 			}
 		}
 		intel_gpu_above_4gb_patch = (intel==8.0f)&&(memory>4096); // enable memory allocations greater than 4GB for Intel GPUs with >4GB VRAM
+		arm_fma_patch = contains(to_lower(vendor), "arm"); // enable for all ARM GPUs
 	}
 	inline Device_Info() {}; // default constructor
 };
@@ -174,6 +176,7 @@ class Device {
 		"\n	#ifdef cl_khr_int64_base_atomics"
 		"\n	#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" // make sure cl_khr_int64_base_atomics extension is enabled
 		"\n	#endif"
+		+(info.arm_fma_patch ? "\n #define fma(a, b, c) ((a)*(b)+(c))" : "") // ARM GPUs have terrible fma performance, so replace with a*b+c
 	;}
 public:
 	Device_Info info;