From 55523db842fa9c8a69aa06a07656b800914a0e16 Mon Sep 17 00:00:00 2001
From: tyoeer <tyoeer@users.noreply.github.com>
Date: Mon, 23 Dec 2024 20:49:58 +0100
Subject: [PATCH 1/3] Add f32x4 arithmetic instructions to Pulley:

Adds float SIMD instructions on 4 lanes (f32x4) for subtraction, multiplication, and negation.
`vtrunc32x4` and `vmuli32x4` were used as basis on how to organize things.
---
 .../codegen/src/isa/pulley_shared/lower.isle  |  3 ++
 pulley/src/interp.rs                          | 29 +++++++++++++++++++
 pulley/src/lib.rs                             |  6 ++++
 3 files changed, 38 insertions(+)
diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle
index b62d44810eb1..fd4a1d6d57c1 100644
--- a/cranelift/codegen/src/isa/pulley_shared/lower.isle
+++ b/cranelift/codegen/src/isa/pulley_shared/lower.isle
@@ -1157,12 +1157,14 @@
 
 (rule (lower (has_type $F32 (fsub a b))) (pulley_fsub32 a b))
 (rule (lower (has_type $F64 (fsub a b))) (pulley_fsub64 a b))
+(rule (lower (has_type $F32X4 (fsub a b))) (pulley_vsub32x4 a b))
 (rule (lower (has_type $F64X2 (fsub a b))) (pulley_vsubf64x2 a b))
 
 ;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $F32 (fmul a b))) (pulley_fmul32 a b))
 (rule (lower (has_type $F64 (fmul a b))) (pulley_fmul64 a b))
+(rule (lower (has_type $F32X4 (fmul a b))) (pulley_vmul32x4 a b))
 (rule (lower (has_type $F64X2 (fmul a b))) (pulley_vmulf64x2 a b))
 
 ;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1233,6 +1235,7 @@
 
 (rule (lower (has_type $F32 (fneg a))) (pulley_fneg32 a))
 (rule (lower (has_type $F64 (fneg a))) (pulley_fneg64 a))
+(rule (lower (has_type $F32X4 (fneg a))) (pulley_vnegf32x4 a))
 (rule (lower (has_type $F64X2 (fneg a))) (pulley_vnegf64x2 a))
 
 ;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs
index 31cb072c55c6..4cd810cef528 100644
--- a/pulley/src/interp.rs
+++ b/pulley/src/interp.rs
@@ -2980,6 +2980,16 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vsub32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = *a - b;
+        }
+        self.state[operands.dst].set_f32x4(a);
+        ControlFlow::Continue(())
+    }
+
     fn fmul32(&mut self, operands: BinaryOperands<FReg>) -> ControlFlow<Done> {
         let a = self.state[operands.src1].get_f32();
         let b = self.state[operands.src2].get_f32();
@@ -2987,6 +2997,16 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vmul32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+        let mut a = self.state[operands.src1].get_f32x4();
+        let b = self.state[operands.src2].get_f32x4();
+        for (a, b) in a.iter_mut().zip(b) {
+            *a = *a * b;
+        }
+        self.state[operands.dst].set_f32x4(a);
+        ControlFlow::Continue(())
+    }
+
     fn fdiv32(&mut self, operands: BinaryOperands<FReg>) -> ControlFlow<Done> {
         let a = self.state[operands.src1].get_f32();
         let b = self.state[operands.src2].get_f32();
@@ -3162,6 +3182,15 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
+    fn vnegf32x4(&mut self, dst: VReg, src: VReg) -> ControlFlow<Done> {
+        let mut a = self.state[src].get_f32x4();
+        for elem in a.iter_mut() {
+            *elem = -*elem;
+        }
+        self.state[dst].set_f32x4(a);
+        ControlFlow::Continue(())
+    }
+
     fn fabs32(&mut self, dst: FReg, src: FReg) -> ControlFlow<Done> {
         let a = self.state[src].get_f32();
         self.state[dst].set_f32(a.wasm_abs());
diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs
index 447264b6a37f..51de25ca5f4f 100644
--- a/pulley/src/lib.rs
+++ b/pulley/src/lib.rs
@@ -811,8 +811,12 @@ macro_rules! for_each_extended_op {
             fadd32 = Fadd32 { operands: BinaryOperands<FReg> };
             /// `low32(dst) = low32(src1) - low32(src2)`
             fsub32 = Fsub32 { operands: BinaryOperands<FReg> };
+            /// `low128(dst) = low128(src1) - low128(src2)`
+            vsub32x4 = Vsub32x4 { operands: BinaryOperands<VReg> };
             /// `low32(dst) = low32(src1) * low32(src2)`
             fmul32 = Fmul32 { operands: BinaryOperands<FReg> };
+            /// `low128(dst) = low128(src1) * low128(src2)`
+            vmul32x4 = Vmul32x4 { operands: BinaryOperands<VReg> };
             /// `low32(dst) = low32(src1) / low32(src2)`
             fdiv32 = Fdiv32 { operands: BinaryOperands<FReg> };
             /// `low128(dst) = low128(src1) / low128(src2)`
@@ -849,6 +853,8 @@ macro_rules! for_each_extended_op {
             vsqrt64x2 = Vsqrt64x2 { dst: VReg, src: VReg };
             /// `low32(dst) = -low32(src)`
             fneg32 = Fneg32 { dst: FReg, src: FReg };
+            /// `low128(dst) = -low128(src)`
+            vnegf32x4 = Vnegf32x4 { dst: VReg, src: VReg };
             /// `low32(dst) = |low32(src)|`
             fabs32 = Fabs32 { dst: FReg, src: FReg };
 

From d910101a67fff3df9db92e20b3ae7f9d026cfc74 Mon Sep 17 00:00:00 2001
From: tyoeer <tyoeer@users.noreply.github.com>
Date: Mon, 23 Dec 2024 21:11:21 +0100
Subject: [PATCH 2/3] Mark `simd_f32x4_arith.wast` as passing for Pulley:

To be exact: `spec_testsuite/simd_f32x4_arith.wast` has been removed from the should fail list for Pulley.
---
 crates/wast-util/src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs
index f8e15777a481..657303433c54 100644
--- a/crates/wast-util/src/lib.rs
+++ b/crates/wast-util/src/lib.rs
@@ -408,7 +408,6 @@ impl WastTest {
                 "spec_testsuite/proposals/relaxed-simd/relaxed_madd_nmadd.wast",
                 "spec_testsuite/proposals/memory64/relaxed_madd_nmadd.wast",
                 "spec_testsuite/proposals/memory64/i32x4_relaxed_trunc.wast",
-                "spec_testsuite/simd_f32x4_arith.wast",
                 "spec_testsuite/simd_f32x4_cmp.wast",
                 "spec_testsuite/simd_f32x4_pmin_pmax.wast",
                 "spec_testsuite/simd_f64x2_cmp.wast",

From e6ca3d16d3ef57979fc152a70ac5d6ddfe64d5b9 Mon Sep 17 00:00:00 2001
From: tyoeer <tyoeer@users.noreply.github.com>
Date: Thu, 2 Jan 2025 14:07:05 +0100
Subject: [PATCH 3/3] Rename 2 f32x4 arithmatic instructions to contain
 "f32x4":

Specifically:
- "vsub32x4" -> "vsubf32x4"
- "vmul32x4" -> "vmulf32x4"
---
 cranelift/codegen/src/isa/pulley_shared/lower.isle | 4 ++--
 pulley/src/interp.rs                               | 4 ++--
 pulley/src/lib.rs                                  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.isle b/cranelift/codegen/src/isa/pulley_shared/lower.isle
index fd4a1d6d57c1..72ee094e0b1e 100644
--- a/cranelift/codegen/src/isa/pulley_shared/lower.isle
+++ b/cranelift/codegen/src/isa/pulley_shared/lower.isle
@@ -1157,14 +1157,14 @@
 
 (rule (lower (has_type $F32 (fsub a b))) (pulley_fsub32 a b))
 (rule (lower (has_type $F64 (fsub a b))) (pulley_fsub64 a b))
-(rule (lower (has_type $F32X4 (fsub a b))) (pulley_vsub32x4 a b))
+(rule (lower (has_type $F32X4 (fsub a b))) (pulley_vsubf32x4 a b))
 (rule (lower (has_type $F64X2 (fsub a b))) (pulley_vsubf64x2 a b))
 
 ;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $F32 (fmul a b))) (pulley_fmul32 a b))
 (rule (lower (has_type $F64 (fmul a b))) (pulley_fmul64 a b))
-(rule (lower (has_type $F32X4 (fmul a b))) (pulley_vmul32x4 a b))
+(rule (lower (has_type $F32X4 (fmul a b))) (pulley_vmulf32x4 a b))
 (rule (lower (has_type $F64X2 (fmul a b))) (pulley_vmulf64x2 a b))
 
 ;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs
index 4cd810cef528..8c9a8dbc1d81 100644
--- a/pulley/src/interp.rs
+++ b/pulley/src/interp.rs
@@ -2980,7 +2980,7 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
-    fn vsub32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+    fn vsubf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
         let mut a = self.state[operands.src1].get_f32x4();
         let b = self.state[operands.src2].get_f32x4();
         for (a, b) in a.iter_mut().zip(b) {
@@ -2997,7 +2997,7 @@ impl ExtendedOpVisitor for Interpreter<'_> {
         ControlFlow::Continue(())
     }
 
-    fn vmul32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
+    fn vmulf32x4(&mut self, operands: BinaryOperands<VReg>) -> ControlFlow<Done> {
         let mut a = self.state[operands.src1].get_f32x4();
         let b = self.state[operands.src2].get_f32x4();
         for (a, b) in a.iter_mut().zip(b) {
diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs
index 51de25ca5f4f..236346345264 100644
--- a/pulley/src/lib.rs
+++ b/pulley/src/lib.rs
@@ -812,11 +812,11 @@ macro_rules! for_each_extended_op {
             /// `low32(dst) = low32(src1) - low32(src2)`
             fsub32 = Fsub32 { operands: BinaryOperands<FReg> };
             /// `low128(dst) = low128(src1) - low128(src2)`
-            vsub32x4 = Vsub32x4 { operands: BinaryOperands<VReg> };
+            vsubf32x4 = Vsubf32x4 { operands: BinaryOperands<VReg> };
             /// `low32(dst) = low32(src1) * low32(src2)`
             fmul32 = Fmul32 { operands: BinaryOperands<FReg> };
             /// `low128(dst) = low128(src1) * low128(src2)`
-            vmul32x4 = Vmul32x4 { operands: BinaryOperands<VReg> };
+            vmulf32x4 = Vmulf32x4 { operands: BinaryOperands<VReg> };
             /// `low32(dst) = low32(src1) / low32(src2)`
             fdiv32 = Fdiv32 { operands: BinaryOperands<FReg> };
             /// `low128(dst) = low128(src1) / low128(src2)`