From 22ab7d51dae68087f033534cd49d5924d5daaefd Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 22 Jul 2024 07:37:05 -0700
Subject: [PATCH] fix: rollback loop vectorization for now

---
 Project.toml                 |  2 --
 src/LuxLib.jl                |  4 +---
 src/impl/activation.jl       |  2 +-
 src/impl/affine_normalize.jl | 30 +++++++++++++++++-------------
 src/impl/dropout.jl          |  8 ++++----
 src/impl/normalization.jl    |  2 +-
 6 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/Project.toml b/Project.toml
index 0e58bdd9..f24133b6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,7 +12,6 @@ FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
 LuxDeviceUtils = "34f89e08-e1d5-43b4-8944-0b49ac560553"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
@@ -51,7 +50,6 @@ FastClosures = "0.3.2"
 ForwardDiff = "0.10.36"
 KernelAbstractions = "0.9.22"
 LinearAlgebra = "1.10"
-LoopVectorization = "0.12.171"
 LuxCore = "0.1.13"
 LuxDeviceUtils = "0.1.26"
 LuxTestUtils = "0.1.18"
diff --git a/src/LuxLib.jl b/src/LuxLib.jl
index e0355008..292202ff 100644
--- a/src/LuxLib.jl
+++ b/src/LuxLib.jl
@@ -8,13 +8,11 @@ using FastClosures: @closure
 using ForwardDiff: ForwardDiff
 using KernelAbstractions: KernelAbstractions, @kernel, @Const, @index
 using LinearAlgebra: LinearAlgebra, BLAS, mul!
-using LoopVectorization: @turbo
 using LuxCore: LuxCore
 using LuxDeviceUtils: get_device_type, LuxAMDGPUDevice, LuxCUDADevice, LuxCPUDevice,
                       AbstractLuxGPUDevice, AbstractLuxDevice
 using Markdown: @doc_str
-using NNlib: NNlib, ConvDims, conv, conv!, relu, gelu, sigmoid_fast, swish, σ, ∇conv_data,
-             ∇conv_filter
+using NNlib: NNlib, ConvDims, conv, conv!, relu, gelu, σ, ∇conv_data, ∇conv_filter
 using Random: Random, AbstractRNG, rand!
 using Reexport: @reexport
 using Statistics: Statistics, mean, var
diff --git a/src/impl/activation.jl b/src/impl/activation.jl
index 0b83e03f..77016c99 100644
--- a/src/impl/activation.jl
+++ b/src/impl/activation.jl
@@ -21,7 +21,7 @@ end
 
 function _fast_activation!(
         ::LoopedArrayOp, y::AbstractArray, σ::F, x::AbstractArray) where {F}
-    @turbo for I in eachindex(y, x)
+    @simd ivdep for I in eachindex(y, x)
         @inbounds y[I] = σ(x[I])
     end
 end
diff --git a/src/impl/affine_normalize.jl b/src/impl/affine_normalize.jl
index 1698e2ae..11be7a0e 100644
--- a/src/impl/affine_normalize.jl
+++ b/src/impl/affine_normalize.jl
@@ -58,11 +58,13 @@ end
 
 function __affine_normalize_gn_impl!(::LoopedArrayOp, y::AbstractArray{<:Number, 4}, f::F,
         x::AbstractArray{<:Number, 4}, μ, σ², ::Nothing, ::Nothing, ϵ::Real) where {F}
-    @turbo for L in axes(y, 4), K in axes(y, 3)
+    for L in axes(y, 4), K in axes(y, 3)
         @inbounds _sc = @fastmath inv(sqrt(σ²[1, 1, K, L] + ϵ))
         @inbounds _bc = -μ[1, 1, K, L] * _sc
-        for J in axes(y, 2), I in axes(y, 1)
-            @inbounds y[I, J, K, L] = muladd(x[I, J, K, L], _sc, _bc)
+        for J in axes(y, 2)
+            @simd ivdep for I in axes(y, 1)
+                @inbounds y[I, J, K, L] = muladd(x[I, J, K, L], _sc, _bc)
+            end
         end
     end
     _fast_activation!(f, y) # NOTE: don't fuse into the above loop
@@ -71,12 +73,12 @@ end
 function __affine_normalize_gn_impl!(::LoopedArrayOp, y::AbstractArray{<:Number, 4}, f::F,
         x::AbstractArray{<:Number, 4}, μ, σ², scale::AbstractArray{<:Number, 4},
         bias::AbstractArray{<:Number, 4}, ϵ::Real) where {F}
-    @turbo for L in axes(y, 4), K in axes(y, 3)
+    for L in axes(y, 4), K in axes(y, 3)
         @inbounds idenom = @fastmath inv(sqrt(σ²[1, 1, K, L] + ϵ))
         for J in axes(y, 2)
             @inbounds _sc = scale[1, J, K, 1] * idenom
             @inbounds _bc = muladd(-μ[1, 1, K, L], _sc, bias[1, J, K, 1])
-            for I in axes(y, 1)
+            @simd ivdep for I in axes(y, 1)
                 @inbounds y[I, J, K, L] = muladd(x[I, J, K, L], _sc, _bc)
             end
         end
@@ -180,15 +182,17 @@ function ∇affine_normalize_gn_impl(::LoopedArrayOp, ∂y, x, μ, σ², ::Nothi
     ∂x, ∂μ, ∂σ² = similar(x), zero.(μ), zero.(σ²)
     half = eltype(∂σ²)(0.5)
 
-    @turbo for L in axes(∂y, 4), K in axes(∂y, 3)
+    for L in axes(∂y, 4), K in axes(∂y, 3)
         @inbounds idenom = @fastmath inv(sqrt(σ²[1, 1, K, L] + ϵ))
         idenom² = idenom^2
-        for J in axes(∂y, 2), I in axes(∂y, 1)
-            @inbounds xμ = x[I, J, K, L] - μ[1, 1, K, L]
+        for J in axes(∂y, 2)
+            @simd for I in axes(∂y, 1)
+                @inbounds xμ = x[I, J, K, L] - μ[1, 1, K, L]
 
-            @inbounds ∂x[I, J, K, L] = ∂y[I, J, K, L] * idenom
-            @inbounds ∂μ[1, 1, K, L] -= ∂x[I, J, K, L]
-            @inbounds ∂σ²[1, 1, K, L] -= ∂x[I, J, K, L] * xμ * half * idenom²
+                @inbounds ∂x[I, J, K, L] = ∂y[I, J, K, L] * idenom
+                @inbounds ∂μ[1, 1, K, L] -= ∂x[I, J, K, L]
+                @inbounds ∂σ²[1, 1, K, L] -= ∂x[I, J, K, L] * xμ * half * idenom²
+            end
         end
     end
 
@@ -199,12 +203,12 @@ function ∇affine_normalize_gn_impl(::LoopedArrayOp, ∂y, x, μ, σ², scale,
     ∂x, ∂μ, ∂σ², ∂sc, ∂b = similar(x), zero.(μ), zero.(σ²), zero.(scale), zero.(bias)
     half = eltype(∂σ²)(0.5)
 
-    @turbo for L in axes(∂y, 4), K in axes(∂y, 3)
+    for L in axes(∂y, 4), K in axes(∂y, 3)
         @inbounds idenom = @fastmath inv(sqrt(σ²[1, 1, K, L] + ϵ))
         idenom² = idenom^2
         for J in axes(∂y, 2)
             @inbounds _sc = scale[1, J, K, 1] * idenom
-            for I in axes(∂y, 1)
+            @simd for I in axes(∂y, 1)
                 @inbounds xμ = x[I, J, K, L] - μ[1, 1, K, L]
 
                 @inbounds ∂x[I, J, K, L] = ∂y[I, J, K, L] * _sc
diff --git a/src/impl/dropout.jl b/src/impl/dropout.jl
index ac96a69d..3ae38fdf 100644
--- a/src/impl/dropout.jl
+++ b/src/impl/dropout.jl
@@ -14,7 +14,7 @@ end
         ::LoopedArrayOp, noise::AbstractArray, p::Real,
         x::AbstractArray, α::Real, A::Real, B::Real)
     res = similar(x, promote_type(typeof(p), typeof(α)))
-    @turbo for i in eachindex(noise)
+    @simd ivdep for i in eachindex(noise)
         @inbounds res[i] = muladd(ifelse(noise[i] > p, x[i], α), A, B)
     end
     return res
@@ -32,7 +32,7 @@ function CRC.rrule(::typeof(_alpha_dropout_kernel), ::LoopedArrayOp, noise::Abst
         p::Real, x::AbstractArray, α::Real, A::Real, B::Real)
     _cond = similar(noise, Bool)
     y = similar(x, promote_type(typeof(p), typeof(α), typeof(A), typeof(B), eltype(x)))
-    @turbo for i in eachindex(noise)
+    @simd ivdep for i in eachindex(noise)
         @inbounds _cond[i] = noise[i] > p
         @inbounds y[i] = muladd(ifelse(_cond[i], x[i], α), A, B)
     end
@@ -41,7 +41,7 @@ function CRC.rrule(::typeof(_alpha_dropout_kernel), ::LoopedArrayOp, noise::Abst
     _∇alpha_dropout_kernel = let _cond = _cond, proj_x = proj_x, x = x, noise = noise
         Δ -> begin
             ∂x = similar(x)
-            @turbo for i in eachindex(noise)
+            @simd ivdep for i in eachindex(noise)
                 @inbounds ∂x[i] = _cond[i] * Δ[i] * A
             end
             return (ntuple(Returns(∂∅), 4)..., proj_x(∂x), ntuple(Returns(∂∅), 3)...)
@@ -87,7 +87,7 @@ EnzymeRules.inactive_noinl(::typeof(_alpha_dropout_noise), ::Any...) = nothing
     rand!(rng, y)
     opmode = internal_operation_mode(y)
     if opmode isa LoopedArrayOp
-        @turbo for i in eachindex(y)
+        @simd ivdep for i in eachindex(y)
             @inbounds y[i] = (y[i] > p) * invp
         end
     else
diff --git a/src/impl/normalization.jl b/src/impl/normalization.jl
index 2bf09c9a..a603cbed 100644
--- a/src/impl/normalization.jl
+++ b/src/impl/normalization.jl
@@ -18,7 +18,7 @@ function __update_statistics(opmode, rμ, rσ², μ, σ², m1, m2)
     return rμ2, rσ²2
 end
 function __update_statistics!(::LoopedArrayOp, rμ2, rσ²2, rμ, rσ², μ, σ², m1, m2, m3)
-    @turbo for I in eachindex(rμ2, rσ²2)
+    @simd ivdep for I in eachindex(rμ2, rσ²2)
         @inbounds rμ2[I] = m3 * rμ[I] + m1 * μ[I]
         @inbounds rσ²2[I] = m3 * rσ²[I] + m2 * σ²[I]
     end