From 22ab7d51dae68087f033534cd49d5924d5daaefd Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Mon, 22 Jul 2024 07:37:05 -0700 Subject: [PATCH] fix: rollback loop vectorization for now --- Project.toml | 2 -- src/LuxLib.jl | 4 +--- src/impl/activation.jl | 2 +- src/impl/affine_normalize.jl | 30 +++++++++++++++++------------- src/impl/dropout.jl | 8 ++++---- src/impl/normalization.jl | 2 +- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Project.toml b/Project.toml index 0e58bdd9..f24133b6 100644 --- a/Project.toml +++ b/Project.toml @@ -12,7 +12,6 @@ FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623" LuxDeviceUtils = "34f89e08-e1d5-43b4-8944-0b49ac560553" Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" @@ -51,7 +50,6 @@ FastClosures = "0.3.2" ForwardDiff = "0.10.36" KernelAbstractions = "0.9.22" LinearAlgebra = "1.10" -LoopVectorization = "0.12.171" LuxCore = "0.1.13" LuxDeviceUtils = "0.1.26" LuxTestUtils = "0.1.18" diff --git a/src/LuxLib.jl b/src/LuxLib.jl index e0355008..292202ff 100644 --- a/src/LuxLib.jl +++ b/src/LuxLib.jl @@ -8,13 +8,11 @@ using FastClosures: @closure using ForwardDiff: ForwardDiff using KernelAbstractions: KernelAbstractions, @kernel, @Const, @index using LinearAlgebra: LinearAlgebra, BLAS, mul! -using LoopVectorization: @turbo using LuxCore: LuxCore using LuxDeviceUtils: get_device_type, LuxAMDGPUDevice, LuxCUDADevice, LuxCPUDevice, AbstractLuxGPUDevice, AbstractLuxDevice using Markdown: @doc_str -using NNlib: NNlib, ConvDims, conv, conv!, relu, gelu, sigmoid_fast, swish, σ, ∇conv_data, - ∇conv_filter +using NNlib: NNlib, ConvDims, conv, conv!, relu, gelu, σ, ∇conv_data, ∇conv_filter using Random: Random, AbstractRNG, rand! using Reexport: @reexport using Statistics: Statistics, mean, var diff --git a/src/impl/activation.jl b/src/impl/activation.jl index 0b83e03f..77016c99 100644 --- a/src/impl/activation.jl +++ b/src/impl/activation.jl @@ -21,7 +21,7 @@ end function _fast_activation!( ::LoopedArrayOp, y::AbstractArray, σ::F, x::AbstractArray) where {F} - @turbo for I in eachindex(y, x) + @simd ivdep for I in eachindex(y, x) @inbounds y[I] = σ(x[I]) end end diff --git a/src/impl/affine_normalize.jl b/src/impl/affine_normalize.jl index 1698e2ae..11be7a0e 100644 --- a/src/impl/affine_normalize.jl +++ b/src/impl/affine_normalize.jl @@ -58,11 +58,13 @@ end function __affine_normalize_gn_impl!(::LoopedArrayOp, y::AbstractArray{<:Number, 4}, f::F, x::AbstractArray{<:Number, 4}, μ, σ², ::Nothing, ::Nothing, ϵ::Real) where {F} - @turbo for L in axes(y, 4), K in axes(y, 3) + for L in axes(y, 4), K in axes(y, 3) @inbounds _sc = @fastmath inv(sqrt(σ²[1, 1, K, L] + ϵ)) @inbounds _bc = -μ[1, 1, K, L] * _sc - for J in axes(y, 2), I in axes(y, 1) - @inbounds y[I, J, K, L] = muladd(x[I, J, K, L], _sc, _bc) + for J in axes(y, 2) + @simd ivdep for I in axes(y, 1) + @inbounds y[I, J, K, L] = muladd(x[I, J, K, L], _sc, _bc) + end end end _fast_activation!(f, y) # NOTE: don't fuse into the above loop @@ -71,12 +73,12 @@ end function __affine_normalize_gn_impl!(::LoopedArrayOp, y::AbstractArray{<:Number, 4}, f::F, x::AbstractArray{<:Number, 4}, μ, σ², scale::AbstractArray{<:Number, 4}, bias::AbstractArray{<:Number, 4}, ϵ::Real) where {F} - @turbo for L in axes(y, 4), K in axes(y, 3) + for L in axes(y, 4), K in axes(y, 3) @inbounds idenom = @fastmath inv(sqrt(σ²[1, 1, K, L] + ϵ)) for J in axes(y, 2) @inbounds _sc = scale[1, J, K, 1] * idenom @inbounds _bc = muladd(-μ[1, 1, K, L], _sc, bias[1, J, K, 1]) - for I in axes(y, 1) + @simd ivdep for I in axes(y, 1) @inbounds y[I, J, K, L] = muladd(x[I, J, K, L], _sc, _bc) end end @@ -180,15 +182,17 @@ function ∇affine_normalize_gn_impl(::LoopedArrayOp, ∂y, x, μ, σ², ::Nothi ∂x, ∂μ, ∂σ² = similar(x), zero.(μ), zero.(σ²) half = eltype(∂σ²)(0.5) - @turbo for L in axes(∂y, 4), K in axes(∂y, 3) + for L in axes(∂y, 4), K in axes(∂y, 3) @inbounds idenom = @fastmath inv(sqrt(σ²[1, 1, K, L] + ϵ)) idenom² = idenom^2 - for J in axes(∂y, 2), I in axes(∂y, 1) - @inbounds xμ = x[I, J, K, L] - μ[1, 1, K, L] + for J in axes(∂y, 2) + @simd for I in axes(∂y, 1) + @inbounds xμ = x[I, J, K, L] - μ[1, 1, K, L] - @inbounds ∂x[I, J, K, L] = ∂y[I, J, K, L] * idenom - @inbounds ∂μ[1, 1, K, L] -= ∂x[I, J, K, L] - @inbounds ∂σ²[1, 1, K, L] -= ∂x[I, J, K, L] * xμ * half * idenom² + @inbounds ∂x[I, J, K, L] = ∂y[I, J, K, L] * idenom + @inbounds ∂μ[1, 1, K, L] -= ∂x[I, J, K, L] + @inbounds ∂σ²[1, 1, K, L] -= ∂x[I, J, K, L] * xμ * half * idenom² + end end end @@ -199,12 +203,12 @@ function ∇affine_normalize_gn_impl(::LoopedArrayOp, ∂y, x, μ, σ², scale, ∂x, ∂μ, ∂σ², ∂sc, ∂b = similar(x), zero.(μ), zero.(σ²), zero.(scale), zero.(bias) half = eltype(∂σ²)(0.5) - @turbo for L in axes(∂y, 4), K in axes(∂y, 3) + for L in axes(∂y, 4), K in axes(∂y, 3) @inbounds idenom = @fastmath inv(sqrt(σ²[1, 1, K, L] + ϵ)) idenom² = idenom^2 for J in axes(∂y, 2) @inbounds _sc = scale[1, J, K, 1] * idenom - for I in axes(∂y, 1) + @simd for I in axes(∂y, 1) @inbounds xμ = x[I, J, K, L] - μ[1, 1, K, L] @inbounds ∂x[I, J, K, L] = ∂y[I, J, K, L] * _sc diff --git a/src/impl/dropout.jl b/src/impl/dropout.jl index ac96a69d..3ae38fdf 100644 --- a/src/impl/dropout.jl +++ b/src/impl/dropout.jl @@ -14,7 +14,7 @@ end ::LoopedArrayOp, noise::AbstractArray, p::Real, x::AbstractArray, α::Real, A::Real, B::Real) res = similar(x, promote_type(typeof(p), typeof(α))) - @turbo for i in eachindex(noise) + @simd ivdep for i in eachindex(noise) @inbounds res[i] = muladd(ifelse(noise[i] > p, x[i], α), A, B) end return res @@ -32,7 +32,7 @@ function CRC.rrule(::typeof(_alpha_dropout_kernel), ::LoopedArrayOp, noise::Abst p::Real, x::AbstractArray, α::Real, A::Real, B::Real) _cond = similar(noise, Bool) y = similar(x, promote_type(typeof(p), typeof(α), typeof(A), typeof(B), eltype(x))) - @turbo for i in eachindex(noise) + @simd ivdep for i in eachindex(noise) @inbounds _cond[i] = noise[i] > p @inbounds y[i] = muladd(ifelse(_cond[i], x[i], α), A, B) end @@ -41,7 +41,7 @@ function CRC.rrule(::typeof(_alpha_dropout_kernel), ::LoopedArrayOp, noise::Abst _∇alpha_dropout_kernel = let _cond = _cond, proj_x = proj_x, x = x, noise = noise Δ -> begin ∂x = similar(x) - @turbo for i in eachindex(noise) + @simd ivdep for i in eachindex(noise) @inbounds ∂x[i] = _cond[i] * Δ[i] * A end return (ntuple(Returns(∂∅), 4)..., proj_x(∂x), ntuple(Returns(∂∅), 3)...) @@ -87,7 +87,7 @@ EnzymeRules.inactive_noinl(::typeof(_alpha_dropout_noise), ::Any...) = nothing rand!(rng, y) opmode = internal_operation_mode(y) if opmode isa LoopedArrayOp - @turbo for i in eachindex(y) + @simd ivdep for i in eachindex(y) @inbounds y[i] = (y[i] > p) * invp end else diff --git a/src/impl/normalization.jl b/src/impl/normalization.jl index 2bf09c9a..a603cbed 100644 --- a/src/impl/normalization.jl +++ b/src/impl/normalization.jl @@ -18,7 +18,7 @@ function __update_statistics(opmode, rμ, rσ², μ, σ², m1, m2) return rμ2, rσ²2 end function __update_statistics!(::LoopedArrayOp, rμ2, rσ²2, rμ, rσ², μ, σ², m1, m2, m3) - @turbo for I in eachindex(rμ2, rσ²2) + @simd ivdep for I in eachindex(rμ2, rσ²2) @inbounds rμ2[I] = m3 * rμ[I] + m1 * μ[I] @inbounds rσ²2[I] = m3 * rσ²[I] + m2 * σ²[I] end