Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: rollback loop vectorization for now
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Jul 23, 2024
1 parent 11333fd commit 22ab7d5
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 24 deletions.
2 changes: 0 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
LuxDeviceUtils = "34f89e08-e1d5-43b4-8944-0b49ac560553"
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
Expand Down Expand Up @@ -51,7 +50,6 @@ FastClosures = "0.3.2"
ForwardDiff = "0.10.36"
KernelAbstractions = "0.9.22"
LinearAlgebra = "1.10"
LoopVectorization = "0.12.171"
LuxCore = "0.1.13"
LuxDeviceUtils = "0.1.26"
LuxTestUtils = "0.1.18"
Expand Down
4 changes: 1 addition & 3 deletions src/LuxLib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@ using FastClosures: @closure
using ForwardDiff: ForwardDiff
using KernelAbstractions: KernelAbstractions, @kernel, @Const, @index
using LinearAlgebra: LinearAlgebra, BLAS, mul!
using LoopVectorization: @turbo
using LuxCore: LuxCore
using LuxDeviceUtils: get_device_type, LuxAMDGPUDevice, LuxCUDADevice, LuxCPUDevice,
AbstractLuxGPUDevice, AbstractLuxDevice
using Markdown: @doc_str
using NNlib: NNlib, ConvDims, conv, conv!, relu, gelu, sigmoid_fast, swish, σ, ∇conv_data,
∇conv_filter
using NNlib: NNlib, ConvDims, conv, conv!, relu, gelu, σ, ∇conv_data, ∇conv_filter
using Random: Random, AbstractRNG, rand!
using Reexport: @reexport
using Statistics: Statistics, mean, var
Expand Down
2 changes: 1 addition & 1 deletion src/impl/activation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ end

function _fast_activation!(
::LoopedArrayOp, y::AbstractArray, σ::F, x::AbstractArray) where {F}
@turbo for I in eachindex(y, x)
@simd ivdep for I in eachindex(y, x)
@inbounds y[I] = σ(x[I])
end
end
Expand Down
30 changes: 17 additions & 13 deletions src/impl/affine_normalize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,13 @@ end

function __affine_normalize_gn_impl!(::LoopedArrayOp, y::AbstractArray{<:Number, 4}, f::F,
x::AbstractArray{<:Number, 4}, μ, σ², ::Nothing, ::Nothing, ϵ::Real) where {F}
@turbo for L in axes(y, 4), K in axes(y, 3)
for L in axes(y, 4), K in axes(y, 3)
@inbounds _sc = @fastmath inv(sqrt(σ²[1, 1, K, L] + ϵ))
@inbounds _bc = -μ[1, 1, K, L] * _sc
for J in axes(y, 2), I in axes(y, 1)
@inbounds y[I, J, K, L] = muladd(x[I, J, K, L], _sc, _bc)
for J in axes(y, 2)
@simd ivdep for I in axes(y, 1)
@inbounds y[I, J, K, L] = muladd(x[I, J, K, L], _sc, _bc)
end
end
end
_fast_activation!(f, y) # NOTE: don't fuse into the above loop
Expand All @@ -71,12 +73,12 @@ end
function __affine_normalize_gn_impl!(::LoopedArrayOp, y::AbstractArray{<:Number, 4}, f::F,
x::AbstractArray{<:Number, 4}, μ, σ², scale::AbstractArray{<:Number, 4},
bias::AbstractArray{<:Number, 4}, ϵ::Real) where {F}
@turbo for L in axes(y, 4), K in axes(y, 3)
for L in axes(y, 4), K in axes(y, 3)
@inbounds idenom = @fastmath inv(sqrt(σ²[1, 1, K, L] + ϵ))
for J in axes(y, 2)
@inbounds _sc = scale[1, J, K, 1] * idenom
@inbounds _bc = muladd(-μ[1, 1, K, L], _sc, bias[1, J, K, 1])
for I in axes(y, 1)
@simd ivdep for I in axes(y, 1)
@inbounds y[I, J, K, L] = muladd(x[I, J, K, L], _sc, _bc)
end
end
Expand Down Expand Up @@ -180,15 +182,17 @@ function ∇affine_normalize_gn_impl(::LoopedArrayOp, ∂y, x, μ, σ², ::Nothi
∂x, ∂μ, ∂σ² = similar(x), zero.(μ), zero.(σ²)
half = eltype(∂σ²)(0.5)

@turbo for L in axes(∂y, 4), K in axes(∂y, 3)
for L in axes(∂y, 4), K in axes(∂y, 3)
@inbounds idenom = @fastmath inv(sqrt(σ²[1, 1, K, L] + ϵ))
idenom² = idenom^2
for J in axes(∂y, 2), I in axes(∂y, 1)
@inbounds= x[I, J, K, L] - μ[1, 1, K, L]
for J in axes(∂y, 2)
@simd for I in axes(∂y, 1)
@inbounds= x[I, J, K, L] - μ[1, 1, K, L]

@inbounds ∂x[I, J, K, L] = ∂y[I, J, K, L] * idenom
@inbounds ∂μ[1, 1, K, L] -= ∂x[I, J, K, L]
@inbounds ∂σ²[1, 1, K, L] -= ∂x[I, J, K, L] ** half * idenom²
@inbounds ∂x[I, J, K, L] = ∂y[I, J, K, L] * idenom
@inbounds ∂μ[1, 1, K, L] -= ∂x[I, J, K, L]
@inbounds ∂σ²[1, 1, K, L] -= ∂x[I, J, K, L] ** half * idenom²
end
end
end

Expand All @@ -199,12 +203,12 @@ function ∇affine_normalize_gn_impl(::LoopedArrayOp, ∂y, x, μ, σ², scale,
∂x, ∂μ, ∂σ², ∂sc, ∂b = similar(x), zero.(μ), zero.(σ²), zero.(scale), zero.(bias)
half = eltype(∂σ²)(0.5)

@turbo for L in axes(∂y, 4), K in axes(∂y, 3)
for L in axes(∂y, 4), K in axes(∂y, 3)
@inbounds idenom = @fastmath inv(sqrt(σ²[1, 1, K, L] + ϵ))
idenom² = idenom^2
for J in axes(∂y, 2)
@inbounds _sc = scale[1, J, K, 1] * idenom
for I in axes(∂y, 1)
@simd for I in axes(∂y, 1)
@inbounds= x[I, J, K, L] - μ[1, 1, K, L]

@inbounds ∂x[I, J, K, L] = ∂y[I, J, K, L] * _sc
Expand Down
8 changes: 4 additions & 4 deletions src/impl/dropout.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ end
::LoopedArrayOp, noise::AbstractArray, p::Real,
x::AbstractArray, α::Real, A::Real, B::Real)
res = similar(x, promote_type(typeof(p), typeof(α)))
@turbo for i in eachindex(noise)
@simd ivdep for i in eachindex(noise)
@inbounds res[i] = muladd(ifelse(noise[i] > p, x[i], α), A, B)
end
return res
Expand All @@ -32,7 +32,7 @@ function CRC.rrule(::typeof(_alpha_dropout_kernel), ::LoopedArrayOp, noise::Abst
p::Real, x::AbstractArray, α::Real, A::Real, B::Real)
_cond = similar(noise, Bool)
y = similar(x, promote_type(typeof(p), typeof(α), typeof(A), typeof(B), eltype(x)))
@turbo for i in eachindex(noise)
@simd ivdep for i in eachindex(noise)
@inbounds _cond[i] = noise[i] > p
@inbounds y[i] = muladd(ifelse(_cond[i], x[i], α), A, B)
end
Expand All @@ -41,7 +41,7 @@ function CRC.rrule(::typeof(_alpha_dropout_kernel), ::LoopedArrayOp, noise::Abst
_∇alpha_dropout_kernel = let _cond = _cond, proj_x = proj_x, x = x, noise = noise
Δ -> begin
∂x = similar(x)
@turbo for i in eachindex(noise)
@simd ivdep for i in eachindex(noise)
@inbounds ∂x[i] = _cond[i] * Δ[i] * A
end
return (ntuple(Returns(∂∅), 4)..., proj_x(∂x), ntuple(Returns(∂∅), 3)...)
Expand Down Expand Up @@ -87,7 +87,7 @@ EnzymeRules.inactive_noinl(::typeof(_alpha_dropout_noise), ::Any...) = nothing
rand!(rng, y)
opmode = internal_operation_mode(y)
if opmode isa LoopedArrayOp
@turbo for i in eachindex(y)
@simd ivdep for i in eachindex(y)
@inbounds y[i] = (y[i] > p) * invp
end
else
Expand Down
2 changes: 1 addition & 1 deletion src/impl/normalization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ function __update_statistics(opmode, rμ, rσ², μ, σ², m1, m2)
return rμ2, rσ²2
end
function __update_statistics!(::LoopedArrayOp, rμ2, rσ²2, rμ, rσ², μ, σ², m1, m2, m3)
@turbo for I in eachindex(rμ2, rσ²2)
@simd ivdep for I in eachindex(rμ2, rσ²2)
@inbounds rμ2[I] = m3 * rμ[I] + m1 * μ[I]
@inbounds rσ²2[I] = m3 * rσ²[I] + m2 * σ²[I]
end
Expand Down

0 comments on commit 22ab7d5

Please sign in to comment.