Skip to content

Commit

Permalink
test: try re-enabling enzyme testing on 0.13.16 (#1042)
Browse files Browse the repository at this point in the history
* test: try re-enabling enzyme testing on 0.13.14

* fix: cache invalidation tests

* fix: more test fixes and standardize grad tests

* fix: avoid LV or Octavian with Enzyme

* fix: enzyme support for pooling

* fix: more enzyme support

* ci: temporarily disable other tests (drop me)

* test: cleanup conv tests

* ci: temporarily disable other tests (drop me)

* test: dense tests

* test: try fixing more tests

* test: workaround Enzyme warning

* test: enzyme only on linux

* fix: more BN test fixes

* test: newest release fixes more issues

* fix: print error in CI

* fix: more test fixes

* chore: apply suggestions from code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* test: mark remaining tests as broken

* fix: bypass enzyme bmm failure

* chore: apply suggestions from code review
  • Loading branch information
avik-pal authored Nov 21, 2024
1 parent f31dc3a commit 132619c
Show file tree
Hide file tree
Showing 47 changed files with 495 additions and 837 deletions.
8 changes: 4 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Lux"
uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.3.3"
version = "1.3.4"

[deps]
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
Expand Down Expand Up @@ -69,15 +69,15 @@ LuxZygoteExt = "Zygote"
ADTypes = "1.10"
Adapt = "4.1"
ArgCheck = "2.3"
ArrayInterface = "7.10"
ArrayInterface = "7.17.1"
CUDA = "5.3.2"
ChainRulesCore = "1.24"
Compat = "4.16"
ComponentArrays = "0.15.18"
ConcreteStructs = "0.2.3"
DispatchDoctor = "0.4.12"
Enzyme = "0.13.13"
EnzymeCore = "0.8.5"
Enzyme = "0.13.16"
EnzymeCore = "0.8.6"
FastClosures = "0.3.2"
Flux = "0.14.25"
ForwardDiff = "0.10.36"
Expand Down
2 changes: 1 addition & 1 deletion docs/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ ChainRulesCore = "1.24"
ComponentArrays = "0.15.18"
Documenter = "1.4"
DocumenterVitepress = "0.1.3"
Enzyme = "0.13.13"
Enzyme = "0.13.16"
FiniteDiff = "2.23.1"
ForwardDiff = "0.10.36"
Functors = "0.5"
Expand Down
2 changes: 1 addition & 1 deletion ext/LuxEnzymeExt/training.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
function Lux.Training.compute_gradients_impl(
ad::AutoEnzyme, obj_fn::F, data, ts::TrainState) where {F}
dps = Lux.Training.dparameters(ts.cache)
dps = fmap(Utils.zero, ts.parameters; exclude=isleaf)

obj_fn_wrap, st_wrap, stats_wrap = Lux.Training.wrap_objective_function(
obj_fn, ts.model, ts.parameters, ts.states, data, True())
Expand Down
2 changes: 1 addition & 1 deletion lib/LuxCore/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ ArrayInterface = "7.9"
ChainRulesCore = "1.24"
Compat = "4.16"
DispatchDoctor = "0.4.10"
EnzymeCore = "0.8.5"
EnzymeCore = "0.8.6"
Functors = "0.5"
MLDataDevices = "1.6"
Random = "1.10"
Expand Down
2 changes: 1 addition & 1 deletion lib/LuxCore/test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[compat]
Aqua = "0.8.7"
EnzymeCore = "0.8.5"
EnzymeCore = "0.8.6"
ExplicitImports = "1.9.0"
Functors = "0.5"
MLDataDevices = "1.6"
Expand Down
6 changes: 3 additions & 3 deletions lib/LuxLib/Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.3.8"
version = "1.3.9"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand Down Expand Up @@ -65,8 +65,8 @@ ChainRulesCore = "1.24"
Compat = "4.16"
CpuId = "0.3"
DispatchDoctor = "0.4.12"
Enzyme = "0.13.13"
EnzymeCore = "0.8.5"
Enzyme = "0.13.16"
EnzymeCore = "0.8.6"
FastClosures = "0.3.2"
ForwardDiff = "0.10.36"
Hwloc = "3.2"
Expand Down
17 changes: 15 additions & 2 deletions lib/LuxLib/ext/LuxLibTrackerExt.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
module LuxLibTrackerExt

using FastClosures: @closure
using LuxLib: LuxLib, Utils, Traits
using LuxLib: LuxLib, Utils, Impl, Traits, GenericBroadcastOp
using NNlib: NNlib
using Static: True, StaticBool
using Tracker: Tracker, TrackedArray, TrackedReal, TrackedVector
using Tracker: Tracker, TrackedArray, TrackedReal, TrackedVector, TrackedMatrix

tracker_data(x) = Tracker.data(x)
tracker_data(x::NNlib.BatchedAdjoint) = NNlib.batched_adjoint(tracker_data(parent(x)))
Expand Down Expand Up @@ -52,6 +52,19 @@ for op in (:batched_mul, :batched_matmul)
end
end

# Overload muladd for Traced Arrays
for AType in (:TrackedMatrix, :AbstractMatrix),
xType in (:TrackedMatrix, :AbstractMatrix),
bType in (:TrackedVector, :AbstractVector)

Utils.is_tracked(AType, xType, bType) || continue

@eval function Impl.matmuladd(
::GenericBroadcastOp, A::$(AType), x::$(xType), b::$(bType))
return A * x .+ b
end
end

# NNlib: gather
Tracker.@grad_from_chainrules NNlib.gather!(
dst::AbstractArray, src::TrackedArray, idx::AbstractArray)
Expand Down
16 changes: 9 additions & 7 deletions lib/LuxLib/src/impl/batched_mul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,12 @@ function batched_matmul_cpu!(z::AbstractArray{zT, 3}, x::AbstractArray{xT, 3},
batched_matmul_loopvec_impl!(z, x, y)
return
end
# Avoid an Enzyme segfault https://github.com/EnzymeAD/Enzyme.jl/issues/1983
fallback_batched_matmul!(z, LoopedArrayOp(), x, y)
# NNlib.batched_mul!(z, x, y) # XXX: restore once the enzyme segfault is fixed
if Utils.within_enzyme_autodiff()
# XXX: https://github.com/LuxDL/Lux.jl/issues/1024
fallback_batched_matmul!(z, LoopedArrayOp(), x, y)
else
NNlib.batched_mul!(z, x, y)
end
return
end

Expand All @@ -80,10 +83,9 @@ end
function fallback_batched_matmul!(
z::AbstractArray{zT, 3}, opmode, x::AbstractArray{xT, 3},
y::AbstractArray{yT, 3}) where {zT, xT, yT}
# XXX: bring back once the enzyme segfault is fixed
# @warn "Using fallback Batched Matrix Multiply routine for $(dev) with A: size = \
# $(size(x)) eltype = $(xT) and B: size = $(size(y)) eltype = $(yT). This may be \
# slow." maxlog=1
@warn "Using fallback Batched Matrix Multiply routine for $(opmode) with A: size = \
$(size(x)) eltype = $(xT) and B: size = $(size(y)) eltype = $(yT). This may be \
slow." maxlog=1

if (size(x, 3) != size(y, 3) && size(x, 3) != 1 && size(y, 3) != 1) ||
(size(x, 2) != size(y, 1))
Expand Down
40 changes: 31 additions & 9 deletions lib/LuxLib/src/impl/batchnorm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,29 @@ function batchnorm_affine_normalize_internal!(
end

function compute_batchnorm_scale_bias!(γ′, β′, γ, β, μ, σ², ϵ)
if γ === nothing && β === nothing
@simd ivdep for J in eachindex(γ′, β′, μ, σ²)
@fastmath @inbounds γ′[J] = inv(sqrt(σ²[J] + ϵ))
@fastmath @inbounds β′[J] = -μ[J] * γ′[J]
if Utils.within_enzyme_autodiff()
if γ === nothing && β === nothing
for J in eachindex(γ′, β′, μ, σ²)
@inbounds γ′[J] = inv(sqrt(σ²[J] + ϵ))
@inbounds β′[J] = -μ[J] * γ′[J]
end
else
for J in eachindex(γ′, β′, γ, β, μ, σ²)
@inbounds γ′[J] = γ[J] / sqrt(σ²[J] + ϵ)
@inbounds β′[J] = β[J] - μ[J] * γ′[J]
end
end
else
@simd ivdep for J in eachindex(γ′, β′, γ, β, μ, σ²)
@fastmath @inbounds γ′[J] = γ[J] / sqrt(σ²[J] + ϵ)
@fastmath @inbounds β′[J] = β[J] - μ[J] * γ′[J]
if γ === nothing && β === nothing
@simd ivdep for J in eachindex(γ′, β′, μ, σ²)
@fastmath @inbounds γ′[J] = inv(sqrt(σ²[J] + ϵ))
@fastmath @inbounds β′[J] = -μ[J] * γ′[J]
end
else
@simd ivdep for J in eachindex(γ′, β′, γ, β, μ, σ²)
@fastmath @inbounds γ′[J] = γ[J] / sqrt(σ²[J] + ϵ)
@fastmath @inbounds β′[J] = β[J] - μ[J] * γ′[J]
end
end
end
end
Expand All @@ -115,7 +129,11 @@ function apply_batchnorm_scale_bias_act_cpu!(
if size(y, 1) == 1
apply_batchnorm_scale_bias_act_2d_serial_cpu!(y, γ′, β′, x, σ)
else
apply_batchnorm_scale_bias_act_3d_threaded_cpu!(y, γ′, β′, x, σ)
if Utils.within_enzyme_autodiff()
apply_batchnorm_scale_bias_act_3d_serial_cpu!(y, γ′, β′, x, σ)
else
apply_batchnorm_scale_bias_act_3d_threaded_cpu!(y, γ′, β′, x, σ)
end
end
end

Expand Down Expand Up @@ -160,7 +178,11 @@ function apply_batchnorm_scale_bias_cpu!(y::AbstractArray{yT, 3}, γ′::Abstrac
if size(y, 1) == 1
apply_batchnorm_scale_bias_2d_serial_cpu!(y, γ′, β′, x)
else
apply_batchnorm_scale_bias_3d_threaded_cpu!(y, γ′, β′, x)
if Utils.within_enzyme_autodiff()
apply_batchnorm_scale_bias_3d_serial_cpu!(y, γ′, β′, x)
else
apply_batchnorm_scale_bias_3d_threaded_cpu!(y, γ′, β′, x)
end
end
end

Expand Down
1 change: 1 addition & 0 deletions lib/LuxLib/src/impl/normalization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ reshape_norm_dims(y, x) = reshape(x, get_norm_reshape_dims(size(y), length(x)))
end

CRC.@non_differentiable get_norm_reshape_dims(::Any...)
EnzymeRules.inactive(::typeof(get_norm_reshape_dims), ::Any...) = true

# Entry Points
## InstanceNorm
Expand Down
4 changes: 3 additions & 1 deletion lib/LuxLib/src/traits.jl
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,12 @@ end
module System

using ChainRulesCore: ChainRulesCore
using EnzymeCore: EnzymeCore
using Hwloc: Hwloc
using Static: static, False, True

using ..LuxLib: DISABLE_LOOP_VECTORIZATION
using ..Utils: is_extension_loaded, safe_minimum
using ..Utils: is_extension_loaded, safe_minimum, within_enzyme_autodiff

const CRC = ChainRulesCore

Expand Down Expand Up @@ -135,6 +136,7 @@ CRC.@non_differentiable explicit_blas_loaded()
use_octavian() = False()
else
function use_octavian()
within_enzyme_autodiff() && return False()
return is_extension_loaded(Val(:Octavian)) & is_x86_64() &
(INTEL_HARDWARE | AMD_RYZEN_HARDWARE)
end
Expand Down
7 changes: 7 additions & 0 deletions lib/LuxLib/src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,11 @@ within_autodiff(::AbstractArray{<:ForwardDiff.Dual}) = True()

CRC.rrule(::typeof(within_autodiff), x) = True(), _ -> (∂∅, ∂∅)

function within_enzyme_autodiff()
unsafe_known(is_extension_loaded(Val(:Enzyme))) && return EnzymeCore.within_autodiff()
return false
end

static_training_mode(::Nothing, args...) = within_autodiff_vararg(args...)

function static_training_mode(
Expand Down Expand Up @@ -329,6 +334,8 @@ CRC.@non_differentiable static_training_mode_check(::Any...)
@inline can_loopvec_args(args...) = false
else
@inline function can_loopvec_args(args...)
# Avoid loop vectorization inside Enzyme autodiff calls
within_enzyme_autodiff() && return false
return can_loopvec_args_check(is_extension_loaded(Val(:LoopVectorization)), args...)
end
end
Expand Down
4 changes: 2 additions & 2 deletions lib/LuxLib/test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ BLISBLAS = "0.1"
BenchmarkTools = "1.5"
ChainRulesCore = "1.24"
ComponentArrays = "0.15.18"
Enzyme = "0.13.13"
EnzymeCore = "0.8.5"
Enzyme = "0.13.16"
EnzymeCore = "0.8.6"
ExplicitImports = "1.9.0"
ForwardDiff = "0.10.36"
Hwloc = "3.2"
Expand Down
10 changes: 6 additions & 4 deletions lib/LuxLib/test/common_ops/activation_tests.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
@testitem "Activation Functions" tags=[:misc] setup=[SharedTestSetup] begin
using Enzyme

rng = StableRNG(1234)

apply_act(f::F, x) where {F} = sum(abs2, f.(x))
Expand All @@ -8,7 +10,7 @@
@testset "$mode" for (mode, aType, ongpu, fp64) in MODES
@testset "$f: $T" for f in [identity, relu, sigmoid, sigmoid_fast, softplus,
logsigmoid, gelu, swish, lisht, tanh, tanh_fast],
T in [Float16, Float32, Float64]
T in [Float32, Float64]

!fp64 && T == Float64 && continue

Expand Down Expand Up @@ -41,9 +43,9 @@
end
@test @inferred(Zygote.gradient(apply_act_fast2, f, x)) isa Any

@test_gradients(Base.Fix1(apply_act, f), x; atol, rtol)
@test_gradients(Base.Fix1(apply_act_fast, f), x; atol, rtol)
@test_gradients(Base.Fix1(apply_act_fast2, f), x; atol, rtol)
@test_gradients(apply_act, f, x; atol, rtol)
@test_gradients(apply_act_fast, f, x; atol, rtol, skip_backends=[AutoEnzyme()])
@test_gradients(apply_act_fast2, f, x; atol, rtol)

∂x1 = Zygote.gradient(apply_act, f, x)[2]
∂x2 = Zygote.gradient(apply_act_fast, f, x)[2]
Expand Down
2 changes: 1 addition & 1 deletion lib/LuxLib/test/common_ops/bias_act_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
@testset "$act, $T, $sz" for act in [
identity, relu, sigmoid, sigmoid_fast, softplus,
logsigmoid, gelu, swish, lisht, tanh, tanh_fast],
T in [Float16, Float32, Float64],
T in [Float32, Float64],
sz in [(2, 2, 3, 4), (4, 5)]

!fp64 && T == Float64 && continue
Expand Down
30 changes: 14 additions & 16 deletions lib/LuxLib/test/common_ops/conv_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ end

calc_padding(pad, ::NTuple{N}, dilation, stride) where {N} = expand(Val(2 * N), pad)

sumabs2conv(args...) = sum(abs2, fused_conv_bias_activation(args...))

function run_conv_testing(gen_f::Function, activation, kernel, stride, padding,
hasbias, groups, Tw, Tx, aType, mode, ongpu)
weight = convfilter(gen_f, Tw, kernel, 4 => 8; groups) |> aType
Expand All @@ -28,9 +30,8 @@ function run_conv_testing(gen_f::Function, activation, kernel, stride, padding,

generic_testing = !(mode == "amdgpu" && (Tx == Float64 || Tw == Float64))

fp16 = Tx == Float16 || Tw == Float16
atol = fp16 ? 1.0f-1 : 1.0f-3
rtol = fp16 ? 1.0f-1 : 1.0f-3
atol = 1.0f-3
rtol = 1.0f-3

if generic_testing
y_generic = LuxLib.Impl.conv(x, weight, cdims)
Expand All @@ -45,36 +46,33 @@ function run_conv_testing(gen_f::Function, activation, kernel, stride, padding,
@test @inferred(fused_conv_bias_activation(activation, weight, x, bias, cdims)) isa Any
@jet fused_conv_bias_activation(activation, weight, x, bias, cdims)

__f = (σ, w, x, b, cdims) -> sum(abs2, fused_conv_bias_activation(σ, w, x, b, cdims))

if mode != "amdgpu" && activation !== anonact && !fp16
@test @inferred(Zygote.gradient(__f, activation, weight, x, bias, cdims)) isa Any
if mode != "amdgpu" && activation !== anonact
@test @inferred(Zygote.gradient(
sumabs2conv, activation, weight, x, bias, cdims
)) isa Any
else
try
@inferred(Zygote.gradient(__f, activation, weight, x, bias, cdims))
@inferred(Zygote.gradient(sumabs2conv, activation, weight, x, bias, cdims))
@test true
catch e
e isa ErrorException || rethrow()
@test_broken false
end
end

__f_grad = let activation = activation, cdims = cdims
(w, x, b) -> __f(activation, w, x, b, cdims)
end

skip_backends = Any[AutoEnzyme()]
skip_backends = []
mp = Tx != Tw
mp && push!(skip_backends, AutoReverseDiff())
((mp && ongpu) || (mode == "amdgpu" && (Tx == Float64 || Tw == Float64))) &&
push!(skip_backends, AutoTracker())
@test_gradients(__f_grad, weight, x, bias; atol, rtol, skip_backends, soft_fail=fp16)

@test_gradients(sumabs2conv, activation, weight, x, bias, cdims; atol, rtol,
skip_backends)
end

anonact = x -> gelu(x)

const ELTYPES = [(Float16, Float16), (Float32, Float16), (Float32, Float32),
(Float32, Float64), (Float64, Float64)]
const ELTYPES = [(Float32, Float32), (Float32, Float64), (Float64, Float64)]
const ACTIVATIONS = [
identity, tanh, tanh_fast, sigmoid, sigmoid_fast, relu, gelu, swish, anonact]

Expand Down
Loading

7 comments on commit 132619c

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register subdir=lib/LuxTestUtils

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register subdir=lib/LuxLib

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/119929

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a LuxTestUtils-v1.7.0 -m "<description of version>" 132619c86d1579fbca4d4d253331d103cd528101
git push origin LuxTestUtils-v1.7.0

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/119930

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a LuxLib-v1.3.9 -m "<description of version>" 132619c86d1579fbca4d4d253331d103cd528101
git push origin LuxLib-v1.3.9

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 132619c Previous: cb0900f Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3875 ns 3875 ns 1
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4208 ns 4375 ns 0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5250 ns 5083 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4333 ns 4208 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 61892.5 ns 60144 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10542 ns 10625 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10209 ns 10666 ns 0.96
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10459 ns 11375 ns 0.92
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10417 ns 10334 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 433097 ns 421452 ns 1.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1084 ns 1250 ns 0.87
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1291 ns 1292 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1292 ns 1250 ns 1.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1209 ns 1167 ns 1.04
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 18531 ns 18149 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4167 ns 4167 ns 1
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 3917 ns 4042 ns 0.97
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4250 ns 4292 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4083 ns 3625 ns 1.13
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 111975 ns 109548 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57583 ns 56166 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46292 ns 46709 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 38042 ns 46334 ns 0.82
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83125 ns 82291 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37370 ns 37127 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2031625 ns 2031334 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2085958 ns 2096166.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2088333.5 ns 2086458 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2005041 ns 1997167 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 198108 ns 197158.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 143750 ns 143042 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 146063 ns 145583.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 145209 ns 146709 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 144583.5 ns 149500 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 166112.5 ns 166231 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1118042 ns 1138708.5 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1114250 ns 1128583 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1153000 ns 1062083.5 ns 1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1068770.5 ns 1115041.5 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 533468 ns 530934 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3584 ns 3125 ns 1.15
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3750 ns 3458 ns 1.08
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4417 ns 4292 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3958 ns 3375 ns 1.17
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 72081 ns 70464 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9000 ns 9208 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8542 ns 8917 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9041 ns 9125 ns 0.99
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8916 ns 9166 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 503190.5 ns 483194.5 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 15000 ns 15333 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15250 ns 15458 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 16708 ns 17333 ns 0.96
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 15542 ns 17062.5 ns 0.91
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 55903 ns 53962 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 214187.5 ns 214583.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 213604.5 ns 212667 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 215395.5 ns 214625 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212917 ns 225250 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 278881 ns 273370 ns 1.02
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 458 ns 1.09
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 542 ns 666 ns 0.81
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 750 ns 750 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 583 ns 500 ns 1.17
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17733 ns 17502.5 ns 1.01
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1625 ns 1542 ns 1.05
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1500 ns 1667 ns 0.90
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1625 ns 1834 ns 0.89
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1583 ns 1375 ns 1.15
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 105125.5 ns 101667.5 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7125 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5833 ns 5917 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5250 ns 5792 ns 0.91
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10084 ns 9917 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 24106 ns 23886 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220750 ns 221417 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 228084 ns 228125 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 230459 ns 228666 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213708.5 ns 220500 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 169707.5 ns 169891 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3875 ns 3958 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3875 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3875 ns 3875 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23637 ns 23537 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16708 ns 16750 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16834 ns 17042 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16875 ns 16875 ns 1
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16625 ns 16750 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 161602 ns 159725 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 578416.5 ns 570333 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 569958 ns 574000 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 579292 ns 579125 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 578291 ns 571125 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113009 ns 113492 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1417979.5 ns 1428041 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1419167 ns 1422333 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1424875 ns 1423708 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1426416 ns 1423458 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 210883 ns 208571.5 ns 1.01
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1067000 ns 1051187.5 ns 1.02
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 958417 ns 971896 ns 0.99
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1336917 ns 1346062.5 ns 0.99
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1304396 ns 1306416 ns 1.00
lenet(28, 28, 1, 64)/forward/GPU/CUDA 271759 ns 272301 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 5795104.5 ns 5990916 ns 0.97
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4601125 ns 4519875 ns 1.02
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4929084 ns 4948416.5 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5750083 ns 5523125 ns 1.04
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1068932 ns 1070952 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 583 ns 0.93
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23274 ns 23553 ns 0.99
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2084 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2166 ns 2167 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2167 ns 2167 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2208 ns 2125 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 171283 ns 168963.5 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4333 ns 3875 ns 1.12
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4125 ns 4167 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5083 ns 5250 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4292 ns 3666 ns 1.17
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 66130 ns 65091 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11625 ns 11416 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11458 ns 11292 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12458 ns 12333.5 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11709 ns 11209 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 452684.5 ns 446962.5 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6375 ns 6458.5 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6959 ns 6792 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8229.5 ns 7833.5 ns 1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6916 ns 6250 ns 1.11
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 52019 ns 52555 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16875 ns 16584 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17000 ns 17791 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18166 ns 17375 ns 1.05
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17542 ns 17125 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 301500.5 ns 308634 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 625 ns 0.87
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 542 ns 666 ns 0.81
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 666 ns 583 ns 1.14
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 667 ns 625 ns 1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 32512 ns 32320 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8500 ns 8541 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8750 ns 9167 ns 0.95
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9500 ns 9500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8959 ns 9479.5 ns 0.95
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 157915 ns 159616 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64542 ns 64750 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64625 ns 64625 ns 1
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64750 ns 64292 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64875 ns 64542 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111658.5 ns 111041.5 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 279708 ns 292000 ns 0.96
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 283750 ns 292084 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 293250 ns 275666 ns 1.06
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 284521 ns 275708 ns 1.03
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 185586.5 ns 183441 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3282500 ns 3191791 ns 1.03
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 3076875 ns 3043437.5 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 2795834 ns 3020437.5 ns 0.93
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 4063541.5 ns 4089708 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 567714 ns 601857 ns 0.94
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7638583 ns 7582625 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7366000 ns 7473208.5 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7289042 ns 7437833 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8172916 ns 8187292 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1335450 ns 1317154 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 17555833 ns 18957000 ns 0.93
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 17413291.5 ns 19047250 ns 0.91
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 17640417 ns 19104542 ns 0.92
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 14085667 ns 15686625 ns 0.90
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23644667 ns 23902625 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 33391375 ns 34420458 ns 0.97
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 40912708 ns 37002333 ns 1.11
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 35048479 ns 34848770.5 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1855237.5 ns 1857006 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 189754584 ns 191696375.5 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 232353000 ns 164341792 ns 1.41
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 201284750 ns 152698167 ns 1.32
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 435226125 ns 439655916 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13860033 ns 13895377 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 290571042 ns 292126520.5 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 334832916 ns 340023312 ns 0.98
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 303703583 ns 298857875 ns 1.02
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 393811604 ns 335240875 ns 1.17
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 21541 ns 22250 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22375 ns 23083 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23354 ns 23959 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24500 ns 23417 ns 1.05
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 95582 ns 96101 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 103250 ns 103542 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 115312.5 ns 103541 ns 1.11
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 104625 ns 104791 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 102667 ns 113250 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 503695.5 ns 512131 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5750 ns 5834 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5791 ns 6375 ns 0.91
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7666 ns 7000 ns 1.10
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6250 ns 6125 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 68642 ns 68297.5 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14875 ns 15208 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14625 ns 15750 ns 0.93
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16250 ns 16583 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14833 ns 15062.5 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 478112.5 ns 474148.5 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3019792 ns 3053958 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2069896 ns 2089500 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2279000 ns 2270042 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4750917 ns 4804875 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 583001 ns 582756 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23604770.5 ns 23872458.5 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18003875 ns 18056937.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18293125 ns 17766021 ns 1.03
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 35919729.5 ns 35515208 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3106744 ns 3103295.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33297687 ns 33801000 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 27474958 ns 27630916.5 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 29070229.5 ns 27435750 ns 1.06
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41830959 ns 41597458 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 73396 ns 74917 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 75125 ns 72541 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 74875 ns 76416 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 72959 ns 74375 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 103514 ns 103583 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 274208 ns 221146 ns 1.24
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 205959 ns 219166 ns 0.94
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 255333 ns 208875 ns 1.22
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 296916 ns 206542 ns 1.44
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 554316 ns 560403 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11167 ns 12166 ns 0.92
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11875 ns 12208.5 ns 0.97
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13458 ns 13167 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12458 ns 12042 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 72256.5 ns 71403 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26583.5 ns 26979.5 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26833 ns 27167 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 28084 ns 27958.5 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26708 ns 26459 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 483481.5 ns 472464 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11520.5 ns 12437.5 ns 0.93
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 13041 ns 12979 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13750 ns 14167 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12875 ns 12125 ns 1.06
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 52959.5 ns 53400 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 25500 ns 25625 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 25542 ns 26292 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26375 ns 26416 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26542 ns 26167 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 310926 ns 306626.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 179125 ns 180729 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 182625 ns 182709 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 183958 ns 183875 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 182416 ns 180833 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 58111 ns 56252.5 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 582958 ns 593541.5 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 583209 ns 593916 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 610042 ns 584021 ns 1.04
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 582000 ns 582917 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 286370 ns 289288.5 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5729.5 ns 6500 ns 0.88
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6334 ns 6125 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7500 ns 7792 ns 0.96
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6083 ns 6145.5 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 71136.5 ns 70132.5 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14167 ns 14271 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14500 ns 14916 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15667 ns 15500 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14667 ns 14000 ns 1.05
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 468005 ns 460852.5 ns 1.02
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1186749.5 ns 1175354 ns 1.01
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1247334 ns 1353000 ns 0.92
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1282666.5 ns 1269979 ns 1.01
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 841729 ns 1317500 ns 0.64
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301667 ns 302455 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4101771 ns 4288500 ns 0.96
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4417458 ns 4366958 ns 1.01
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4790916 ns 4543917 ns 1.05
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 3731833.5 ns 4469000 ns 0.84
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1043818 ns 1030148 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1792 ns 1875 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1875 ns 1833 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1834 ns 1875 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23460 ns 23497 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4875 ns 4834 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4834 ns 5041 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4917 ns 4875 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4958 ns 4875 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 189873 ns 185923.5 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5792 ns 5500 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6125 ns 6167 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7187.5 ns 6459 ns 1.11
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6208 ns 5583 ns 1.11
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 55970.5 ns 55454.5 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10625 ns 10667 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11083 ns 11750 ns 0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11584 ns 11458 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 11500 ns 10667 ns 1.08
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 332298.5 ns 337381 ns 0.98
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 292 ns 375 ns 0.78
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 375 ns 333 ns 1.13
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22660 ns 22737 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2708 ns 2708 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2750 ns 3000 ns 0.92
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3000 ns 3000 ns 1
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2709 ns 2750 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 159360 ns 157057 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11292 ns 11625 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11792 ns 12250 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13250 ns 12708 ns 1.04
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 12229.5 ns 11417 ns 1.07
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 57130.5 ns 56422 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24708 ns 24250 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24167 ns 25208 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25854 ns 25000 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24916.5 ns 25437.5 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 300198 ns 294376.5 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4208 ns 4167 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4125 ns 4208 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4250 ns 4167 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4208 ns 4208 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24574 ns 24716 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16166 ns 16042 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16000 ns 16417 ns 0.97
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16042 ns 16250 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16375 ns 16167 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 201392 ns 193381 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5750 ns 5750 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5750 ns 6083 ns 0.95
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5875 ns 5750 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5916 ns 5833 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 33153 ns 33569 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20333 ns 20479.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 20792 ns 21000 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 20917 ns 21208 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21375 ns 21104.5 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 175780 ns 174365.5 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 417417 ns 375416.5 ns 1.11
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 378854.5 ns 374666.5 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 487270.5 ns 488312.5 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 103917 ns 524187.5 ns 0.20
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66399.5 ns 66372.5 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 877583 ns 931978.5 ns 0.94
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 949562.5 ns 880291.5 ns 1.08
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1206625 ns 1223791.5 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 469167 ns 1351833.5 ns 0.35
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 191112 ns 192149.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 85417 ns 81312.5 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81083 ns 80750 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 84625 ns 80792 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 85417 ns 80937 ns 1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193239.5 ns 192807 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1913750 ns 1932917 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1913542 ns 1916542 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1943083.5 ns 1926479 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1906896 ns 1921042 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 406558 ns 394461 ns 1.03
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 292 ns 291 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 291 ns 292 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 22047.5 ns 22118 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1750 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1875 ns 1834 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1834 ns 1834 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1875 ns 1792 ns 1.05
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 171306.5 ns 166019.5 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6209 ns 6250 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6625 ns 7208 ns 0.92
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 8542 ns 8166 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 7125 ns 6312.5 ns 1.13
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 60422 ns 57360.5 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9000 ns 8917 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8958 ns 9167 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9584 ns 9208 ns 1.04
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9416 ns 9250 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 313100.5 ns 301535 ns 1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 119013624.5 ns 156508063 ns 0.76
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174073709 ns 173937500 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 154836458 ns 148141208 ns 1.05
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 106465208 ns 106478500 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5473107.5 ns 5474150 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 615549000 ns 673237875 ns 0.91
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 555627500 ns 556883000 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 469486625 ns 453960458.5 ns 1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 758488604 ns 759297583 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34956527 ns 38204722 ns 0.91
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 650955333 ns 701496583 ns 0.93
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 665997520.5 ns 667076166 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 596311875 ns 586800771 ns 1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 746344250 ns 744632000 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 59041 ns 56833 ns 1.04
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47750 ns 48042 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39041 ns 47125 ns 0.83
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84708.5 ns 84541 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 36941 ns 37576 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1922166 ns 1935541 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1978041 ns 1985208 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1990167 ns 1979834 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1920167 ns 1893771 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 173728 ns 174934 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 282041.5 ns 267875 ns 1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 266458 ns 288042 ns 0.93
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 273853.5 ns 270229.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 270333 ns 267250 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 135453.5 ns 128767 ns 1.05
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 674666 ns 665041 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 684354 ns 668958 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 676145.5 ns 589167 ns 1.15
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 596375 ns 596209 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 752272.5 ns 703647.5 ns 1.07
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2253417 ns 2205417 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2217895.5 ns 2188541 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2190479 ns 2100166.5 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2202416.5 ns 2225499.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 133169 ns 133307.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5479500 ns 5538625 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5506916 ns 5527958 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5588312.5 ns 5503250 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5564021 ns 5491271 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 794371.5 ns 759584.5 ns 1.05
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 646958 ns 638667 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 656500 ns 640458 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 640416 ns 648875 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 657291 ns 636167 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 47817 ns 47137 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1822375 ns 1796937.5 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1719708 ns 1724292 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1665541 ns 1720542 ns 0.97
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2108083 ns 2104520.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 227850 ns 218174.5 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58458 ns 57000 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 45083 ns 46833 ns 0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 38041 ns 47083 ns 0.81
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84958 ns 84542 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28842 ns 28335 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2030375 ns 2047750 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2084312.5 ns 2077083 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1787459 ns 2092083 ns 0.85
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2014583.5 ns 1939979 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 192397.5 ns 191381.5 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13382625 ns 13410020.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12433458.5 ns 12472750 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12571375 ns 12570979 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 15143562.5 ns 15234500 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 514602 ns 512740.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47546916 ns 47584458 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 41875708 ns 41911083 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 41161020.5 ns 41152979.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 58396167 ns 58152541 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3251545 ns 3249099 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 75047125 ns 74313208.5 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 67897459 ns 91931958.5 ns 0.74
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90940166.5 ns 91156000 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 99460667 ns 76595709 ns 1.30
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58750 ns 57334 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46875 ns 47417 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 38333 ns 47250 ns 0.81
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 80334 ns 84375 ns 0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 46475 ns 48075 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1921416 ns 1930959 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1976416 ns 1977562.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1721708.5 ns 1977250 ns 0.87
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1905000 ns 1816292 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 190253.5 ns 196217.5 ns 0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 334 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 333 ns 417 ns 0.80
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 334 ns 1.12
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 417 ns 333 ns 1.25
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 31709.5 ns 32756 ns 0.97
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6125 ns 6125 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6208 ns 6583 ns 0.94
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6583 ns 6542 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6854.5 ns 6208 ns 1.10
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 176344 ns 178147.5 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 291 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 291 ns 292 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 250 ns 1.17
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 31144 ns 31948 ns 0.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2625 ns 2625 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2625 ns 2875 ns 0.91
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2833 ns 2834 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2750 ns 2625 ns 1.05
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 164923.5 ns 164100 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 285479083.5 ns 323244146 ns 0.88
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 340672292 ns 340740458 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 320528833.5 ns 314512041.5 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 267627833 ns 271130916 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7061953.5 ns 7115553 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1000752000 ns 1053603541.5 ns 0.95
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 941508917 ns 941056333 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 849741542 ns 854610104 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1162624583 ns 1162236250 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 33972568.5 ns 33945165 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1314224145.5 ns 1364084083.5 ns 0.96
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1312834041.5 ns 1705661833 ns 0.77
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1621294583 ns 1621953875 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1681368042 ns 1313183229.5 ns 1.28
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1461562.5 ns 1410000 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1416958 ns 1408291.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1414750 ns 1453645.5 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1412375 ns 1407209 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 127713.5 ns 127861 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5020125 ns 5051959 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5027042 ns 5013583.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4740833 ns 5028416.5 ns 0.94
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5044042 ns 5027271 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 510137 ns 604299 ns 0.84
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 171071812.5 ns 161226250 ns 1.06
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 126739625 ns 131446875 ns 0.96
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 146147041 ns 127042083 ns 1.15
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 168329334 ns 155626750.5 ns 1.08
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4881506 ns 4974919.5 ns 0.98
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 622612209 ns 850481958 ns 0.73
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 538980667 ns 644255791 ns 0.84
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 504257334 ns 496077667 ns 1.02
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 656863250 ns 685984875 ns 0.96
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 16684647 ns 15948822 ns 1.05
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 8964583 ns 9064833.5 ns 0.99
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8900333 ns 8770396 ns 1.01
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 7993333 ns 7878104.5 ns 1.01
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 9790312.5 ns 10163000 ns 0.96
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1594468.5 ns 1608837.5 ns 0.99
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 36115750.5 ns 37348729 ns 0.97
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 36971083.5 ns 36970124.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 34444208 ns 33623167 ns 1.02
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 37794834 ns 38875729.5 ns 0.97
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6465190.5 ns 6455570 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47292 ns 47375 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47542 ns 47750 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47584 ns 47583 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47500 ns 47625 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 18793 ns 18855 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50291.5 ns 50250 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50417 ns 50750 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50833 ns 50416 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50750 ns 50292 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 231220 ns 202264 ns 1.14
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6291 ns 6375 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7084 ns 7187.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7792 ns 8417 ns 0.93
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7542 ns 6708 ns 1.12
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 106604.5 ns 108599.5 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10209 ns 9604.5 ns 1.06
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9833 ns 10209 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10270.5 ns 10292 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10459 ns 10583 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 619990 ns 610519 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5792 ns 5958 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6416 ns 6375 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7958 ns 7583 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6042 ns 5542 ns 1.09
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 121725 ns 131186.5 ns 0.93
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13375 ns 12875 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13000 ns 13208 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13584 ns 13583 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13375 ns 12875 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 528027 ns 530393 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1000 ns 1000 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 959 ns 1167 ns 0.82
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1125 ns 1042 ns 1.08
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 31705 ns 32479.5 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7792 ns 7833.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7667 ns 8042 ns 0.95
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8209 ns 8083 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8666 ns 7916 ns 1.09
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 204125.5 ns 216406.5 ns 0.94
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23000 ns 23042 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23084 ns 23542 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23584 ns 23333 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23500 ns 23375 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18461 ns 19066 ns 0.97
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52458 ns 52291.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52291 ns 52500 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 52791 ns 53166.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52458 ns 52125 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 286087.5 ns 309714.5 ns 0.92
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1397209 ns 1413917 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1395917 ns 1401104 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1400209 ns 1457583.5 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1398500 ns 1402271 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 195540.5 ns 196285 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5008458.5 ns 5045083 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5018750 ns 4724458 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4722750 ns 5023021 ns 0.94
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4703042 ns 4706104.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 626852.5 ns 644560.5 ns 0.97
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3063416 ns 3086125.5 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2063875 ns 2087104.5 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2311417 ns 2281125 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4823500 ns 4848375 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 580360 ns 580262 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24332959 ns 24765000.5 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18875458 ns 18889791.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18989334 ns 19005084 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 36748479.5 ns 36681292 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3188758 ns 3253871.5 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 34048562.5 ns 34537875 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28257854 ns 28314500 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28468541.5 ns 27967000 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41851021 ns 41702500 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 144123292 ns 144041208 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 147912291 ns 143168583 ns 1.03
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 128219729 ns 124247521 ns 1.03
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 175666645.5 ns 173506729 ns 1.01
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22797470 ns 22768605 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 1274551333 ns 957619479 ns 1.33
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1209986250 ns 1175957479.5 ns 1.03
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 717258459 ns 739734292 ns 0.97
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 669341542 ns 672317125 ns 1.00
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 118134658 ns 118020449 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 75042 ns 73979 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 73833 ns 75750 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 75813 ns 75416 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 74125 ns 72854.5 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 248024.5 ns 300521.5 ns 0.83
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 202750 ns 287875 ns 0.70
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 283250 ns 285333 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 194000 ns 204208 ns 0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 189583 ns 287375 ns 0.66
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1272660.5 ns 1342742 ns 0.95
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 35542000 ns 36185500 ns 0.98
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 36428479 ns 35466000.5 ns 1.03
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32734792 ns 32336688 ns 1.01
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 40941958 ns 40972250 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5852888 ns 5837876 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 147574354 ns 151179834 ns 0.98
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 154842271 ns 151456979 ns 1.02
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 142249771 ns 136606104 ns 1.04
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 285430916 ns 287372208 ns 0.99
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34907859 ns 34877857 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 119543458.5 ns 155986916 ns 0.77
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 173916625 ns 174507459 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 155928584 ns 148111416.5 ns 1.05
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 103545938 ns 102908562.5 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5470774 ns 5463707 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 471171395.5 ns 520380250 ns 0.91
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 467366000 ns 465489750 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 456719729 ns 439138000 ns 1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 738831458 ns 742252417 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 32277660 ns 35175845 ns 0.92
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 709159062 ns 698201250 ns 1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 654555208.5 ns 654820792 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 585803354.5 ns 571273229.5 ns 1.03
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 726547959 ns 850215250 ns 0.85
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1242646 ns 1101520.5 ns 1.13
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 968625.5 ns 970208.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 674709 ns 920500 ns 0.73
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 1941770.5 ns 1945375.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 569058 ns 580245.5 ns 0.98
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2969916 ns 2907896 ns 1.02
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2603708 ns 2595708 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 1985166.5 ns 2606333 ns 0.76
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3729625 ns 3655000 ns 1.02
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1762089 ns 1734207 ns 1.02
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 5801458 ns 6744875 ns 0.86
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 5780958 ns 6498208 ns 0.89
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 5645834 ns 6503854.5 ns 0.87
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 2921042 ns 4423604.5 ns 0.66
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7208 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5958 ns 6083 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5333 ns 5958.5 ns 0.90
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10083 ns 9959 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 25119 ns 25201 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215750 ns 212291 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 258458 ns 220750 ns 1.17
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221291.5 ns 220125 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 207146 ns 206792 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 264756 ns 262467.5 ns 1.01
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 308377104 ns 316552750 ns 0.97
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 231656291 ns 221682708 ns 1.04
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 224042396 ns 187257688 ns 1.20
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 307881333 ns 311596375 ns 0.99
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7678620 ns 7676203 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1097604312.5 ns 1093022833.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 920148521 ns 911616145.5 ns 1.01
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 858485833.5 ns 815656375 ns 1.05
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1150798750 ns 1161401125 ns 0.99
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26497955 ns 26547253 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4958.5 ns 5292 ns 0.94
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5583 ns 5667 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6916.5 ns 6625 ns 1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5541 ns 5125 ns 1.08
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 171524 ns 167889.5 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7542 ns 7083 ns 1.06
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6750 ns 7375 ns 0.92
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7458 ns 7459 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7875 ns 7437.5 ns 1.06
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 670577.5 ns 650263 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 541 ns 542 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 541 ns 709 ns 0.76
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 667 ns 0.94
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 23778 ns 23809 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 8708 ns 9041.5 ns 0.96
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 8541.5 ns 9791 ns 0.87
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9458 ns 9208.5 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9541.5 ns 9042 ns 1.06
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 233071 ns 233459 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 353250 ns 351417 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 353208 ns 352250 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 352667 ns 353063 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 352125 ns 353333 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 21348 ns 21613 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 822333 ns 791250 ns 1.04
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 774854 ns 808979 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 777042 ns 773625 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 825999.5 ns 824084 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 286748 ns 305844 ns 0.94
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 336833 ns 314958 ns 1.07
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 335917 ns 333625 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 445708 ns 448667 ns 0.99
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 10917 ns 331833 ns 0.03289907875346937
batchedmm(16, Bsize=32)/forward/GPU/CUDA 17559 ns 17811 ns 0.99
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 713499.5 ns 682125 ns 1.05
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 730834 ns 746791.5 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1027167 ns 1029167 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 26500 ns 700937.5 ns 0.03780650913954525
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 260521.5 ns 273907.5 ns 0.95
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 371375 ns 328083 ns 1.13
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 346250 ns 348979 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 445812.5 ns 424375 ns 1.05
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 30479 ns 370666 ns 0.0822276658770969
batchedmm(16, Bsize=128)/forward/GPU/CUDA 22136 ns 22237 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 734062.5 ns 743604 ns 0.99
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 773750.5 ns 750229 ns 1.03
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1061729 ns 1076375 ns 0.99
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 98521 ns 822541 ns 0.12
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 220018.5 ns 220485.5 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3375 ns 3334 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3542 ns 3792 ns 0.93
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3687.5 ns 3625 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3583 ns 3583 ns 1
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 17780 ns 18068 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4125 ns 4166 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4167 ns 4542 ns 0.92
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4375 ns 4250 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4500 ns 4334 ns 1.04
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 258504 ns 278097 ns 0.93
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3750 ns 3292 ns 1.14
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3500 ns 3645.5 ns 0.96
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4917 ns 4708 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4083 ns 4042 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 200777 ns 212235.5 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8417 ns 8042 ns 1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8000 ns 8417 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8625 ns 8792 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8604.5 ns 8167 ns 1.05
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1183716 ns 1255478 ns 0.94
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 205708 ns 204000 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 210125 ns 211375 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 210375 ns 211042 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200375 ns 200541 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34375 ns 34367 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 650916 ns 605708.5 ns 1.07
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 666959 ns 625021 ns 1.07
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 624167 ns 620792 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 632458 ns 582583 ns 1.09
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 343648 ns 361289.5 ns 0.95
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 1000479 ns 973333 ns 1.03
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 1007958 ns 950209 ns 1.06
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 974396 ns 955541 ns 1.02
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 894770.5 ns 1286000.5 ns 0.70
batchedmm(128, Bsize=128)/forward/GPU/CUDA 207021.5 ns 207830 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4512146 ns 4594084 ns 0.98
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4708729.5 ns 4500750.5 ns 1.05
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4609875 ns 4304583 ns 1.07
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 5171208.5 ns 6304625 ns 0.82
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 947853.5 ns 925479 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3333 ns 3333 ns 1
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3083 ns 3583 ns 0.86
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4333 ns 4250 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3917 ns 3541 ns 1.11
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 218377.5 ns 240989.5 ns 0.91
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7375 ns 6875 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6833 ns 7542 ns 0.91
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7458 ns 7375 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7459 ns 7042 ns 1.06
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 1012916 ns 1039649.5 ns 0.97
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1641584 ns 1636792 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1193979 ns 1175749.5 ns 1.02
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1342687.5 ns 1347167 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2486625.5 ns 2463271 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 214048 ns 213096 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12366291.5 ns 12388416 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9556958 ns 9551437.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9332500 ns 9305937.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18065166.5 ns 18088000 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1946882 ns 1951605 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17346750 ns 17398084 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14347000 ns 14348854.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14486917 ns 14347271 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21148167 ns 21112104 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 134750 ns 94729.5 ns 1.42
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 88584 ns 90667 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 92042 ns 92375 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 89042 ns 114395.5 ns 0.78
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 126624 ns 125574 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2031958 ns 2039792 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2023083.5 ns 1808208.5 ns 1.12
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1756000 ns 2033666.5 ns 0.86
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2029583 ns 2022500 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1029084 ns 1052869 ns 0.98
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 1750 ns 326041.5 ns 0.005367414884301538
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 2833 ns 344833 ns 0.008215571015535056
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 2458 ns 396416 ns 0.0062005569906360995
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 2166.5 ns 314708 ns 0.006884159284161826
batchedmm(2, Bsize=4)/forward/GPU/CUDA 16055 ns 15677 ns 1.02
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2583 ns 701042 ns 0.00368451533574308
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2500 ns 733209 ns 0.0034096690029718677
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 2750 ns 1020500 ns 0.0026947574718275357
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2750 ns 656250 ns 0.004190476190476191
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 191618 ns 196145.5 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7416 ns 7084 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5917 ns 5541 ns 1.07
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5125 ns 6084 ns 0.84
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10166 ns 10000 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 33917 ns 34060 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 226396.5 ns 221166.5 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 222521 ns 220916.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221584 ns 220167 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 207458 ns 217124.5 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 311723.5 ns 344547 ns 0.90
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3750 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3667 ns 3709 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3750 ns 3708 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3667 ns 3667 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22860 ns 22568 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14458 ns 14167 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14291 ns 14375 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14250 ns 14458 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14667 ns 14416 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 472859.5 ns 487124.5 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 137417 ns 97500 ns 1.41
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 96458.5 ns 93417 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 95833 ns 96687.5 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 93125 ns 91875 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 125940 ns 124929 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1921458.5 ns 1940875 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1918166.5 ns 1919916.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1817687.5 ns 1931229.5 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1914458 ns 1917271.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 951464 ns 955641 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 869042 ns 854084 ns 1.02
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 815167 ns 826333 ns 0.99
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1175833 ns 1211000 ns 0.97
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 967562.5 ns 955354.5 ns 1.01
lenet(28, 28, 1, 32)/forward/GPU/CUDA 276671 ns 272141 ns 1.02
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2830583 ns 2801124.5 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2508062.5 ns 2515333 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3332875 ns 3309625 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3328000 ns 3416625 ns 0.97
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1576106.5 ns 1612126.5 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 16000 ns 17062.5 ns 0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15625 ns 16708.5 ns 0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 16458 ns 18937 ns 0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16417 ns 15167 ns 1.08
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 143900.5 ns 142123.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 255875.5 ns 223437.5 ns 1.15
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 254271 ns 215958 ns 1.18
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 216250 ns 216125 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 258021 ns 255708.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 637843.5 ns 644779 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 220792 ns 222292 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 220667 ns 221750 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 221208 ns 222542 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 222208.5 ns 220917 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 270997 ns 271274.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 504458 ns 509083 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 507416.5 ns 501292 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 499833.5 ns 496750 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 498875.5 ns 550583 ns 0.91
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1304306.5 ns 1401190 ns 0.93
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 3459 ns 304437.5 ns 0.011361938000410594
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 3854.5 ns 331687.5 ns 0.01162087808554739
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 5375 ns 376292 ns 0.014284119779320315
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 4042 ns 321812.5 ns 0.012560108758982327
batchedmm(16, Bsize=4)/forward/GPU/CUDA 16660 ns 16554 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 7166 ns 708875 ns 0.010108975489331687
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 6458 ns 736875 ns 0.00876403731976251
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 7209 ns 1020209 ns 0.0070661991807561
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 7541.5 ns 668458 ns 0.011281935439474132
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 194930.5 ns 196065 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17666 ns 17854 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17125 ns 18520.5 ns 0.92
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19729 ns 19667 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18000 ns 16209 ns 1.11
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 146357.5 ns 146750.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 244562 ns 247604 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 237417 ns 212500 ns 1.12
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 214500 ns 212917 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 225208 ns 211750.5 ns 1.06
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 894981 ns 1011803 ns 0.88
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4416 ns 4125 ns 1.07
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 3917 ns 4125 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5334 ns 5187.5 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4833 ns 4084 ns 1.18
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 187684 ns 201325 ns 0.93
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10500 ns 10667 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9708 ns 10875 ns 0.89
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11167 ns 10500 ns 1.06
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11250 ns 10375 ns 1.08
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 1024651 ns 1050725 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3209 ns 3375 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3250 ns 3625 ns 0.90
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4687.5 ns 4167 ns 1.12
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3791 ns 3291 ns 1.15
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 218725.5 ns 242454 ns 0.90
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7833 ns 7542 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7291 ns 7666 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7625 ns 7750 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7917 ns 7333 ns 1.08
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 1043721.5 ns 1067571 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23437104.5 ns 24057353.5 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 35045979.5 ns 34753459 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 41490500 ns 37792125 ns 1.10
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34913479 ns 34828583.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2126334.5 ns 1854184 ns 1.15
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 184798459 ns 187222542 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 159330000 ns 160010375 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 151477459 ns 146721854.5 ns 1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 411547250 ns 412776417 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16524151 ns 16508303 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 427197208 ns 437495583 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 252723645.5 ns 253838438 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 305721250 ns 232343979.5 ns 1.32
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 481095166 ns 483540875 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 182854.5 ns 183854 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 182791.5 ns 183625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 185292 ns 185334 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 185750 ns 184167 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 173677.5 ns 220968 ns 0.79
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 629833 ns 594000 ns 1.06
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 631375 ns 632437.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 590542 ns 586084 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 630770.5 ns 628500 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1010062 ns 1061303.5 ns 0.95
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 3848041.5 ns 3892042 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 4009000 ns 3642708 ns 1.10
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3525583 ns 3572042 ns 0.99
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 4614917 ns 5353250 ns 0.86
batchedmm(128, Bsize=512)/forward/GPU/CUDA 536882 ns 549368 ns 0.98
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17371917 ns 17901624.5 ns 0.97
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 17740624.5 ns 17281292 ns 1.03
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16856312.5 ns 16574875 ns 1.02
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 20403334 ns 22050250 ns 0.93
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2613028 ns 2630980 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 541 ns 0.92
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 500 ns 625 ns 0.80
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 667 ns 584 ns 1.14
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 31917 ns 31762 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9334 ns 9145.5 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8708 ns 9208 ns 0.95
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9875 ns 9417 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9417 ns 9208 ns 1.02
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 260614 ns 262912.5 ns 0.99
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 503086958 ns 505346750 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 424620083.5 ns 429818666.5 ns 0.99
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 462339520.5 ns 433256333.5 ns 1.07
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 673052062 ns 677373875 ns 0.99
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12478664.5 ns 12487373 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 1872018104.5 ns 2066713500 ns 0.91
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1625413500 ns 1635890000 ns 0.99
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1546440125 ns 1494391792 ns 1.03
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2200566458.5 ns 2208031208.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49139909 ns 49163495.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1647791.5 ns 1632500.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1202542 ns 1173583 ns 1.02
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1365999.5 ns 1383958 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2393042 ns 2483292 ns 0.96
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 215162 ns 214736 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12703083.5 ns 12776042 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9880000 ns 9939062.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9761146 ns 9686917 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18559417 ns 18349375 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2005712 ns 2056758 ns 0.98
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17693854 ns 17758729.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14669187.5 ns 14689958 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14767500 ns 14551125 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21469542 ns 21399666 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26208 ns 26292 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26292 ns 26333 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26292 ns 26250 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23799 ns 24146 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66666 ns 66791 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66750 ns 67292 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67209 ns 68417 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 67500 ns 66709 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 380551.5 ns 391053.5 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203917 ns 204333 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209750 ns 210125 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 210000 ns 209458 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 199958 ns 198792 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 25800 ns 26289 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 648229.5 ns 642083 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 661271 ns 624354.5 ns 1.06
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 622750 ns 621729.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 586375 ns 627000.5 ns 0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 308724.5 ns 357106 ns 0.86
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 600291 ns 645625 ns 0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 594125 ns 636292 ns 0.93
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 544666 ns 602667 ns 0.90
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 652208 ns 672375 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 131751 ns 132245.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2235000 ns 2294979 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2235625 ns 2157208 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2300854 ns 2246208 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2253125 ns 2249458 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1127758 ns 1236985 ns 0.91
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17541 ns 17937.5 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 16958 ns 18416.5 ns 0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19917 ns 20083 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17958 ns 18895.5 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 145385 ns 145580 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 261583 ns 259583 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 260812.5 ns 261791 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220937.5 ns 219084 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 230896 ns 257520.5 ns 0.90
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 982925 ns 1034996 ns 0.95
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 542 ns 542 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 542 ns 667 ns 0.81
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 667 ns 583 ns 1.14
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23015 ns 23604 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9479.5 ns 9750 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9042 ns 10292 ns 0.88
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10292 ns 10250 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9625 ns 9333 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 257388 ns 260113.5 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5458 ns 5083.5 ns 1.07
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5417 ns 5792 ns 0.94
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6625 ns 6833 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6083 ns 5375 ns 1.13
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 233603.5 ns 229273.5 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7083 ns 6709 ns 1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7041 ns 7667 ns 0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7833 ns 7583 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7375 ns 6937.5 ns 1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 800650 ns 777061.5 ns 1.03
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2000 ns 1917 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2125 ns 2500 ns 0.85
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2458 ns 2208 ns 1.11
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2459 ns 2250 ns 1.09
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 17988 ns 18340 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6500 ns 6542 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6291 ns 6667 ns 0.94
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6708 ns 6666 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6542 ns 6584 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 330671 ns 320616.5 ns 1.03
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 749709 ns 750542 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 747104 ns 746792 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 749208 ns 746916 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 751791.5 ns 750584 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 21045 ns 21795 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 791000 ns 805145.5 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 791062.5 ns 791604 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 775875 ns 772584 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 775250 ns 810645.5 ns 0.96
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 294695 ns 302046.5 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7208 ns 6959 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5958 ns 5917 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5291 ns 6000 ns 0.88
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10208 ns 10167 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 32534 ns 32896 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 233291 ns 228770.5 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 267375 ns 227709 ns 1.17
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 227812.5 ns 228084 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213583 ns 225625.5 ns 0.95
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 361573 ns 359979 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10020.5 ns 10250 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10042 ns 10208 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11625 ns 11042 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10208 ns 9958 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 248981.5 ns 245976 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 26791 ns 24896 ns 1.08
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24292 ns 24000 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24750 ns 25416.5 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25000 ns 24625 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1132389 ns 1114734 ns 1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 107227250 ns 106794687 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 117058791.5 ns 118367979 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 124034229 ns 120992291 ns 1.03
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 117545541.5 ns 118045833 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2659866 ns 2655666 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 393155000 ns 397097667 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 366597250 ns 368138875 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 357674666 ns 357737125 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 490403667 ns 483722209 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15157994 ns 15195689 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 758865499.5 ns 769405854 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 580033084 ns 762934333 ns 0.76
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 748265062.5 ns 748099729.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 948608916.5 ns 772112770.5 ns 1.23
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6916.5 ns 6417 ns 1.08
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7000 ns 7375 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8042 ns 8187 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7625 ns 8708.5 ns 0.88
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 242461.5 ns 243458.5 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14084 ns 13625 ns 1.03
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13500 ns 14834 ns 0.91
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14208 ns 14834 ns 0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14333 ns 14000 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1085062 ns 1081512.5 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5541 ns 5500 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6563 ns 6083.5 ns 1.08
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7666 ns 7500 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6291 ns 5625 ns 1.12
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 235371.5 ns 236881 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12542 ns 12583 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12104.5 ns 12750 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13042 ns 13000 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12750 ns 12542 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 793450.5 ns 792100 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 5125 ns 328937.5 ns 0.015580467414022421
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 5750 ns 345250 ns 0.0166545981173063
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 6333 ns 398625 ns 0.01588711194731891
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 5625 ns 315687.5 ns 0.01781825381112651
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16571 ns 17026 ns 0.97
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 15792 ns 701750 ns 0.022503740648379053
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 15417 ns 734417 ns 0.020992161129167762
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 15625 ns 1025666 ns 0.015234004052001334
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 15750 ns 663750 ns 0.023728813559322035
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 200110.5 ns 202330 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 292 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 292 ns 417 ns 0.70
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 416 ns 416 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 417 ns 292 ns 1.43
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 23594.5 ns 23795 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 5959 ns 6250 ns 0.95
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6083 ns 6750 ns 0.90
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6666 ns 6500 ns 1.03
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6834 ns 6104.5 ns 1.12
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 242427.5 ns 242897.5 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5833 ns 5875 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5834 ns 6042 ns 0.97
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 6000 ns 5917 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6041 ns 5875 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 24342.5 ns 24778 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 20875 ns 21834 ns 0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21042 ns 21542 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21666 ns 21750 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21875 ns 21417 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 262727.5 ns 265364.5 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 185833 ns 184375 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 144916.5 ns 185000 ns 0.78
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 146875 ns 149541 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 144416.5 ns 190750 ns 0.76
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167734 ns 168165 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1323750 ns 1361667 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1312209 ns 1306875.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1332875 ns 1318541.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1333770.5 ns 1332084 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1339118 ns 1372553 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24041.5 ns 24458 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22312.5 ns 22729 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 24833 ns 25000 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24667 ns 22374.5 ns 1.10
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 351890.5 ns 355948 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 170708 ns 176958 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 177875 ns 131167 ns 1.36
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 118625 ns 126166.5 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 120020.5 ns 177542 ns 0.68
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1461877 ns 1491511 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 292 ns 417 ns 0.70
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 416 ns 333 ns 1.25
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 22590 ns 23138 ns 0.98
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6250 ns 6125 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6250 ns 6917 ns 0.90
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6750 ns 6667 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6583 ns 6250 ns 1.05
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 255552.5 ns 259300 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4291 ns 4458 ns 0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4417 ns 4875 ns 0.91
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5708 ns 5708.5 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5292 ns 4833 ns 1.09
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 256272 ns 258768.5 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10042 ns 9709 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9833 ns 10083 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10417 ns 10417 ns 1
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10333 ns 10041.5 ns 1.03
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1354208 ns 1358754 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1583 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1625 ns 1666 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1666 ns 1667 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1625 ns 1583 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22798 ns 23306 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5833 ns 5625 ns 1.04
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5709 ns 6125 ns 0.93
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6000 ns 6041 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5916 ns 5625 ns 1.05
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 274328 ns 275587 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6866624.5 ns 6813916.5 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6433708 ns 6428416 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6554499.5 ns 6554167 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7548875 ns 7571104.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 213149 ns 213811 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24100417 ns 24163500 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21294521 ns 21359167 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21070125 ns 21066083 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29826667 ns 29670209 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2116806 ns 2101483 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 37336834 ns 37462416 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 34197292 ns 45862833.5 ns 0.75
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45794042 ns 45876667 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 49624208 ns 38235959 ns 1.30
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5750 ns 5459 ns 1.05
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5625 ns 6250 ns 0.90
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6791 ns 6958 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6667 ns 5292 ns 1.26
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 236202.5 ns 238588.5 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8084 ns 7959 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7875 ns 8334 ns 0.94
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8667 ns 8250 ns 1.05
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9167 ns 8250 ns 1.11
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1060405 ns 1068264.5 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1553542 ns 1529292 ns 1.02
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1263041.5 ns 1266666.5 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1622041 ns 1623709 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2175916 ns 2163750 ns 1.01
lenet(28, 28, 1, 128)/forward/GPU/CUDA 272178 ns 279544 ns 0.97
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7902375 ns 7968292 ns 0.99
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6258292 ns 6533250 ns 0.96
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7165958 ns 7125792 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10478104.5 ns 10479375 ns 1.00
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1852121.5 ns 1874497 ns 0.99
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 361584 ns 320667 ns 1.13
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 370750 ns 346291 ns 1.07
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 456417 ns 428584 ns 1.06
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 24999.5 ns 345375 ns 0.0723836409699602
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46439.5 ns 46619.5 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 738895.5 ns 745958.5 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 809958 ns 791666.5 ns 1.02
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1082542 ns 1073208.5 ns 1.01
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 76708 ns 776479 ns 0.09878953584063445
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 301861.5 ns 311670 ns 0.97
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397459 ns 396708.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288084 ns 287917 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 212208 ns 288250 ns 0.74
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 755209 ns 753417 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43701 ns 44556 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 665625 ns 645167 ns 1.03
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 530417 ns 527667 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 473750 ns 532000 ns 0.89
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 974458 ns 974292 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 189749 ns 190424 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 649583 ns 668958 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 641833 ns 629749.5 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 545458.5 ns 544375 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 653167 ns 643396 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 131877 ns 132592.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2454834 ns 2485646 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2460271 ns 2448562.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2500666 ns 2450292 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2518479 ns 2461146 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1202049 ns 1408688 ns 0.85
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 3000 ns 324000.5 ns 0.009259244970300971
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 3500 ns 344459 ns 0.010160860944263323
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 3500 ns 396583 ns 0.008825390901778443
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 2708 ns 314083.5 ns 0.008621911052315705
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15904 ns 16193 ns 0.98
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 5375 ns 700875 ns 0.007668985197075085
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 5292 ns 734292 ns 0.007206942197381968
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 5666 ns 1020625 ns 0.005551500306184936
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 5750 ns 656584 ns 0.008757447638078297
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 196388 ns 201017 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1465625 ns 1461042 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1502708 ns 1503750 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1496875 ns 1504625 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1444792 ns 1442917 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 40558 ns 40991 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5125396 ns 5155750 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5286583 ns 5279833.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5312375 ns 5308333.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4974792 ns 4987604 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 195790.5 ns 200839 ns 0.97
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3750 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3708 ns 3709 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3709 ns 3667 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3708 ns 3708 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 32748 ns 33187 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15083 ns 14958 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15083 ns 15395.5 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15167 ns 15375 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15375 ns 15083 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 375651.5 ns 379072.5 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 71125 ns 71541 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71167 ns 71542 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 71208 ns 71270.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 71083 ns 71083 ns 1
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 112958 ns 112914 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 323791 ns 325333 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 320458 ns 320729.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 326875 ns 318792 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 323000 ns 317333 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 193747 ns 193733 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 1000 ns 1000 ns 1
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 958 ns 1125 ns 0.85
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1042 ns 1083 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 1084 ns 1000 ns 1.08
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 23358 ns 23845 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7875 ns 7750 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7834 ns 8583 ns 0.91
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8458 ns 8500 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8833 ns 7750 ns 1.14
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 259209 ns 262768.5 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 505375 ns 456417 ns 1.11
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 484292 ns 472584 ns 1.02
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 564542 ns 554479 ns 1.02
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 215062.5 ns 550167 ns 0.39
batchedmm(128, Bsize=32)/forward/GPU/CUDA 128754 ns 128330 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1371334 ns 1408750 ns 0.97
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1393812.5 ns 1380958 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1732333 ns 1632666.5 ns 1.06
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 870083.5 ns 1597604 ns 0.54
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 276302 ns 274089 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 334 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 292 ns 417 ns 0.70
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 375 ns 333 ns 1.13
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31400 ns 31588 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6167 ns 6083 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6000 ns 6750 ns 0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6500 ns 6458 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6958 ns 6125 ns 1.14
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 263074.5 ns 263587.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1767042 ns 1767792 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1725208 ns 1726375 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1727292 ns 1725708 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1726271 ns 1773250 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168554 ns 168887 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4357521 ns 4406958 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4359541 ns 4358916 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4379875 ns 4369792 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4377583 ns 4367125 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1157059 ns 1241756.5 ns 0.93
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6666 ns 6750 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6666 ns 7000 ns 0.95
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 6916 ns 6792 ns 1.02
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 7041.5 ns 6750 ns 1.04
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 20567 ns 19512 ns 1.05
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 32834 ns 51584 ns 0.64
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 51229.5 ns 48771 ns 1.05
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 33541.5 ns 33250 ns 1.01
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 51062.5 ns 52958 ns 0.96
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 209739.5 ns 210086 ns 1.00
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 17250 ns 328750 ns 0.05247148288973384
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 17812.5 ns 344958 ns 0.05163672099212078
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 18292 ns 408250 ns 0.044805878750765464
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 17708 ns 323500 ns 0.05473879443585781
batchedmm(2, Bsize=512)/forward/GPU/CUDA 17907 ns 18058 ns 0.99
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 53208 ns 719583.5 ns 0.0739427738407009
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 52959 ns 735666.5 ns 0.07198778250742693
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 53541 ns 1034250 ns 0.051767947788252354
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 53291 ns 684646 ns 0.07783730570250903
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 344400 ns 345041 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75333 ns 75459 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 74959 ns 75292 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75292 ns 75167 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 75000 ns 75333 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 47022 ns 46969 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 325292 ns 332833 ns 0.98
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 324417 ns 325833 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 343042 ns 324583 ns 1.06
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 327084 ns 323834 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 210359 ns 207979 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1488333 ns 1487708 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1527917 ns 1530375 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1521042 ns 1530750 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1466167 ns 1466417 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 51138 ns 51505.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5120375 ns 5146312.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5285750 ns 5151604.5 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5309459 ns 5003270.5 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4973917 ns 4984709 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 202631 ns 205494.5 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28167 ns 28250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28125 ns 28334 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28208 ns 28333 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28209 ns 28167 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24478 ns 24407 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66208 ns 66500 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66167 ns 66375 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66250 ns 67458 ns 0.98
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66959 ns 66417 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 533201 ns 525547 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1463833 ns 1383749.5 ns 1.06
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1144583 ns 1059771 ns 1.08
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 832188 ns 1061458 ns 0.78
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2217792 ns 2248687.5 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 576305 ns 581876.5 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3077958.5 ns 3035479 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2733167 ns 2745250 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2620334 ns 2740958 ns 0.96
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3782000 ns 3811500 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 2001343 ns 2064611 ns 0.97
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 7887749.5 ns 8921042 ns 0.88
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 7887771 ns 8776625 ns 0.90
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 7989000 ns 8768729.5 ns 0.91
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 4832458 ns 6359583 ns 0.76
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 134958 ns 82083.5 ns 1.64
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 78917 ns 81562.5 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 82625 ns 83125 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81250 ns 80583 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193237.5 ns 192403.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2017354.5 ns 2040625 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2006750 ns 1935354.5 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2041167 ns 2023083 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2018875 ns 2003562.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 797402 ns 805958 ns 0.99

This comment was automatically generated by workflow using github-action-benchmark.

@avik-pal
Copy link
Member Author

@avik-pal avik-pal commented on 132619c Nov 22, 2024 via email

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/119938

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.3.4 -m "<description of version>" 132619c86d1579fbca4d4d253331d103cd528101
git push origin v1.3.4

Please sign in to comment.