Skip to content

Commit

Permalink
fix: missing zero leads to NaNs (#1044)
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal authored Nov 7, 2024
1 parent 4079372 commit 40a0797
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 3 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Lux"
uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.2.1"
version = "1.2.2"

[deps]
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
Expand Down
1 change: 1 addition & 0 deletions src/extended_ops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ function ∇eachslice(Δ′, x::AbstractArray, ::Val{dims}) where {dims}
idx = findfirst(Base.Fix2(isa, AbstractArray), Δs)
idx === nothing && return zero.(x)
Δ = similar(x)
fill!(Δ, false)
for i in axes(x, dims)
Δᵢ = selectdim(Δ, dims, i)
copyto!(Δᵢ, Δs[i])
Expand Down
6 changes: 4 additions & 2 deletions src/helpers/losses.jl
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,10 @@ function unsafe_apply_loss(loss::BinaryFocalLoss, ŷ, y)
end

@doc doc"""
CrossEntropyLoss(; agg=mean, epsilon=nothing, dims=1,
label_smoothing::Union{Nothing, Real}=nothing)
CrossEntropyLoss(;
agg=mean, epsilon=nothing, dims=1, logits::Union{Bool, Val}=Val(false),
label_smoothing::Union{Nothing, Real}=nothing
)
Return the cross entropy loss which is used in multi-class classification tasks. The input,
$\hat{y}$, is expected to be normalized (i.e. `softmax` output) if `logits` is `false` or
Expand Down

3 comments on commit 40a0797

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/118863

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.2.2 -m "<description of version>" 40a0797924bb1b6426f82600d3cf69563133b59c
git push origin v1.2.2

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 40a0797 Previous: 900c21c Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4709 ns 4270.5 ns 1.10
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4187.5 ns 4000 ns 1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6458 ns 5875 ns 1.10
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4333 ns 4895.5 ns 0.89
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 59980 ns 59833 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10208 ns 10375 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10250 ns 9958 ns 1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10416 ns 10792 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10500 ns 10125 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 421819.5 ns 422438 ns 1.00
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3333 ns 1083 ns 3.08
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1125 ns 1000 ns 1.13
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1458 ns 1417 ns 1.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3583 ns 1125 ns 3.18
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 18027 ns 18109 ns 1.00
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4208.5 ns 4166 ns 1.01
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4000 ns 4125 ns 0.97
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4125 ns 4187.5 ns 0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 3542 ns 4042 ns 0.88
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 110057.5 ns 109209 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57167 ns 57645.5 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46291 ns 47000 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 37667 ns 38125 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81562.5 ns 82084 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37973 ns 37455 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2023292 ns 1973687 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2084875 ns 2089416 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2079104.5 ns 2085625 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1989416 ns 1985813 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 196720 ns 195917 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 148250 ns 146416.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 145250 ns 147020.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 147000 ns 145667 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 146458.5 ns 145604.5 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 165401.5 ns 166391 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1109125 ns 1129209 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1002854 ns 1126375 ns 0.89
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1120896 ns 1147667 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1120062.5 ns 1104209 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 522324 ns 521058.5 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3167 ns 3416.5 ns 0.93
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3666 ns 3333 ns 1.10
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4959 ns 6333 ns 0.78
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4208 ns 3250 ns 1.29
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 67564 ns 66594 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9292 ns 8792 ns 1.06
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9625 ns 9291 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9666 ns 9250 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9125 ns 9292 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 490212.5 ns 493812 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 16250 ns 14750 ns 1.10
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15666 ns 15458 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 17708 ns 19167 ns 0.92
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 15458.5 ns 16437.5 ns 0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 53751 ns 53833 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 218959 ns 215416.5 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 263125 ns 213208.5 ns 1.23
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 214479.5 ns 214271 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 214584 ns 227104 ns 0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 270657.5 ns 271460 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 584 ns 542 ns 1.08
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 667 ns 625 ns 1.07
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 791 ns 792 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 583 ns 0.86
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 17722 ns 17470 ns 1.01
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1625 ns 1750 ns 0.93
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1542 ns 1417 ns 1.09
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1750 ns 1709 ns 1.02
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1500 ns 1645.5 ns 0.91
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 105921 ns 101826.5 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 6708 ns 7250 ns 0.93
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5625 ns 5916 ns 0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5250 ns 5292 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9917 ns 10000 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 24704 ns 23857.5 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220833 ns 226895.5 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 228854.5 ns 230375 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 230667 ns 231584 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 214854.5 ns 258625 ns 0.83
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 173103 ns 167659 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3834 ns 3875 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3875 ns 3875 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3875 ns 3916 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3833 ns 3833 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 24587 ns 23468 ns 1.05
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16750 ns 16750 ns 1
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 17000 ns 17042 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16875 ns 17000 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16667 ns 16625 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 165551.5 ns 160597 ns 1.03
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 578833 ns 572166 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 576917 ns 575000 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 582166 ns 587458 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 576333 ns 578334 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113849.5 ns 113397 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1416354 ns 1421708 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1419000 ns 1420125 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1419292 ns 1430083 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1422125 ns 1413292 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 215768 ns 209669.5 ns 1.03
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1077667 ns 1074458 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 958083 ns 958250.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1334083.5 ns 1334396 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1294500 ns 1310875 ns 0.99
lenet(28, 28, 1, 64)/forward/GPU/CUDA 279534.5 ns 269120.5 ns 1.04
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 5905708.5 ns 5769437 ns 1.02
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4464875 ns 4470625 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4947583.5 ns 4941021 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5694959 ns 5552042 ns 1.03
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1102519 ns 1066489 ns 1.03
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 542 ns 500 ns 1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23704 ns 23585 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2083 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2208 ns 2167 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2125 ns 2250 ns 0.94
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2083 ns 2125 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 173233.5 ns 169900 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6083 ns 4084 ns 1.49
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6062.5 ns 6250 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 6833 ns 7209 ns 0.95
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5625 ns 6125 ns 0.92
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 65783.5 ns 64199 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11708 ns 11083 ns 1.06
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11458 ns 11625 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12041.5 ns 12000 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10916 ns 10917 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 451682.5 ns 446167.5 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7291 ns 6042 ns 1.21
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6959 ns 7042 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7417 ns 8833 ns 0.84
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7000 ns 7250 ns 0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 52765.5 ns 51074.5 ns 1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 19292 ns 17292 ns 1.12
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16791 ns 18334 ns 0.92
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18416 ns 18083 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17417 ns 17229.5 ns 1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 307013 ns 299895.5 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 584 ns 459 ns 1.27
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 459 ns 542 ns 0.85
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 583 ns 542 ns 1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 33676 ns 32630 ns 1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9209 ns 8458 ns 1.09
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8959 ns 9041 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9458 ns 9166 ns 1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8541 ns 8459 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 163297 ns 158907 ns 1.03
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64208 ns 64625 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64542 ns 64250 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64750 ns 65000 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64625 ns 64667 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111486 ns 111460 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 283750 ns 289667 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 272916 ns 279750 ns 0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 290209 ns 289625 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 282250 ns 281250 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 188359 ns 184453.5 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3288854 ns 3347125 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 3022250 ns 3015520.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 2788959 ns 2792979 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 3940500 ns 4064520.5 ns 0.97
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 573457 ns 588037 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7566395.5 ns 7500166 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7439354 ns 7470229.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7359291 ns 7393937.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8218750.5 ns 8209000 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1350797 ns 1331630 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 18855500 ns 19529541 ns 0.97
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 19172959 ns 19142959 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 19055083 ns 19022708 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 15683667 ns 15703750 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23397396 ns 23617083 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 33591292 ns 33598208 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 41001500 ns 41100666 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 35016729 ns 35022333 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1854981 ns 1855178.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 188708583 ns 189352250 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 163045187.5 ns 163568208 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 158187270.5 ns 158452896 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 440633708 ns 438607167 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13912485 ns 13925600.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 290204250 ns 287704167 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 336699250 ns 337952937.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 306929083.5 ns 291466708 ns 1.05
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 360485292 ns 395696000 ns 0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 24000 ns 21334 ns 1.12
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25250 ns 24375 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 25542 ns 25771 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24187 ns 23584 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 96403.5 ns 95861 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 103208 ns 103625 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 103958 ns 103708 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 106167 ns 104625 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 103500 ns 103479.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 505974.5 ns 510517.5 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6583 ns 5750 ns 1.14
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7187.5 ns 7208 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7834 ns 7666.5 ns 1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7270.5 ns 7166 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 68015.5 ns 68604 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14916 ns 14708 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16333 ns 15916 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16500 ns 16666 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14417 ns 14667 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 478278.5 ns 483804.5 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3017708 ns 2876500 ns 1.05
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2059209 ns 2063833 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2271084 ns 2288208 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4832125 ns 4870416 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 583236 ns 587700 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23544333.5 ns 23421375 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18008625 ns 17990750 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18182895.5 ns 18312792 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 35486792 ns 35646292 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3171395 ns 3104605 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33320333 ns 33240625 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 27559042 ns 27662417 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27849959 ns 27837459 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41863187 ns 41788833 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 74145.5 ns 72083 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 75999.5 ns 78729 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 76208 ns 75729.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 74167 ns 72459 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 100896 ns 100762.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 218250 ns 204458 ns 1.07
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 320708 ns 219041 ns 1.46
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 218959 ns 320458 ns 0.68
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 206479 ns 205312.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 541885.5 ns 541454.5 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 12667 ns 11333 ns 1.12
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12604 ns 12416 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13250 ns 13834 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12541 ns 13125 ns 0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 71532.5 ns 69856.5 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26458.5 ns 26520.5 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 27625 ns 27458 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27959 ns 28291 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 25084 ns 26500 ns 0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 472937 ns 473341 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 13166.5 ns 11833 ns 1.11
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12666 ns 12750 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13584 ns 14333 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12750 ns 13375 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 53054.5 ns 51587 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 26000 ns 26375 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26250 ns 26583 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26667 ns 26666 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 25937.5 ns 26417 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 305682.5 ns 302777.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 179250 ns 178666.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 183042 ns 180292 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 184000 ns 184416.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 180521 ns 179709 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 56083 ns 55677 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 583709 ns 591146.5 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 594666.5 ns 588583 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 584334 ns 593062 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 582417 ns 582708.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 284606 ns 285027 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6917 ns 5667 ns 1.22
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7084 ns 7167 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7458 ns 7895.5 ns 0.94
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6000 ns 7291 ns 0.82
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 69928 ns 69657.5 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13916 ns 14167 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14917 ns 14958 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15500 ns 15854.5 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13708.5 ns 14583 ns 0.94
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 462602.5 ns 460443 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1245291.5 ns 1194208.5 ns 1.04
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1249041 ns 1216792 ns 1.03
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1251687.5 ns 1262604 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 1317084 ns 1318166.5 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301738.5 ns 301559 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4131000 ns 4098416 ns 1.01
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4425583 ns 4352937.5 ns 1.02
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4803083.5 ns 4631875 ns 1.04
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 4483417 ns 4436562.5 ns 1.01
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1042425 ns 1042661.5 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1792 ns 1750 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1834 ns 1833 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1833 ns 1834 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1833 ns 1875 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23347 ns 23523 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4792 ns 4792 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4959 ns 4875 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4916 ns 4916 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 188110.5 ns 187370 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7291 ns 5500 ns 1.33
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6667 ns 6334 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 8500 ns 8604 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6520.5 ns 7292 ns 0.89
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 55544 ns 54466 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11750 ns 10958 ns 1.07
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11542 ns 11792 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11459 ns 11708.5 ns 0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10500 ns 11166 ns 0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 333462 ns 330839 ns 1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 333 ns 333 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 333 ns 292 ns 1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 23358 ns 22873.5 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2750 ns 2708 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2917 ns 2959 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2875 ns 3042 ns 0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2709 ns 2750 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 159055 ns 157537.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 13917 ns 10750 ns 1.29
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 12167 ns 13708 ns 0.89
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 14416.5 ns 14958 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 14125 ns 14583 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 56583 ns 55574.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25542 ns 25209 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24750 ns 25250 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25292 ns 25375 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24666 ns 24979.5 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 295253 ns 292656 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4125 ns 4208 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4208 ns 4125 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4125 ns 4167 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4166 ns 4167 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24628 ns 24774 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16209 ns 16333 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16167 ns 16125 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16042 ns 16125 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16042 ns 16084 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 196022 ns 195031.5 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5625 ns 5708 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5750 ns 5750 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5750 ns 5750 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5667 ns 5709 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 33545 ns 33326 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20812.5 ns 21125 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 21292 ns 20875 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21250 ns 21583 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 20875 ns 21500 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 177718.5 ns 175195.5 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 397187.5 ns 415708 ns 0.96
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 367791.5 ns 376667 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 474125.5 ns 471499.5 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 524687.5 ns 523500 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66609 ns 66680.5 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 993104.5 ns 924750.5 ns 1.07
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 895209 ns 849291 ns 1.05
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1217499.5 ns 1217521 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 1316708.5 ns 1302292 ns 1.01
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 190077.5 ns 189339 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 80792 ns 79792 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81583 ns 82667 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 85208 ns 84208 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82625 ns 82833 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193329 ns 193132 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1921791.5 ns 1917625.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1910583 ns 1915292 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1932750 ns 1940917 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1930937.5 ns 1896541 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 400621 ns 395963 ns 1.01
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 292 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21903 ns 21798 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1750 ns 1792 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1833 ns 1834 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 171616 ns 167505 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 7209 ns 5834 ns 1.24
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6875 ns 7500 ns 0.92
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9083 ns 9958 ns 0.91
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8437.5 ns 6875 ns 1.23
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 61519.5 ns 58244.5 ns 1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9375 ns 9375 ns 1
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9208 ns 9333 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9542 ns 9354.5 ns 1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9083 ns 9625 ns 0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 316461 ns 302935 ns 1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 121921729.5 ns 119443416.5 ns 1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174226916 ns 173896250 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 155004041 ns 155811625 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 104742666 ns 108054541 ns 0.97
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5476999 ns 5469386 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 615745833 ns 616746166.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 553438666 ns 555745625 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 466862292 ns 468855125 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 758782292 ns 760571396 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 38219390.5 ns 34956216 ns 1.09
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 647920084 ns 648663875 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 665540396 ns 664591146 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 602573542 ns 601178041.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 736079500 ns 746069334 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58458 ns 59458 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47375 ns 47083 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 38167 ns 39166 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83791 ns 83208 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37680 ns 37582 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1919334 ns 1926708 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1969750 ns 1983042 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1964792 ns 1986937.5 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1887083 ns 1850250 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 174429 ns 173017.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 266604.5 ns 265187.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 266833 ns 267959 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 277917 ns 276771 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 267792 ns 266917 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 134043 ns 128834.5 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 598458 ns 604083 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 679625 ns 692833.5 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 698500 ns 705709 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 589166.5 ns 590291.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 739242.5 ns 683429 ns 1.08
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2179875 ns 2195333 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2207770.5 ns 2225625 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2230166.5 ns 2230583 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2166541.5 ns 2183333 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 133769.5 ns 133325.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5479791 ns 5480833 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5475416 ns 5508958 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5556584 ns 5585895.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5498708.5 ns 5490125 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 789077 ns 766206 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 648750 ns 646750 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 646833 ns 660250 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 648791 ns 642917 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 641042 ns 647375 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46829 ns 47306 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1820833 ns 1828875 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1718208 ns 1721042 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1663417 ns 1665209 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2109125 ns 2097000 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 220654.5 ns 223896.5 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57584 ns 58667 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46708 ns 47750 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 37187.5 ns 38958 ns 0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83958 ns 82750 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 29081 ns 29191 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2026792 ns 2029083.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2065000 ns 2091166 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2093333 ns 2107249.5 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2012896 ns 1994854.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 190966 ns 190986 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13377084 ns 13371291 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12431583 ns 12436583.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12650458 ns 12675625 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 15142979 ns 15146959 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 518359.5 ns 517535.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47276625 ns 47259416 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 41724000 ns 41746209 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 41175333 ns 41384750 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 58162084 ns 58440500 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3219076.5 ns 3203835 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 73902667 ns 73984667 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 90751250 ns 91223791.5 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 90811916.5 ns 90609938 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 77025583.5 ns 77234000 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58083 ns 59000 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47042 ns 47417 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 38875 ns 38917 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84333 ns 81125 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 47304 ns 47741 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1915854 ns 1911646 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1776104 ns 1970541 ns 0.90
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1975416 ns 1976417 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1882666 ns 1882083 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 193083.5 ns 195868.5 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 32433 ns 32615 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6875 ns 6500 ns 1.06
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6708 ns 6375 ns 1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6666 ns 6750 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6083 ns 6375 ns 0.95
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 179829 ns 176818 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 31486 ns 32102 ns 0.98
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2750 ns 2625 ns 1.05
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2875 ns 2875 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2834 ns 2916 ns 0.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2542 ns 2625 ns 0.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 167204 ns 164236.5 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 289802312 ns 286096229 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 340067917 ns 339570541 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 320767937.5 ns 321242167 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 269385459 ns 271493208 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 7005548 ns 7111512 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1000854291 ns 987492667 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 940227375 ns 939040416 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 867700979 ns 868433209 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1161938542 ns 1162204042 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 33905766 ns 34040446 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1313359771 ns 1310851000.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1692771333 ns 1685402625 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1614224666 ns 1648347125 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1329771896 ns 1310788750 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1466583 ns 1412625 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1408729.5 ns 1412041.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1419646 ns 1424625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1408917 ns 1408334 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 127867 ns 128501 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5153083 ns 5028875 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5008791 ns 5030104 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5035021 ns 5062042 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4998083 ns 5014021 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 635980 ns 597004.5 ns 1.07
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 168439167 ns 168008834 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 131339666.5 ns 130299417 ns 1.01
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 146925000 ns 148283479 ns 0.99
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 169743500 ns 161948354 ns 1.05
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4853143 ns 5052268 ns 0.96
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 663948875 ns 662817209 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 641456958 ns 492884417 ns 1.30
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 506284584 ns 507367709 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 678955333 ns 678320708 ns 1.00
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 15737054 ns 17294527 ns 0.91
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 8738833 ns 8884604 ns 0.98
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8800458 ns 8801959 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 8166459 ns 8221541.5 ns 0.99
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 10174791 ns 10127167 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1595992 ns 1611762 ns 0.99
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 35758375 ns 36027125 ns 0.99
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 36866500 ns 36933063 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 34436000 ns 34547750 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 38800000 ns 38824854 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6470958 ns 6452267 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47708 ns 47375 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47792 ns 47250 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47875 ns 47542 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47500 ns 47333 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 18930 ns 19020 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50334 ns 50312.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50792 ns 50500 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50833 ns 50958.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50417 ns 50333 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 241811.5 ns 226580 ns 1.07
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8542 ns 6542 ns 1.31
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7208 ns 7187.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 8708 ns 9083 ns 0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8500 ns 8625 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 126920.5 ns 117383.5 ns 1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9958 ns 9625 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10041 ns 10208 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10125 ns 10333.5 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10167 ns 10209 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 677861 ns 723908.5 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 8666.5 ns 6083 ns 1.42
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7770.5 ns 8250 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 9167 ns 9417 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 8166 ns 8375 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 144327 ns 157024.5 ns 0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13375 ns 13292 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12959 ns 13792 ns 0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13334 ns 13708 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13083 ns 12834 ns 1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 583952.5 ns 618769 ns 0.94
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1000 ns 1042 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1042 ns 1042 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1042 ns 1083 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 32857 ns 32863 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8145.5 ns 7875 ns 1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8167 ns 8000 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8166 ns 8208 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8208 ns 8250 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 244316.5 ns 246953.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 25083.5 ns 25062.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23354.5 ns 23291.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23500 ns 23542 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23666 ns 23250 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 18385 ns 18661 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52541 ns 52625 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52250 ns 52833 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 52625 ns 52875 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52417 ns 52333 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 350310 ns 364018 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1402937 ns 1403750 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1453604 ns 1451354 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1407750 ns 1407542 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1405791 ns 1406458 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 196659 ns 196760 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5009646 ns 5023250 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5009167 ns 5018687.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5033375 ns 5042125 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5002959 ns 5001750 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 686014 ns 766930 ns 0.89
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3040292 ns 3048708 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2085208.5 ns 2082646 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2277708 ns 2300125 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4555708.5 ns 4855000 ns 0.94
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 582321 ns 583278 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24327959 ns 24263250 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18894917 ns 18905459 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 19053062 ns 19193375 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 36734333 ns 36575416 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 3192062 ns 3216229 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 34053333 ns 34013563 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28350771 ns 28342229 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28305584 ns 28436750 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41495750 ns 43339875 ns 0.96
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 143539875 ns 144288959 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 141550687.5 ns 142279583 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 125980521 ns 126469000.5 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 174659959 ns 168866000 ns 1.03
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22767374 ns 22582893 ns 1.01
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 927844458 ns 1275599313 ns 0.73
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 877900625 ns 1058487228.5 ns 0.83
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 714311334 ns 712851209 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 675349375 ns 668538250 ns 1.01
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 118602048 ns 119108875 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 76333.5 ns 83125 ns 0.92
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 74500 ns 76208 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 77792 ns 78125 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 73708.5 ns 72729 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 274210.5 ns 365097 ns 0.75
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 281000 ns 189959 ns 1.48
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 241792 ns 287792 ns 0.84
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 269583.5 ns 268875 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 190458.5 ns 189583.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1331419.5 ns 1559670.5 ns 0.85
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 35658687.5 ns 35476167 ns 1.01
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 35264646 ns 35447729.5 ns 0.99
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32554375 ns 32304459 ns 1.01
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 40986666 ns 40935146 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5840917 ns 5843273 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 147151041 ns 147875542 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 154097375 ns 152751312.5 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 137873062 ns 139824437 ns 0.99
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 287386750 ns 287719375 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 34877985 ns 34882914 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 120641166.5 ns 120880395.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174219083 ns 174358791 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 154681542 ns 155429791 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 102008187.5 ns 106966959 ns 0.95
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5447568 ns 5456342 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 472274750 ns 470623375 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 466169500 ns 466918000 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 452760166.5 ns 456589562.5 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 748093771 ns 742113834 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 35157294 ns 32255425 ns 1.09
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 641570625 ns 706243291.5 ns 0.91
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 654747250 ns 652697541.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 588175041.5 ns 591007625 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 851931500 ns 851805375 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1341708 ns 1320583.5 ns 1.02
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 979125 ns 965875 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 754334 ns 736687.5 ns 1.02
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2083583.5 ns 1944666.5 ns 1.07
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 566508.5 ns 564187.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2962000 ns 2971708.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2619167 ns 2620334 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2514208.5 ns 2535604 ns 0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3687791 ns 3604083.5 ns 1.02
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1777871.5 ns 1878347.5 ns 0.95
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 6635084 ns 6649958 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 6518479 ns 6493042 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 6471250 ns 6437479.5 ns 1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 4441000 ns 4435750 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7375 ns 7375 ns 1
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6167 ns 6208 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5209 ns 5375 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10167 ns 9916 ns 1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 25608 ns 25400 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212500 ns 213645.5 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 233312.5 ns 221833 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221125 ns 221250 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 206625 ns 205875 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 262108.5 ns 293719.5 ns 0.89
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 313057667 ns 301604437.5 ns 1.04
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 221340292 ns 221356625 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 217860313 ns 223278083.5 ns 0.98
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 312301166 ns 312163250 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7676793 ns 7672763 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1083402667 ns 1078062604.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 904193354 ns 896268771 ns 1.01
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 848367687.5 ns 880668729 ns 0.96
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1182818917 ns 1161143188 ns 1.02
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 26525180 ns 26517571 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6125 ns 5500 ns 1.11
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5917 ns 5750 ns 1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9208 ns 9437.5 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5604 ns 5875 ns 0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 179937.5 ns 201555 ns 0.89
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7916 ns 7500 ns 1.06
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7125 ns 7458 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7625 ns 7750 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7292 ns 7041.5 ns 1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 676020.5 ns 699933.5 ns 0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 542 ns 500 ns 1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 458 ns 500 ns 0.92
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 24453 ns 23724.5 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9250 ns 9208 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9542 ns 9625 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9875 ns 9604.5 ns 1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9083 ns 9042 ns 1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 232408.5 ns 234828.5 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 350791 ns 351500 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 351292 ns 350896 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 353292 ns 354624.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 353625 ns 351708 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 21486 ns 20984 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 827167 ns 775417 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 819459 ns 824916 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 825000 ns 830958 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 807520.5 ns 823958 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 314809.5 ns 306663 ns 1.03
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 330771.5 ns 338083 ns 0.98
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 337249.5 ns 341500 ns 0.99
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 440687.5 ns 443667 ns 0.99
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 333208 ns 325667 ns 1.02
batchedmm(16, Bsize=32)/forward/GPU/CUDA 17997 ns 17821 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 684250 ns 696042 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 748834 ns 739416.5 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1039833 ns 1042874.5 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 691958 ns 692645.5 ns 1.00
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 295631.5 ns 273141.5 ns 1.08
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 345229 ns 358458.5 ns 0.96
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 351937.5 ns 349125 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 433625 ns 431291.5 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 372416 ns 370875 ns 1.00
batchedmm(16, Bsize=128)/forward/GPU/CUDA 22620 ns 22357.5 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 743208 ns 756625 ns 0.98
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 752416 ns 744208.5 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1072792 ns 1073250 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 821708 ns 818125.5 ns 1.00
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 254388 ns 221398.5 ns 1.15
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3562.5 ns 3459 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3500 ns 3541 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3792 ns 3792 ns 1
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3292 ns 3291 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 18327 ns 17956 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4250 ns 4208 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4416 ns 4208 ns 1.05
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4458 ns 4416 ns 1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4583 ns 4125 ns 1.11
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 288549.5 ns 275839.5 ns 1.05
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5937 ns 3792 ns 1.57
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4083 ns 3375 ns 1.21
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6604 ns 6750 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3875 ns 6625 ns 0.58
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 232718 ns 205448.5 ns 1.13
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8333.5 ns 8334 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8541 ns 8459 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8459 ns 8500 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8666 ns 8541 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1217009 ns 1183984 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 204375 ns 202625 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 210334 ns 210416 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209583 ns 209292 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 200583 ns 200000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35617 ns 34588 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 647584 ns 603792 ns 1.07
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 629687.5 ns 670625 ns 0.94
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 633979 ns 630958 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 588542 ns 631187.5 ns 0.93
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 351232.5 ns 352652 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 956896.5 ns 967521 ns 0.99
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 933333 ns 927063 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 970375 ns 964437.5 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 1297208.5 ns 1281853.5 ns 1.01
batchedmm(128, Bsize=128)/forward/GPU/CUDA 207957 ns 207244 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4474792 ns 4451771 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4473791 ns 4482750 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4462375 ns 4474208 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 6306417 ns 6201166 ns 1.02
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 944028 ns 945549 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3417 ns 3604.5 ns 0.95
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4104.5 ns 3167 ns 1.30
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5645.5 ns 6792 ns 0.83
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3334 ns 3167 ns 1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 234727 ns 233201 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 7500 ns 1
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7583 ns 7375 ns 1.03
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7208 ns 7291 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7042 ns 7083 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 1010815 ns 1014881 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1649292 ns 1602833.5 ns 1.03
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1200166 ns 1187916 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1363417 ns 1364062 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2476209 ns 2343729.5 ns 1.06
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 214347 ns 212955.5 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12318728.5 ns 12334792 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9553209 ns 9602042 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9347916 ns 9404958 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18029249.5 ns 17966833 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1951267 ns 1949853 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17370208 ns 17347084 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14329458 ns 14365000 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14499750 ns 14512666 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21060125 ns 21005479.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 133791 ns 89791 ns 1.49
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 88333 ns 91729.5 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 91375 ns 94291 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 89500 ns 117416.5 ns 0.76
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 127052 ns 126285 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1999708 ns 2023917 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2008188 ns 2013416.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2046875 ns 2058875 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2034167 ns 2027875 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1039077 ns 1031286 ns 1.01
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 338125.5 ns 346791.5 ns 0.98
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 347771 ns 343583.5 ns 1.01
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 421709 ns 412250 ns 1.02
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 313021 ns 306166 ns 1.02
batchedmm(2, Bsize=4)/forward/GPU/CUDA 16444 ns 16010 ns 1.03
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 702125 ns 702291 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 732625 ns 728979.5 ns 1.01
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 1030209 ns 1025458 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 648604.5 ns 639875 ns 1.01
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 197115.5 ns 193209 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7458 ns 7292 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6042 ns 6083 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5209 ns 5334 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10125 ns 10000 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35200 ns 33620 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 225833 ns 220479.5 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 233791 ns 231958 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 228375 ns 232041 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 218042 ns 220500 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 318050.5 ns 311751 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3667 ns 3708 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3667 ns 3708 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3709 ns 3709 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3666 ns 3667 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22861 ns 22440 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14417 ns 14500 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14375 ns 14417 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14208 ns 14167 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14375 ns 14291 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 490016.5 ns 468658 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 136520.5 ns 95166 ns 1.43
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 92583 ns 138021 ns 0.67
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 99167 ns 99167 ns 1
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 93104.5 ns 122458 ns 0.76
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 126414 ns 125691 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1919000 ns 1931875 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1919833 ns 1954979 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1938604.5 ns 1946854 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1922208 ns 1923729.5 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 960141 ns 940251.5 ns 1.02
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 876459 ns 880500 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 824583 ns 815125 ns 1.01
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1193750 ns 1172292 ns 1.02
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 974812.5 ns 960167 ns 1.02
lenet(28, 28, 1, 32)/forward/GPU/CUDA 281183 ns 270704 ns 1.04
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2826375 ns 2803000 ns 1.01
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2536917 ns 2526833 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3329042 ns 3361333 ns 0.99
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3379375 ns 3405875 ns 0.99
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1666976 ns 1569154 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17708 ns 15146 ns 1.17
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15750 ns 18000 ns 0.88
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 18250 ns 21666 ns 0.84
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16375 ns 18125 ns 0.90
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 144691 ns 141811.5 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 227166 ns 217083 ns 1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 215750 ns 229375 ns 0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 223688 ns 257396 ns 0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 215854 ns 215833 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 652421.5 ns 635765.5 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 222542 ns 219750 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 222500 ns 221500 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 223062.5 ns 226021 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 223125 ns 223937.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 274050 ns 270450 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 554458 ns 509917 ns 1.09
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 524042 ns 557729 ns 0.94
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 557312.5 ns 549792 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 508854 ns 555791 ns 0.92
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1468319 ns 1308245 ns 1.12
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 327041.5 ns 333479 ns 0.98
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 336125 ns 335541.5 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 440874.5 ns 437333 ns 1.01
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 324187.5 ns 319417 ns 1.01
batchedmm(16, Bsize=4)/forward/GPU/CUDA 17320 ns 16583 ns 1.04
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 711083 ns 715333 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 735374.5 ns 730292 ns 1.01
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 1023083 ns 1025458.5 ns 1.00
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 664999.5 ns 655792 ns 1.01
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 200084.5 ns 193313 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18917 ns 17625 ns 1.07
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17750 ns 17625 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19500 ns 20437.5 ns 0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17667 ns 18000 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 151497 ns 144711.5 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 225250 ns 216667 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220666.5 ns 224083 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221708 ns 226625 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 212479 ns 223417 ns 0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1059613.5 ns 903796 ns 1.17
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6833 ns 4625 ns 1.48
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 7000 ns 6750 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7250 ns 7438 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5000 ns 6625 ns 0.75
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 253666 ns 174159.5 ns 1.46
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10270.5 ns 10437.5 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10792 ns 10750 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10750 ns 10770.5 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10250 ns 10833 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 1105579 ns 1024421 ns 1.08
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3666 ns 3646 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3459 ns 3334 ns 1.04
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 5604 ns 5625 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3562.5 ns 3500 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 244769.5 ns 231660 ns 1.06
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7875 ns 7708 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7834 ns 7792 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7667 ns 7625 ns 1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7083.5 ns 7167 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 1111376 ns 1037611.5 ns 1.07
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23504875 ns 23838833 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 34821166 ns 33990646 ns 1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 41404562 ns 41585708 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34980521 ns 34896229 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1840452 ns 1839186 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 184644333 ns 184662833 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 159219875 ns 159634000 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 151030791 ns 151746084 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 414216333 ns 415075875 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 16518712 ns 16506413 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 428777375 ns 427351833 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 254167375 ns 251624521 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 233388208 ns 233926312.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 486791792 ns 484091542 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 184146 ns 181666 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 182083 ns 183416.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 185687.5 ns 186125 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 182813 ns 183834 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 232040.5 ns 173529.5 ns 1.34
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 586542 ns 587541 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 598833 ns 600458 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 637583 ns 632375 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 597333 ns 631354 ns 0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1147246 ns 1005977 ns 1.14
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 3825646 ns 3816041.5 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 3636125 ns 3637833 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3534667 ns 3539646 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 5363792 ns 5351396 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 532563 ns 554127 ns 0.96
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17318916 ns 17372333 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 17179354 ns 17218458.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 17094375 ns 16979478.5 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 22003917 ns 22177625 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2614337 ns 2616933 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 583 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 542 ns 542 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 542 ns 542 ns 1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 500 ns 459 ns 1.09
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 33572 ns 32036 ns 1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9959 ns 9667 ns 1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9416 ns 9750 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9833.5 ns 10125 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9209 ns 9291 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 271645.5 ns 260858 ns 1.04
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 504524917 ns 506491042 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 431143271 ns 428949104 ns 1.01
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 464642083 ns 474815000 ns 0.98
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 675972708.5 ns 671461979 ns 1.01
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12484000 ns 12484614.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 2045443062.5 ns 2043435104.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1629220084 ns 1631358667 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1535256520.5 ns 1546812271 ns 0.99
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2230955812.5 ns 2216473375.5 ns 1.01
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 48693503 ns 49204869.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1657208 ns 1642542 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1196666.5 ns 1194625 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1379438 ns 1380791 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2487729.5 ns 2487084 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 217266.5 ns 215546 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12684584 ns 12711687.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9939834 ns 9927625 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9748521 ns 9788604.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18391375 ns 18464437.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2054532 ns 1995889.5 ns 1.03
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17706187 ns 17669166.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14663771 ns 14709437.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14743729.5 ns 14807645.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21515479 ns 21465708 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26292 ns 26250 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26291 ns 26291 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26209 ns 26167 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 24091.5 ns 23873 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 67083 ns 66917 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 68417 ns 67333 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67041 ns 67083 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66958 ns 66833 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 409287.5 ns 382426 ns 1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 203708 ns 203834 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 210083 ns 209542 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209542 ns 209584 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 199625 ns 199584 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 27233.5 ns 26132 ns 1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 647771 ns 613833.5 ns 1.06
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 648208 ns 636667 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 633916.5 ns 671166.5 ns 0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 631916.5 ns 628229.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 356804 ns 308600 ns 1.16
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 679458.5 ns 671687.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 656792 ns 645937.5 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 649396 ns 644791.5 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 650145.5 ns 676334 ns 0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132741 ns 131667 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2242083.5 ns 2241875 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2223667 ns 2192250 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2266583 ns 2297042 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2232792 ns 2246249.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1233696 ns 1114838 ns 1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18395.5 ns 16791 ns 1.10
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17917 ns 17500 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19417 ns 20958 ns 0.93
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17917 ns 16770.5 ns 1.07
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 146461 ns 143001 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 229458 ns 230375 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 258229 ns 231791.5 ns 1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 244375 ns 266208 ns 0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 250438 ns 260728.5 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1088374.5 ns 959584 ns 1.13
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 542 ns 542 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 542 ns 542 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 459 ns 500 ns 0.92
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 23950 ns 23163 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10208.5 ns 9604.5 ns 1.06
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 10084 ns 10292 ns 0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10292 ns 10625 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9625 ns 9584 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 261195 ns 255611 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6833.5 ns 5416.5 ns 1.26
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6333 ns 5750 ns 1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9166 ns 9458 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5292 ns 5708 ns 0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 236210 ns 219432 ns 1.08
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7708 ns 7833 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7167 ns 7750 ns 0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7708 ns 7709 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7375 ns 7000 ns 1.05
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 807849 ns 764584 ns 1.06
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2250 ns 1959 ns 1.15
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 2166 ns 2083 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2333 ns 2417 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2292 ns 2208 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 17905 ns 17893 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6709 ns 6875 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6750 ns 6542 ns 1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6666 ns 6583 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6770.5 ns 6291 ns 1.08
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 334008.5 ns 320459 ns 1.04
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 749354 ns 747709 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 749125 ns 749833 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 754125 ns 754999.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 751459 ns 749375 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 22038.5 ns 21357 ns 1.03
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 792416 ns 774854 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 808250 ns 792687.5 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 789229 ns 817042 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 798542 ns 811166 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 302658 ns 295013.5 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7458 ns 7334 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6000 ns 6000 ns 1
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5084 ns 5208.5 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10125 ns 10166 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 34362 ns 33519 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 234042 ns 219666 ns 1.07
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 268146 ns 268125 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 265333 ns 252000.5 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 225833 ns 213562 ns 1.06
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 364947.5 ns 354278 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 12333.5 ns 10875 ns 1.13
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 12500 ns 11833 ns 1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13417 ns 12770.5 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11958 ns 12000 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 249215 ns 238132.5 ns 1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 25042 ns 24708 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 25209 ns 24584 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25062.5 ns 25292 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24437.5 ns 24500 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 1118971 ns 1094067.5 ns 1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 106635541 ns 106709834 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 118368771 ns 116906583.5 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 124785875 ns 127036729 ns 0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 117816500 ns 117807000 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2665253 ns 2657653 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 393341625 ns 392558792 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 367145584 ns 365774917 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 430545354 ns 431860937.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 486902292 ns 483379250 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 15155419.5 ns 15196086 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 761855250 ns 758564875.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 759969042 ns 761412666 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 748877750 ns 748747542 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 767805458.5 ns 765232583 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7458 ns 6625 ns 1.13
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7791 ns 7334 ns 1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8208 ns 9041.5 ns 0.91
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7895.5 ns 8250 ns 0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 240391.5 ns 231038.5 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14250 ns 14625 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13625 ns 14750 ns 0.92
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14958 ns 14292 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13541 ns 14542 ns 0.93
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 1079844 ns 1043294.5 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 7875 ns 5875 ns 1.34
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 8333 ns 7959 ns 1.05
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 9833 ns 9167 ns 1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 8145.5 ns 6333 ns 1.29
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 236528.5 ns 228571 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12792 ns 12791 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12459 ns 13167 ns 0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13604 ns 13375 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12354 ns 12333 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 791731 ns 779066.5 ns 1.02
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 341416 ns 347625 ns 0.98
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 345104.5 ns 342625 ns 1.01
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 421708 ns 416812 ns 1.01
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 316750 ns 307083 ns 1.03
batchedmm(2, Bsize=128)/forward/GPU/CUDA 17091 ns 17023 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 706229 ns 710208.5 ns 0.99
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 734625 ns 732125 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 1026854 ns 1032542 ns 0.99
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 654792 ns 653979.5 ns 1.00
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 201401 ns 200196.5 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 334 ns 1.12
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 291 ns 333 ns 0.87
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 24022 ns 23569 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6875 ns 6375 ns 1.08
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6541 ns 6584 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6583 ns 6834 ns 0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6416.5 ns 6042 ns 1.06
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 242349 ns 241926 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5833 ns 5708 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5792 ns 5834 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 5834 ns 5875 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5667 ns 5708 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 24894 ns 24556.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21812.5 ns 21562.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21583 ns 22000 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21666.5 ns 21709 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21584 ns 21167 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 266328.5 ns 265433.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 190062.5 ns 144917 ns 1.31
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 146333 ns 191292 ns 0.76
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 150459 ns 149333 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 145542 ns 149250 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167806 ns 167659 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1322917 ns 1319292 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1318333 ns 1331416 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1353437.5 ns 1362958 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1315541 ns 1326125 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1350841 ns 1343729.5 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 23875 ns 22250 ns 1.07
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24541 ns 23791 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 25875 ns 25875 ns 1
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 23708 ns 23666.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 353850.5 ns 286115 ns 1.24
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 178042 ns 146125 ns 1.22
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 149625 ns 118500 ns 1.26
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 157792 ns 129833 ns 1.22
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 182958 ns 175792 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1467205 ns 1461317 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 291 ns 292 ns 1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 23537 ns 23352 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7000 ns 6334 ns 1.11
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6708 ns 6459 ns 1.04
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6667 ns 6709 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6333 ns 6125 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 258302.5 ns 258095.5 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4500 ns 4625 ns 0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4792 ns 4125 ns 1.16
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7292 ns 7625 ns 0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5146 ns 4895.5 ns 1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 256601 ns 256357.5 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10042 ns 9959 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10541 ns 10125 ns 1.04
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10334 ns 10333 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10542 ns 10333 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 1354903 ns 1358318.5 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1625 ns 1584 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1625 ns 1625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1583 ns 1583 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 23008 ns 23389 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5667 ns 5667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5834 ns 5875 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 5958 ns 6000 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5625 ns 5625 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 275860.5 ns 275350.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6829125 ns 6780125 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6372104.5 ns 6371125 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6510708 ns 6531396 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7661979 ns 7625875 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 213899 ns 214804 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24098624.5 ns 24015354 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21314479.5 ns 21285667 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21021666.5 ns 21085125 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29805875 ns 29769250 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 2108805 ns 2112477.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 37438896 ns 37264541.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 45528708 ns 45538167 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45741750 ns 45665125 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 38021625 ns 38235958 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6750 ns 6208 ns 1.09
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6541.5 ns 5958.5 ns 1.10
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 8125 ns 8750 ns 0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 7792 ns 7500 ns 1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 238056.5 ns 236550 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8875 ns 8750 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8020.5 ns 8375 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8292 ns 8500 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8000 ns 8958 ns 0.89
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 1056341.5 ns 1063848.5 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1550333 ns 1554084 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1257312.5 ns 1262375 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1639875 ns 1631958.5 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2152292 ns 2152375 ns 1.00
lenet(28, 28, 1, 128)/forward/GPU/CUDA 277659 ns 277465 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7893709 ns 7881667 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6521292 ns 6612667 ns 0.99
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7213021.5 ns 7276167 ns 0.99
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10476770.5 ns 10468062.5 ns 1.00
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1865563 ns 1876576 ns 0.99
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 335541.5 ns 346375 ns 0.97
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 348667 ns 348937.5 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 424167 ns 423416.5 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 344333 ns 336687 ns 1.02
batchedmm(128, Bsize=4)/forward/GPU/CUDA 46568 ns 46390 ns 1.00
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 725333 ns 735208 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 793166.5 ns 782458 ns 1.01
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1069208 ns 1081666.5 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 737667 ns 758458.5 ns 0.97
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 311445 ns 311011.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397250 ns 397375 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288166 ns 288250 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 212708 ns 212583 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 749833 ns 754104.5 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 44294 ns 44494 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 669083 ns 675959 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 531000 ns 532333 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 473583.5 ns 474000 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 974375 ns 973417 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 190394 ns 189847 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 673125 ns 599375 ns 1.12
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 646166.5 ns 650333 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 635354 ns 660375 ns 0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 666145.5 ns 655833.5 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132578 ns 132321 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2463167 ns 2469395.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2454250 ns 2363959 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2506958 ns 2519875.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2443770.5 ns 2465916 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1555584 ns 1345989 ns 1.16
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 338958 ns 345583 ns 0.98
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 344708.5 ns 342834 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 403416.5 ns 416375 ns 0.97
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 316417 ns 306979.5 ns 1.03
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15850 ns 16330 ns 0.97
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 703979 ns 703104 ns 1.00
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 734375 ns 729708 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 1023041 ns 1026937.5 ns 1.00
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 650916 ns 645959 ns 1.01
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 196785.5 ns 199885.5 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1461042 ns 1460542 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1500667 ns 1500583 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1492042 ns 1491791 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1440000 ns 1441917 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 40920 ns 41671 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5126333.5 ns 5133500 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5291541.5 ns 5293250 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5301291 ns 5309521 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4979667 ns 4977042 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 196913.5 ns 197710 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3667 ns 3708 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3667 ns 3708 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3708 ns 3709 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3625 ns 3666 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 32799 ns 33362 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15292 ns 15125 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15375 ns 15500 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15167 ns 15125 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15083 ns 15083 ns 1
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 373971 ns 381216.5 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 71375 ns 71375 ns 1
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71250 ns 71208 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 71333 ns 71583 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 70792 ns 71208 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113302 ns 113946.5 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 317667 ns 319833 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 322209 ns 319208 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 322250 ns 327125 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 319291 ns 318375 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 192855.5 ns 195156 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 1083 ns 959 ns 1.13
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 1042 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1042 ns 1083 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 958 ns 1000 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 23648 ns 23764 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8417 ns 8084 ns 1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8334 ns 8542 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8625 ns 8416 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7875 ns 7833.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 261768 ns 263039 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 466583 ns 472416 ns 0.99
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 472208 ns 468125 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 551562.5 ns 549250 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 556667 ns 550333 ns 1.01
batchedmm(128, Bsize=32)/forward/GPU/CUDA 130676 ns 128804.5 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1386958.5 ns 1375292 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1383166.5 ns 1372208 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1619792 ns 1633459 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 1594729 ns 1580500 ns 1.01
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 274271.5 ns 274739 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31927 ns 31574 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6834 ns 6458 ns 1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6417 ns 6875 ns 0.93
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6625 ns 6708 ns 0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6292 ns 6000 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 264167.5 ns 261869 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1731562.5 ns 1727625 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1724188 ns 1783958 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1732791.5 ns 1730916 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1728917 ns 1729333 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 168364 ns 168455 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4370166 ns 4352625 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3986834 ns 4372937.5 ns 0.91
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4407875 ns 4412458 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4357625 ns 4358042 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 1246279 ns 1234725 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 9083.5 ns 6709 ns 1.35
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6625 ns 6584 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7083 ns 7417 ns 0.95
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6792 ns 6542 ns 1.04
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 20206 ns 19619.5 ns 1.03
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 70500 ns 51083 ns 1.38
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 49333 ns 35625 ns 1.38
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 48625 ns 49875 ns 0.97
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 51625 ns 70208 ns 0.74
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 210517 ns 211156 ns 1.00
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 351292 ns 354291 ns 0.99
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 347666 ns 347584 ns 1.00
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 439041 ns 432708 ns 1.01
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 324750 ns 319521.5 ns 1.02
batchedmm(2, Bsize=512)/forward/GPU/CUDA 18167 ns 18053 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 719375 ns 719104 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 738645.5 ns 735979 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 1043271 ns 1039063 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 674500 ns 672750 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 343170 ns 343671.5 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75209 ns 75417 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 75458 ns 75333 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75334 ns 75708 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 75500 ns 74709 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 47044 ns 46983 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 336292 ns 324417 ns 1.04
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 338333.5 ns 327000 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 333875 ns 334917 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 324667 ns 324083 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 208548 ns 207721.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1486334 ns 1486334 ns 1
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1527000 ns 1527500 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1518791 ns 1519000 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1464583 ns 1466541 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 51940 ns 51914 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5119271 ns 5119333.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5286375 ns 5300396 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5302167 ns 5303708 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4987020.5 ns 4989375 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 204109 ns 201413 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28208 ns 28167 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28208 ns 28166 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28250 ns 28333 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28208 ns 28208 ns 1
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24295 ns 24393 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66500 ns 66542 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66542 ns 66292 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66292 ns 66542 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66666 ns 66584 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 522901.5 ns 530998 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1501000 ns 1493250 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1136667 ns 1120167 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 936416 ns 947625 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2248959 ns 2256500 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 570544.5 ns 570331 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3097916.5 ns 3075542 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2602791 ns 2732479 ns 0.95
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2630292 ns 2643125 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3818229.5 ns 3814770.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 2066155.5 ns 2010818 ns 1.03
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 8816250 ns 8738917 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 8770729 ns 8777854.5 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 8782521 ns 8781417 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 6355500 ns 6360687.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 82792 ns 81146 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 81229 ns 81708.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 84146 ns 83708 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83687.5 ns 87687.5 ns 0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194104 ns 192383.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2026750 ns 2016791.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2006271 ns 2012708 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2038312.5 ns 2041312 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2024312.5 ns 2015208 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 798621.5 ns 798885.5 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.