-
Notifications
You must be signed in to change notification settings - Fork 62
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: missing zero leads to NaNs (#1044)
- Loading branch information
Showing
3 changed files
with
6 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
name = "Lux" | ||
uuid = "b2108857-7c20-44ae-9111-449ecde12c47" | ||
authors = ["Avik Pal <[email protected]> and contributors"] | ||
version = "1.2.1" | ||
version = "1.2.2" | ||
|
||
[deps] | ||
ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
40a0797
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register
40a0797
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/118863
Tip: Release Notes
Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.
To add them here just re-invoke and the PR will be updated.
Tagging
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:
40a0797
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lux Benchmarks
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
4709
ns4270.5
ns1.10
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
4187.5
ns4000
ns1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
6458
ns5875
ns1.10
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
4333
ns4895.5
ns0.89
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
59980
ns59833
ns1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
10208
ns10375
ns0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
10250
ns9958
ns1.03
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
10416
ns10792
ns0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
10500
ns10125
ns1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
421819.5
ns422438
ns1.00
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)
3333
ns1083
ns3.08
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)
1125
ns1000
ns1.13
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)
1458
ns1417
ns1.03
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)
3583
ns1125
ns3.18
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA
18027
ns18109
ns1.00
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
4208.5
ns4166
ns1.01
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
4000
ns4125
ns0.97
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
4125
ns4187.5
ns0.99
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
3542
ns4042
ns0.88
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA
110057.5
ns109209
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57167
ns57645.5
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46291
ns47000
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
37667
ns38125
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
81562.5
ns82084
ns0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
37973
ns37455
ns1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2023292
ns1973687
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2084875
ns2089416
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2079104.5
ns2085625
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1989416
ns1985813
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
196720
ns195917
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
148250
ns146416.5
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
145250
ns147020.5
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
147000
ns145667
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
146458.5
ns145604.5
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
165401.5
ns166391
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1109125
ns1129209
ns0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1002854
ns1126375
ns0.89
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1120896
ns1147667
ns0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1120062.5
ns1104209
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
522324
ns521058.5
ns1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
3167
ns3416.5
ns0.93
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
3666
ns3333
ns1.10
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
4959
ns6333
ns0.78
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
4208
ns3250
ns1.29
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
67564
ns66594
ns1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9292
ns8792
ns1.06
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
9625
ns9291
ns1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
9666
ns9250
ns1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
9125
ns9292
ns0.98
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
490212.5
ns493812
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
16250
ns14750
ns1.10
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
15666
ns15458
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
17708
ns19167
ns0.92
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
15458.5
ns16437.5
ns0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
53751
ns53833
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
218959
ns215416.5
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
263125
ns213208.5
ns1.23
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
214479.5
ns214271
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
214584
ns227104
ns0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
270657.5
ns271460
ns1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)
584
ns542
ns1.08
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)
667
ns625
ns1.07
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)
791
ns792
ns1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)
500
ns583
ns0.86
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA
17722
ns17470
ns1.01
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
1625
ns1750
ns0.93
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
1542
ns1417
ns1.09
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
1750
ns1709
ns1.02
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
1500
ns1645.5
ns0.91
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA
105921
ns101826.5
ns1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
6708
ns7250
ns0.93
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
5625
ns5916
ns0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5250
ns5292
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
9917
ns10000
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
24704
ns23857.5
ns1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
220833
ns226895.5
ns0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
228854.5
ns230375
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
230667
ns231584
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
214854.5
ns258625
ns0.83
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
173103
ns167659
ns1.03
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)
3834
ns3875
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)
3875
ns3875
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)
3875
ns3916
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)
3833
ns3833
ns1
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA
24587
ns23468
ns1.05
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16750
ns16750
ns1
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
17000
ns17042
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16875
ns17000
ns0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16667
ns16625
ns1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA
165551.5
ns160597
ns1.03
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
578833
ns572166
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
576917
ns575000
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
582166
ns587458
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
576333
ns578334
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA
113849.5
ns113397
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1416354
ns1421708
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1419000
ns1420125
ns1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1419292
ns1430083
ns0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
1422125
ns1413292
ns1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA
215768
ns209669.5
ns1.03
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s)
1077667
ns1074458
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s)
958083
ns958250.5
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s)
1334083.5
ns1334396
ns1.00
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s)
1294500
ns1310875
ns0.99
lenet(28, 28, 1, 64)/forward/GPU/CUDA
279534.5
ns269120.5
ns1.04
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s)
5905708.5
ns5769437
ns1.02
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s)
4464875
ns4470625
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s)
4947583.5
ns4941021
ns1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s)
5694959
ns5552042
ns1.03
lenet(28, 28, 1, 64)/zygote/GPU/CUDA
1102519
ns1066489
ns1.03
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)
542
ns500
ns1.08
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)
542
ns542
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)
500
ns500
ns1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA
23704
ns23585
ns1.01
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2125
ns2083
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2208
ns2167
ns1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2125
ns2250
ns0.94
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2083
ns2125
ns0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA
173233.5
ns169900
ns1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6083
ns4084
ns1.49
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
6062.5
ns6250
ns0.97
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
6833
ns7209
ns0.95
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5625
ns6125
ns0.92
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
65783.5
ns64199
ns1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
11708
ns11083
ns1.06
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
11458
ns11625
ns0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
12041.5
ns12000
ns1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10916
ns10917
ns1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
451682.5
ns446167.5
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7291
ns6042
ns1.21
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
6959
ns7042
ns0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
7417
ns8833
ns0.84
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7000
ns7250
ns0.97
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
52765.5
ns51074.5
ns1.03
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
19292
ns17292
ns1.12
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
16791
ns18334
ns0.92
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
18416
ns18083
ns1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
17417
ns17229.5
ns1.01
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
307013
ns299895.5
ns1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
584
ns459
ns1.27
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
459
ns542
ns0.85
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
583
ns542
ns1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
500
ns500
ns1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
33676
ns32630
ns1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9209
ns8458
ns1.09
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
8959
ns9041
ns0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9458
ns9166
ns1.03
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
8541
ns8459
ns1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
163297
ns158907
ns1.03
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)
64208
ns64625
ns0.99
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)
64542
ns64250
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)
64750
ns65000
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)
64625
ns64667
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA
111486
ns111460
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
283750
ns289667
ns0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
272916
ns279750
ns0.98
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
290209
ns289625
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
282250
ns281250
ns1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA
188359
ns184453.5
ns1.02
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s)
3288854
ns3347125
ns0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s)
3022250
ns3015520.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s)
2788959
ns2792979
ns1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s)
3940500
ns4064520.5
ns0.97
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA
573457
ns588037
ns0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s)
7566395.5
ns7500166
ns1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s)
7439354
ns7470229.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s)
7359291
ns7393937.5
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s)
8218750.5
ns8209000
ns1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA
1350797
ns1331630
ns1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s)
18855500
ns19529541
ns0.97
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s)
19172959
ns19142959
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s)
19055083
ns19022708
ns1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s)
15683667
ns15703750
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23397396
ns23617083
ns0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
33591292
ns33598208
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
41001500
ns41100666
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
35016729
ns35022333
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1854981
ns1855178.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
188708583
ns189352250
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
163045187.5
ns163568208
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
158187270.5
ns158452896
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
440633708
ns438607167
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
13912485
ns13925600.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
290204250
ns287704167
ns1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
336699250
ns337952937.5
ns1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
306929083.5
ns291466708
ns1.05
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
360485292
ns395696000
ns0.91
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
24000
ns21334
ns1.12
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
25250
ns24375
ns1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
25542
ns25771
ns0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
24187
ns23584
ns1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
96403.5
ns95861
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
103208
ns103625
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
103958
ns103708
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
106167
ns104625
ns1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
103500
ns103479.5
ns1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
505974.5
ns510517.5
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6583
ns5750
ns1.14
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7187.5
ns7208
ns1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
7834
ns7666.5
ns1.02
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7270.5
ns7166
ns1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
68015.5
ns68604
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14916
ns14708
ns1.01
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
16333
ns15916
ns1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
16500
ns16666
ns0.99
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
14417
ns14667
ns0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
478278.5
ns483804.5
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3017708
ns2876500
ns1.05
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2059209
ns2063833
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2271084
ns2288208
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4832125
ns4870416
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA
583236
ns587700
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
23544333.5
ns23421375
ns1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18008625
ns17990750
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
18182895.5
ns18312792
ns0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
35486792
ns35646292
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
3171395
ns3104605
ns1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
33320333
ns33240625
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
27559042
ns27662417
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
27849959
ns27837459
ns1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
41863187
ns41788833
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
74145.5
ns72083
ns1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
75999.5
ns78729
ns0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
76208
ns75729.5
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
74167
ns72459
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
100896
ns100762.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
218250
ns204458
ns1.07
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
320708
ns219041
ns1.46
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
218959
ns320458
ns0.68
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
206479
ns205312.5
ns1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
541885.5
ns541454.5
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
12667
ns11333
ns1.12
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
12604
ns12416
ns1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
13250
ns13834
ns0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
12541
ns13125
ns0.96
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
71532.5
ns69856.5
ns1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
26458.5
ns26520.5
ns1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
27625
ns27458
ns1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
27959
ns28291
ns0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
25084
ns26500
ns0.95
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
472937
ns473341
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
13166.5
ns11833
ns1.11
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
12666
ns12750
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
13584
ns14333
ns0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
12750
ns13375
ns0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
53054.5
ns51587
ns1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
26000
ns26375
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
26250
ns26583
ns0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
26667
ns26666
ns1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
25937.5
ns26417
ns0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
305682.5
ns302777.5
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
179250
ns178666.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
183042
ns180292
ns1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
184000
ns184416.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
180521
ns179709
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
56083
ns55677
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
583709
ns591146.5
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
594666.5
ns588583
ns1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
584334
ns593062
ns0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
582417
ns582708.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
284606
ns285027
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
6917
ns5667
ns1.22
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7084
ns7167
ns0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
7458
ns7895.5
ns0.94
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
6000
ns7291
ns0.82
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
69928
ns69657.5
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
13916
ns14167
ns0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
14917
ns14958
ns1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
15500
ns15854.5
ns0.98
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
13708.5
ns14583
ns0.94
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
462602.5
ns460443
ns1.00
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)
1245291.5
ns1194208.5
ns1.04
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)
1249041
ns1216792
ns1.03
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)
1251687.5
ns1262604
ns0.99
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)
1317084
ns1318166.5
ns1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA
301738.5
ns301559
ns1.00
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)
4131000
ns4098416
ns1.01
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)
4425583
ns4352937.5
ns1.02
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)
4803083.5
ns4631875
ns1.04
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)
4483417
ns4436562.5
ns1.01
batchedmm(512, Bsize=4)/zygote/GPU/CUDA
1042425
ns1042661.5
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1792
ns1750
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1834
ns1833
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1833
ns1834
ns1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1833
ns1875
ns0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA
23347
ns23523
ns0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
4792
ns4792
ns1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
4959
ns4875
ns1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
4916
ns4916
ns1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
4875
ns4875
ns1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA
188110.5
ns187370
ns1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
7291
ns5500
ns1.33
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6667
ns6334
ns1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
8500
ns8604
ns0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
6520.5
ns7292
ns0.89
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
55544
ns54466
ns1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
11750
ns10958
ns1.07
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
11542
ns11792
ns0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
11459
ns11708.5
ns0.98
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10500
ns11166
ns0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
333462
ns330839
ns1.01
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)
333
ns292
ns1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)
333
ns333
ns1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)
333
ns292
ns1.14
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)
292
ns333
ns0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA
23358
ns22873.5
ns1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)
2750
ns2708
ns1.02
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)
2917
ns2959
ns0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)
2875
ns3042
ns0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)
2709
ns2750
ns0.99
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA
159055
ns157537.5
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
13917
ns10750
ns1.29
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
12167
ns13708
ns0.89
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
14416.5
ns14958
ns0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
14125
ns14583
ns0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
56583
ns55574.5
ns1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
25542
ns25209
ns1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
24750
ns25250
ns0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
25292
ns25375
ns1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
24666
ns24979.5
ns0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
295253
ns292656
ns1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)
4125
ns4208
ns0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)
4208
ns4125
ns1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)
4125
ns4167
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)
4166
ns4167
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA
24628
ns24774
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)
16209
ns16333
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)
16167
ns16125
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)
16042
ns16125
ns0.99
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)
16042
ns16084
ns1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA
196022
ns195031.5
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
5625
ns5708
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
5750
ns5750
ns1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
5750
ns5750
ns1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5667
ns5709
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
33545
ns33326
ns1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
20812.5
ns21125
ns0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
21292
ns20875
ns1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
21250
ns21583
ns0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
20875
ns21500
ns0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
177718.5
ns175195.5
ns1.01
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)
397187.5
ns415708
ns0.96
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)
367791.5
ns376667
ns0.98
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)
474125.5
ns471499.5
ns1.01
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)
524687.5
ns523500
ns1.00
batchedmm(16, Bsize=512)/forward/GPU/CUDA
66609
ns66680.5
ns1.00
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)
993104.5
ns924750.5
ns1.07
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)
895209
ns849291
ns1.05
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)
1217499.5
ns1217521
ns1.00
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)
1316708.5
ns1302292
ns1.01
batchedmm(16, Bsize=512)/zygote/GPU/CUDA
190077.5
ns189339
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
80792
ns79792
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
81583
ns82667
ns0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
85208
ns84208
ns1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
82625
ns82833
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
193329
ns193132
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1921791.5
ns1917625.5
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1910583
ns1915292
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1932750
ns1940917
ns1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1930937.5
ns1896541
ns1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
400621
ns395963
ns1.01
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)
291
ns292
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)
292
ns333
ns0.88
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA
21903
ns21798
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
1750
ns1792
ns0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
1875
ns1875
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
1833
ns1834
ns1.00
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
1792
ns1792
ns1
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA
171616
ns167505
ns1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
7209
ns5834
ns1.24
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6875
ns7500
ns0.92
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
9083
ns9958
ns0.91
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
8437.5
ns6875
ns1.23
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
61519.5
ns58244.5
ns1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
9375
ns9375
ns1
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
9208
ns9333
ns0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
9542
ns9354.5
ns1.02
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
9083
ns9625
ns0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
316461
ns302935
ns1.04
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
121921729.5
ns119443416.5
ns1.02
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
174226916
ns173896250
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
155004041
ns155811625
ns0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
104742666
ns108054541
ns0.97
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5476999
ns5469386
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
615745833
ns616746166.5
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
553438666
ns555745625
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
466862292
ns468855125
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
758782292
ns760571396
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
38219390.5
ns34956216
ns1.09
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
647920084
ns648663875
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
665540396
ns664591146
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
602573542
ns601178041.5
ns1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
736079500
ns746069334
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58458
ns59458
ns0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47375
ns47083
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
38167
ns39166
ns0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83791
ns83208
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
37680
ns37582
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1919334
ns1926708
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1969750
ns1983042
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1964792
ns1986937.5
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1887083
ns1850250
ns1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
174429
ns173017.5
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
266604.5
ns265187.5
ns1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
266833
ns267959
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
277917
ns276771
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
267792
ns266917
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
134043
ns128834.5
ns1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
598458
ns604083
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
679625
ns692833.5
ns0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
698500
ns705709
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
589166.5
ns590291.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
739242.5
ns683429
ns1.08
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
2179875
ns2195333
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
2207770.5
ns2225625
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
2230166.5
ns2230583
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
2166541.5
ns2183333
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
133769.5
ns133325.5
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5479791
ns5480833
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5475416
ns5508958
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5556584
ns5585895.5
ns0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5498708.5
ns5490125
ns1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
789077
ns766206
ns1.03
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
648750
ns646750
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
646833
ns660250
ns0.98
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
648791
ns642917
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
641042
ns647375
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA
46829
ns47306
ns0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
1820833
ns1828875
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
1718208
ns1721042
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
1663417
ns1665209
ns1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
2109125
ns2097000
ns1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA
220654.5
ns223896.5
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
57584
ns58667
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
46708
ns47750
ns0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
37187.5
ns38958
ns0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83958
ns82750
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
29081
ns29191
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2026792
ns2029083.5
ns1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2065000
ns2091166
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2093333
ns2107249.5
ns0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2012896
ns1994854.5
ns1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
190966
ns190986
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
13377084
ns13371291
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
12431583
ns12436583.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
12650458
ns12675625
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
15142979
ns15146959
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
518359.5
ns517535.5
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
47276625
ns47259416
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
41724000
ns41746209
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
41175333
ns41384750
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
58162084
ns58440500
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
3219076.5
ns3203835
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
73902667
ns73984667
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
90751250
ns91223791.5
ns0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
90811916.5
ns90609938
ns1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
77025583.5
ns77234000
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
58083
ns59000
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
47042
ns47417
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
38875
ns38917
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
84333
ns81125
ns1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
47304
ns47741
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1915854
ns1911646
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1776104
ns1970541
ns0.90
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1975416
ns1976417
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1882666
ns1882083
ns1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
193083.5
ns195868.5
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
375
ns292
ns1.28
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
292
ns375
ns0.78
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
292
ns333
ns0.88
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
32433
ns32615
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6875
ns6500
ns1.06
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6708
ns6375
ns1.05
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6666
ns6750
ns0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6083
ns6375
ns0.95
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
179829
ns176818
ns1.02
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)
250
ns292
ns0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)
292
ns292
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)
250
ns250
ns1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA
31486
ns32102
ns0.98
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)
2750
ns2625
ns1.05
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)
2875
ns2875
ns1
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)
2834
ns2916
ns0.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)
2542
ns2625
ns0.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA
167204
ns164236.5
ns1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
289802312
ns286096229
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
340067917
ns339570541
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
320767937.5
ns321242167
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
269385459
ns271493208
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA
7005548
ns7111512
ns0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
1000854291
ns987492667
ns1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
940227375
ns939040416
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
867700979
ns868433209
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
1161938542
ns1162204042
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
33905766
ns34040446
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
1313359771
ns1310851000.5
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
1692771333
ns1685402625
ns1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
1614224666
ns1648347125
ns0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
1329771896
ns1310788750
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1466583
ns1412625
ns1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1408729.5
ns1412041.5
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1419646
ns1424625
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1408917
ns1408334
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
127867
ns128501
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5153083
ns5028875
ns1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5008791
ns5030104
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5035021
ns5062042
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4998083
ns5014021
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
635980
ns597004.5
ns1.07
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s)
168439167
ns168008834
ns1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s)
131339666.5
ns130299417
ns1.01
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s)
146925000
ns148283479
ns0.99
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s)
169743500
ns161948354
ns1.05
vgg16(32, 32, 3, 32)/forward/GPU/CUDA
4853143
ns5052268
ns0.96
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s)
663948875
ns662817209
ns1.00
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s)
641456958
ns492884417
ns1.30
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s)
506284584
ns507367709
ns1.00
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s)
678955333
ns678320708
ns1.00
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA
15737054
ns17294527
ns0.91
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)
8738833
ns8884604
ns0.98
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)
8800458
ns8801959
ns1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)
8166459
ns8221541.5
ns0.99
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)
10174791
ns10127167
ns1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA
1595992
ns1611762
ns0.99
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)
35758375
ns36027125
ns0.99
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)
36866500
ns36933063
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)
34436000
ns34547750
ns1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)
38800000
ns38824854
ns1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA
6470958
ns6452267
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)
47708
ns47375
ns1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)
47792
ns47250
ns1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)
47875
ns47542
ns1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)
47500
ns47333
ns1.00
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA
18930
ns19020
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)
50334
ns50312.5
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)
50792
ns50500
ns1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)
50833
ns50958.5
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)
50417
ns50333
ns1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA
241811.5
ns226580
ns1.07
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
8542
ns6542
ns1.31
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
7208
ns7187.5
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
8708
ns9083
ns0.96
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
8500
ns8625
ns0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
126920.5
ns117383.5
ns1.08
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
9958
ns9625
ns1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10041
ns10208
ns0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10125
ns10333.5
ns0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10167
ns10209
ns1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
677861
ns723908.5
ns0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
8666.5
ns6083
ns1.42
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
7770.5
ns8250
ns0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
9167
ns9417
ns0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
8166
ns8375
ns0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
144327
ns157024.5
ns0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
13375
ns13292
ns1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
12959
ns13792
ns0.94
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13334
ns13708
ns0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
13083
ns12834
ns1.02
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
583952.5
ns618769
ns0.94
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
1000
ns1042
ns0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
1083
ns1042
ns1.04
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
1042
ns1042
ns1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
1042
ns1083
ns0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA
32857
ns32863
ns1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
8145.5
ns7875
ns1.03
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
8167
ns8000
ns1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
8166
ns8208
ns0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
8208
ns8250
ns0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA
244316.5
ns246953.5
ns0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
25083.5
ns25062.5
ns1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
23354.5
ns23291.5
ns1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
23500
ns23542
ns1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
23666
ns23250
ns1.02
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA
18385
ns18661
ns0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
52541
ns52625
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
52250
ns52833
ns0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
52625
ns52875
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
52417
ns52333
ns1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA
350310
ns364018
ns0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1402937
ns1403750
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1453604
ns1451354
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1407750
ns1407542
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1405791
ns1406458
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
196659
ns196760
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5009646
ns5023250
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5009167
ns5018687.5
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5033375
ns5042125
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
5002959
ns5001750
ns1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
686014
ns766930
ns0.89
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s)
3040292
ns3048708
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s)
2085208.5
ns2082646
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s)
2277708
ns2300125
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s)
4555708.5
ns4855000
ns0.94
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA
582321
ns583278
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s)
24327959
ns24263250
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s)
18894917
ns18905459
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s)
19053062
ns19193375
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s)
36734333
ns36575416
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA
3192062
ns3216229
ns0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s)
34053333
ns34013563
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s)
28350771
ns28342229
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s)
28305584
ns28436750
ns1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s)
41495750
ns43339875
ns0.96
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)
143539875
ns144288959
ns0.99
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)
141550687.5
ns142279583
ns0.99
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)
125980521
ns126469000.5
ns1.00
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)
174659959
ns168866000
ns1.03
batchedmm(512, Bsize=512)/forward/GPU/CUDA
22767374
ns22582893
ns1.01
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)
927844458
ns1275599313
ns0.73
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)
877900625
ns1058487228.5
ns0.83
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)
714311334
ns712851209
ns1.00
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)
675349375
ns668538250
ns1.01
batchedmm(512, Bsize=512)/zygote/GPU/CUDA
118602048
ns119108875
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
76333.5
ns83125
ns0.92
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
74500
ns76208
ns0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
77792
ns78125
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
73708.5
ns72729
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
274210.5
ns365097
ns0.75
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
281000
ns189959
ns1.48
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
241792
ns287792
ns0.84
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
269583.5
ns268875
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
190458.5
ns189583.5
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1331419.5
ns1559670.5
ns0.85
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)
35658687.5
ns35476167
ns1.01
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)
35264646
ns35447729.5
ns0.99
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)
32554375
ns32304459
ns1.01
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)
40986666
ns40935146
ns1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA
5840917
ns5843273
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)
147151041
ns147875542
ns1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)
154097375
ns152751312.5
ns1.01
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)
137873062
ns139824437
ns0.99
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)
287386750
ns287719375
ns1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA
34877985
ns34882914
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s)
120641166.5
ns120880395.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s)
174219083
ns174358791
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s)
154681542
ns155429791
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s)
102008187.5
ns106966959
ns0.95
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA
5447568
ns5456342
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s)
472274750
ns470623375
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s)
466169500
ns466918000
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s)
452760166.5
ns456589562.5
ns0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s)
748093771
ns742113834
ns1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA
35157294
ns32255425
ns1.09
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s)
641570625
ns706243291.5
ns0.91
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s)
654747250
ns652697541.5
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s)
588175041.5
ns591007625
ns1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s)
851931500
ns851805375
ns1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s)
1341708
ns1320583.5
ns1.02
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s)
979125
ns965875
ns1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s)
754334
ns736687.5
ns1.02
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s)
2083583.5
ns1944666.5
ns1.07
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA
566508.5
ns564187.5
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s)
2962000
ns2971708.5
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s)
2619167
ns2620334
ns1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s)
2514208.5
ns2535604
ns0.99
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s)
3687791
ns3604083.5
ns1.02
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA
1777871.5
ns1878347.5
ns0.95
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s)
6635084
ns6649958
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s)
6518479
ns6493042
ns1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s)
6471250
ns6437479.5
ns1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s)
4441000
ns4435750
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7375
ns7375
ns1
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6167
ns6208
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5209
ns5375
ns0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10167
ns9916
ns1.03
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
25608
ns25400
ns1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
212500
ns213645.5
ns0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
233312.5
ns221833
ns1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
221125
ns221250
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
206625
ns205875
ns1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
262108.5
ns293719.5
ns0.89
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s)
313057667
ns301604437.5
ns1.04
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s)
221340292
ns221356625
ns1.00
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s)
217860313
ns223278083.5
ns0.98
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s)
312301166
ns312163250
ns1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA
7676793
ns7672763
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s)
1083402667
ns1078062604.5
ns1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s)
904193354
ns896268771
ns1.01
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s)
848367687.5
ns880668729
ns0.96
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s)
1182818917
ns1161143188
ns1.02
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA
26525180
ns26517571
ns1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
6125
ns5500
ns1.11
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
5917
ns5750
ns1.03
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
9208
ns9437.5
ns0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
5604
ns5875
ns0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
179937.5
ns201555
ns0.89
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7916
ns7500
ns1.06
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7125
ns7458
ns0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7625
ns7750
ns0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7292
ns7041.5
ns1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
676020.5
ns699933.5
ns0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
500
ns500
ns1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
542
ns500
ns1.08
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
542
ns583
ns0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
458
ns500
ns0.92
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
24453
ns23724.5
ns1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
9250
ns9208
ns1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
9542
ns9625
ns0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
9875
ns9604.5
ns1.03
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
9083
ns9042
ns1.00
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
232408.5
ns234828.5
ns0.99
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)
350791
ns351500
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)
351292
ns350896
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)
353292
ns354624.5
ns1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)
353625
ns351708
ns1.01
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA
21486
ns20984
ns1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)
827167
ns775417
ns1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)
819459
ns824916
ns0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)
825000
ns830958
ns0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)
807520.5
ns823958
ns0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA
314809.5
ns306663
ns1.03
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)
330771.5
ns338083
ns0.98
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)
337249.5
ns341500
ns0.99
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)
440687.5
ns443667
ns0.99
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)
333208
ns325667
ns1.02
batchedmm(16, Bsize=32)/forward/GPU/CUDA
17997
ns17821
ns1.01
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)
684250
ns696042
ns0.98
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)
748834
ns739416.5
ns1.01
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)
1039833
ns1042874.5
ns1.00
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)
691958
ns692645.5
ns1.00
batchedmm(16, Bsize=32)/zygote/GPU/CUDA
295631.5
ns273141.5
ns1.08
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)
345229
ns358458.5
ns0.96
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)
351937.5
ns349125
ns1.01
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)
433625
ns431291.5
ns1.01
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)
372416
ns370875
ns1.00
batchedmm(16, Bsize=128)/forward/GPU/CUDA
22620
ns22357.5
ns1.01
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)
743208
ns756625
ns0.98
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)
752416
ns744208.5
ns1.01
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)
1072792
ns1073250
ns1.00
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)
821708
ns818125.5
ns1.00
batchedmm(16, Bsize=128)/zygote/GPU/CUDA
254388
ns221398.5
ns1.15
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)
3562.5
ns3459
ns1.03
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)
3500
ns3541
ns0.99
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)
3792
ns3792
ns1
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)
3292
ns3291
ns1.00
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA
18327
ns17956
ns1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)
4250
ns4208
ns1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)
4416
ns4208
ns1.05
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)
4458
ns4416
ns1.01
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)
4583
ns4125
ns1.11
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA
288549.5
ns275839.5
ns1.05
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
5937
ns3792
ns1.57
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4083
ns3375
ns1.21
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
6604
ns6750
ns0.98
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
3875
ns6625
ns0.58
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
232718
ns205448.5
ns1.13
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8333.5
ns8334
ns1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8541
ns8459
ns1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8459
ns8500
ns1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8666
ns8541
ns1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
1217009
ns1183984
ns1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
204375
ns202625
ns1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
210334
ns210416
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
209583
ns209292
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
200583
ns200000
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
35617
ns34588
ns1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
647584
ns603792
ns1.07
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
629687.5
ns670625
ns0.94
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
633979
ns630958
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
588542
ns631187.5
ns0.93
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
351232.5
ns352652
ns1.00
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)
956896.5
ns967521
ns0.99
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)
933333
ns927063
ns1.01
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)
970375
ns964437.5
ns1.01
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)
1297208.5
ns1281853.5
ns1.01
batchedmm(128, Bsize=128)/forward/GPU/CUDA
207957
ns207244
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)
4474792
ns4451771
ns1.01
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)
4473791
ns4482750
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)
4462375
ns4474208
ns1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)
6306417
ns6201166
ns1.02
batchedmm(128, Bsize=128)/zygote/GPU/CUDA
944028
ns945549
ns1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3417
ns3604.5
ns0.95
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
4104.5
ns3167
ns1.30
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
5645.5
ns6792
ns0.83
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
3334
ns3167
ns1.05
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
234727
ns233201
ns1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7500
ns7500
ns1
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7583
ns7375
ns1.03
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7208
ns7291
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7042
ns7083
ns0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
1010815
ns1014881
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1649292
ns1602833.5
ns1.03
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1200166
ns1187916
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1363417
ns1364062
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2476209
ns2343729.5
ns1.06
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA
214347
ns212955.5
ns1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12318728.5
ns12334792
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9553209
ns9602042
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9347916
ns9404958
ns0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18029249.5
ns17966833
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
1951267
ns1949853
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17370208
ns17347084
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14329458
ns14365000
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14499750
ns14512666
ns1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21060125
ns21005479.5
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
133791
ns89791
ns1.49
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
88333
ns91729.5
ns0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
91375
ns94291
ns0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
89500
ns117416.5
ns0.76
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
127052
ns126285
ns1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1999708
ns2023917
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2008188
ns2013416.5
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2046875
ns2058875
ns0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2034167
ns2027875
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1039077
ns1031286
ns1.01
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)
338125.5
ns346791.5
ns0.98
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)
347771
ns343583.5
ns1.01
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)
421709
ns412250
ns1.02
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)
313021
ns306166
ns1.02
batchedmm(2, Bsize=4)/forward/GPU/CUDA
16444
ns16010
ns1.03
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)
702125
ns702291
ns1.00
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)
732625
ns728979.5
ns1.01
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)
1030209
ns1025458
ns1.00
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)
648604.5
ns639875
ns1.01
batchedmm(2, Bsize=4)/zygote/GPU/CUDA
197115.5
ns193209
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7458
ns7292
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6042
ns6083
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5209
ns5334
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10125
ns10000
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
35200
ns33620
ns1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
225833
ns220479.5
ns1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
233791
ns231958
ns1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
228375
ns232041
ns0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
218042
ns220500
ns0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
318050.5
ns311751
ns1.02
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3667
ns3708
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3667
ns3708
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3709
ns3709
ns1
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3666
ns3667
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA
22861
ns22440
ns1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
14417
ns14500
ns0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
14375
ns14417
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
14208
ns14167
ns1.00
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
14375
ns14291
ns1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA
490016.5
ns468658
ns1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
136520.5
ns95166
ns1.43
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
92583
ns138021
ns0.67
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
99167
ns99167
ns1
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
93104.5
ns122458
ns0.76
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
126414
ns125691
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1919000
ns1931875
ns0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1919833
ns1954979
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1938604.5
ns1946854
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1922208
ns1923729.5
ns1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
960141
ns940251.5
ns1.02
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s)
876459
ns880500
ns1.00
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s)
824583
ns815125
ns1.01
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s)
1193750
ns1172292
ns1.02
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s)
974812.5
ns960167
ns1.02
lenet(28, 28, 1, 32)/forward/GPU/CUDA
281183
ns270704
ns1.04
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s)
2826375
ns2803000
ns1.01
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s)
2536917
ns2526833
ns1.00
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s)
3329042
ns3361333
ns0.99
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s)
3379375
ns3405875
ns0.99
lenet(28, 28, 1, 32)/zygote/GPU/CUDA
1666976
ns1569154
ns1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
17708
ns15146
ns1.17
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
15750
ns18000
ns0.88
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
18250
ns21666
ns0.84
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
16375
ns18125
ns0.90
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
144691
ns141811.5
ns1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
227166
ns217083
ns1.05
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
215750
ns229375
ns0.94
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
223688
ns257396
ns0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
215854
ns215833
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
652421.5
ns635765.5
ns1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
222542
ns219750
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
222500
ns221500
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
223062.5
ns226021
ns0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
223125
ns223937.5
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
274050
ns270450
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
554458
ns509917
ns1.09
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
524042
ns557729
ns0.94
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
557312.5
ns549792
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
508854
ns555791
ns0.92
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1468319
ns1308245
ns1.12
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)
327041.5
ns333479
ns0.98
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)
336125
ns335541.5
ns1.00
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)
440874.5
ns437333
ns1.01
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)
324187.5
ns319417
ns1.01
batchedmm(16, Bsize=4)/forward/GPU/CUDA
17320
ns16583
ns1.04
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)
711083
ns715333
ns0.99
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)
735374.5
ns730292
ns1.01
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)
1023083
ns1025458.5
ns1.00
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)
664999.5
ns655792
ns1.01
batchedmm(16, Bsize=4)/zygote/GPU/CUDA
200084.5
ns193313
ns1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
18917
ns17625
ns1.07
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
17750
ns17625
ns1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
19500
ns20437.5
ns0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
17667
ns18000
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
151497
ns144711.5
ns1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
225250
ns216667
ns1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
220666.5
ns224083
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
221708
ns226625
ns0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
212479
ns223417
ns0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1059613.5
ns903796
ns1.17
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
6833
ns4625
ns1.48
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
7000
ns6750
ns1.04
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
7250
ns7438
ns0.97
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
5000
ns6625
ns0.75
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
253666
ns174159.5
ns1.46
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
10270.5
ns10437.5
ns0.98
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
10792
ns10750
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
10750
ns10770.5
ns1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
10250
ns10833
ns0.95
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
1105579
ns1024421
ns1.08
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
3666
ns3646
ns1.01
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
3459
ns3334
ns1.04
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
5604
ns5625
ns1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
3562.5
ns3500
ns1.02
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
244769.5
ns231660
ns1.06
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7875
ns7708
ns1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7834
ns7792
ns1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7667
ns7625
ns1.01
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7083.5
ns7167
ns0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
1111376
ns1037611.5
ns1.07
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
23504875
ns23838833
ns0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
34821166
ns33990646
ns1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
41404562
ns41585708
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
34980521
ns34896229
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA
1840452
ns1839186
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
184644333
ns184662833
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
159219875
ns159634000
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
151030791
ns151746084
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
414216333
ns415075875
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
16518712
ns16506413
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
428777375
ns427351833
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
254167375
ns251624521
ns1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
233388208
ns233926312.5
ns1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
486791792
ns484091542
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
184146
ns181666
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
182083
ns183416.5
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
185687.5
ns186125
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
182813
ns183834
ns0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
232040.5
ns173529.5
ns1.34
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
586542
ns587541
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
598833
ns600458
ns1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
637583
ns632375
ns1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
597333
ns631354
ns0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1147246
ns1005977
ns1.14
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)
3825646
ns3816041.5
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)
3636125
ns3637833
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)
3534667
ns3539646
ns1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)
5363792
ns5351396
ns1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA
532563
ns554127
ns0.96
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)
17318916
ns17372333
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)
17179354
ns17218458.5
ns1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)
17094375
ns16979478.5
ns1.01
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)
22003917
ns22177625
ns0.99
batchedmm(128, Bsize=512)/zygote/GPU/CUDA
2614337
ns2616933
ns1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
542
ns583
ns0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
542
ns542
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
542
ns542
ns1
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
500
ns459
ns1.09
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA
33572
ns32036
ns1.05
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
9959
ns9667
ns1.03
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
9416
ns9750
ns0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
9833.5
ns10125
ns0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
9209
ns9291
ns0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA
271645.5
ns260858
ns1.04
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s)
504524917
ns506491042
ns1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s)
431143271
ns428949104
ns1.01
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s)
464642083
ns474815000
ns0.98
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s)
675972708.5
ns671461979
ns1.01
vgg16(32, 32, 3, 128)/forward/GPU/CUDA
12484000
ns12484614.5
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s)
2045443062.5
ns2043435104.5
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s)
1629220084
ns1631358667
ns1.00
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s)
1535256520.5
ns1546812271
ns0.99
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s)
2230955812.5
ns2216473375.5
ns1.01
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA
48693503
ns49204869.5
ns0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
1657208
ns1642542
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
1196666.5
ns1194625
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
1379438
ns1380791
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
2487729.5
ns2487084
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
217266.5
ns215546
ns1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
12684584
ns12711687.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
9939834
ns9927625
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
9748521
ns9788604.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
18391375
ns18464437.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
2054532
ns1995889.5
ns1.03
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
17706187
ns17669166.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
14663771
ns14709437.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
14743729.5
ns14807645.5
ns1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
21515479
ns21465708
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
26292
ns26250
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
26250
ns26250
ns1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
26291
ns26291
ns1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
26209
ns26167
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA
24091.5
ns23873
ns1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
67083
ns66917
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
68417
ns67333
ns1.02
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
67041
ns67083
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66958
ns66833
ns1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA
409287.5
ns382426
ns1.07
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
203708
ns203834
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
210083
ns209542
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
209542
ns209584
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
199625
ns199584
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
27233.5
ns26132
ns1.04
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
647771
ns613833.5
ns1.06
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
648208
ns636667
ns1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
633916.5
ns671166.5
ns0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
631916.5
ns628229.5
ns1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
356804
ns308600
ns1.16
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
679458.5
ns671687.5
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
656792
ns645937.5
ns1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
649396
ns644791.5
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
650145.5
ns676334
ns0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
132741
ns131667
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2242083.5
ns2241875
ns1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2223667
ns2192250
ns1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2266583
ns2297042
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2232792
ns2246249.5
ns0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1233696
ns1114838
ns1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
18395.5
ns16791
ns1.10
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
17917
ns17500
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
19417
ns20958
ns0.93
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
17917
ns16770.5
ns1.07
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA
146461
ns143001
ns1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
229458
ns230375
ns1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
258229
ns231791.5
ns1.11
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
244375
ns266208
ns0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
250438
ns260728.5
ns0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1088374.5
ns959584
ns1.13
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
583
ns500
ns1.17
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
542
ns542
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
542
ns542
ns1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
459
ns500
ns0.92
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA
23950
ns23163
ns1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
10208.5
ns9604.5
ns1.06
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
10084
ns10292
ns0.98
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
10292
ns10625
ns0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
9625
ns9584
ns1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA
261195
ns255611
ns1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)
6833.5
ns5416.5
ns1.26
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)
6333
ns5750
ns1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)
9166
ns9458
ns0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)
5292
ns5708
ns0.93
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA
236210
ns219432
ns1.08
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
7708
ns7833
ns0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
7167
ns7750
ns0.92
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
7708
ns7709
ns1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
7375
ns7000
ns1.05
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA
807849
ns764584
ns1.06
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
2250
ns1959
ns1.15
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
2166
ns2083
ns1.04
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
2333
ns2417
ns0.97
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
2292
ns2208
ns1.04
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA
17905
ns17893
ns1.00
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
6709
ns6875
ns0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
6750
ns6542
ns1.03
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
6666
ns6583
ns1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
6770.5
ns6291
ns1.08
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA
334008.5
ns320459
ns1.04
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)
749354
ns747709
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)
749125
ns749833
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)
754125
ns754999.5
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)
751459
ns749375
ns1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA
22038.5
ns21357
ns1.03
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)
792416
ns774854
ns1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)
808250
ns792687.5
ns1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)
789229
ns817042
ns0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)
798542
ns811166
ns0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA
302658
ns295013.5
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
7458
ns7334
ns1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
6000
ns6000
ns1
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
5084
ns5208.5
ns0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
10125
ns10166
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
34362
ns33519
ns1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
234042
ns219666
ns1.07
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
268146
ns268125
ns1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
265333
ns252000.5
ns1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
225833
ns213562
ns1.06
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
364947.5
ns354278
ns1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)
12333.5
ns10875
ns1.13
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)
12500
ns11833
ns1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)
13417
ns12770.5
ns1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)
11958
ns12000
ns1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA
249215
ns238132.5
ns1.05
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
25042
ns24708
ns1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
25209
ns24584
ns1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
25062.5
ns25292
ns0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
24437.5
ns24500
ns1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA
1118971
ns1094067.5
ns1.02
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s)
106635541
ns106709834
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s)
118368771
ns116906583.5
ns1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s)
124785875
ns127036729
ns0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s)
117816500
ns117807000
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA
2665253
ns2657653
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s)
393341625
ns392558792
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s)
367145584
ns365774917
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s)
430545354
ns431860937.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s)
486902292
ns483379250
ns1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA
15155419.5
ns15196086
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s)
761855250
ns758564875.5
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s)
759969042
ns761412666
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s)
748877750
ns748747542
ns1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s)
767805458.5
ns765232583
ns1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)
7458
ns6625
ns1.13
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)
7791
ns7334
ns1.06
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)
8208
ns9041.5
ns0.91
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)
7895.5
ns8250
ns0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA
240391.5
ns231038.5
ns1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
14250
ns14625
ns0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
13625
ns14750
ns0.92
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
14958
ns14292
ns1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
13541
ns14542
ns0.93
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA
1079844
ns1043294.5
ns1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)
7875
ns5875
ns1.34
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)
8333
ns7959
ns1.05
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)
9833
ns9167
ns1.07
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)
8145.5
ns6333
ns1.29
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA
236528.5
ns228571
ns1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)
12792
ns12791
ns1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)
12459
ns13167
ns0.95
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)
13604
ns13375
ns1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)
12354
ns12333
ns1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA
791731
ns779066.5
ns1.02
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)
341416
ns347625
ns0.98
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)
345104.5
ns342625
ns1.01
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)
421708
ns416812
ns1.01
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)
316750
ns307083
ns1.03
batchedmm(2, Bsize=128)/forward/GPU/CUDA
17091
ns17023
ns1.00
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)
706229
ns710208.5
ns0.99
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)
734625
ns732125
ns1.00
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)
1026854
ns1032542
ns0.99
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)
654792
ns653979.5
ns1.00
batchedmm(2, Bsize=128)/zygote/GPU/CUDA
201401
ns200196.5
ns1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
375
ns334
ns1.12
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
291
ns333
ns0.87
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
24022
ns23569
ns1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
6875
ns6375
ns1.08
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6541
ns6584
ns0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6583
ns6834
ns0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6416.5
ns6042
ns1.06
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
242349
ns241926
ns1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)
5833
ns5708
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)
5792
ns5834
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)
5834
ns5875
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)
5667
ns5708
ns0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA
24894
ns24556.5
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)
21812.5
ns21562.5
ns1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)
21583
ns22000
ns0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)
21666.5
ns21709
ns1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)
21584
ns21167
ns1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA
266328.5
ns265433.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
190062.5
ns144917
ns1.31
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
146333
ns191292
ns0.76
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
150459
ns149333
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
145542
ns149250
ns0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
167806
ns167659
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
1322917
ns1319292
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
1318333
ns1331416
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
1353437.5
ns1362958
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
1315541
ns1326125
ns0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1350841
ns1343729.5
ns1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)
23875
ns22250
ns1.07
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)
24541
ns23791
ns1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)
25875
ns25875
ns1
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)
23708
ns23666.5
ns1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA
353850.5
ns286115
ns1.24
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)
178042
ns146125
ns1.22
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)
149625
ns118500
ns1.26
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)
157792
ns129833
ns1.22
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)
182958
ns175792
ns1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA
1467205
ns1461317
ns1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
375
ns292
ns1.28
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
291
ns292
ns1.00
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA
23537
ns23352
ns1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
7000
ns6334
ns1.11
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
6708
ns6459
ns1.04
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
6667
ns6709
ns0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
6333
ns6125
ns1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA
258302.5
ns258095.5
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
4500
ns4625
ns0.97
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
4792
ns4125
ns1.16
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
7292
ns7625
ns0.96
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
5146
ns4895.5
ns1.05
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
256601
ns256357.5
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
10042
ns9959
ns1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
10541
ns10125
ns1.04
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
10334
ns10333
ns1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
10542
ns10333
ns1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
1354903
ns1358318.5
ns1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)
1625
ns1625
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)
1625
ns1584
ns1.03
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)
1625
ns1625
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)
1583
ns1583
ns1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA
23008
ns23389
ns0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)
5667
ns5667
ns1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)
5834
ns5875
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)
5958
ns6000
ns0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)
5625
ns5625
ns1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA
275860.5
ns275350.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s)
6829125
ns6780125
ns1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s)
6372104.5
ns6371125
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s)
6510708
ns6531396
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s)
7661979
ns7625875
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA
213899
ns214804
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s)
24098624.5
ns24015354
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s)
21314479.5
ns21285667
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s)
21021666.5
ns21085125
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s)
29805875
ns29769250
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA
2108805
ns2112477.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s)
37438896
ns37264541.5
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s)
45528708
ns45538167
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s)
45741750
ns45665125
ns1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s)
38021625
ns38235958
ns0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)
6750
ns6208
ns1.09
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)
6541.5
ns5958.5
ns1.10
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)
8125
ns8750
ns0.93
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)
7792
ns7500
ns1.04
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA
238056.5
ns236550
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8875
ns8750
ns1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8020.5
ns8375
ns0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8292
ns8500
ns0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
8000
ns8958
ns0.89
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA
1056341.5
ns1063848.5
ns0.99
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s)
1550333
ns1554084
ns1.00
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s)
1257312.5
ns1262375
ns1.00
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s)
1639875
ns1631958.5
ns1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s)
2152292
ns2152375
ns1.00
lenet(28, 28, 1, 128)/forward/GPU/CUDA
277659
ns277465
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s)
7893709
ns7881667
ns1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s)
6521292
ns6612667
ns0.99
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s)
7213021.5
ns7276167
ns0.99
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s)
10476770.5
ns10468062.5
ns1.00
lenet(28, 28, 1, 128)/zygote/GPU/CUDA
1865563
ns1876576
ns0.99
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)
335541.5
ns346375
ns0.97
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)
348667
ns348937.5
ns1.00
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)
424167
ns423416.5
ns1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)
344333
ns336687
ns1.02
batchedmm(128, Bsize=4)/forward/GPU/CUDA
46568
ns46390
ns1.00
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)
725333
ns735208
ns0.99
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)
793166.5
ns782458
ns1.01
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)
1069208
ns1081666.5
ns0.99
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)
737667
ns758458.5
ns0.97
batchedmm(128, Bsize=4)/zygote/GPU/CUDA
311445
ns311011.5
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)
397250
ns397375
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)
288166
ns288250
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)
212708
ns212583
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)
749833
ns754104.5
ns0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA
44294
ns44494
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)
669083
ns675959
ns0.99
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)
531000
ns532333
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)
473583.5
ns474000
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)
974375
ns973417
ns1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA
190394
ns189847
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
673125
ns599375
ns1.12
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
646166.5
ns650333
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
635354
ns660375
ns0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
666145.5
ns655833.5
ns1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
132578
ns132321
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2463167
ns2469395.5
ns1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2454250
ns2363959
ns1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2506958
ns2519875.5
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2443770.5
ns2465916
ns0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1555584
ns1345989
ns1.16
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)
338958
ns345583
ns0.98
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)
344708.5
ns342834
ns1.01
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)
403416.5
ns416375
ns0.97
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)
316417
ns306979.5
ns1.03
batchedmm(2, Bsize=32)/forward/GPU/CUDA
15850
ns16330
ns0.97
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)
703979
ns703104
ns1.00
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)
734375
ns729708
ns1.01
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)
1023041
ns1026937.5
ns1.00
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)
650916
ns645959
ns1.01
batchedmm(2, Bsize=32)/zygote/GPU/CUDA
196785.5
ns199885.5
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1461042
ns1460542
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1500667
ns1500583
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1492042
ns1491791
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1440000
ns1441917
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA
40920
ns41671
ns0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5126333.5
ns5133500
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5291541.5
ns5293250
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5301291
ns5309521
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4979667
ns4977042
ns1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
196913.5
ns197710
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)
3667
ns3708
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)
3667
ns3708
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)
3708
ns3709
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)
3625
ns3666
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA
32799
ns33362
ns0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)
15292
ns15125
ns1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)
15375
ns15500
ns0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)
15167
ns15125
ns1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)
15083
ns15083
ns1
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA
373971
ns381216.5
ns0.98
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)
71375
ns71375
ns1
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)
71250
ns71208
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)
71333
ns71583
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)
70792
ns71208
ns0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA
113302
ns113946.5
ns0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
317667
ns319833
ns0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
322209
ns319208
ns1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
322250
ns327125
ns0.99
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
319291
ns318375
ns1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA
192855.5
ns195156
ns0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)
1083
ns959
ns1.13
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)
1083
ns1042
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)
1042
ns1083
ns0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)
958
ns1000
ns0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA
23648
ns23764
ns1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)
8417
ns8084
ns1.04
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)
8334
ns8542
ns0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)
8625
ns8416
ns1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)
7875
ns7833.5
ns1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA
261768
ns263039
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)
466583
ns472416
ns0.99
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)
472208
ns468125
ns1.01
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)
551562.5
ns549250
ns1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)
556667
ns550333
ns1.01
batchedmm(128, Bsize=32)/forward/GPU/CUDA
130676
ns128804.5
ns1.01
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)
1386958.5
ns1375292
ns1.01
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)
1383166.5
ns1372208
ns1.01
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)
1619792
ns1633459
ns0.99
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)
1594729
ns1580500
ns1.01
batchedmm(128, Bsize=32)/zygote/GPU/CUDA
274271.5
ns274739
ns1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)
375
ns416
ns0.90
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)
375
ns416
ns0.90
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)
375
ns375
ns1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)
333
ns292
ns1.14
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA
31927
ns31574
ns1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)
6834
ns6458
ns1.06
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)
6417
ns6875
ns0.93
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)
6625
ns6708
ns0.99
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)
6292
ns6000
ns1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA
264167.5
ns261869
ns1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1731562.5
ns1727625
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1724188
ns1783958
ns0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1732791.5
ns1730916
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1728917
ns1729333
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
168364
ns168455
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
4370166
ns4352625
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
3986834
ns4372937.5
ns0.91
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
4407875
ns4412458
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4357625
ns4358042
ns1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
1246279
ns1234725
ns1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)
9083.5
ns6709
ns1.35
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)
6625
ns6584
ns1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)
7083
ns7417
ns0.95
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)
6792
ns6542
ns1.04
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA
20206
ns19619.5
ns1.03
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
70500
ns51083
ns1.38
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
49333
ns35625
ns1.38
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
48625
ns49875
ns0.97
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
51625
ns70208
ns0.74
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA
210517
ns211156
ns1.00
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)
351292
ns354291
ns0.99
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)
347666
ns347584
ns1.00
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)
439041
ns432708
ns1.01
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)
324750
ns319521.5
ns1.02
batchedmm(2, Bsize=512)/forward/GPU/CUDA
18167
ns18053
ns1.01
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)
719375
ns719104
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)
738645.5
ns735979
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)
1043271
ns1039063
ns1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)
674500
ns672750
ns1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA
343170
ns343671.5
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)
75209
ns75417
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)
75458
ns75333
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)
75334
ns75708
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)
75500
ns74709
ns1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA
47044
ns46983
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)
336292
ns324417
ns1.04
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)
338333.5
ns327000
ns1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)
333875
ns334917
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)
324667
ns324083
ns1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA
208548
ns207721.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
1486334
ns1486334
ns1
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
1527000
ns1527500
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
1518791
ns1519000
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
1464583
ns1466541
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
51940
ns51914
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
5119271
ns5119333.5
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
5286375
ns5300396
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
5302167
ns5303708
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
4987020.5
ns4989375
ns1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
204109
ns201413
ns1.01
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)
28208
ns28167
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)
28208
ns28166
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)
28250
ns28333
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)
28208
ns28208
ns1
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA
24295
ns24393
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)
66500
ns66542
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)
66542
ns66292
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)
66292
ns66542
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)
66666
ns66584
ns1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA
522901.5
ns530998
ns0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s)
1501000
ns1493250
ns1.01
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s)
1136667
ns1120167
ns1.01
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s)
936416
ns947625
ns0.99
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s)
2248959
ns2256500
ns1.00
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA
570544.5
ns570331
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s)
3097916.5
ns3075542
ns1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s)
2602791
ns2732479
ns0.95
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s)
2630292
ns2643125
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s)
3818229.5
ns3814770.5
ns1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA
2066155.5
ns2010818
ns1.03
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s)
8816250
ns8738917
ns1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s)
8770729
ns8777854.5
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s)
8782521
ns8781417
ns1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s)
6355500
ns6360687.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)
82792
ns81146
ns1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)
81229
ns81708.5
ns0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)
84146
ns83708
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)
83687.5
ns87687.5
ns0.95
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA
194104
ns192383.5
ns1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)
2026750
ns2016791.5
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)
2006271
ns2012708
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)
2038312.5
ns2041312
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)
2024312.5
ns2015208
ns1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA
798621.5
ns798885.5
ns1.00
This comment was automatically generated by workflow using github-action-benchmark.