forked from pytorch/ao
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'pytorch:main' into main
- Loading branch information
Showing
105 changed files
with
12,321 additions
and
1,067 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# From https://github.com/pytorch/test-infra/wiki/Using-Nova-Reusable-Build-Workflows | ||
name: Build Linux Wheels | ||
|
||
on: | ||
pull_request: | ||
paths: | ||
- build/packaging/** | ||
- .github/workflows/build_wheels_linux.yml | ||
push: | ||
branches: | ||
- nightly | ||
- main | ||
- release/* | ||
tags: | ||
# NOTE: Binary build pipelines should only get triggered on release candidate builds | ||
# Release candidate tags look like: v1.11.0-rc1 | ||
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ | ||
schedule: | ||
- cron: '0 0 * * *' # Runs at midnight UTC every day | ||
workflow_dispatch: | ||
|
||
jobs: | ||
generate-matrix: | ||
uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main | ||
with: | ||
package-type: wheel | ||
os: linux | ||
with-cpu: enable | ||
with-cuda: enable | ||
with-rocm: disable | ||
|
||
build: | ||
needs: generate-matrix | ||
permissions: | ||
id-token: write | ||
contents: read | ||
uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main | ||
with: | ||
# Set the ref to an empty string instead of the default nightly because | ||
# torchao doesn't have nightly branch setup yet, instead the build is | ||
# triggered daily from main with a schedule | ||
repository: pytorch/ao | ||
ref: "" | ||
build-matrix: ${{ needs.generate-matrix.outputs.matrix }} | ||
env-var-script: packaging/env_var_script_linux.sh | ||
pre-script: packaging/pre_build_script.sh | ||
post-script: packaging/post_build_script.sh | ||
smoke-test-script: packaging/smoke_test.py | ||
package-name: torchao | ||
trigger-event: ${{ github.event_name }} | ||
# This is the CUDA version to be uploaded to torchao-nightly pypi | ||
upload-to-pypi: cu121 | ||
secrets: | ||
PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
import torch | ||
import torchao | ||
from torch.utils.benchmark import Timer | ||
import pandas as pd | ||
from tqdm import tqdm | ||
|
||
|
||
def benchmark(m, k, n, splitK): | ||
# Randomly initialize each bytes. The highest value for randint() is set the the max value of uint32_t. | ||
fp6_weight = torch.randint(4294967295, (n, k // 16 * 3)).to(torch.int) | ||
fp16_scale = torch.rand(n).half() + 0.5 | ||
fp16_activation = torch.rand(m, k).half() + 0.5 | ||
|
||
fp6_weight_packed = torchao.ops.prepack_fp6_weight(fp6_weight) | ||
act_cuda = fp16_activation.cuda() | ||
weight_cuda = fp6_weight_packed.cuda() | ||
scale_cuda = fp16_scale.cuda() | ||
|
||
# need to do this since Timer cannot see torchao | ||
def fp6_linear(act_cuda, weight_cuda, scale_cuda, splitK): | ||
return torchao.ops.fp16act_fp6weight_linear(act_cuda, weight_cuda, scale_cuda, splitK) | ||
|
||
fp6_output = fp6_linear(act_cuda, weight_cuda, scale_cuda, splitK) | ||
|
||
fp6_measurement = Timer( | ||
stmt="fp6_linear(act_cuda, weight_cuda, scale_cuda, splitK)", | ||
globals=locals(), | ||
).blocked_autorange() | ||
|
||
fp16_weight = torchao.ops.fp6_weight_dequant(fp6_weight, fp16_scale).cuda() | ||
fp16_output = act_cuda @ fp16_weight.T | ||
|
||
fp16_measurement = Timer( | ||
stmt="act_cuda @ fp16_weight.T", | ||
globals=locals(), | ||
).blocked_autorange() | ||
|
||
# follow https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/tests/python/kernel_test.py | ||
# doesn't seem to be the right way to check for correctness | ||
correct = (fp6_output - fp16_output).abs().mean() / fp16_output.abs().mean() < 1e-3 | ||
|
||
return { | ||
"m": m, | ||
"k": k, | ||
"n": n, | ||
"fp6_latency (ms)": fp6_measurement.median * 1000, | ||
"fp16_latency (ms)": fp16_measurement.median * 1000, | ||
"speedup (d/s)": fp16_measurement.median / fp6_measurement.median, | ||
"correct": correct, | ||
} | ||
|
||
|
||
if __name__ == "__main__": | ||
# from https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/tests/python/run.sh | ||
k_vals = (8192, 8192, 8192, 28672) | ||
n_vals = (10240, 8192, 57344, 8192) | ||
|
||
results = [] | ||
|
||
# splitK can be tuned based on m, k, n | ||
for m, splitK_vals in tqdm([ | ||
(1, (5, 6, 7, 6)), | ||
(2, (5, 6, 7, 6)), | ||
(4, (5, 6, 7, 6)), | ||
(8, (5, 6, 7, 6)), | ||
# (16, (5, 6, 7, 6)), | ||
# (64, (5, 6, 7, 6)), | ||
# (128, (5, 3, 3, 3)), | ||
# (256, (4, 3, 2, 3)), | ||
# (512, (2, 5, 2, 4)), | ||
(1024, (1, 2, 1, 2)), | ||
(2048, (1, 1, 1, 1)), | ||
(4096, (1, 1, 1, 1)), | ||
# (8192, (1, 1, 1, 1)), | ||
# (16384, (1, 1, 1, 1)), | ||
]): | ||
for n, k, splitK in zip(n_vals, k_vals, splitK_vals): | ||
results.append(benchmark(m, n, k, splitK)) | ||
|
||
df = pd.DataFrame(results) | ||
df.to_csv("fp6_benchmark_results.csv", index=False) | ||
print(df.to_markdown(index=False)) |
Oops, something went wrong.