Merge branch 'pytorch:main' into main

weifengpy · May 28, 2024 · e5093d5 · e5093d5
2 parents f27760b + 42c2376
commit e5093d5
Show file tree

Hide file tree

Showing 105 changed files with 12,321 additions and 1,067 deletions.
diff --git a/.github/workflows/build_wheels_linux.yml b/.github/workflows/build_wheels_linux.yml
@@ -0,0 +1,54 @@
+# From https://github.com/pytorch/test-infra/wiki/Using-Nova-Reusable-Build-Workflows
+name: Build Linux Wheels
+
+on:
+  pull_request:
+    paths:
+      - build/packaging/**
+      - .github/workflows/build_wheels_linux.yml
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  schedule:
+    - cron: '0 0 * * *'  # Runs at midnight UTC every day
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
+    with:
+      package-type: wheel
+      os: linux
+      with-cpu: enable
+      with-cuda: enable
+      with-rocm: disable
+
+  build:
+    needs: generate-matrix
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main
+    with:
+      # Set the ref to an empty string instead of the default nightly because
+      # torchao doesn't have nightly branch setup yet, instead the build is
+      # triggered daily from main with a schedule
+      repository: pytorch/ao
+      ref: ""
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      env-var-script: packaging/env_var_script_linux.sh
+      pre-script: packaging/pre_build_script.sh
+      post-script: packaging/post_build_script.sh
+      smoke-test-script: packaging/smoke_test.py
+      package-name: torchao
+      trigger-event: ${{ github.event_name }}
+      # This is the CUDA version to be uploaded to torchao-nightly pypi
+      upload-to-pypi: cu121
+    secrets:
+      PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/doc_build.yml b/.github/workflows/doc_build.yml
@@ -41,6 +41,7 @@ jobs:
         run: |
           python -m pip install torch
           python -m pip install -e .
+          pip install -r dev-requirements.txt
           cd docs
           python -m pip install -r requirements.txt
       - name: Build docs

diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml
diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
@@ -31,9 +31,9 @@ jobs:
             torch-spec: 'torch==2.3.0'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.1"
-          - name: CUDA 2.4.0.dev20240421
+          - name: CUDA Nightly 
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: '--pre torch==2.4.0.dev20240421+cu121 --index-url https://download.pytorch.org/whl/nightly/cu121'
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.1"
           - name: CPU 2.2.2
@@ -46,7 +46,7 @@ jobs:
             torch-spec: 'torch==2.3.0 --index-url https://download.pytorch.org/whl/cpu'
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
-          - name: Nightly CPU
+          - name: CPU Nightly
             runs-on: linux.4xlarge
             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cpu'
             gpu-arch-type: "cpu"
@@ -58,9 +58,14 @@ jobs:
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       script: |
+        conda create -n venv python=3.9 -y
+        conda activate venv
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
         python -m pip install --upgrade pip
         pip install ${{ matrix.torch-spec }}
         pip install -r requirements.txt
         pip install -r dev-requirements.txt
-        python setup.py install
+        pip install .
         pytest test --verbose -s
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # torchao: PyTorch Architecture Optimization
 
-[![](https://dcbadge.vercel.app/api/server/cudamode?style=flat)](discord.gg/cudamode)
+[![](https://dcbadge.vercel.app/api/server/cudamode?style=flat)](https://discord.gg/cudamode)
 
 This repository is currently under heavy development - if you have suggestions on the API or use-cases you'd like to be covered, please open an [issue](https://github.com/pytorch/ao/issues)
 
@@ -27,9 +27,25 @@ From source
 ```Shell
 git clone https://github.com/pytorch/ao
 cd ao
-pip install .
+pip install -r requirements.txt
+pip install -r dev-requirements.txt
 ```
 
+There are two options;
+-If you plan to be developing the library run:
+```Shell
+python setup.py develop
+```
+
+If you want to install from source run
+```Shell
+python setup.py install
+```
+
+** Note:
+Since we are building pytorch c++/cuda extensions by default, running `pip install .` will
+not work.
+
 ### Quantization
 
 ```python
@@ -44,12 +60,9 @@ torch._inductor.config.use_mixed_mm = True
 model = torch.nn.Sequential(torch.nn.Linear(32, 64)).cuda().to(torch.bfloat16)
 input = torch.randn(32,32, dtype=torch.bfloat16, device='cuda')
 
-# perform autoquantization
-torchao.autoquant(model, (input))
-
-# compile the model to recover performance
-model = torch.compile(model, mode='max-autotune')
-model(input)
+# perform autoquantization and compilation
+q_model = torchao.autoquant(torch.compile(model, mode='max-autotune'))
+q_model(input)
 ```
 
 ### Sparsity
@@ -100,6 +113,7 @@ To learn more try out our APIs, you can check out API examples in
 3. Support for lower precision [dtypes](./torchao/dtypes) such as
     - [nf4](https://github.com/pytorch/ao/blob/main/torchao/dtypes/nf4tensor.py) which was used to [implement QLoRA](https://github.com/pytorch/torchtune/blob/main/docs/source/tutorials/qlora_finetune.rst) without writing custom Triton or CUDA code
     - [uint4](https://github.com/pytorch/ao/blob/main/torchao/dtypes/uint4.py)
+    - [MX](https://github.com/pytorch/ao/blob/main/torchao/prototype/mx_formats) implementing training and inference support with tensors using the [OCP MX spec](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) data types, which can be described as groupwise scaled float8/float6/float4/int8, with the scales being constrained to powers of two. This work is prototype as the hardware support is not available yet.
 4. [Bleeding Edge Kernels](./torchao/prototype/) for experimental kernels without backwards compatibility guarantees
     - [GaLore](https://github.com/pytorch/ao/tree/main/torchao/prototype/galore) for memory efficient finetuning
     - [fused HQQ Gemm Kernel](https://github.com/pytorch/ao/tree/main/torchao/prototype/hqq) for compute bound workloads
@@ -123,9 +137,9 @@ torchao has been integrated with other libraries including
 ## Success stories
 Our kernels have been used to achieve SOTA inference performance on
 
-* Image segmentation models with [sam-fast](pytorch.org/blog/accelerating-generative-ai)
-* Language models with [gpt-fast](pytorch.org/blog/accelerating-generative-ai-2)
-* Diffusion models with [sd-fast](pytorch.org/blog/accelerating-generative-ai-3)
+* Image segmentation models with [sam-fast](https://pytorch.org/blog/accelerating-generative-ai)
+* Language models with [gpt-fast](https://pytorch.org/blog/accelerating-generative-ai-2)
+* Diffusion models with [sd-fast](https://pytorch.org/blog/accelerating-generative-ai-3)
 
 ## License
 

diff --git a/benchmarks/benchmark_fp6.py b/benchmarks/benchmark_fp6.py
@@ -0,0 +1,82 @@
+import torch
+import torchao
+from torch.utils.benchmark import Timer
+import pandas as pd
+from tqdm import tqdm
+
+
+def benchmark(m, k, n, splitK):
+    # Randomly initialize each bytes. The highest value for randint() is set the the max value of uint32_t.
+    fp6_weight = torch.randint(4294967295, (n, k // 16 * 3)).to(torch.int)
+    fp16_scale = torch.rand(n).half() + 0.5
+    fp16_activation = torch.rand(m, k).half() + 0.5
+
+    fp6_weight_packed = torchao.ops.prepack_fp6_weight(fp6_weight)
+    act_cuda = fp16_activation.cuda()
+    weight_cuda = fp6_weight_packed.cuda()
+    scale_cuda = fp16_scale.cuda()
+
+    # need to do this since Timer cannot see torchao
+    def fp6_linear(act_cuda, weight_cuda, scale_cuda, splitK):
+        return torchao.ops.fp16act_fp6weight_linear(act_cuda, weight_cuda, scale_cuda, splitK)
+
+    fp6_output = fp6_linear(act_cuda, weight_cuda, scale_cuda, splitK)
+
+    fp6_measurement = Timer(
+        stmt="fp6_linear(act_cuda, weight_cuda, scale_cuda, splitK)",
+        globals=locals(),
+    ).blocked_autorange()
+
+    fp16_weight = torchao.ops.fp6_weight_dequant(fp6_weight, fp16_scale).cuda()
+    fp16_output = act_cuda @ fp16_weight.T
+
+    fp16_measurement = Timer(
+        stmt="act_cuda @ fp16_weight.T",
+        globals=locals(),
+    ).blocked_autorange()
+
+    # follow https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/tests/python/kernel_test.py
+    # doesn't seem to be the right way to check for correctness
+    correct = (fp6_output - fp16_output).abs().mean() / fp16_output.abs().mean() < 1e-3
+
+    return {
+        "m": m,
+        "k": k,
+        "n": n,
+        "fp6_latency (ms)": fp6_measurement.median * 1000,
+        "fp16_latency (ms)": fp16_measurement.median * 1000,
+        "speedup (d/s)": fp16_measurement.median / fp6_measurement.median,
+        "correct": correct,
+    }
+
+
+if __name__ == "__main__":
+    # from https://github.com/usyd-fsalab/fp6_llm/blob/ce76774bcfc26b325c1b558abcf1935026d9abbc/tests/python/run.sh
+    k_vals = (8192, 8192, 8192, 28672)
+    n_vals = (10240, 8192, 57344, 8192)
+
+    results = []
+
+    # splitK can be tuned based on m, k, n
+    for m, splitK_vals in tqdm([
+        (1, (5, 6, 7, 6)),
+        (2, (5, 6, 7, 6)),
+        (4, (5, 6, 7, 6)),
+        (8, (5, 6, 7, 6)),
+        # (16, (5, 6, 7, 6)),
+        # (64, (5, 6, 7, 6)),
+        # (128, (5, 3, 3, 3)),
+        # (256, (4, 3, 2, 3)),
+        # (512, (2, 5, 2, 4)),
+        (1024, (1, 2, 1, 2)),
+        (2048, (1, 1, 1, 1)),
+        (4096, (1, 1, 1, 1)),
+        # (8192, (1, 1, 1, 1)),
+        # (16384, (1, 1, 1, 1)),
+    ]):
+        for n, k, splitK in zip(n_vals, k_vals, splitK_vals):
+            results.append(benchmark(m, n, k, splitK))
+
+    df = pd.DataFrame(results)
+    df.to_csv("fp6_benchmark_results.csv", index=False)
+    print(df.to_markdown(index=False))