Skip to content

Commit

Permalink
Build Zoom wheel
Browse files Browse the repository at this point in the history
  • Loading branch information
makslevental committed Jan 22, 2025
1 parent aaef6b9 commit 7fb906f
Show file tree
Hide file tree
Showing 5 changed files with 252 additions and 11 deletions.
141 changes: 141 additions & 0 deletions .github/workflows/build_zoom_backend.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
name: "Build Zoom wheel"

on:
workflow_dispatch:
inputs:
force_debug_with_tmate:
type: boolean
description: 'Run the build with tmate session'
required: false
default: false
debug_with_tmate:
type: boolean
description: 'Run the build with a tmate session ONLY in case of failure'
required: false
default: false
pull_request:
push:
branches:
- main

concurrency:
group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
cancel-in-progress: true

jobs:
build:

strategy:
fail-fast: false
matrix:
include:
- name: "ubuntu-22.04"
runs-on: "azure-cpubuilder-linux-scale"
# runs-on: "mi300"
# container: "rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0"

runs-on: ${{ matrix.runs-on }}

name: ${{ matrix.name }}

env:
CACHE_DIR: ${{ github.workspace }}/.container-cache
# either the PR number or `branch-N` where N always increments
CACHE_KEY: linux-build-test-cpp-asserts-manylinux-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }}

defaults:
run:
shell: bash

permissions:
id-token: write
contents: write

container:
image: ${{ matrix.container }}

steps:
- name: "Check out repository"
uses: actions/[email protected]
with:
submodules: true

- name: Enable cache
uses: actions/cache/restore@v3
with:
path: ${{ env.CACHE_DIR }}
key: ${{ env.CACHE_KEY }}
restore-keys: linux-build-test-cpp-

- name: "Build PyTorch"
id: build
run: |
curl -sSL https://raw.githubusercontent.com/mrodden/get-rocm/refs/heads/master/get-rocm.py -o get-rocm.py
python3.11 get-rocm.py --rocm-version 6.2.3
export CCACHE_DIR="${{ env.CACHE_DIR }}"
export CMAKE_C_COMPILER_LAUNCHER=ccache
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
export CCACHE_SLOPPINESS=include_file_ctime,include_file_mtime,time_macros
python3.11 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
./build.sh
- name: "Audit"
id: audit
run: |
sudo apt install patchelf
source venv/bin/activate
pip install auditwheel
auditwheel repair -w dist --plat manylinux_2_39_x86_64 dist/torch*
- name: "Test"
id: test
run: |
# smoke test
python zoom_extension/examples/test.py
# device tests
PYTORCH_TEST_WITH_SLOW=1 TORCH_TEST_DEVICES=zoom_extension/test/pytorch_test_base.py ./test.sh
cat zoom_test_errors.log
cat zoom_unimplemented_operators.log
- name: Save cache
uses: actions/cache/save@v3
if: ${{ !cancelled() }}
with:
path: ${{ env.CACHE_DIR }}
key: ${{ env.CACHE_KEY }}

- name: Upload artifacts
if: ${{ !cancelled() }}
uses: actions/upload-artifact@v4
with:
name: ${{ matrix.name }}_artifact
path: dist
if-no-files-found: warn

- name: Release current commit
if: ${{ !cancelled() }}
uses: ncipollo/[email protected]
with:
artifacts: "dist/torch*.whl"
token: "${{ secrets.GITHUB_TOKEN }}"
tag: "latest"
name: "latest"
removeArtifacts: false
allowUpdates: true
replacesArtifacts: true
makeLatest: true

- name: "Setup tmate session"
if: ${{ (failure() && inputs.debug_with_tmate) || inputs.force_debug_with_tmate }}
uses: mxschmitt/[email protected]
with:
limit-access-to-actor: true
install-dependencies: ${{ startsWith(matrix.runs-on, 'macos') || startsWith(matrix.runs-on, 'windows') }}
18 changes: 7 additions & 11 deletions build.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
#!/bin/bash

rm -rf build
git clean -fdx -e .idea
git clean -fdX -e .idea


export USE_ZOOM=1
export USE_ROCM=0
Expand Down Expand Up @@ -118,13 +115,12 @@ export USE_VULKAN_FP16_INFERENCE=0
export USE_VULKAN_RELAXED_PRECISI0=0
export USE_XNNPACK=0
export USE_XPU=0
export ONNX_ML=0

# for the ligerllama example we need distributed and tensorpipe, only because
# huggingface model.generate insists on querying torch.distributed and distributed relies on tensorpipe
# this could be a factor of nod-pytorch being out of date with upstream:
# https://github.com/pytorch/pytorch/issues/97397
export PYTORCH_ROCM_ARCH="gfx90a;gfx940;gfx941;gfx942;gfx1100;"
source venv/bin/activate
#python setup.py develop
python setup.py bdist_wheel

python setup.py develop
python zoom_extension/examples/test.py
PYTORCH_TEST_WITH_SLOW=1 TORCH_TEST_DEVICES=zoom_extension/test/pytorch_test_base.py ./test.sh
python setup.py bdist_wheel
#python zoom_extension/examples/test.py
#PYTORCH_TEST_WITH_SLOW=1 TORCH_TEST_DEVICES=zoom_extension/test/pytorch_test_base.py ./test.sh
59 changes: 59 additions & 0 deletions test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash

log_file="test.log"
bak_file="test.log.bak"
output_file="zoom_unimplemented_operators.log"
bak_out="zoom_unimplemented_operators.log.bak"
error_file="zoom_test_errors.log"
bak_err="zoom_test_errors.log.bak"

# backup logs
[ -f $log_file ] && cp $log_file $bak_file
[ -f $output_file ] && cp $output_file $bak_out
[ -f $error_file ] && cp $error_file $bak_err

python test/test_torch.py --run-parallel 0 -k TestTorchDeviceTypePRIVATEUSEONE --verbose &> $log_file
#python test/test_ops.py -k TestCommonPRIVATEUSEONE
#python test/test_ops.py -k TestCommonPRIVATEUSEONE.test_compare_cpu --verbose &> $log_file
#python test/test_ops.py -k TestCommonPRIVATEUSEONE.test_numpy_ref --verbose &> $log_file

## Find Unimplemented Operator Errors from failing tests
# Pattern to search for
pattern="Could not run 'aten::[^']*' with arguments from the 'zoom' backend"

# Extract aten operators, count frequencies, sort by frequency (descending), and save to output file
grep -oP "$pattern" "$log_file" |
sed -n "s/.*'aten::\([^']*\)'.*/\1/p" |
sort |
uniq -c |
sort -rn |
sed 's/^ *//; s/ /\t/' > "$output_file"

# Count total matches
total_matches=$(grep -cP "$pattern" "$log_file")

# Append total matches to the output file
echo -e "\nTotal unimplemented operator failures: $total_matches" >> "$output_file"
echo "A list of unimplemented operators has been saved to $output_file"

## Find errors from failing tests
# Extract error messages, count frequencies, sort by frequency (descending), and save to output file
# Pattern to search for
pattern="^.*Error: (?!test)(.+?)(?=\n|$)"

grep -oP "$pattern" "$log_file" |
sed 's/^(.*Error): //g' |
awk '{print substr($0, 1, 100)}' | # Limit to first 100 characters
sort |
uniq -c |
sort -rn |
sed 's/^ *//; s/ /\t/' > "$error_file"

# Count total matches
total_matches=$(grep -cP "$pattern" "$log_file")

# Append total matches to the output file
echo -e "\nTotal test errors failures: $total_matches" >> "$error_file"
echo "A list of test errors has been saved to $error_file"

echo "Test logs have been saved to $log_file"
10 changes: 10 additions & 0 deletions zoom_extension/examples/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import torch.zoom

torch.utils.rename_privateuse1_backend("zoom")
# TODO: figure this out
unsupported_dtypes = None
torch.utils.generate_methods_for_privateuse1_backend(
unsupported_dtype=unsupported_dtypes
)
x = torch.empty(5, device="zoom:0", dtype=torch.int64)
print(x)
35 changes: 35 additions & 0 deletions zoom_extension/test/pytorch_test_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import torch
import torch.zoom
from typing import ClassVar

torch.utils.rename_privateuse1_backend('zoom')
unsupported_dtypes = None
torch.utils.generate_methods_for_privateuse1_backend(unsupported_dtype=unsupported_dtypes)

class ZoomTestBase(DeviceTypeTestBase):
device_type = 'privateuseone'
primary_device: ClassVar[str]

@classmethod
def get_primary_device(cls):
return cls.primary_device


@classmethod
def get_all_devices(cls):
primary_device_idx = int(cls.get_primary_device().split(':')[1])
num_devices = torch.zoom.device_count()

prim_device = cls.get_primary_device()
zoom_str = 'zoom:{0}'
non_primary_devices = [zoom_str.format(idx) for idx in range(num_devices) if idx != primary_device_idx]
return [prim_device] + non_primary_devices

@classmethod
def setUpClass(cls):
# Force Zoom Init
t = torch.ones(1, device='zoom')
# Acquires the current device as the primary (test) device
cls.primary_device = f'zoom:{torch.zoom.current_device()}'

TEST_CLASS = ZoomTestBase

0 comments on commit 7fb906f

Please sign in to comment.