diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 5dbafee9f..a6300211f 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -1055,7 +1055,54 @@ jobs: ./runner/build_android.sh echo "Tests complete." - test-torchao-experimental: + test-torchao-experimental-python: + strategy: + matrix: + runner: [macos-14-xlarge] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + submodules: true + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: 3.10.11 + - name: Setup Xcode + if: runner.os == 'macOS' + uses: maxim-lobanov/setup-xcode@v1 + with: + xcode-version: '15.3' + - name: Print machine info + run: | + uname -a + if [ $(uname -s) == Darwin ]; then + sysctl machdep.cpu.brand_string + sysctl machdep.cpu.core_count + fi + - name: Install torchchat + run: | + echo "Intalling pip3 packages" + ./install/install_requirements.sh + pip3 list + python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' + - name: Run inference + run: | + python torchchat.py download stories110M + wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model + export PRMT="Once upon a time in a land far away" + echo "Generate eager" + python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' + echo "Generate compile" + python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile + echo "Export AOTI" + python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' + echo "Generate AOTI" + python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}" + echo "Tests complete." + + test-torchao-experimental-cpp: strategy: matrix: runner: [macos-14-xlarge] @@ -1109,18 +1156,12 @@ jobs: python torchchat.py download stories110M wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model export PRMT="Once upon a time in a land far away" - echo "Generate eager" - python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' - echo "Generate compile" - python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile echo "Export and run ET (C++ runner)" python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}" echo "Export and run AOTI (C++ runner)" python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' ./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}" - echo "Generate AOTI" - python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}" echo "Tests complete." test-torchao-experimental-mps: diff --git a/docs/quantization.md b/docs/quantization.md index 704a7ed6a..d1de63b14 100644 --- a/docs/quantization.md +++ b/docs/quantization.md @@ -120,13 +120,15 @@ python3 torchchat.py generate llama3 --pte-path llama3.pte --prompt "Hello my n ## Experimental TorchAO lowbit kernels -WARNING: These kernels only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon. +If you are on a Mac with Apple Silicon, we have 1-8 quantization available for embedding and linear layers, backed by CPU and MPS kernels. + +The CPU kernels are installed automatically by the torchchat install script and can be used out of the box. To use the MPS kernels, follow the setup instructions below. ### Use #### linear:a8wxdq The quantization scheme linear:a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize. -It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false). +It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7, 8), groupsize (-1 if channelwise desired), and has_weight_zeros (true, false). The argument has_weight_zeros indicates whether the weights are quantized with scales only (has_weight_zeros: false) or with both scales and zeros (has_weight_zeros: true). Roughly speaking, {bitwidth: 4, groupsize: 32, has_weight_zeros: false} is similar to GGML's Q4_0 quantization scheme. @@ -138,7 +140,9 @@ The quantization scheme embedding:wx quantizes embeddings in a groupwise manner You should expect high performance on ARM CPU if groupsize is divisible by 32. With other platforms and argument choices, a slow fallback kernel will be used. You will see warnings about this during quantization. ### Setup -To use linear:a8wxdq and embedding:wx, you must set up the torchao experimental kernels. These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon. +If you are using the torchao ops from python (i.e not with a C++ runner), they are available out of the box on a Mac with Apple Silicon, and you can skip these setup steps. + +If you plan to use the kernels from the AOTI/ExecuTorch C++ runners, follow the setup steps below. From the torchchat root directory, run ``` @@ -147,7 +151,7 @@ bash torchchat/utils/scripts/build_torchao_ops.sh This should take about 10 seconds to complete. -Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners. +When building the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners. ``` bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops @@ -175,8 +179,8 @@ OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype fl #### AOTI ``` -OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-dso llama3_1.so -OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so --prompt "Once upon a time," --num-samples 5 +OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-aoti-package-path llama3_1.pt2 +OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --aoti-package-path llama3_1.pt2 --prompt "Once upon a time," --num-samples 5 ``` If you built the AOTI runner with link_torchao_ops as discussed in the setup section, you can also use the C++ runner: diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt index e79e9c341..640cd889c 100644 --- a/install/.pins/et-pin.txt +++ b/install/.pins/et-pin.txt @@ -1 +1 @@ -9c043290ad3944268290e015c3063bc411e6ef6b +9836b39fe690e1906f133b4a233863149c30d499 diff --git a/install/install_requirements.sh b/install/install_requirements.sh index 360ba1801..35a6967a9 100755 --- a/install/install_requirements.sh +++ b/install/install_requirements.sh @@ -117,9 +117,11 @@ fi # For torchao need to install from github since nightly build doesn't have macos build. # TODO: Remove this and install nightly build, once it supports macos +# USE_CPP=1 indicates that the torchao experimental aten kernels will be built and loaded +# if on Mac with Apple Silicon ( set -x - $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@2f97b0955953fa1a46594a27f0df2bc48d93e79d + USE_CPP=1 $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@11333ba2cb5c4e792bc4f5c0d70c12991f972008 ) if [[ -x "$(command -v nvidia-smi)" ]]; then diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py index b90d098b3..15736b035 100644 --- a/torchchat/utils/quantize.py +++ b/torchchat/utils/quantize.py @@ -50,6 +50,18 @@ state_dict_device, use_et_backend, ) +from torchao.experimental.packed_linear_int8_dynamic_activation_intx_weight_layout import ( + PackedLinearInt8DynamicActivationIntxWeightLayout, +) +from torchao.experimental.quant_api import ( + int8_dynamic_activation_intx_weight, + IntxWeightEmbeddingQuantizer, +) +from torchao.quantization.granularity import ( + PerGroup, + PerRow, +) +from torchao.dtypes import PlainLayout # Flag for whether the a8wxdq quantizer is available. @@ -117,7 +129,45 @@ def quantize_model( unwrap_tensor_subclass(model) continue - if quantizer in ["linear:a8wxdq", "embedding:wx"]: + if quantizer == "linear:a8wxdq": + if get_precision() != torch.float32: + print(f"Quantizer {quantizer} requires float32 inputs, but received {get_precision()}. Changing dtype to float32. Note that after quantization, the weights will be lowbit integers, not float32.") + set_precision(torch.float32) + + group_size = q_kwargs["groupsize"] + bit_width = q_kwargs["bitwidth"] + has_weight_zeros = q_kwargs["has_weight_zeros"] + granularity = PerRow() if group_size == -1 else PerGroup(group_size) + weight_dtype = getattr(torch, f"int{bit_width}") + + try: + quantize_( + model, + int8_dynamic_activation_intx_weight( + weight_dtype=weight_dtype, + granularity=granularity, + has_weight_zeros=has_weight_zeros, + layout=PackedLinearInt8DynamicActivationIntxWeightLayout(), + ), + ) + except Exception as e: + print("Encountered error during quantization: {e}") + print("Trying with PlainLayout") + quantize_( + model, + int8_dynamic_activation_intx_weight( + weight_dtype=weight_dtype, + granularity=granularity, + has_weight_zeros=has_weight_zeros, + layout=PlainLayout(), + ), + ) + + if not support_tensor_subclass: + unwrap_tensor_subclass(model) + continue + + if quantizer == "embedding:wx": # These quantizers require float32 input weights. Note that after quantization, # the weights will no longer be float32, but lowbit integers if get_precision() != torch.float32: @@ -889,10 +939,12 @@ def quantized_model(self) -> nn.Module: # class references quantizer_class_dict = { "embedding": EmbeddingOnlyQuantHandler, + "embedding:wx": IntxWeightEmbeddingQuantizer, "linear:int8": WeightOnlyInt8QuantHandler, "precision": PrecisionHandler, "executor": ExecutorHandler, "linear:int4": Int4WeightOnlyQuantizer, + "linear:a8wxdq": None, # uses quantize_ API "linear:a8w4dq": Int8DynActInt4WeightQuantizer, } @@ -915,27 +967,10 @@ def quantized_model(self) -> nn.Module: torchao_experimental_quant_api_spec.loader.exec_module( torchao_experimental_quant_api ) - from torchao_experimental_quant_api import ( - Int8DynActIntxWeightLinearQuantizer, - IntxWeightEmbeddingQuantizer, - UIntxWeightOnlyLinearQuantizer, - ) - - quantizer_class_dict["linear:a8wxdq"] = Int8DynActIntxWeightLinearQuantizer - quantizer_class_dict["embedding:wx"] = IntxWeightEmbeddingQuantizer + from torchao_experimental_quant_api import UIntxWeightOnlyLinearQuantizer quantizer_class_dict["linear:afpwx"] = UIntxWeightOnlyLinearQuantizer # Try loading custom op - try: - import glob - - libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/libtorchao_ops_aten.*") - libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs)) - torch.ops.load_library(libs[0]) - print("Loaded torchao cpu ops.") - except Exception as e: - print("Unable to load torchao cpu ops library. Slow fallback kernels will be used.") - try: libname = "libtorchao_ops_mps_aten.dylib" libpath = f"{torchao_build_path}/cmake-out/lib/{libname}"