Update

[ghstack-poisoned]
pytorch · Oct 4, 2024 · 1f01df9 · 1f01df9
2 parents ac6f768 + 5dd0132
commit 1f01df9
Show file tree

Hide file tree

Showing 16 changed files with 1,023 additions and 437 deletions.
diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
@@ -38,12 +38,12 @@ jobs:
             torch-spec: 'torch==2.4.0'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.1"
-          - name: CUDA Nightly
+          - name: CUDA Nightly (Oct 1)
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121'
+            torch-spec: '--pre torch==2.6.0.dev20241001+cu121 --index-url https://download.pytorch.org/whl/nightly/cu121'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.1"
-            
+
           - name: CPU 2.2.2
             runs-on: linux.4xlarge
             torch-spec: 'torch==2.2.2 --index-url https://download.pytorch.org/whl/cpu "numpy<2" '

diff --git a/README.md b/README.md
@@ -170,14 +170,19 @@ For *most* developers you probably want to skip building custom C++/CUDA extensi
 USE_CPP=0 pip install -e .
 ```
 
-## Integrations
+## OSS Integrations
 
 We're also fortunate to be integrated into some of the leading open-source libraries including
 1. Hugging Face transformers with a [builtin inference backend](https://huggingface.co/docs/transformers/main/quantization/torchao) and [low bit optimizers](https://github.com/huggingface/transformers/pull/31865)
-2. Hugging Face diffusers best practices with torch.compile and torchao [standalone repo](https://github.com/sayakpaul/diffusers-torchao)
+2. Hugging Face diffusers best practices with torch.compile and torchao in a standalone repo [diffusers-torchao](https://github.com/sayakpaul/diffusers-torchao)
 3. Mobius HQQ backend leveraged our int4 kernels to get [195 tok/s on a 4090](https://github.com/mobiusml/hqq#faster-inference)
+4. [TorchTune](https://github.com/pytorch/torchtune) for our QLoRA and QAT recipes
+5. [torchchat](https://github.com/pytorch/torchtune) for post training quantization
+6. [SGLang](https://github.com/sgl-project/sglang/pull/1341) for LLM inference quantization
 
 ## Videos
+* [Keynote talk at GPU MODE IRL](https://youtu.be/FH5wiwOyPX4?si=VZK22hHz25GRzBG1&t=1009)
+* [Low precision dtypes at PyTorch conference](https://youtu.be/xcKwEZ77Cps?si=7BS6cXMGgYtFlnrA)
 * [Slaying OOMs at the Mastering LLM's course](https://www.youtube.com/watch?v=UvRl4ansfCg)
 * [Advanced Quantization at CUDA MODE](https://youtu.be/1u9xUK3G4VM?si=4JcPlw2w8chPXW8J)
 * [Chip Huyen's GPU Optimization Workshop](https://www.youtube.com/live/v_q2JTIqE20?si=mf7HeZ63rS-uYpS6)

diff --git a/test/prototype/test_quantized_training.py b/test/prototype/test_quantized_training.py
@@ -188,6 +188,7 @@ def snr(ref, actual):
         assert snr(inputs_ref.grad, inputs_int8mp.grad) > 20
         assert snr(linear.weight.grad, linear_int8mp.weight.grad) > 20
 
+    @pytest.mark.skip('Flaky on CI')
     @parametrize("compile", [False, True])
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_bitnet_training(self, compile):

diff --git a/test/quantization/test_quant_primitives.py b/test/quantization/test_quant_primitives.py
@@ -202,14 +202,16 @@ def test_choose_qparams_group_sym_no_clipping_err(self):
         self.assertTrue(torch.equal(scale, scale_ref))
         self.assertTrue(torch.equal(zero_point, zp_ref))
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_3, "skipping when torch version is 2.3 or lower")
-    @unittest.skipIf(TORCH_VERSION_AT_LEAST_2_6, "skipping when torch version is 2.6 or higher")
     @unittest.skipIf(is_fbcode(), "broken in fbcode")
     def test_choose_qparams_token_asym(self):
         input = torch.randn(10, 10)
         mapping_type = MappingType.ASYMMETRIC
         dtype = torch.int8
         block_size = (1, 10)
-        scale, zero_point = choose_qparams_affine(input, mapping_type, block_size, dtype, eps=torch.finfo(torch.float32).eps)
+        if TORCH_VERSION_AT_LEAST_2_6:
+            scale, zero_point = choose_qparams_affine(input, mapping_type, block_size, dtype, eps=torch.finfo(torch.float32).eps, scale_dtype=torch.float64, zero_point_dtype=torch.int64)
+        else:
+            scale, zero_point = choose_qparams_affine(input, mapping_type, block_size, dtype, eps=torch.finfo(torch.float32).eps)
 
         scale_ref, zp_ref = torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric(input, dtype)
         scale_ref = scale_ref.squeeze()