diff --git a/tests/llmcompressor/transformers/compression/recipes/sparse_24_int8.yaml b/tests/llmcompressor/transformers/compression/recipes/sparse_int8.yaml similarity index 61% rename from tests/llmcompressor/transformers/compression/recipes/sparse_24_int8.yaml rename to tests/llmcompressor/transformers/compression/recipes/sparse_int8.yaml index 73279db2c..e394e4bb3 100644 --- a/tests/llmcompressor/transformers/compression/recipes/sparse_24_int8.yaml +++ b/tests/llmcompressor/transformers/compression/recipes/sparse_int8.yaml @@ -3,9 +3,9 @@ pruning_stage: SparseGPTModifier: sparsity: 0.5 sequential_update: true - mask_structure: "2:4" + mask_structure: "0:0" targets: ['re:model.layers.\d*$'] -quant_stage: +test_stage: quant_modifiers: QuantizationModifier: ignore: ["lm_head"] @@ -13,17 +13,25 @@ quant_stage: group_0: weights: num_bits: 8 - type: int - strategy: tensor - dynamic: false + type: "int" symmetric: true + strategy: "tensor" input_activations: num_bits: 8 - type: int - strategy: tensor - dynamic: true - symmetric: true + type: "int" + symmetric: false + strategy: "tensor" + output_activations: null targets: ["Linear"] + group_1: + weights: + num_bits: 8 + type: "int" + symmetric: true + strategy: "tensor" + input_activations: null + output_activations: null + targets: ["Embedding"] pruning_modifiers: ConstantPruningModifier: targets: [ diff --git a/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py b/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py index 42c965b7a..4eb5d7de5 100644 --- a/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py +++ b/tests/llmcompressor/transformers/sparsification/test_compress_tensor_utils.py @@ -370,9 +370,9 @@ def test_model_shared_tensors_gpu( "model_stub, recipe, sparse_format, quant_format", [ ( - "Xenova/llama2.c-stories110M", - "tests/llmcompressor/transformers/compression/recipes/sparse_24_int8.yaml", - CompressionFormat.sparse_24.value, + "Xenova/llama2.c-stories15M", + "tests/llmcompressor/transformers/compression/recipes/sparse_int8.yaml", + CompressionFormat.sparse_bitmask.value, CompressionFormat.int_quantized.value, ), ], @@ -445,30 +445,7 @@ def test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tm if key.endswith("weight") and quant_format != "dense": # we don't expect an exact match for compressed diff = torch.abs(dense_tensor - reconstructed_tensor) - assert not torch.any( - diff > 0.01 - ).item(), f"{key} has a diff greater than 0.01" + assert not torch.any(diff > 0.01), f"Max diff: {torch.max(diff)}" else: assert torch.equal(dense_tensor, reconstructed_tensor) shutil.rmtree(tmp_path) - - -# This parameterization should be added to the test_compressor_stacking test -# once the lossy nature of FP8 compress-decompress is resolved. -# Until then, this test is marked as xfail. -@pytest.mark.xfail(reason="Known issue with FP8 compress-decompress") -@pytest.mark.parametrize( - "model_stub, recipe, sparse_format, quant_format", - [ - ( - "Xenova/llama2.c-stories110M", - "tests/llmcompressor/transformers/compression/recipes/sparse_24_fp8.yaml", - CompressionFormat.sparse_24.value, - CompressionFormat.float_quantized.value, - ), - ], -) -def test_compressor_stacking_fp8( - model_stub, recipe, sparse_format, quant_format, tmp_path -): - test_compressor_stacking(model_stub, recipe, sparse_format, quant_format, tmp_path)