diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index f61c2b93c..e89fa9b44 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -1010,6 +1010,7 @@ def _weight_only_quantization( calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None, **kwargs, ) -> openvino.runtime.Model: + _verify_not_optimized(model) config = quantization_config if isinstance(config, dict): config = OVWeightQuantizationConfig.from_dict(quantization_config) @@ -1066,6 +1067,7 @@ def _full_quantization( calibration_dataset: nncf.Dataset, **kwargs, ): + _verify_not_optimized(model) advanced_parameters_kwargs = {} if quantization_config.smooth_quant_alpha is not None: advanced_parameters_kwargs["smooth_quant_alphas"] = AdvancedSmoothQuantParameters( @@ -1187,3 +1189,20 @@ def _hybrid_quantization( **kwargs, ) return quantized_model + + +def _verify_not_optimized(ov_model): + message_template = ( + "Cannot apply optimization to the model because it was already optimized with the following config: {}. " + "To avoid this issue, check that you set load_in_8bit=False or not using quantization_config at export in the .from_pretrained(), " + "or explicitly specify weight format with --weight_format fp16/fp32 when using CLI." + ) + + rt_info = ov_model.get_rt_info() + if "nncf" in rt_info: + model_weight_compression_config = rt_info["nncf"].get("weight_compression", None) + model_quantization_config = rt_info["nncf"].get("quantization", None) + if model_weight_compression_config is not None: + raise RuntimeError(message_template.format(model_weight_compression_config)) + elif model_quantization_config is not None: + raise RuntimeError(message_template.format(model_quantization_config)) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 1df43d548..c4c0ff247 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -214,6 +214,9 @@ def preprocess_function(examples, tokenizer): # Verify that the configuration is correctly saved and loaded loaded_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict()) + check_optimization_not_applicable_to_optimized_model( + model, quantization_config=OVWeightQuantizationConfig(bits=8) + ) @parameterized.expand(SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET) def test_ov_model_static_quantization_with_auto_dataset( @@ -718,6 +721,13 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust else: models = [model] + if model_type == "open-clip": + pytest.skip(reason="ticket 161043") + elif model_type == "t5": + pytest.skip(reason="ticket 160958") + else: + check_optimization_not_applicable_to_optimized_model(model, quantization_config={"bits": 8}) + expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type] for i, model in enumerate(models): _, num_weight_nodes = get_num_quantized_nodes(model) @@ -738,6 +748,7 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_ self.assertEqual(0, num_weight_nodes["int4"]) model.save_pretrained(tmp_dir) + check_optimization_not_applicable_to_optimized_model(model, quantization_config) def test_stable_diffusion_with_weight_compression(self): int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_NAMES["stable-diffusion"], export=True) @@ -752,6 +763,10 @@ def test_stable_diffusion_with_weight_compression(self): self.assertEqual(0, num_fake_nodes) self.assertEqual(242, num_weight_nodes["int8"]) self.assertEqual(0, num_weight_nodes["int4"]) + quantization_config = OVWeightQuantizationConfig( + bits=8, dataset="conceptual_captions", num_samples=2, quant_method=OVQuantizationMethod.HYBRID + ) + check_optimization_not_applicable_to_optimized_model(int8_pipe, quantization_config) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:]) def test_ovmodel_hybrid_quantization_with_custom_dataset( @@ -797,6 +812,7 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ if model_id == "facebook/opt-125m": for key, value in self.DEFAULT_INT4_CONFIG.items(): self.assertEqual(value, getattr(openvino_config.quantization_config, key)) + check_optimization_not_applicable_to_optimized_model(model, quantization_config={"bits": 8}) @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_auto_compression_with_config( @@ -1321,3 +1337,12 @@ def test_calibration_data_uniqueness(self, model_name, apply_caching): else: # Without caching, encoder hidden states tensors will be unique for each collected input self.assertGreater(len(data_id_per_key["encoder_hidden_states"]), 2) + + +def check_optimization_not_applicable_to_optimized_model(model, quantization_config): + quantizer = OVQuantizer(model) + with pytest.raises( + RuntimeError, + match="Cannot apply optimization to the model because it was already optimized with the following config", + ): + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))