Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raise an error when OVQuantizer is invoked on an compressed model #1122

Merged
merged 3 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -1010,6 +1010,7 @@ def _weight_only_quantization(
calibration_dataset: Optional[Union[nncf.Dataset, Iterable]] = None,
**kwargs,
) -> openvino.runtime.Model:
_verify_not_optimized(model)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

config = quantization_config
if isinstance(config, dict):
config = OVWeightQuantizationConfig.from_dict(quantization_config)
Expand Down Expand Up @@ -1066,6 +1067,7 @@ def _full_quantization(
calibration_dataset: nncf.Dataset,
**kwargs,
):
_verify_not_optimized(model)
advanced_parameters_kwargs = {}
if quantization_config.smooth_quant_alpha is not None:
advanced_parameters_kwargs["smooth_quant_alphas"] = AdvancedSmoothQuantParameters(
Expand Down Expand Up @@ -1187,3 +1189,20 @@ def _hybrid_quantization(
**kwargs,
)
return quantized_model


def _verify_not_optimized(ov_model):
message_template = (
"Cannot apply optimization to the model because it was already optimized with the following config: {}. "
"To avoid this issue, check that you set load_in_8bit=False or not using quantization_config at export in the .from_pretrained(), "
"or explicitly specify weight format with --weight_format fp16/fp32 when using CLI."
)

rt_info = ov_model.get_rt_info()
if "nncf" in rt_info:
model_weight_compression_config = rt_info["nncf"].get("weight_compression", None)
model_quantization_config = rt_info["nncf"].get("quantization", None)
if model_weight_compression_config is not None:
raise RuntimeError(message_template.format(model_weight_compression_config))
elif model_quantization_config is not None:
raise RuntimeError(message_template.format(model_quantization_config))
25 changes: 25 additions & 0 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,9 @@ def preprocess_function(examples, tokenizer):
# Verify that the configuration is correctly saved and loaded
loaded_config = OVConfig.from_pretrained(tmp_dir)
self.assertEqual(ov_config.quantization_config.to_dict(), loaded_config.quantization_config.to_dict())
check_optimization_not_applicable_to_optimized_model(
l-bat marked this conversation as resolved.
Show resolved Hide resolved
model, quantization_config=OVWeightQuantizationConfig(bits=8)
)

@parameterized.expand(SUPPORTED_ARCHITECTURES_OV_MODEL_WITH_AUTO_DATASET)
def test_ov_model_static_quantization_with_auto_dataset(
Expand Down Expand Up @@ -718,6 +721,13 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type, trust
else:
models = [model]

if model_type == "open-clip":
pytest.skip(reason="ticket 161043")
elif model_type == "t5":
pytest.skip(reason="ticket 160958")
else:
check_optimization_not_applicable_to_optimized_model(model, quantization_config={"bits": 8})

expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type]
for i, model in enumerate(models):
_, num_weight_nodes = get_num_quantized_nodes(model)
Expand All @@ -738,6 +748,7 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_fake_
self.assertEqual(0, num_weight_nodes["int4"])

model.save_pretrained(tmp_dir)
check_optimization_not_applicable_to_optimized_model(model, quantization_config)

def test_stable_diffusion_with_weight_compression(self):
int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_NAMES["stable-diffusion"], export=True)
Expand All @@ -752,6 +763,10 @@ def test_stable_diffusion_with_weight_compression(self):
self.assertEqual(0, num_fake_nodes)
self.assertEqual(242, num_weight_nodes["int8"])
self.assertEqual(0, num_weight_nodes["int4"])
quantization_config = OVWeightQuantizationConfig(
bits=8, dataset="conceptual_captions", num_samples=2, quant_method=OVQuantizationMethod.HYBRID
)
check_optimization_not_applicable_to_optimized_model(int8_pipe, quantization_config)

@parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:])
def test_ovmodel_hybrid_quantization_with_custom_dataset(
Expand Down Expand Up @@ -797,6 +812,7 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_
if model_id == "facebook/opt-125m":
for key, value in self.DEFAULT_INT4_CONFIG.items():
self.assertEqual(value, getattr(openvino_config.quantization_config, key))
check_optimization_not_applicable_to_optimized_model(model, quantization_config={"bits": 8})

@parameterized.expand(LOAD_IN_4_BITS_SCOPE)
def test_ovmodel_4bit_auto_compression_with_config(
Expand Down Expand Up @@ -1321,3 +1337,12 @@ def test_calibration_data_uniqueness(self, model_name, apply_caching):
else:
# Without caching, encoder hidden states tensors will be unique for each collected input
self.assertGreater(len(data_id_per_key["encoder_hidden_states"]), 2)


def check_optimization_not_applicable_to_optimized_model(model, quantization_config):
quantizer = OVQuantizer(model)
with pytest.raises(
RuntimeError,
match="Cannot apply optimization to the model because it was already optimized with the following config",
):
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
Loading