diff --git a/src/sparseml/transformers/utils/transformations.py b/src/sparseml/transformers/utils/transformations.py new file mode 100644 index 00000000000..aceff72346d --- /dev/null +++ b/src/sparseml/transformers/utils/transformations.py @@ -0,0 +1,253 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# flake8: noqa #F821,#E501 + +import functools +import logging +from typing import Dict + +import numpy +import torch +from torch import Tensor + + +__all__ = [ + "transform_names", + "add_tensors", + "transform_tensors", + "remove_unwanted_tensors", + "is_quantization_target", +] + +_LOGGER = logging.getLogger(__name__) + + +def _log_call(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + _LOGGER.info("Applying transformation: %s", func.__name__.upper()) + return_value = func(*args, **kwargs) + _LOGGER.info("Transformation: %s complete", func.__name__.upper()) + return return_value + + return wrapper + + +def is_quantization_target(key: str) -> bool: + """ + Assumes self_attn and mlp are the only quantization targets + in model layers of the state_dict. + + :param key: The key of the state_dict + :return: True if the key is a quantization target, False otherwise + """ + return "model.layers" in key and ("self_attn" in key or "mlp" in key) + + +@_log_call +def transform_names(state_dict: Dict[str, Tensor]) -> Dict[str, Tensor]: + """ + Transforms the state_dict keys to match with exllama format + + The renames include: + - weight_fake_quant.scale -> scales + - weight_fake_quant.zero_point -> qzeros + - weight -> qweight + + Note: does not transforms the actual tensor values + + :pre-condition: The state_dict should be for a quantized model + :pre-condition: Targets only the weights of the self_attn and mlp nodes + :param state_dict: The quantized state_dict to be transformed + :return: The transformed state_dict + """ + # mapping of the old names to the new names + name_map: Dict[str, str] = { + ".weight_fake_quant.scale": ".scales", + ".weight_fake_quant.zero_point": ".qzeros", + ".weight": ".qweight", + } + + new_state_dict: Dict[str, Tensor] = {} + for key, tensor in state_dict.items(): + if is_quantization_target(key) and any( + key.endswith(target_suffix := suffix) for suffix in name_map + ): + updated_key = key.replace(target_suffix, name_map[target_suffix]) + new_state_dict[updated_key] = tensor + else: + new_state_dict[key] = tensor + return new_state_dict + + +def pack(weight: Tensor, scales: Tensor, zeros: Tensor, g_idx: Tensor) -> Tensor: + """ + Quantize the weight tensor using the scales, zeros, and g_idx tensors + into 4 bit integers, and packs a group of 8 of them into a single 32 bit integer. + + Adapted from: + https://github.com/AutoGPTQ/AutoGPTQ/blob/ea4a99778f90b60c9b5177d7487af1b4ca87744f/auto_gptq/nn_modules/qlinear/qlinear_exllama.py#L118 + + :param weight: The weight tensor to be quantized of shape [x, 8y] + :param scales: The scales tensor + :param zeros: The zero points tensor + :param g_idx: The group index tensor + :return: The quantized weight tensor of int32 dtype and shape [x, y] + """ + g_idx = g_idx.clone() + + scales = scales.t().contiguous() + zeros = zeros.t().contiguous() + scale_zeros = zeros * scales + scales = scales.clone().half() + bits = 4 + + intweight = [] + infeatures = weight.shape[1] + for idx in range(infeatures): + intweight.append( + torch.round( + (weight[:, idx] + scale_zeros[g_idx[idx]]) / scales[g_idx[idx]] + ).to(torch.int)[:, None] + ) + intweight = torch.cat(intweight, dim=1) + intweight = intweight.t().contiguous() + intweight = intweight.numpy().astype(numpy.uint32) + + i = 0 + row = 0 + qweight = numpy.zeros( + (intweight.shape[0] // 32 * bits, intweight.shape[1]), dtype=numpy.uint32 + ) + while row < qweight.shape[0]: + if bits in [4]: + for j in range(i, i + (32 // bits)): + qweight[row] |= intweight[j] << (bits * (j - i)) + i += 32 // bits + row += 1 + else: + raise NotImplementedError("Only 4 bits are supported.") + + qweight = qweight.astype(numpy.int32) + qweight = torch.from_numpy(qweight) + return qweight + + +@_log_call +def add_tensors(state_dict: Dict[str, Tensor]) -> Dict[str, Tensor]: + + new_dict: Dict[str, Tensor] = {} + + for key, tensor in state_dict.items(): + if is_quantization_target(key) and key.endswith(".qweight"): + # add bias and g_idx tensors + bias_key = key.replace(".qweight", ".bias") + g_idx_key = key.replace(".qweight", ".g_idx") + + # bias tensor + bias_tensor = torch.zeros(tensor.shape[0], dtype=torch.float16) + new_dict[bias_key] = bias_tensor + + # g_idx tensor of shape [num_channels] dtype int32 filled + # with zeros + g_idx_tensor = torch.zeros(tensor.shape[1], dtype=torch.int32) + new_dict[g_idx_key] = g_idx_tensor + + # copy the original tensor, (qweight is also copied in this step) + new_dict[key] = tensor + return new_dict + + +@_log_call +def transform_tensors(state_dict: Dict[str, Tensor]) -> Dict[str, Tensor]: + + new_dict: Dict[str, Tensor] = {} + + # auxillary dict to store transformed weights + weights_dict: Dict[str, Tensor] = {} + + # quantize qweights before scales, and qzeros + # because the ordering is not guaranteed + # in our implementation + for key, tensor in state_dict.items(): + if is_quantization_target(key) and key.endswith(".qweight"): + # quantize the weight tensor + qweight = pack( + weight=tensor, + scales=state_dict[key.replace("qweight", "scales")], + zeros=state_dict[key.replace("qweight", "qzeros")], + g_idx=state_dict[key.replace("qweight", "g_idx")], + ) + assert qweight.dtype == torch.int32 + weights_dict[key] = qweight + + # transform scales and zero points + for key, tensor in state_dict.items(): + if is_quantization_target(key) and key.endswith(".scales"): + # scales [x] should be reshaped to [1, x] + # and converted to fp16 + scales = tensor.reshape(1, -1).to(torch.float16) + new_dict[key] = scales + elif is_quantization_target(key) and key.endswith(".qzeros"): + # zero points [8x] should be reshaped to [1, x] + # of type int32 and filled with zeros (symmetric quantization) + zeros = torch.zeros(tensor.shape[0] // 8, dtype=torch.int32) + new_dict[key] = zeros.reshape(1, -1) + else: + new_dict[key] = tensor + + # overwrite old weights with the new quantized weights + new_dict.update(weights_dict) + + # auxillary weights_dict not needed anymore + del weights_dict + + return new_dict + + +@_log_call +def remove_unwanted_tensors(state_dict: Dict[str, Tensor]) -> Dict[str, Tensor]: + """ + Remove unwanted tensors from the state_dict that are not necessary for inference. + These tensors include: + - eps + - min_val + - max_val + - fake_quant_enabled + - observer_enabled + + """ + to_delete = ["eps", "min_val", "max_val", "fake_quant_enabled", "observer_enabled"] + keys = list(state_dict.keys()) + for key in keys: + if any(key.endswith(suffix) for suffix in to_delete): + del state_dict[key] + return state_dict + + +def check_dicts(actual, expected): + assert len(actual) == len( + expected + ), "The number of tensors in the actual and expected state dicts do not match" + + for key, value in actual.items(): + assert ( + key in expected + ), f"The key {key} is not present in the expected state dict" + assert ( + value.shape == expected[key].shape + ), f"The shape of the tensor {key} in the actual state dict does not match the shape of the tensor in the expected state dict, expected {expected[key].shape} but got {value.shape}" + assert ( + value.dtype == expected[key].dtype + ), f"The dtype of the tensor {key} in the actual state dict does not match the dtype of the tensor in the expected state dict, expected {expected[key].dtype} but got {value.dtype}" diff --git a/src/sparseml/transformers/utils/vllm_export_helpers.py b/src/sparseml/transformers/utils/vllm_export_helpers.py new file mode 100644 index 00000000000..ddf24ebebc3 --- /dev/null +++ b/src/sparseml/transformers/utils/vllm_export_helpers.py @@ -0,0 +1,170 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +General utilities for exporting models to different formats using safe tensors. +""" +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Literal, Optional, Union + +from torch import Tensor +from transformers import PreTrainedModel, PreTrainedTokenizerBase + +from sparseml.transformers.utils.transformations import ( + add_tensors, + remove_unwanted_tensors, + transform_names, + transform_tensors, +) +from sparseml.utils import get_unique_dir_name + + +__all__ = [ + "export_safetensors", + "SUPPORTED_FORMAT_TYPES", +] + +SUPPORTED_FORMAT_TYPES = Literal["exllama", "marlin"] +_LOGGER = logging.getLogger(__name__) + + +def export_safetensors( + model: PreTrainedModel, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + format: SUPPORTED_FORMAT_TYPES = "exllama", + save_dir: Union[str, Path, None] = None, +): + """ + A utility function to export a model to safetensor format. + Calls the appropriate state dict translation function based on the format + and saves the translated state dict to the specified directory. + If the directory is defaults to cwd/exported_model. If the directory already exists, + a new directory is created with a unique name. + + :param model: The loaded model to be exported. + :param tokenizer: The tokenizer associated with the model. + :param format: The format to which the model should be exported. + Default is "exllama". + :param save_dir: The directory where the model should be saved. + """ + # original state dict + state_dict: Dict[Any, Any] = model.state_dict() + + # translation + _LOGGER.info(f"Adding {format} quantization info to config") + model.config.quantization_config = QuantizationConfig() + + _LOGGER.info(f"Translating state dict to {format} format.") + translated_state_dict: Dict[Any, Any] = translate_state_dict( + state_dict=state_dict, format=format + ) + + if save_dir is None: + save_dir = Path.cwd() / f"{format}_model" + + save_dir: str = get_unique_dir_name(dir_name=save_dir) + + # saving + save_checkpoint( + model=model, + tokenizer=tokenizer, + state_dict=translated_state_dict, + save_dir=save_dir, + ) + + +def save_checkpoint( + model: PreTrainedModel, + state_dict: Dict[Any, Any], + save_dir: str, + tokenizer: Optional[PreTrainedTokenizerBase] = None, +): + model.save_pretrained( + save_directory=save_dir, state_dict=state_dict, safe_serialization=True + ) + _LOGGER.info(f"Model and config saved to {save_dir}") + + if tokenizer: + tokenizer.save_pretrained(save_directory=save_dir) + _LOGGER.info(f"tokenizer saved to {save_dir}") + + +def translate_state_dict( + state_dict: Dict[Any, Any], format: SUPPORTED_FORMAT_TYPES +) -> Dict[Any, Any]: + """ + A utility function to translate the state dict to the specified format. + + :param state_dict: The state dict to be translated. + :param format: The format to which the state dict should be translated. + """ + if format == "exllama": + return _translate_state_dict_exllama(state_dict=state_dict) + + +def _translate_state_dict_exllama(state_dict: Dict[str, Any]) -> Dict[Any, Any]: + """ + Translate the state dict to the Exllama format. + + Changes made to quantized params in the passed state_dict: + - weight tensor renamed to qweight, and the corresponding tensor + value of shape [x, 8y] will be repacked to [x, y] + - scale tensor renamed to scales, and the corresponding tensor + value of shape [8x] will be reshaped to [1, 8x] and + then repacked to [1, x] + - zero_point tensor renamed to qzeros, and the corresponding tensor + value of shape [x] will be reshaped to [1, x] + - A g_idx tensor of shape [num_channels] will be added to the + state_dict, this tensor will be filled with zeros + - All fake quantization parameters will be removed from the state_dict + + + + + :param state_dict: The model state dict to be translated. + :return: The translated state dict compatible with Exllama. + """ + + transformations = ( + transform_names, + add_tensors, + transform_tensors, + remove_unwanted_tensors, + ) + for transformation in transformations: + state_dict_copy: Dict[str, Tensor] = transformation(state_dict=state_dict) + + return state_dict_copy + + +@dataclass +class QuantizationConfig: + bits: int = field(default=4, metadata={"choices": [2, 3, 4, 8]}) + group_size: int = field(default=-1) + damp_percent: float = field(default=0.01) + desc_act: bool = field(default=False) + sym: bool = field(default=True) + is_marlin_format: bool = field(default=False) + + def to_dict(self): + return { + "bits": self.bits, + "group_size": self.group_size, + "desc_act": self.desc_act, + "sym": self.sym, + "is_marlin_format": self.is_marlin_format, + "quant_method": "gptq", + } diff --git a/src/sparseml/utils/helpers.py b/src/sparseml/utils/helpers.py index 6c1d4f3ad6c..0c5b4d8330e 100644 --- a/src/sparseml/utils/helpers.py +++ b/src/sparseml/utils/helpers.py @@ -74,6 +74,7 @@ "parse_kwarg_tuples", "download_zoo_training_dir", "is_package_available", + "get_unique_dir_name", ] @@ -974,3 +975,23 @@ def is_package_available( return package_exists, package_version else: return package_exists + + +def get_unique_dir_name(dir_name: Union[str, Path]) -> str: + """ + A utility function to get a unique directory name by appending + a number to the directory name if the directory already exists + (Note: the function does not create the directory, it only + returns the unique directory name) + + :param dir_name: The directory name to get a unique name for + :return: The unique directory name + """ + dir_name: str = str(dir_name) + counter: str = 1 + new_dir_name: str = dir_name + + while Path(new_dir_name).exists(): + new_dir_name = f"{dir_name}_{counter}" + counter += 1 + return new_dir_name