diff --git a/src/sparseml/transformers/utils/gptq_utils/__init__.py b/src/sparseml/transformers/utils/gptq_utils/__init__.py new file mode 100644 index 00000000000..d07ff5d5779 --- /dev/null +++ b/src/sparseml/transformers/utils/gptq_utils/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# flake8: noqa + +from .vllm_export_helpers import * diff --git a/src/sparseml/transformers/utils/gptq_utils/transformations.py b/src/sparseml/transformers/utils/gptq_utils/transformations.py new file mode 100644 index 00000000000..36004f5f0f7 --- /dev/null +++ b/src/sparseml/transformers/utils/gptq_utils/transformations.py @@ -0,0 +1,327 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# flake8: noqa #F821,#E501 + +import functools +import logging +from typing import Callable, Dict, List, Tuple + +import numpy +import torch +from torch import Tensor + + +__all__ = [ + "transform_to_exllama_names", + "add_exllama_tensors", + "transform_gptq_weights_and_reshape_tensors", + "remove_unwanted_tensors_for_exllama", + "is_gptq_quantization_target", + "convert_fp32_tensors_to_fp16", + "gptq_exllama_transformations", + "GPTQ_EXLLAMA_TRANSFORMATIONS", +] + +_LOGGER = logging.getLogger(__name__) + +TransformationType = Callable[[Dict[str, torch.Tensor]], Dict[str, torch.Tensor]] + + +def _log_call(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + _LOGGER.debug("Applying transformation: %s", func.__name__.upper()) + return_value = func(*args, **kwargs) + _LOGGER.debug("Transformation: %s complete", func.__name__.upper()) + return return_value + + return wrapper + + +def is_gptq_quantization_target(key: str) -> bool: + """ + Assumes self_attn and mlp are the only quantization targets + in model layers of the state_dict. + + :param key: The key of the state_dict + :return: True if the key is a quantization target, False otherwise + """ + return "model.layers" in key and ("self_attn" in key or "mlp" in key) + + +@_log_call +def transform_to_exllama_names(state_dict: Dict[str, Tensor]) -> Dict[str, Tensor]: + """ + Transforms the state_dict keys to match with exllama format + + The renames include: + - weight_fake_quant.scale -> scales + - weight_fake_quant.zero_point -> qzeros + - weight -> qweight + + Note: does not transforms the actual tensor values + + :pre-condition: The state_dict should be for a quantized model + :pre-condition: Targets only the weights of the self_attn and mlp nodes + :param state_dict: The quantized state_dict to be transformed + :return: The transformed state_dict + """ + # mapping of the old names to the new names + name_map: Dict[str, str] = { + ".weight_fake_quant.scale": ".scales", + ".weight_fake_quant.zero_point": ".qzeros", + ".weight": ".qweight", + } + + updated_state_dict: Dict[str, Tensor] = {} + for key, tensor in state_dict.items(): + if is_gptq_quantization_target(key) and any( + key.endswith(target_suffix := suffix) for suffix in name_map + ): + updated_key = key.replace(target_suffix, name_map[target_suffix]) + updated_state_dict[updated_key] = tensor + else: + updated_state_dict[key] = tensor + return updated_state_dict + + +@_log_call +def add_exllama_tensors(state_dict: Dict[str, Tensor]) -> Dict[str, Tensor]: + """ + Add the bias and g_idx tensors to the state_dict, wherever + a qweight tensor is present + + The added tensors include: + - bias: A tensor of shape [output channels] filled with zeros + and dtype float16 + - g_idx: A tensor of shape [num_channels] filled with zeros + and dtype int32 + + :pre-condition: The state_dict should be for a quantized model + :pre-condition: The state_dict should have been transformed to exllama names + :param state_dict: The state_dict to be transformed + :return: The state_dict with the added bias and g_idx tensors + """ + + updated_state_dict: Dict[str, Tensor] = {} + + for key, tensor in state_dict.items(): + if is_gptq_quantization_target(key) and key.endswith(".qweight"): + # add bias and g_idx tensors + bias_key = key.replace(".qweight", ".bias") + g_idx_key = key.replace(".qweight", ".g_idx") + + # bias tensor + bias_tensor = torch.zeros(tensor.shape[0], dtype=torch.float16) + updated_state_dict[bias_key] = bias_tensor + + # g_idx tensor of shape [num_channels] dtype int32 filled + # with zeros + g_idx_tensor = torch.zeros(tensor.shape[1], dtype=torch.int32) + updated_state_dict[g_idx_key] = g_idx_tensor + + # copy the original tensor, (qweight is also copied in this step) + updated_state_dict[key] = tensor + return updated_state_dict + + +@_log_call +def transform_gptq_weights_and_reshape_tensors( + state_dict: Dict[str, Tensor] +) -> Dict[str, Tensor]: + """ + Tranforms weights into their required shapes and types for Exllama format + + The transformations include: + - Quantize the weight tensor using the scales, zeros, and g_idx tensors + additonally pack a group of 8 of them into a single 32 bit integer + and rename the tensor to qweight + - Reshape the scales tensor to [1, x] and convert to fp16 + - Reshape the zero points tensor to [1, x] of type int32 and fill with zeros + (it is assumed that quantization was symmetric) + + :pre-condition: The state_dict should be for a quantized model + :pre-condition: The state_dict should have been transformed to exllama names + :pre-condition: The state_dict should have the bias and g_idx tensors added + :param state_dict: The state_dict to be transformed + :return: The transformed state_dict, with repacked and reshaped tensors + """ + + transformed_state_dict: Dict[str, Tensor] = {} + + # auxillary dict to store transformed weights + transformed_weights_dict: Dict[str, Tensor] = {} + + # quantize qweights before scales, and qzeros + # because the ordering in which tensors are fetched + # is not guaranteed by our implementation + for key, tensor in state_dict.items(): + if is_gptq_quantization_target(key) and key.endswith(".qweight"): + # quantize the weight tensor + qweight = _pack_fp32_into_int32( + weight=tensor, + scales=state_dict[key.replace("qweight", "scales")], + zeros=state_dict[key.replace("qweight", "qzeros")], + g_idx=state_dict[key.replace("qweight", "g_idx")], + ) + assert qweight.dtype == torch.int32 + transformed_weights_dict[key] = qweight + + # transform scales and zero points + for key, tensor in state_dict.items(): + if is_gptq_quantization_target(key) and key.endswith(".scales"): + # scales [x] should be reshaped to [1, x] + # and converted to fp16 + scales = tensor.reshape(1, -1).half() + transformed_state_dict[key] = scales + elif is_gptq_quantization_target(key) and key.endswith(".qzeros"): + # zero points [8x] should be reshaped to [1, x] + # of type int32 and filled with zeros (symmetric quantization) + zeros = torch.zeros(tensor.shape[0] // 8, dtype=torch.int32) + transformed_state_dict[key] = zeros.reshape(1, -1) + else: + transformed_state_dict[key] = tensor + + # overwrite old weights with the new quantized weights + transformed_state_dict.update(transformed_weights_dict) + + # auxillary weights_dict not needed anymore + del transformed_weights_dict + + return transformed_state_dict + + +@_log_call +def remove_unwanted_tensors_for_exllama( + state_dict: Dict[str, Tensor] +) -> Dict[str, Tensor]: + """ + Remove unwanted tensors from the state_dict that are not necessary for inference. + These tensors include: + - eps + - min_val + - max_val + - fake_quant_enabled + - observer_enabled + + :param state_dict: The state_dict to be cleaned + :return: The cleaned state_dict with all keys ending with the unwanted suffixes removed + """ + suffixes_to_delete: List[str] = [ + "eps", + "min_val", + "max_val", + "fake_quant_enabled", + "observer_enabled", + ] + keys = list(state_dict.keys()) + for key in keys: + if any(key.endswith(suffix) for suffix in suffixes_to_delete): + del state_dict[key] + return state_dict + + +@_log_call +def convert_fp32_tensors_to_fp16(state_dict: Dict[str, Tensor]) -> Dict[str, Tensor]: + """ + Convert all remaining fp32 tensors to fp16 tensors in the state_dict + This is expected by the Exllama format + + :param state_dict: The state_dict to be converted + :return: The converted state_dict, with all fp32 tensors converted to fp16 + """ + converted_state_dict: Dict[str, Tensor] = {} + for key, tensor in state_dict.items(): + converted_state_dict[key] = ( + tensor.half() if tensor.dtype == torch.float32 else tensor + ) + return converted_state_dict + + +def gptq_exllama_transformations() -> Tuple[TransformationType, ...]: + """ + :return: An Iterable of transformations that must be applied to + the state_dict IN_ORDER to convert it to the Exllama format + for GPTQ style quantization. Each transformation is a + callable that accepts a state_dict and returns a transformed + state_dict. + """ + + return ( + transform_to_exllama_names, + add_exllama_tensors, + transform_gptq_weights_and_reshape_tensors, + remove_unwanted_tensors_for_exllama, + convert_fp32_tensors_to_fp16, + ) + + +def _pack_fp32_into_int32( + weight: Tensor, scales: Tensor, zeros: Tensor, g_idx: Tensor +) -> Tensor: + """ + Quantize the weight tensor using the scales, zeros, and g_idx tensors + into 4 bit integers, and packs a group of 8 of them into a single 32 bit integer. + + Adapted from: + https://github.com/AutoGPTQ/AutoGPTQ/blob/ea4a99778f90b60c9b5177d7487af1b4ca87744f/auto_gptq/nn_modules/qlinear/qlinear_exllama.py#L118 + + :param weight: The weight tensor to be quantized of shape [x, 8y] + :param scales: The scales tensor + :param zeros: The zero points tensor + :param g_idx: The group index tensor + :return: The quantized weight tensor of int32 dtype and shape [x, y] + """ + g_idx = g_idx.clone() + + scales = scales.t().contiguous() + zeros = zeros.t().contiguous() + scale_zeros = zeros * scales + scales = scales.clone().half() + bits = 4 + + intweight = [] + infeatures = weight.shape[1] + for idx in range(infeatures): + intweight.append( + torch.round( + (weight[:, idx] + scale_zeros[g_idx[idx]]) / scales[g_idx[idx]] + ).to(torch.int)[:, None] + ) + intweight = torch.cat(intweight, dim=1) + intweight = intweight.t().contiguous() + intweight = intweight.numpy().astype(numpy.uint32) + + i = 0 + row = 0 + qweight = numpy.zeros( + (intweight.shape[0] // 32 * bits, intweight.shape[1]), dtype=numpy.uint32 + ) + while row < qweight.shape[0]: + if bits in [4]: + for j in range(i, i + (32 // bits)): + qweight[row] |= intweight[j] << (bits * (j - i)) + i += 32 // bits + row += 1 + else: + raise NotImplementedError("Only 4 bits are supported.") + + qweight = qweight.astype(numpy.int32) + qweight = torch.from_numpy(qweight) + return qweight + + +GPTQ_EXLLAMA_TRANSFORMATIONS: Tuple[ + TransformationType, ... +] = gptq_exllama_transformations() diff --git a/src/sparseml/transformers/utils/gptq_utils/vllm_export_helpers.py b/src/sparseml/transformers/utils/gptq_utils/vllm_export_helpers.py new file mode 100644 index 00000000000..4a4d3ea51ac --- /dev/null +++ b/src/sparseml/transformers/utils/gptq_utils/vllm_export_helpers.py @@ -0,0 +1,265 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +General utilities for exporting models to different formats using safe tensors. +""" +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Literal, Optional, Tuple, Union + +from torch import Tensor +from transformers import PreTrainedModel, PreTrainedTokenizerBase + +from sparseml.pytorch.model_load.helpers import fallback_to_cpu +from sparseml.transformers.utils.gptq_utils.transformations import ( + GPTQ_EXLLAMA_TRANSFORMATIONS, +) +from sparseml.transformers.utils.sparse_model import SparseAutoModelForCausalLM +from sparseml.transformers.utils.sparse_tokenizer import SparseAutoTokenizer +from sparseml.utils import get_unique_dir_name + + +__all__ = [ + "export_vllm_compatible_checkpoint", + "SUPPORTED_FORMAT_TYPES", +] + +SUPPORTED_FORMAT_TYPES = Literal["exllama", "marlin"] +_LOGGER = logging.getLogger(__name__) + + +def export_vllm_compatible_checkpoint( + model: Union[PreTrainedModel, str], + tokenizer: Union[PreTrainedTokenizerBase, str, None] = None, + format: SUPPORTED_FORMAT_TYPES = "exllama", + save_dir: Union[str, Path, None] = None, + device: str = "cuda", +): + """ + A utility function to export a GPTQ quantized model to safetensors, + compatible with the vLLM library. + Calls the appropriate state dict translation function based on the format + and saves the translated state dict to the specified directory. + If the directory is not specified defaults to cwd/exported_model. + If the directory already exists, a new directory is created with a unique name. + + :param model: The loaded model to be exported, can also be a local model + directory path, or a HugginFace/SparseZoo stub + :param tokenizer: The tokenizer associated with the model, can also + be a HuggingFace/Sparsezoo stub. + :param format: The format to which the model should be exported. + Default is "exllama". + :param save_dir: The directory where the model should be saved. + :param device: The device to use for the model. Default is "cuda". + if cuda is not available, it will fallback to cpu. + """ + + validate_specified_format(format=format) + + model, tokenizer = _create_model_and_tokenizer(model=model, tokenizer=tokenizer) + + _LOGGER.info(f"Translating state dict to {format} format.") + translated_state_dict: Dict[str, Any] = translate_state_dict( + state_dict=model.state_dict(), format=format + ) + + model.config.quantization_config = _QuantizationConfig() + _LOGGER.info(f"Added {format} quantization info to model.config") + + if save_dir is None: + save_dir = Path.cwd() / f"{format}_model" + + save_dir: str = get_unique_dir_name(dir_name=save_dir) + + save_checkpoint( + model=model, + tokenizer=tokenizer, + state_dict=translated_state_dict, + save_dir=save_dir, + ) + + +def save_checkpoint( + model: PreTrainedModel, + state_dict: Dict[Any, Any], + save_dir: str, + tokenizer: Optional[PreTrainedTokenizerBase] = None, +): + """ + Saves the model and tokenizer to the specified directory, + with the specified state dict. + + :param model: The model to be saved. + :param state_dict: The state dict to be saved. + :param save_dir: The directory where the model should be saved. + :param tokenizer: The tokenizer associated with the model. This will + be saved to the same directory as the model. + """ + model.save_pretrained( + save_directory=save_dir, state_dict=state_dict, safe_serialization=True + ) + _LOGGER.info(f"Model and config saved to {save_dir}") + + if tokenizer: + tokenizer.save_pretrained(save_directory=save_dir) + _LOGGER.info(f"tokenizer saved to {save_dir}") + + +def translate_state_dict( + state_dict: Dict[Any, Any], format: SUPPORTED_FORMAT_TYPES +) -> Dict[Any, Any]: + """ + A utility function to translate the state dict to the specified format. + + :pre-condition: The format must be one of the supported formats. + :param state_dict: The state dict to be translated. + :param format: The format to which the state dict should be translated. + """ + if format == "exllama": + return _translate_state_dict_exllama(state_dict=state_dict) + + # raise appropriate error if this function is called as a standalone + validate_specified_format() + + +def validate_specified_format(format: SUPPORTED_FORMAT_TYPES): + """ + Validates the specified format is supported and raises + an error if not. + + :raises ValueError: If the specified format is not supported. + :raises NotImplementedError: for marlin format. + """ + + # validate + if format not in SUPPORTED_FORMAT_TYPES: + raise ValueError( + f"Unsupported format {format}, supported formats " + f"are {SUPPORTED_FORMAT_TYPES}" + ) + + if format != "exllama": + raise NotImplementedError(f"Exporting to format {format} is not supported yet.") + + +@dataclass(frozen=True) +class _QuantizationConfig: + """ + A dataclass to hold the quantization configuration for the model. + This class is specific to GPTQ style quantization, and an instance + of this class can be added to the model.config.quantization_config + to enable the model to be exported to Exllama format. + + Right now, the defaults are specific to sparseml GPTQ quantization. + In future versions we may support more general quantization configurations. + + This class is frozen to prevent modification of the instance after creation. + """ + + bits: int = field(default=4, metadata={"choices": [2, 3, 4, 8]}) + group_size: int = field(default=-1) + damp_percent: float = field(default=0.01) + desc_act: bool = field(default=False) + sym: bool = field(default=True) + is_marlin_format: bool = field(default=False) + + def to_dict(self): + return { + "bits": self.bits, + "group_size": self.group_size, + "desc_act": self.desc_act, + "sym": self.sym, + "is_marlin_format": self.is_marlin_format, + "quant_method": "gptq", + } + + +def _translate_state_dict_exllama(state_dict: Dict[str, Any]) -> Dict[Any, Any]: + """ + Translate the state dict to the Exllama format. + + Changes made to quantized params in the passed state_dict: + - weight tensor renamed to qweight, and the corresponding tensor + value of shape [x, 8y] will be repacked to [x, y] + - scale tensor renamed to scales, and the corresponding tensor + value of shape [8x] will be reshaped to [1, 8x] and + then repacked to [1, x] + - zero_point tensor renamed to qzeros, and the corresponding tensor + value of shape [x] will be reshaped to [1, x] + - A g_idx tensor of shape [num_channels] will be added to the + state_dict, this tensor will be filled with zeros + - All fake quantization parameters will be removed from the state_dict + + + + + :param state_dict: The model state dict to be translated. + :return: The translated state dict compatible with Exllama. + """ + + state_dict_copy = {} + for transformation in GPTQ_EXLLAMA_TRANSFORMATIONS: + state_dict_copy: Dict[str, Tensor] = transformation( + state_dict=state_dict_copy or state_dict + ) + + return state_dict_copy + + +def _create_model_and_tokenizer( + model: Union[PreTrainedModel, str], + tokenizer: Union[PreTrainedTokenizerBase, str, None] = None, + device: str = "cuda", +) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]: + """ + Create/infer model and tokenizer instances from the passed + in model and tokenizer. Additionally moves the model to the + specified device. + + :param model: The model to be exported, can also be + path to a local model directory or a HuggingFace/SparseZoo stub + :param tokenizer: The tokenizer associated with the model, + can also be a HuggingFace/SparseZoo stub, if not passed in, + it will be inferred from the model. An error will be raised if it + cannot be inferred. + :param device: The device to use for the model. Default is "cuda". + if cuda is not available, it will fallback to cpu. + :return A tuple of (model, tokenizer) instances. If both were + passed into this function, they are returned as is. + If tokenizer was not passed in, it is inferred from the + model path/stub + """ + if isinstance(tokenizer, str): + # tokenizer from it's own path/stub + tokenizer = SparseAutoTokenizer.from_pretrained(tokenizer) + + if tokenizer is None and isinstance(model, str): + # tokenizer from model path/stub + tokenizer = SparseAutoTokenizer.from_pretrained(model) + + if tokenizer is None: + raise ValueError( + "tokenizer not passed in and could not be inferred from model." + "Please pass in a tokenizer." + ) + + if isinstance(model, str): + model = SparseAutoModelForCausalLM.from_pretrained(model) + + # move model to gpu if avaliable + model.to(fallback_to_cpu(device=device)) + + return model, tokenizer diff --git a/src/sparseml/utils/helpers.py b/src/sparseml/utils/helpers.py index 6c1d4f3ad6c..0c5b4d8330e 100644 --- a/src/sparseml/utils/helpers.py +++ b/src/sparseml/utils/helpers.py @@ -74,6 +74,7 @@ "parse_kwarg_tuples", "download_zoo_training_dir", "is_package_available", + "get_unique_dir_name", ] @@ -974,3 +975,23 @@ def is_package_available( return package_exists, package_version else: return package_exists + + +def get_unique_dir_name(dir_name: Union[str, Path]) -> str: + """ + A utility function to get a unique directory name by appending + a number to the directory name if the directory already exists + (Note: the function does not create the directory, it only + returns the unique directory name) + + :param dir_name: The directory name to get a unique name for + :return: The unique directory name + """ + dir_name: str = str(dir_name) + counter: str = 1 + new_dir_name: str = dir_name + + while Path(new_dir_name).exists(): + new_dir_name = f"{dir_name}_{counter}" + counter += 1 + return new_dir_name