diff --git a/src/sparseml/transformers/utils/gptq_utils/__init__.py b/src/sparseml/transformers/utils/gptq_utils/__init__.py
new file mode 100644
index 00000000000..d07ff5d5779
--- /dev/null
+++ b/src/sparseml/transformers/utils/gptq_utils/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+
+from .vllm_export_helpers import *
diff --git a/src/sparseml/transformers/utils/gptq_utils/transformations.py b/src/sparseml/transformers/utils/gptq_utils/transformations.py
new file mode 100644
index 00000000000..36004f5f0f7
--- /dev/null
+++ b/src/sparseml/transformers/utils/gptq_utils/transformations.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa #F821,#E501
+
+import functools
+import logging
+from typing import Callable, Dict, List, Tuple
+
+import numpy
+import torch
+from torch import Tensor
+
+
+__all__ = [
+    "transform_to_exllama_names",
+    "add_exllama_tensors",
+    "transform_gptq_weights_and_reshape_tensors",
+    "remove_unwanted_tensors_for_exllama",
+    "is_gptq_quantization_target",
+    "convert_fp32_tensors_to_fp16",
+    "gptq_exllama_transformations",
+    "GPTQ_EXLLAMA_TRANSFORMATIONS",
+]
+
+_LOGGER = logging.getLogger(__name__)
+
+TransformationType = Callable[[Dict[str, torch.Tensor]], Dict[str, torch.Tensor]]
+
+
+def _log_call(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        _LOGGER.debug("Applying transformation: %s", func.__name__.upper())
+        return_value = func(*args, **kwargs)
+        _LOGGER.debug("Transformation: %s complete", func.__name__.upper())
+        return return_value
+
+    return wrapper
+
+
+def is_gptq_quantization_target(key: str) -> bool:
+    """
+    Assumes self_attn and mlp are the only quantization targets
+    in model layers of the state_dict.
+
+    :param key: The key of the state_dict
+    :return: True if the key is a quantization target, False otherwise
+    """
+    return "model.layers" in key and ("self_attn" in key or "mlp" in key)
+
+
+@_log_call
+def transform_to_exllama_names(state_dict: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    """
+    Transforms the state_dict keys to match with exllama format
+
+    The renames include:
+        - weight_fake_quant.scale -> scales
+        - weight_fake_quant.zero_point -> qzeros
+        - weight -> qweight
+
+    Note: does not transforms the actual tensor values
+
+    :pre-condition: The state_dict should be for a quantized model
+    :pre-condition: Targets only the weights of the self_attn and mlp nodes
+    :param state_dict: The quantized state_dict to be transformed
+    :return: The transformed state_dict
+    """
+    # mapping of the old names to the new names
+    name_map: Dict[str, str] = {
+        ".weight_fake_quant.scale": ".scales",
+        ".weight_fake_quant.zero_point": ".qzeros",
+        ".weight": ".qweight",
+    }
+
+    updated_state_dict: Dict[str, Tensor] = {}
+    for key, tensor in state_dict.items():
+        if is_gptq_quantization_target(key) and any(
+            key.endswith(target_suffix := suffix) for suffix in name_map
+        ):
+            updated_key = key.replace(target_suffix, name_map[target_suffix])
+            updated_state_dict[updated_key] = tensor
+        else:
+            updated_state_dict[key] = tensor
+    return updated_state_dict
+
+
+@_log_call
+def add_exllama_tensors(state_dict: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    """
+    Add the bias and g_idx tensors to the state_dict, wherever
+    a qweight tensor is present
+
+    The added tensors include:
+        - bias: A tensor of shape [output channels] filled with zeros
+                and dtype float16
+        - g_idx: A tensor of shape [num_channels] filled with zeros
+                and dtype int32
+
+    :pre-condition: The state_dict should be for a quantized model
+    :pre-condition: The state_dict should have been transformed to exllama names
+    :param state_dict: The state_dict to be transformed
+    :return: The state_dict with the added bias and g_idx tensors
+    """
+
+    updated_state_dict: Dict[str, Tensor] = {}
+
+    for key, tensor in state_dict.items():
+        if is_gptq_quantization_target(key) and key.endswith(".qweight"):
+            # add bias and g_idx tensors
+            bias_key = key.replace(".qweight", ".bias")
+            g_idx_key = key.replace(".qweight", ".g_idx")
+
+            # bias tensor
+            bias_tensor = torch.zeros(tensor.shape[0], dtype=torch.float16)
+            updated_state_dict[bias_key] = bias_tensor
+
+            # g_idx tensor of shape [num_channels] dtype int32 filled
+            # with zeros
+            g_idx_tensor = torch.zeros(tensor.shape[1], dtype=torch.int32)
+            updated_state_dict[g_idx_key] = g_idx_tensor
+
+        # copy the original tensor, (qweight is also copied in this step)
+        updated_state_dict[key] = tensor
+    return updated_state_dict
+
+
+@_log_call
+def transform_gptq_weights_and_reshape_tensors(
+    state_dict: Dict[str, Tensor]
+) -> Dict[str, Tensor]:
+    """
+    Tranforms weights into their required shapes and types for Exllama format
+
+    The transformations include:
+        - Quantize the weight tensor using the scales, zeros, and g_idx tensors
+            additonally pack a group of 8 of them into a single 32 bit integer
+            and rename the tensor to qweight
+        - Reshape the scales tensor to [1, x] and convert to fp16
+        - Reshape the zero points tensor to [1, x] of type int32 and fill with zeros
+            (it is assumed that quantization was symmetric)
+
+    :pre-condition: The state_dict should be for a quantized model
+    :pre-condition: The state_dict should have been transformed to exllama names
+    :pre-condition: The state_dict should have the bias and g_idx tensors added
+    :param state_dict: The state_dict to be transformed
+    :return: The transformed state_dict, with repacked and reshaped tensors
+    """
+
+    transformed_state_dict: Dict[str, Tensor] = {}
+
+    # auxillary dict to store transformed weights
+    transformed_weights_dict: Dict[str, Tensor] = {}
+
+    # quantize qweights before scales, and qzeros
+    # because the ordering in which tensors are fetched
+    # is not guaranteed by our implementation
+    for key, tensor in state_dict.items():
+        if is_gptq_quantization_target(key) and key.endswith(".qweight"):
+            # quantize the weight tensor
+            qweight = _pack_fp32_into_int32(
+                weight=tensor,
+                scales=state_dict[key.replace("qweight", "scales")],
+                zeros=state_dict[key.replace("qweight", "qzeros")],
+                g_idx=state_dict[key.replace("qweight", "g_idx")],
+            )
+            assert qweight.dtype == torch.int32
+            transformed_weights_dict[key] = qweight
+
+    # transform scales and zero points
+    for key, tensor in state_dict.items():
+        if is_gptq_quantization_target(key) and key.endswith(".scales"):
+            # scales [x] should be reshaped to [1, x]
+            # and converted to fp16
+            scales = tensor.reshape(1, -1).half()
+            transformed_state_dict[key] = scales
+        elif is_gptq_quantization_target(key) and key.endswith(".qzeros"):
+            # zero points [8x] should be reshaped to [1, x]
+            # of type int32 and filled with zeros (symmetric quantization)
+            zeros = torch.zeros(tensor.shape[0] // 8, dtype=torch.int32)
+            transformed_state_dict[key] = zeros.reshape(1, -1)
+        else:
+            transformed_state_dict[key] = tensor
+
+    # overwrite old weights with the new quantized weights
+    transformed_state_dict.update(transformed_weights_dict)
+
+    # auxillary weights_dict not needed anymore
+    del transformed_weights_dict
+
+    return transformed_state_dict
+
+
+@_log_call
+def remove_unwanted_tensors_for_exllama(
+    state_dict: Dict[str, Tensor]
+) -> Dict[str, Tensor]:
+    """
+    Remove unwanted tensors from the state_dict that are not necessary for inference.
+    These tensors include:
+        - eps
+        - min_val
+        - max_val
+        - fake_quant_enabled
+        - observer_enabled
+
+    :param state_dict: The state_dict to be cleaned
+    :return: The cleaned state_dict with all keys ending with the unwanted suffixes removed
+    """
+    suffixes_to_delete: List[str] = [
+        "eps",
+        "min_val",
+        "max_val",
+        "fake_quant_enabled",
+        "observer_enabled",
+    ]
+    keys = list(state_dict.keys())
+    for key in keys:
+        if any(key.endswith(suffix) for suffix in suffixes_to_delete):
+            del state_dict[key]
+    return state_dict
+
+
+@_log_call
+def convert_fp32_tensors_to_fp16(state_dict: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    """
+    Convert all remaining fp32 tensors to fp16 tensors in the state_dict
+    This is expected by the Exllama format
+
+    :param state_dict: The state_dict to be converted
+    :return: The converted state_dict, with all fp32 tensors converted to fp16
+    """
+    converted_state_dict: Dict[str, Tensor] = {}
+    for key, tensor in state_dict.items():
+        converted_state_dict[key] = (
+            tensor.half() if tensor.dtype == torch.float32 else tensor
+        )
+    return converted_state_dict
+
+
+def gptq_exllama_transformations() -> Tuple[TransformationType, ...]:
+    """
+    :return: An Iterable of transformations that must be applied to
+        the state_dict IN_ORDER to convert it to the Exllama format
+        for GPTQ style quantization. Each transformation is a
+        callable that accepts a state_dict and returns a transformed
+        state_dict.
+    """
+
+    return (
+        transform_to_exllama_names,
+        add_exllama_tensors,
+        transform_gptq_weights_and_reshape_tensors,
+        remove_unwanted_tensors_for_exllama,
+        convert_fp32_tensors_to_fp16,
+    )
+
+
+def _pack_fp32_into_int32(
+    weight: Tensor, scales: Tensor, zeros: Tensor, g_idx: Tensor
+) -> Tensor:
+    """
+    Quantize the weight tensor using the scales, zeros, and g_idx tensors
+    into 4 bit integers, and packs a group of 8 of them into a single 32 bit integer.
+
+    Adapted from:
+    https://github.com/AutoGPTQ/AutoGPTQ/blob/ea4a99778f90b60c9b5177d7487af1b4ca87744f/auto_gptq/nn_modules/qlinear/qlinear_exllama.py#L118
+
+    :param weight: The weight tensor to be quantized of shape [x, 8y]
+    :param scales: The scales tensor
+    :param zeros: The zero points tensor
+    :param g_idx: The group index tensor
+    :return: The quantized weight tensor of int32 dtype and shape [x, y]
+    """
+    g_idx = g_idx.clone()
+
+    scales = scales.t().contiguous()
+    zeros = zeros.t().contiguous()
+    scale_zeros = zeros * scales
+    scales = scales.clone().half()
+    bits = 4
+
+    intweight = []
+    infeatures = weight.shape[1]
+    for idx in range(infeatures):
+        intweight.append(
+            torch.round(
+                (weight[:, idx] + scale_zeros[g_idx[idx]]) / scales[g_idx[idx]]
+            ).to(torch.int)[:, None]
+        )
+    intweight = torch.cat(intweight, dim=1)
+    intweight = intweight.t().contiguous()
+    intweight = intweight.numpy().astype(numpy.uint32)
+
+    i = 0
+    row = 0
+    qweight = numpy.zeros(
+        (intweight.shape[0] // 32 * bits, intweight.shape[1]), dtype=numpy.uint32
+    )
+    while row < qweight.shape[0]:
+        if bits in [4]:
+            for j in range(i, i + (32 // bits)):
+                qweight[row] |= intweight[j] << (bits * (j - i))
+            i += 32 // bits
+            row += 1
+        else:
+            raise NotImplementedError("Only 4 bits are supported.")
+
+    qweight = qweight.astype(numpy.int32)
+    qweight = torch.from_numpy(qweight)
+    return qweight
+
+
+GPTQ_EXLLAMA_TRANSFORMATIONS: Tuple[
+    TransformationType, ...
+] = gptq_exllama_transformations()
diff --git a/src/sparseml/transformers/utils/gptq_utils/vllm_export_helpers.py b/src/sparseml/transformers/utils/gptq_utils/vllm_export_helpers.py
new file mode 100644
index 00000000000..4a4d3ea51ac
--- /dev/null
+++ b/src/sparseml/transformers/utils/gptq_utils/vllm_export_helpers.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+General utilities for exporting models to different formats using safe tensors.
+"""
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Dict, Literal, Optional, Tuple, Union
+
+from torch import Tensor
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+
+from sparseml.pytorch.model_load.helpers import fallback_to_cpu
+from sparseml.transformers.utils.gptq_utils.transformations import (
+    GPTQ_EXLLAMA_TRANSFORMATIONS,
+)
+from sparseml.transformers.utils.sparse_model import SparseAutoModelForCausalLM
+from sparseml.transformers.utils.sparse_tokenizer import SparseAutoTokenizer
+from sparseml.utils import get_unique_dir_name
+
+
+__all__ = [
+    "export_vllm_compatible_checkpoint",
+    "SUPPORTED_FORMAT_TYPES",
+]
+
+SUPPORTED_FORMAT_TYPES = Literal["exllama", "marlin"]
+_LOGGER = logging.getLogger(__name__)
+
+
+def export_vllm_compatible_checkpoint(
+    model: Union[PreTrainedModel, str],
+    tokenizer: Union[PreTrainedTokenizerBase, str, None] = None,
+    format: SUPPORTED_FORMAT_TYPES = "exllama",
+    save_dir: Union[str, Path, None] = None,
+    device: str = "cuda",
+):
+    """
+    A utility function to export a GPTQ quantized model to safetensors,
+    compatible with the vLLM library.
+    Calls the appropriate state dict translation function based on the format
+    and saves the translated state dict to the specified directory.
+    If the directory is not specified defaults to cwd/exported_model.
+    If the directory already exists, a new directory is created with a unique name.
+
+    :param model: The loaded model to be exported, can also be a local model
+        directory path, or a HugginFace/SparseZoo stub
+    :param tokenizer: The tokenizer associated with the model, can also
+        be a HuggingFace/Sparsezoo stub.
+    :param format: The format to which the model should be exported.
+        Default is "exllama".
+    :param save_dir: The directory where the model should be saved.
+    :param device: The device to use for the model. Default is "cuda".
+        if cuda is not available, it will fallback to cpu.
+    """
+
+    validate_specified_format(format=format)
+
+    model, tokenizer = _create_model_and_tokenizer(model=model, tokenizer=tokenizer)
+
+    _LOGGER.info(f"Translating state dict to {format} format.")
+    translated_state_dict: Dict[str, Any] = translate_state_dict(
+        state_dict=model.state_dict(), format=format
+    )
+
+    model.config.quantization_config = _QuantizationConfig()
+    _LOGGER.info(f"Added {format} quantization info to model.config")
+
+    if save_dir is None:
+        save_dir = Path.cwd() / f"{format}_model"
+
+    save_dir: str = get_unique_dir_name(dir_name=save_dir)
+
+    save_checkpoint(
+        model=model,
+        tokenizer=tokenizer,
+        state_dict=translated_state_dict,
+        save_dir=save_dir,
+    )
+
+
+def save_checkpoint(
+    model: PreTrainedModel,
+    state_dict: Dict[Any, Any],
+    save_dir: str,
+    tokenizer: Optional[PreTrainedTokenizerBase] = None,
+):
+    """
+    Saves the model and tokenizer to the specified directory,
+    with the specified state dict.
+
+    :param model: The model to be saved.
+    :param state_dict: The state dict to be saved.
+    :param save_dir: The directory where the model should be saved.
+    :param tokenizer: The tokenizer associated with the model. This will
+        be saved to the same directory as the model.
+    """
+    model.save_pretrained(
+        save_directory=save_dir, state_dict=state_dict, safe_serialization=True
+    )
+    _LOGGER.info(f"Model and config saved to {save_dir}")
+
+    if tokenizer:
+        tokenizer.save_pretrained(save_directory=save_dir)
+        _LOGGER.info(f"tokenizer saved to {save_dir}")
+
+
+def translate_state_dict(
+    state_dict: Dict[Any, Any], format: SUPPORTED_FORMAT_TYPES
+) -> Dict[Any, Any]:
+    """
+    A utility function to translate the state dict to the specified format.
+
+    :pre-condition: The format must be one of the supported formats.
+    :param state_dict: The state dict to be translated.
+    :param format: The format to which the state dict should be translated.
+    """
+    if format == "exllama":
+        return _translate_state_dict_exllama(state_dict=state_dict)
+
+    # raise appropriate error if this function is called as a standalone
+    validate_specified_format()
+
+
+def validate_specified_format(format: SUPPORTED_FORMAT_TYPES):
+    """
+    Validates the specified format is supported and raises
+    an error if not.
+
+    :raises ValueError: If the specified format is not supported.
+    :raises NotImplementedError: for marlin format.
+    """
+
+    # validate
+    if format not in SUPPORTED_FORMAT_TYPES:
+        raise ValueError(
+            f"Unsupported format {format}, supported formats "
+            f"are {SUPPORTED_FORMAT_TYPES}"
+        )
+
+    if format != "exllama":
+        raise NotImplementedError(f"Exporting to format {format} is not supported yet.")
+
+
+@dataclass(frozen=True)
+class _QuantizationConfig:
+    """
+    A dataclass to hold the quantization configuration for the model.
+    This class is specific to GPTQ style quantization, and an instance
+    of this class can be added to the model.config.quantization_config
+    to enable the model to be exported to Exllama format.
+
+    Right now, the defaults are specific to sparseml GPTQ quantization.
+    In future versions we may support more general quantization configurations.
+
+    This class is frozen to prevent modification of the instance after creation.
+    """
+
+    bits: int = field(default=4, metadata={"choices": [2, 3, 4, 8]})
+    group_size: int = field(default=-1)
+    damp_percent: float = field(default=0.01)
+    desc_act: bool = field(default=False)
+    sym: bool = field(default=True)
+    is_marlin_format: bool = field(default=False)
+
+    def to_dict(self):
+        return {
+            "bits": self.bits,
+            "group_size": self.group_size,
+            "desc_act": self.desc_act,
+            "sym": self.sym,
+            "is_marlin_format": self.is_marlin_format,
+            "quant_method": "gptq",
+        }
+
+
+def _translate_state_dict_exllama(state_dict: Dict[str, Any]) -> Dict[Any, Any]:
+    """
+    Translate the state dict to the Exllama format.
+
+    Changes made to quantized params in the passed state_dict:
+    - weight tensor renamed to qweight, and the corresponding tensor
+        value of shape [x, 8y] will be repacked to [x, y]
+    - scale tensor renamed to scales, and the corresponding tensor
+        value of shape [8x] will be reshaped to [1, 8x] and
+        then repacked to [1, x]
+    - zero_point tensor renamed to qzeros, and the corresponding tensor
+        value of shape [x] will be reshaped to [1, x]
+    - A g_idx tensor of shape [num_channels] will be added to the
+        state_dict, this tensor will be filled with zeros
+    - All fake quantization parameters will be removed from the state_dict
+
+
+
+
+    :param state_dict: The model state dict to be translated.
+    :return: The translated state dict compatible with Exllama.
+    """
+
+    state_dict_copy = {}
+    for transformation in GPTQ_EXLLAMA_TRANSFORMATIONS:
+        state_dict_copy: Dict[str, Tensor] = transformation(
+            state_dict=state_dict_copy or state_dict
+        )
+
+    return state_dict_copy
+
+
+def _create_model_and_tokenizer(
+    model: Union[PreTrainedModel, str],
+    tokenizer: Union[PreTrainedTokenizerBase, str, None] = None,
+    device: str = "cuda",
+) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]:
+    """
+    Create/infer model and tokenizer instances from the passed
+    in model and tokenizer. Additionally moves the model to the
+    specified device.
+
+    :param model: The model to be exported, can also be
+        path to a local model directory or a HuggingFace/SparseZoo stub
+    :param tokenizer: The tokenizer associated with the model,
+        can also be a HuggingFace/SparseZoo stub, if not passed in,
+        it will be inferred from the model. An error will be raised if it
+        cannot be inferred.
+    :param device: The device to use for the model. Default is "cuda".
+        if cuda is not available, it will fallback to cpu.
+    :return A tuple of (model, tokenizer) instances. If both were
+        passed into this function, they are returned as is.
+        If tokenizer was not passed in, it is inferred from the
+        model path/stub
+    """
+    if isinstance(tokenizer, str):
+        # tokenizer from it's own path/stub
+        tokenizer = SparseAutoTokenizer.from_pretrained(tokenizer)
+
+    if tokenizer is None and isinstance(model, str):
+        # tokenizer from model path/stub
+        tokenizer = SparseAutoTokenizer.from_pretrained(model)
+
+    if tokenizer is None:
+        raise ValueError(
+            "tokenizer not passed in and could not be inferred from model."
+            "Please pass in a tokenizer."
+        )
+
+    if isinstance(model, str):
+        model = SparseAutoModelForCausalLM.from_pretrained(model)
+
+    # move model to gpu if avaliable
+    model.to(fallback_to_cpu(device=device))
+
+    return model, tokenizer
diff --git a/src/sparseml/utils/helpers.py b/src/sparseml/utils/helpers.py
index 6c1d4f3ad6c..0c5b4d8330e 100644
--- a/src/sparseml/utils/helpers.py
+++ b/src/sparseml/utils/helpers.py
@@ -74,6 +74,7 @@
     "parse_kwarg_tuples",
     "download_zoo_training_dir",
     "is_package_available",
+    "get_unique_dir_name",
 ]
 
 
@@ -974,3 +975,23 @@ def is_package_available(
         return package_exists, package_version
     else:
         return package_exists
+
+
+def get_unique_dir_name(dir_name: Union[str, Path]) -> str:
+    """
+    A utility function to get a unique directory name by appending
+    a number to the directory name if the directory already exists
+    (Note: the function does not create the directory, it only
+    returns the unique directory name)
+
+    :param dir_name: The directory name to get a unique name for
+    :return: The unique directory name
+    """
+    dir_name: str = str(dir_name)
+    counter: str = 1
+    new_dir_name: str = dir_name
+
+    while Path(new_dir_name).exists():
+        new_dir_name = f"{dir_name}_{counter}"
+        counter += 1
+    return new_dir_name