neuralmagic · Satrat · Mar 20, 2024 · Mar 12, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/setup.py b/setup.py
@@ -39,7 +39,7 @@
 _deps = [
     "setuptools<=59.5.0",
     "pyyaml>=5.0.0",
-    "numpy>=1.0.0",
+    "numpy>=1.17.0",
     "matplotlib>=3.0.0",
     "merge-args>=0.1.0",
     "onnx>=1.5.0,<1.15.0",

diff --git a/src/sparseml/pytorch/model_load/helpers.py b/src/sparseml/pytorch/model_load/helpers.py
@@ -229,17 +229,21 @@ def reload_model_from_checkpoint(model: Module, checkpoint: Optional[str] = None
 
 
 def save_model_and_recipe(
-    model: Module, save_path: str, tokenizer: Optional[Any] = None
+    model: Module,
+    save_path: str,
+    tokenizer: Optional[Any] = None,
+    save_safetensors: bool = False,
 ):
     """
     Save a model, tokenizer and the currently loaded recipe to file
 
     :param model: pytorch model to save
     :param save_path: path to save output to
     :param tokenizer: model tokenizer to save
+    :param save_safetensors: whether to save as safetensors or pickle(bin)
     """
 
-    model.save_pretrained(save_path)
+    model.save_pretrained(save_path, safe_serialization=save_safetensors)
 
     if tokenizer is not None:
         tokenizer.save_pretrained(save_path)

diff --git a/src/sparseml/transformers/__init__.py b/src/sparseml/transformers/__init__.py
@@ -59,3 +59,4 @@ def _check_transformers_install():
 from .utils import *
 from .export import *
 from .finetune import *
+from .compression import *
diff --git a/src/sparseml/transformers/compression/README.md b/src/sparseml/transformers/compression/README.md
@@ -0,0 +1,67 @@
+# Save/Load Compressed SafeTensors
+
+## Motivation
+
+* Reduce disk space by saving in a compressed format for sparse models. Models in this compressed format will be loaded by vLLM for more efficient inference
+* Set up the save/load architecture such that we can easily expand to additional compression formats in the future. The config should be human readable so users can understand the compression format at a quick glance
+
+## SafeTensors File Format
+
+For each parameter in the uncompressed state_dict, we store the following attributes 
+needed for decompression in the compressed state_dict:
+
+* compressed tensor
+* bitmask
+* uncompressed shape
+* row offsets
+
+```python
+# dense
+{
+    PARAM_NAME: uncompressed_tensor
+}
+
+# compressed
+{
+    PARAM_NAME.compressed: compressed_tensor # 1d tensor
+    PARAM_NAME.bitmask: value # 2d bitmask tensor (nrows x (ncols / 8))
+Satrat marked this conversation as resolved.
+    PARAM_NAME.shape: value # uncompressed shape tensor
+    PARAM_NAME.row_offsets: value # 1d offsets tensor
+}
+```
+
+## Example Code
+
+```python
+from sparseml.transformers import SparseAutoModelForCausalLM
+from sparseml.transformers.compression import BitmaskConfig, BitmaskCompressor
+from safetensors import safe_open
+import os
+
+MODEL_PATH = "zoo:llama2-7b-gsm8k_llama2_pretrain-pruned50.oneshot"
+OUTPUT_PATH = "./test_compress_output"
+
+model = SparseAutoModelForCausalLM.from_pretrained(MODEL_PATH)
+
+sparsity_config = BitmaskConfig()
+compressor = BitmaskCompressor(config=sparsity_config)
+
+model_state_dict = model.state_dict()
+sparse_state_dict = compressor.compress(model_state_dict)
+
+
+model.save_pretrained(OUTPUT_PATH, safe_serialization=True, state_dict=sparse_state_dict)
+
+safetensors_path = os.path.join(OUTPUT_PATH, "model-00001-of-00002.safetensors")
+with safe_open(safetensors_path, framework="pt", device=0) as f:
+    test_name = "model.layers.4.self_attn.k_proj.weight"
+    bitmask = f.get_tensor(test_name + ".bitmask")
+    shape = f.get_tensor(test_name + ".shape")
+    values = f.get_tensor(test_name + ".compressed")
+    row_offsets = f.get_tensor(test_name + ".row_offsets")
+    print(f"bitmask: {bitmask}")
+    print(f"shape: {shape}")
+    print(f"values: {values}")
+    print(f"row offsets: {row_offsets}")
+```
diff --git a/src/sparseml/transformers/compression/__init__.py b/src/sparseml/transformers/compression/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+
+from .compressors import *
+from .config import *
diff --git a/src/sparseml/transformers/compression/compressors/__init__.py b/src/sparseml/transformers/compression/compressors/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+
+from .base import ModelCompressor
+from .sparse_bitmask import BitmaskCompressor
diff --git a/src/sparseml/transformers/compression/compressors/base.py b/src/sparseml/transformers/compression/compressors/base.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+from torch import Tensor
+
+from sparseml.transformers.compression.config import CompressionConfig
+from sparsezoo.utils.registry import RegistryMixin
+
+
+__all__ = ["ModelCompressor"]
+
+
+class ModelCompressor(RegistryMixin):
+    """
+    Base class representing a model compression algorithm.
+
+    :param config: config specifying compression parameters
+    """
+
+    def __init__(self, config: CompressionConfig):
+        self.config = config
+
+    def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """
+        Compresses a dense state dict
+
+        :param model_state: state dict of uncompressed model
+        :return: compressed state dict
+        """
+        raise NotImplementedError()
+
+    def decompress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """
+        Uncompresses a compressed state dict back to dense
+
+        :param model_state: state dict of uncompressed model
+        :return: compressed state dict
+        """
+        raise NotImplementedError()
diff --git a/src/sparseml/transformers/compression/compressors/sparse_bitmask.py b/src/sparseml/transformers/compression/compressors/sparse_bitmask.py
@@ -0,0 +1,195 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Dict, Tuple
+
+import numpy
+import torch
+from torch import Tensor
+from tqdm import tqdm
+
+from sparseml.transformers.compression.compressors import ModelCompressor
+
+
+__all__ = [
+    "BitmaskCompressor",
+    "BitmaskTensor",
+    "bitmask_compress",
+    "bitmask_decompress",
+    "pack_bitmasks",
+    "unpack_bitmasks",
+]
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
+
+@ModelCompressor.register(name="sparse_bitmask")
+class BitmaskCompressor(ModelCompressor):
+    """
+    Compression for sparse models using bitmasks. Non-zero weights are stored in a 1d
+    values tensor, with their locations stored in a 2d bitmask
+    """
+
+    def compress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """
+        Compresses a dense state dict using bitmask compression
+
+        :param model_state: state dict of uncompressed model
+        :return: compressed state dict
+        """
+        compressed_dict = {}
+        _LOGGER.info(
+            f"Compressing model with {len(model_state)} parameterized layers..."
+        )
+        for name, value in tqdm(model_state.items()):
+            bitmask_tensor = BitmaskTensor(value)
+            compressed_dict |= bitmask_tensor.dict(name_prefix=name)
+
+        return compressed_dict
+
+    def decompress(self, model_state: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """
+        Uncompresses a bitmask compressed state dict back to dense
+
+        :param model_state: state dict of uncompressed model
+        :return: compressed state dict
+        """
+        raise NotImplementedError()
+
+
+class BitmaskTensor:
+    """
+    Owns compressions and decompression for a single bitmask compressed tensor.
+    Adapted from: https://github.com/mgoin/torch_bitmask/tree/main
+
+    :param tensor: Dense tensor to compress
+    """
+
+    def __init__(self, tensor: Tensor):
+        self.shape = tensor.shape
+        self.values, self.bitmasks, self.row_offsets = bitmask_compress(tensor.cpu())
+
+    def decompress(self) -> Tensor:
+        """
+        :return: reconstructed dense tensor
+        """
+        return bitmask_decompress(self.values, self.bitmasks, self.shape)
+
+    @staticmethod
+    def from_dense(tensor: Tensor) -> "BitmaskTensor":
+        """
+        :param tensor: dense tensor to compress
+        :return: instantiated compressed tensor
+        """
+        return BitmaskTensor(tensor)
+
+    def curr_memory_size_bytes(self):
+        """
+        :return: size in bytes required to store compressed tensor on disk
+        """
+
+        def sizeof_tensor(a):
+            return a.element_size() * a.nelement()
+
+        return (
+            sizeof_tensor(self.values)
+            + sizeof_tensor(self.bitmasks)
+            + sizeof_tensor(self.row_offsets)
+        )
+
+    def dict(self, name_prefix: str) -> Dict[str, Tensor]:
+        """
+        :name_prefix: name of original tensor to store compressed weight as
+        :return: dict of compressed data for the stored weight
+        """
+        return {
+            name_prefix + ".compressed": self.values,
+            name_prefix + ".bitmask": self.bitmasks,
+            name_prefix + ".shape": torch.tensor(self.shape, device="cpu"),
+            name_prefix + ".row_offsets": self.row_offsets,
+        }
+
+    def __repr__(self):
+        return f"BitmaskTensor(shape={self.shape}, compressed=True)"
+
+
+def bitmask_compress(tensor: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+    """
+    Compresses a dense tensor using bitmask compression
+
+    :param tensor: dense tensor to compress
+    :return: tuple of compressed data representing tensor
+    """
+    bytemasks = tensor != 0
+    row_counts = bytemasks.sum(dim=-1)
+    row_offsets = torch.cumsum(row_counts, 0) - row_counts
+    values = tensor[bytemasks]
+    bitmasks_packed = pack_bitmasks(bytemasks)
+
+    return values, bitmasks_packed, row_offsets
+
+
+def bitmask_decompress(
+    values: Tensor, bitmasks: Tensor, original_shape: torch.Size
+) -> Tensor:
+    """
+    Reconstructs a dense tensor from a compressed one
+
+    :param values: 1d tensor of non-zero values
+    :param bitmasks: 2d int8 tensor flagging locations of non-zero values in the
+    tensors original shape
+    :param original_shape: shape of the dense tensor
+    :return: decompressed dense tensor
+    """
+    bytemasks_unpacked = unpack_bitmasks(bitmasks, original_shape)
+
+    decompressed_tensor = torch.zeros(original_shape, dtype=values.dtype)
+    decompressed_tensor[bytemasks_unpacked] = values
+
+    return decompressed_tensor
+
+
+def pack_bitmasks(bytemasks: Tensor) -> Tensor:
+    """
+    Converts a bytemask tensor to a bitmask tensor to reduce memory. Shape RxC will be
+    compressed to R x ceil(C/8)
+    :param bytemasks: mask tensor where each byte corresponds to a weight
+    :return: mask tensor where each bit corresounds to a weight
+    """
+    packed_bits_numpy = numpy.packbits(bytemasks.numpy(), axis=-1, bitorder="little")
+    packed_bits_torch = torch.from_numpy(packed_bits_numpy)
+
+    return packed_bits_torch
+
+
+def unpack_bitmasks(packed_bitmasks: Tensor, original_shape: torch.Size) -> Tensor:
+    """
+    Converts a bitmask tensor back to a bytemask tensor for use during decompression
+
+    :param packed_bitmasks: mask tensor where each bit corresponds to a weight
+    :param original_shape: dense shape to decompress to
+    :return: boolean mask of weights in the original dense shape
+    """
+    # Unpack the bits
+    unpacked_bits = numpy.unpackbits(
+        packed_bitmasks.numpy(), axis=-1, count=original_shape[-1], bitorder="little"
+    )
+
+    # Reshape to match the original shape
+    unpacked_bitmasks_torch = torch.from_numpy(
+        unpacked_bits.reshape(original_shape).astype(bool)
+    )
+
+    return unpacked_bitmasks_torch