Compression UI Changes + One-shot/Finetune Support (#2194)

* working implementation * remove unneeded file * update README * clean up and docstrings * finetuning and one-shot interface * update README * update save * update README
neuralmagic · Mar 22, 2024 · ed2978e · ed2978e
1 parent b5fd814
commit ed2978e
Show file tree

Hide file tree

Showing 11 changed files with 249 additions and 124 deletions.
diff --git a/src/sparseml/pytorch/model_load/helpers.py b/src/sparseml/pytorch/model_load/helpers.py
@@ -233,6 +233,7 @@ def save_model_and_recipe(
     save_path: str,
     tokenizer: Optional[Any] = None,
     save_safetensors: bool = False,
+    save_compressed: bool = False,
 ):
     """
     Save a model, tokenizer and the currently loaded recipe to file
@@ -241,9 +242,12 @@ def save_model_and_recipe(
     :param save_path: path to save output to
     :param tokenizer: model tokenizer to save
     :param save_safetensors: whether to save as safetensors or pickle (bin)
+    :param save_compressed: whether to compress sparse weights on disk
     """
 
-    model.save_pretrained(save_path, safe_serialization=save_safetensors)
+    model.save_pretrained(
+        save_path, save_compressed=save_compressed, safe_serialization=save_safetensors
+    )
 
     if tokenizer is not None:
         tokenizer.save_pretrained(save_path)

diff --git a/src/sparseml/transformers/compression/README.md b/src/sparseml/transformers/compression/README.md
@@ -61,17 +61,16 @@ model = SparseAutoModelForCausalLM.from_pretrained(
 ```
 
 Saving a compressed model with an explicitly provided compression config. The config
-is saved to the model's `config.json` file
+is saved to the model's `config.json` file. **Note:** the model must have been 
+initialized with SparseAutoModelForCausalLM.from_pretrained()
 
 ```python
-from sparseml.transformers.utils import SparseAutoModelForCausalLM
 from sparseml.transformers.compression import BitmaskConfig
 
 output_dir = "/PATH/TO/SAVE/COMPRESSED_MODEL"
 sparsity_config = BitmaskConfig()
 
-SparseAutoModelForCausalLM.save_pretrained(
-    model,
+model.save_pretrained(
     save_directory=output_dir,
     sparsity_config=sparsity_config,
 )
@@ -80,14 +79,13 @@ SparseAutoModelForCausalLM.save_pretrained(
 Saving a compressed model, inferring the config from the model attributes
 
 ```python
-SparseAutoModelForCausalLM.save_compressed(
+model.save_compressed(
     model,
     save_directory=output_dir,
 )
 
 # alternative
-SparseAutoModelForCausalLM.save_pretrained(
-    model,
+model.save_pretrained(
     save_directory=output_dir,
     save_compressed=True
 )
@@ -97,19 +95,38 @@ Saving a model in the dense format, but still include a sparsity config in `conf
 with global sparsity and sparsity structure information
 
 ```python
-from sparseml.transformers.utils import SparseAutoModelForCausalLM
 from sparseml.transformers.compression import DenseSparsityConfig
 
-SparseAutoModelForCausalLM.save_pretrained(
-    model,
+model.save_pretrained(
     save_directory=output_dir,
     sparsity_config=DenseSparsityConfig()
 )
 ```
 
+## Enable Compression During One-Shot and Sparse Finetunining
+Models that are saved in a supported compressed format on disk will automatically be
+decompressed when loaded as input to `sparseml.transformers.oneshot` or 
+`sparseml.transformers.train`
+
+To enable compression on save after oneshot or finetuning simply add the 
+`save_compressed=True` argument to `sparseml.transformers.oneshot` or 
+`sparseml.transformers.train`
+
+```python
+from sparseml.transformers import train
+
+train(
+    save_compressed=True,
+    model="neuralmagic/TinyLlama-1.1B-Chat-v1.0-pruned2.4",
+    recipe=RECIPE,
+    dataset=DATASET
+)
+```
+
+
 ## Example Code
 
-Loads a 50% sparse model, compresses it using the inferred bitmask compression, then 
+Loads a 60% sparse model, compresses it using the inferred bitmask compression, then 
 reloads the compressed model.
 
 ```python
@@ -126,11 +143,10 @@ with measure_cuda_memory() as m:
     model = SparseAutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="cuda:0")
 print(f"Load dense model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
 
-print(f"Sparsity config before compression: {model.sparsity_config}")
+sparsity_config = getattr(model,"sparsity_config", None)
+print(f"Sparsity config before compression: {sparsity_config}")
 with measure_cuda_memory() as m:
-    SparseAutoModelForCausalLM.save_compressed(
-        model, OUTPUT_PATH
-    )
+    model.save_compressed(OUTPUT_PATH)
 print(f"Save compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
 
 torch.cuda.set_device(1)
@@ -139,5 +155,6 @@ with measure_cuda_memory() as m:
         OUTPUT_PATH, device_map="cuda:1"
     )
 print(f"Load compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
-print(f"Sparsity config after compression: {model_again.sparsity_config}")
+sparsity_config = getattr(model_again,"sparsity_config", None)
+print(f"Sparsity config after compression: {sparsity_config}")
 ```
diff --git a/src/sparseml/transformers/compression/utils/__init__.py b/src/sparseml/transformers/compression/utils/__init__.py
@@ -14,5 +14,6 @@
 
 # flake8: noqa
 
+from .compress_save import *
 from .helpers import *
 from .safetensors_load import *
diff --git a/src/sparseml/transformers/compression/utils/compress_save.py b/src/sparseml/transformers/compression/utils/compress_save.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import types
+import weakref
+from functools import wraps
+from typing import Optional
+
+from transformers import PreTrainedModel
+from transformers.file_utils import CONFIG_NAME
+
+from sparseml.transformers.compression.compressors import ModelCompressor
+from sparseml.transformers.compression.config import CompressionConfig
+from sparseml.transformers.utils.helpers import SPARSITY_CONFIG_NAME
+
+
+__all__ = [
+    "modify_save_pretrained",
+    "add_save_compressed_method",
+]
+
+
+def modify_save_pretrained(model: PreTrainedModel):
+    """
+    Overrides a PreTrainedModel's save_pretrained() method with a wrapped version that
+    supports compression
+    """
+
+    def save_pretrained_compressed(save_pretrained_method):
+        if getattr(save_pretrained_method, "_overridden", False):
+            # `model.save_pretrained` has already been replaced, return.
+            return save_pretrained_method
+
+        # Keep a weak reference to the model class and unbound save_pretrained
+        # method so we can call the original
+        model_ref = weakref.ref(save_pretrained_method.__self__)
+        original_save_pretrained = save_pretrained_method.__func__
+        model_class = model_ref().__class__
+        del save_pretrained_method
+
+        @wraps(original_save_pretrained)
+        def save_pretrained_wrapper(
+            save_directory: str,
+            sparsity_config: Optional[CompressionConfig] = None,
+            save_compressed: bool = False,
+            **kwargs,
+        ):
+            """
+            Wrapper around PreTrainedModel.save_pretrained(), adds functionality for
+            saving models in a compressed format on disk. The compression format is
+            saved to the model's config file
+
+            :param save_directory: output directory to save model to
+            :param sparsity_config: optional sparsity config to compress model with,
+            if no config is provided it will be inferred from the model
+            :param save_compresed: whether or not to compress the model on disk
+            :param kwargs: additional kwargs to pass on to model.save_pretrained
+            """
+            model = model_ref()
+
+            if sparsity_config is not None:
+                # if a sparsity config is provided, always save compressed
+                sparsity_config.fill_config_details(model)
+                save_compressed = True
+            elif save_compressed:
+                # try to infer a sparsity config from the model if none is provided
+                sparsity_config = CompressionConfig.infer_config_from_model(
+                    model, compress=save_compressed
+                )
+
+            if sparsity_config is None:
+                # model is not sparse, save as dense
+                return original_save_pretrained.__get__(model, model_class)(
+                    save_directory, **kwargs
+                )
+
+            # if we've gotten to this point we have a config so we can run compression
+            kwargs["safe_serialization"] = True
+            compressor = ModelCompressor.load_from_registry(
+                sparsity_config.format, config=sparsity_config
+            )
+
+            # state_dict gets passed in as a kwarg for FSDP models
+            state_dict = kwargs.get("state_dict", None)
+            if state_dict is None:
+                state_dict = model.state_dict()
+
+            # make sure we're on the main process when saving
+            if state_dict is not None and len(state_dict) > 0:
+                compressed_state_dict = compressor.compress(state_dict)
+                kwargs["state_dict"] = compressed_state_dict
+
+                original_save_pretrained.__get__(model, model_class)(
+                    save_directory, **kwargs
+                )
+                sparsity_config_data = sparsity_config.dict()
+                config_file_path = os.path.join(save_directory, CONFIG_NAME)
+
+                # add the sparsity config to the model's config file
+                with open(config_file_path, "r") as config_file:
+                    config_data = json.load(config_file)
+                config_data[SPARSITY_CONFIG_NAME] = sparsity_config_data
+                with open(config_file_path, "w") as config_file:
+                    json.dump(config_data, config_file, indent=4, sort_keys=True)
+
+        save_pretrained_wrapper._overriden = True
+        return save_pretrained_wrapper
+
+    # wrap save_pretrained
+    model.save_pretrained = save_pretrained_compressed(model.save_pretrained)
+
+
+def add_save_compressed_method(model: PreTrainedModel):
+    """
+    Overrides an instance of PreTrainedModel to add a save_compressed method that
+    wraps PreTrainedModel.save_pretrained(). Requires modify_save_pretrained() has
+    already been run on the model instance
+    """
+
+    def save_compressed(
+        self,
+        save_directory: str,
+        sparsity_config: Optional[CompressionConfig] = None,
+        **kwargs,
+    ):
+        """
+        Alias for PreTrainedModel.save_pretrained() that always saves in a
+        compressed format
+
+        :param save_directory: output directory to save model to
+        :param sparsity_config: optional sparsity config to compress model with, if no
+        config is provided it will be inferred from the model
+        :param kwargs: additional kwargs to pass on to model.save_pretrained
+        """
+        return self.save_pretrained(
+            save_directory=save_directory,
+            sparsity_config=sparsity_config,
+            save_compressed=True,
+            **kwargs,
+        )
+
+    model.save_compressed = types.MethodType(save_compressed, model)
diff --git a/src/sparseml/transformers/finetune/runner.py b/src/sparseml/transformers/finetune/runner.py
@@ -194,6 +194,7 @@ def one_shot(self, stage: Optional[str] = None):
                 save_path=self._output_dir,
                 tokenizer=self.tokenizer,
                 save_safetensors=self._training_args.save_safetensors,
+                save_compressed=self._training_args.save_compressed,
             )
 
     def train(self, checkpoint: str, stage: Optional[str] = None):

diff --git a/src/sparseml/transformers/finetune/session_mixin.py b/src/sparseml/transformers/finetune/session_mixin.py
@@ -457,25 +457,29 @@ def save_model(
 
         :param output_dir: the path to save the recipes into
         """
-        self._check_super_defined("save_model")
-        super().save_model(output_dir=output_dir, _internal_call=_internal_call)
-
         if session_manager.active_session() is None:
             return  # nothing to save
 
         if output_dir is None:
             output_dir = self.args.output_dir
 
-        # don't export the gathered model on checkpoints
-        if is_fsdp_model(self.model) and not _internal_call:
+        if not is_fsdp_model(self.model):
+            self.model.save_pretrained(
+                output_dir,
+                save_compressed=self.args.save_compressed,
+                safe_serialization=self.args.save_safetensors,
+            )
+        else:  # FSDP model
             save_pretrained_fsdp(
                 model=self.model,
                 accelerator=self.accelerator,
                 output_dir=output_dir,
+                save_compressed=self.args.save_compressed,
                 save_safetensors=self.metadata.get("save_safetensors", False),
             )
 
         self.save_state()
+        self.tokenizer.save_pretrained(output_dir)
         if not _is_oneshot:  # optimizer/scheduler not relevant to one-shot
             self.save_optimizer_and_scheduler(output_dir)
 

diff --git a/src/sparseml/transformers/finetune/text_generation.py b/src/sparseml/transformers/finetune/text_generation.py
@@ -309,10 +309,6 @@ def main(
     if isinstance(tokenizer, str) or tokenizer is None:
         tokenizer = initialize_tokenizer_from_path(model_args, model, teacher)
 
-    # setup new SparseSession unless user requests otherwise
-    if training_args.clear_sparse_session:
-        session_manager.create_session()
-        session_manager.active_session().reset()
     session_manager.pre_initialize_structure(model=model, framework=Framework.pytorch)
 
     # intialize session manager
@@ -381,6 +377,10 @@ def main(
     if training_args.do_predict:
         stage_runner.predict()
 
+    # Clean up the SparseSession before exit if requested
+    if training_args.clear_sparse_session:
+        session_manager.active_session().reset()
+
 
 if __name__ == "__main__":
     apply()
diff --git a/src/sparseml/transformers/finetune/training_args.py b/src/sparseml/transformers/finetune/training_args.py
@@ -54,6 +54,10 @@ class TrainingArguments(HFTrainingArgs):
             )
         },
     )
+    save_compressed: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to compress sparse models during save"},
+    )
     do_oneshot: Optional[bool] = field(
         default=False,
         metadata={"help": "Whether to run one-shot calibration"},