Skip to content

Commit

Permalink
Compression UI Changes + One-shot/Finetune Support (#2194)
Browse files Browse the repository at this point in the history
* working implementation

* remove unneeded file

* update README

* clean up and docstrings

* finetuning and one-shot interface

* update README

* update save

* update README
  • Loading branch information
Sara Adkins authored Mar 22, 2024
1 parent b5fd814 commit ed2978e
Show file tree
Hide file tree
Showing 11 changed files with 249 additions and 124 deletions.
6 changes: 5 additions & 1 deletion src/sparseml/pytorch/model_load/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ def save_model_and_recipe(
save_path: str,
tokenizer: Optional[Any] = None,
save_safetensors: bool = False,
save_compressed: bool = False,
):
"""
Save a model, tokenizer and the currently loaded recipe to file
Expand All @@ -241,9 +242,12 @@ def save_model_and_recipe(
:param save_path: path to save output to
:param tokenizer: model tokenizer to save
:param save_safetensors: whether to save as safetensors or pickle (bin)
:param save_compressed: whether to compress sparse weights on disk
"""

model.save_pretrained(save_path, safe_serialization=save_safetensors)
model.save_pretrained(
save_path, save_compressed=save_compressed, safe_serialization=save_safetensors
)

if tokenizer is not None:
tokenizer.save_pretrained(save_path)
Expand Down
49 changes: 33 additions & 16 deletions src/sparseml/transformers/compression/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,16 @@ model = SparseAutoModelForCausalLM.from_pretrained(
```

Saving a compressed model with an explicitly provided compression config. The config
is saved to the model's `config.json` file
is saved to the model's `config.json` file. **Note:** the model must have been
initialized with SparseAutoModelForCausalLM.from_pretrained()

```python
from sparseml.transformers.utils import SparseAutoModelForCausalLM
from sparseml.transformers.compression import BitmaskConfig

output_dir = "/PATH/TO/SAVE/COMPRESSED_MODEL"
sparsity_config = BitmaskConfig()

SparseAutoModelForCausalLM.save_pretrained(
model,
model.save_pretrained(
save_directory=output_dir,
sparsity_config=sparsity_config,
)
Expand All @@ -80,14 +79,13 @@ SparseAutoModelForCausalLM.save_pretrained(
Saving a compressed model, inferring the config from the model attributes

```python
SparseAutoModelForCausalLM.save_compressed(
model.save_compressed(
model,
save_directory=output_dir,
)

# alternative
SparseAutoModelForCausalLM.save_pretrained(
model,
model.save_pretrained(
save_directory=output_dir,
save_compressed=True
)
Expand All @@ -97,19 +95,38 @@ Saving a model in the dense format, but still include a sparsity config in `conf
with global sparsity and sparsity structure information

```python
from sparseml.transformers.utils import SparseAutoModelForCausalLM
from sparseml.transformers.compression import DenseSparsityConfig

SparseAutoModelForCausalLM.save_pretrained(
model,
model.save_pretrained(
save_directory=output_dir,
sparsity_config=DenseSparsityConfig()
)
```

## Enable Compression During One-Shot and Sparse Finetunining
Models that are saved in a supported compressed format on disk will automatically be
decompressed when loaded as input to `sparseml.transformers.oneshot` or
`sparseml.transformers.train`

To enable compression on save after oneshot or finetuning simply add the
`save_compressed=True` argument to `sparseml.transformers.oneshot` or
`sparseml.transformers.train`

```python
from sparseml.transformers import train

train(
save_compressed=True,
model="neuralmagic/TinyLlama-1.1B-Chat-v1.0-pruned2.4",
recipe=RECIPE,
dataset=DATASET
)
```


## Example Code

Loads a 50% sparse model, compresses it using the inferred bitmask compression, then
Loads a 60% sparse model, compresses it using the inferred bitmask compression, then
reloads the compressed model.

```python
Expand All @@ -126,11 +143,10 @@ with measure_cuda_memory() as m:
model = SparseAutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="cuda:0")
print(f"Load dense model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")

print(f"Sparsity config before compression: {model.sparsity_config}")
sparsity_config = getattr(model,"sparsity_config", None)
print(f"Sparsity config before compression: {sparsity_config}")
with measure_cuda_memory() as m:
SparseAutoModelForCausalLM.save_compressed(
model, OUTPUT_PATH
)
model.save_compressed(OUTPUT_PATH)
print(f"Save compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")

torch.cuda.set_device(1)
Expand All @@ -139,5 +155,6 @@ with measure_cuda_memory() as m:
OUTPUT_PATH, device_map="cuda:1"
)
print(f"Load compressed model peak GPU {m.overall_peak_memory / float(2**30):.4f} GB")
print(f"Sparsity config after compression: {model_again.sparsity_config}")
sparsity_config = getattr(model_again,"sparsity_config", None)
print(f"Sparsity config after compression: {sparsity_config}")
```
1 change: 1 addition & 0 deletions src/sparseml/transformers/compression/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@

# flake8: noqa

from .compress_save import *
from .helpers import *
from .safetensors_load import *
155 changes: 155 additions & 0 deletions src/sparseml/transformers/compression/utils/compress_save.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import types
import weakref
from functools import wraps
from typing import Optional

from transformers import PreTrainedModel
from transformers.file_utils import CONFIG_NAME

from sparseml.transformers.compression.compressors import ModelCompressor
from sparseml.transformers.compression.config import CompressionConfig
from sparseml.transformers.utils.helpers import SPARSITY_CONFIG_NAME


__all__ = [
"modify_save_pretrained",
"add_save_compressed_method",
]


def modify_save_pretrained(model: PreTrainedModel):
"""
Overrides a PreTrainedModel's save_pretrained() method with a wrapped version that
supports compression
"""

def save_pretrained_compressed(save_pretrained_method):
if getattr(save_pretrained_method, "_overridden", False):
# `model.save_pretrained` has already been replaced, return.
return save_pretrained_method

# Keep a weak reference to the model class and unbound save_pretrained
# method so we can call the original
model_ref = weakref.ref(save_pretrained_method.__self__)
original_save_pretrained = save_pretrained_method.__func__
model_class = model_ref().__class__
del save_pretrained_method

@wraps(original_save_pretrained)
def save_pretrained_wrapper(
save_directory: str,
sparsity_config: Optional[CompressionConfig] = None,
save_compressed: bool = False,
**kwargs,
):
"""
Wrapper around PreTrainedModel.save_pretrained(), adds functionality for
saving models in a compressed format on disk. The compression format is
saved to the model's config file
:param save_directory: output directory to save model to
:param sparsity_config: optional sparsity config to compress model with,
if no config is provided it will be inferred from the model
:param save_compresed: whether or not to compress the model on disk
:param kwargs: additional kwargs to pass on to model.save_pretrained
"""
model = model_ref()

if sparsity_config is not None:
# if a sparsity config is provided, always save compressed
sparsity_config.fill_config_details(model)
save_compressed = True
elif save_compressed:
# try to infer a sparsity config from the model if none is provided
sparsity_config = CompressionConfig.infer_config_from_model(
model, compress=save_compressed
)

if sparsity_config is None:
# model is not sparse, save as dense
return original_save_pretrained.__get__(model, model_class)(
save_directory, **kwargs
)

# if we've gotten to this point we have a config so we can run compression
kwargs["safe_serialization"] = True
compressor = ModelCompressor.load_from_registry(
sparsity_config.format, config=sparsity_config
)

# state_dict gets passed in as a kwarg for FSDP models
state_dict = kwargs.get("state_dict", None)
if state_dict is None:
state_dict = model.state_dict()

# make sure we're on the main process when saving
if state_dict is not None and len(state_dict) > 0:
compressed_state_dict = compressor.compress(state_dict)
kwargs["state_dict"] = compressed_state_dict

original_save_pretrained.__get__(model, model_class)(
save_directory, **kwargs
)
sparsity_config_data = sparsity_config.dict()
config_file_path = os.path.join(save_directory, CONFIG_NAME)

# add the sparsity config to the model's config file
with open(config_file_path, "r") as config_file:
config_data = json.load(config_file)
config_data[SPARSITY_CONFIG_NAME] = sparsity_config_data
with open(config_file_path, "w") as config_file:
json.dump(config_data, config_file, indent=4, sort_keys=True)

save_pretrained_wrapper._overriden = True
return save_pretrained_wrapper

# wrap save_pretrained
model.save_pretrained = save_pretrained_compressed(model.save_pretrained)


def add_save_compressed_method(model: PreTrainedModel):
"""
Overrides an instance of PreTrainedModel to add a save_compressed method that
wraps PreTrainedModel.save_pretrained(). Requires modify_save_pretrained() has
already been run on the model instance
"""

def save_compressed(
self,
save_directory: str,
sparsity_config: Optional[CompressionConfig] = None,
**kwargs,
):
"""
Alias for PreTrainedModel.save_pretrained() that always saves in a
compressed format
:param save_directory: output directory to save model to
:param sparsity_config: optional sparsity config to compress model with, if no
config is provided it will be inferred from the model
:param kwargs: additional kwargs to pass on to model.save_pretrained
"""
return self.save_pretrained(
save_directory=save_directory,
sparsity_config=sparsity_config,
save_compressed=True,
**kwargs,
)

model.save_compressed = types.MethodType(save_compressed, model)
1 change: 1 addition & 0 deletions src/sparseml/transformers/finetune/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ def one_shot(self, stage: Optional[str] = None):
save_path=self._output_dir,
tokenizer=self.tokenizer,
save_safetensors=self._training_args.save_safetensors,
save_compressed=self._training_args.save_compressed,
)

def train(self, checkpoint: str, stage: Optional[str] = None):
Expand Down
14 changes: 9 additions & 5 deletions src/sparseml/transformers/finetune/session_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,25 +457,29 @@ def save_model(
:param output_dir: the path to save the recipes into
"""
self._check_super_defined("save_model")
super().save_model(output_dir=output_dir, _internal_call=_internal_call)

if session_manager.active_session() is None:
return # nothing to save

if output_dir is None:
output_dir = self.args.output_dir

# don't export the gathered model on checkpoints
if is_fsdp_model(self.model) and not _internal_call:
if not is_fsdp_model(self.model):
self.model.save_pretrained(
output_dir,
save_compressed=self.args.save_compressed,
safe_serialization=self.args.save_safetensors,
)
else: # FSDP model
save_pretrained_fsdp(
model=self.model,
accelerator=self.accelerator,
output_dir=output_dir,
save_compressed=self.args.save_compressed,
save_safetensors=self.metadata.get("save_safetensors", False),
)

self.save_state()
self.tokenizer.save_pretrained(output_dir)
if not _is_oneshot: # optimizer/scheduler not relevant to one-shot
self.save_optimizer_and_scheduler(output_dir)

Expand Down
8 changes: 4 additions & 4 deletions src/sparseml/transformers/finetune/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,10 +309,6 @@ def main(
if isinstance(tokenizer, str) or tokenizer is None:
tokenizer = initialize_tokenizer_from_path(model_args, model, teacher)

# setup new SparseSession unless user requests otherwise
if training_args.clear_sparse_session:
session_manager.create_session()
session_manager.active_session().reset()
session_manager.pre_initialize_structure(model=model, framework=Framework.pytorch)

# intialize session manager
Expand Down Expand Up @@ -381,6 +377,10 @@ def main(
if training_args.do_predict:
stage_runner.predict()

# Clean up the SparseSession before exit if requested
if training_args.clear_sparse_session:
session_manager.active_session().reset()


if __name__ == "__main__":
apply()
4 changes: 4 additions & 0 deletions src/sparseml/transformers/finetune/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ class TrainingArguments(HFTrainingArgs):
)
},
)
save_compressed: Optional[bool] = field(
default=False,
metadata={"help": "Whether to compress sparse models during save"},
)
do_oneshot: Optional[bool] = field(
default=False,
metadata={"help": "Whether to run one-shot calibration"},
Expand Down
Loading

0 comments on commit ed2978e

Please sign in to comment.