Skip to content

Commit

Permalink
support vision datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
kylesayrs committed Nov 5, 2024
1 parent f137347 commit 593d4fd
Show file tree
Hide file tree
Showing 8 changed files with 202 additions and 27 deletions.
6 changes: 3 additions & 3 deletions examples/quantization_w4a16/llama3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

model = SparseAutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
device_map="cuda:0",
torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
Expand All @@ -20,7 +20,7 @@

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 512
NUM_CALIBRATION_SAMPLES = 160 #2048
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
Expand Down Expand Up @@ -55,7 +55,7 @@ def tokenize(sample):

# Configure the quantization algorithm to run.
# * quantize the weights to 4 bit with GPTQ with a group size 128
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"], batch_size=-1, dampening_frac=0.5)

# Apply algorithms.
oneshot(
Expand Down
83 changes: 83 additions & 0 deletions examples/quantization_w4a16/vision2_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from datasets import load_dataset
from transformers import AutoProcessor, MllamaForConditionalGeneration

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot

# Select model and load it.
MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model = MllamaForConditionalGeneration.from_pretrained(
MODEL_ID,
device_map="cuda:0",
torch_dtype="auto",
)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 160 #2048
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))


def preprocess(example):
return {
"text": processor.apply_chat_template(
example["messages"],
tokenize=False,
)
}


ds = ds.map(preprocess)


# Tokenize inputs.
def tokenize(sample):
return processor(
None,
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)


ds = ds.map(tokenize, remove_columns=ds.column_names)

# Configure the quantization algorithm to run.
# * quantize the weights to 4 bit with GPTQ with a group size 128
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"], batch_size=1, dampening_frac=0.5)

# Apply algorithms.
oneshot(
model=model,
tokenizer=MODEL_ID,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
trust_remote_code_model=True,
)

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids = processor("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(processor.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)
88 changes: 88 additions & 0 deletions examples/quantization_w4a16/vision_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from datasets import load_dataset
from transformers import AutoProcessor

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot

# Select model and load it.
MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"

model = SparseAutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="cuda:0",
torch_dtype="auto",
)
breakpoint()
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

# Select calibration dataset.
DATASET_ID = "lmms-lab/flickr30k"
DATASET_SPLIT = "test[:165]"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 165 #2048
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))


def preprocess(example):
messages = [
[
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What does the image show?"}
]
}
],
]
return {
"text": processor.apply_chat_template(
messages,
add_generation_prompt=True,
),
}


ds = ds.map(preprocess)


# Tokenize inputs.
def tokenize(sample):
return processor(sample["image"], sample["text"], add_special_tokens=False, return_tensors="pt", max_length=MAX_SEQUENCE_LENGTH)


ds = ds.map(tokenize, remove_columns=ds.column_names)

# Configure the quantization algorithm to run.
# * quantize the weights to 4 bit with GPTQ with a group size 128
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"], batch_size=-1, dampening_frac=0.5)

# Apply algorithms.
oneshot(
model=model,
tokenizer=MODEL_ID,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
trust_remote_code_model=True,
)

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids = processor("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(processor.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)
3 changes: 2 additions & 1 deletion src/llmcompressor/modifiers/quantization/gptq/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def on_initialize(self, state: "State", **kwargs) -> bool:
self.calibration_forward(state.model, state.data.calib)
finally:
pass
#torch.cuda.memory._dump_snapshot("bs256.pickle")
#torch.cuda.memory._dump_snapshot("bs10.pickle")
#torch.cuda.memory._record_memory_history(enabled=None)
#exit(0)

Expand Down Expand Up @@ -306,6 +306,7 @@ def compress_module(
logger.info(f"Using {inp.size(0)} samples")

with align_module(module):
print(inp.shape)
loss, quantized_weight, _scale, _zero_point, _g_idx = quantize_weight(
module.weight.data,
inp,
Expand Down
24 changes: 12 additions & 12 deletions src/llmcompressor/modifiers/utils/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,11 @@ def register_hooks(self, model: torch.nn.Module):
# if no targets are provided, default to the modules that shouldn't be
# split by FSDP. For Transformers models this is equivalent to the
# decoder layers (ie LlamaDecoderLayer)
sequential_targets = self.sequential_targets
if sequential_targets is None:
sequential_targets = get_no_split_params(model)
layers = get_layers(sequential_targets, model)
self._num_layers = len(layers)
# sequential_targets = self.sequential_targets
# if sequential_targets is None:
# sequential_targets = get_no_split_params(model)
# layers = get_layers(sequential_targets, model)
# self._num_layers = len(layers)

for name, module in model.named_modules():
if getattr_chain(module, "quantization_scheme.weights", None) is not None:
Expand All @@ -139,13 +139,13 @@ def register_hooks(self, model: torch.nn.Module):

self.pre_compress_module(module)

if name in layers.keys():
pre_hook = partial(self.layer_pre_forward, name)
post_hook = partial(self.layer_post_forward, name)
self.register_hook(module.register_forward_pre_hook(pre_hook, with_kwargs=True))
self.register_hook(
module.register_forward_hook(post_hook, with_kwargs=True)
)
# if name in layers.keys():
# pre_hook = partial(self.layer_pre_forward, name)
# post_hook = partial(self.layer_post_forward, name)
# self.register_hook(module.register_forward_pre_hook(pre_hook, with_kwargs=True))
# self.register_hook(
# module.register_forward_hook(post_hook, with_kwargs=True)
# )


@HooksMixin.hook
Expand Down
9 changes: 6 additions & 3 deletions src/llmcompressor/transformers/finetune/data/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,20 @@ def __init__(
self.padding = False

if self.tokenizer:
if hasattr(self.tokenizer, "tokenizer"):
self.tokenizer = self.tokenizer.tokenizer

if not self.tokenizer.pad_token:
self.tokenizer.pad_token = self.tokenizer.eos_token

# configure sequence length
max_seq_length = data_args.max_seq_length
model_max_length = tokenizer.model_max_length if tokenizer else max_seq_length
model_max_length = self.tokenizer.model_max_length if self.tokenizer else max_seq_length
if self.tokenizer and max_seq_length > model_max_length:
logger.warning(
f"The max_seq_length passed ({max_seq_length}) is larger than "
f"the maximum length for the model ({tokenizer.model_max_length}). "
f"Using max_seq_length={tokenizer.model_max_length}."
f"the maximum length for the model ({self.tokenizer.model_max_length}). "
f"Using max_seq_length={self.tokenizer.model_max_length}."
)
self.max_seq_length = min(data_args.max_seq_length, model_max_length)

Expand Down
4 changes: 2 additions & 2 deletions src/llmcompressor/transformers/finetune/data/data_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def create_batch_dataloader(

def pad_sequences(batch):
# extract input_ids and attention_mask from the batch
input_ids = [torch.tensor(item["input_ids"]) for item in batch]
masks = [torch.tensor(item["attention_mask"]) for item in batch]
input_ids = [torch.tensor(item["input_ids"]).squeeze(0) for item in batch]
masks = [torch.tensor(item["attention_mask"]).squeeze(0) for item in batch]

# while 0 is not necessarily the "correct" padding value, the padded
# input_ids are ignored according to the attention_mask
Expand Down
12 changes: 6 additions & 6 deletions src/llmcompressor/transformers/finetune/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from loguru import logger
from transformers import (
AutoConfig,
AutoTokenizer,
AutoProcessor,
DefaultDataCollator,
HfArgumentParser,
set_seed,
Expand Down Expand Up @@ -221,12 +221,12 @@ def initialize_model_from_path(
def initialize_tokenizer_from_path(model_args, model, teacher):
tokenizer_src = model_args.tokenizer
tokenizer_src = tokenizer_src or get_shared_tokenizer_src(model, teacher)
tokenizer = AutoTokenizer.from_pretrained(
tokenizer = AutoProcessor.from_pretrained(
tokenizer_src,
cache_dir=model_args.cache_dir,
use_fast=True,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
# cache_dir=model_args.cache_dir,
# use_fast=True,
# revision=model_args.model_revision,
# use_auth_token=True if model_args.use_auth_token else None,
trust_remote_code=model_args.trust_remote_code_model,
)

Expand Down

0 comments on commit 593d4fd

Please sign in to comment.