vllm-project · kylesayrs · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
@@ -1,11 +1,14 @@
+import torch
 from datasets import load_dataset
 from transformers import AutoTokenizer
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
 
 # Select model and load it.
-MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+#MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+#MODEL_ID = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
 
 model = SparseAutoModelForCausalLM.from_pretrained(
     MODEL_ID,
@@ -20,8 +23,8 @@
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
+NUM_CALIBRATION_SAMPLES = 512 // 6
+MAX_SEQUENCE_LENGTH = 2048 // 2
 
 # Load dataset and preprocess.
 ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
@@ -41,10 +44,11 @@ def preprocess(example):
 
 
 # Tokenize inputs.
+tokenizer.add_special_tokens({'pad_token': '[PAD]'})
 def tokenize(sample):
     return tokenizer(
         sample["text"],
-        padding=False,
+        padding=True,
         max_length=MAX_SEQUENCE_LENGTH,
         truncation=True,
         add_special_tokens=False,
@@ -55,7 +59,7 @@ def tokenize(sample):
 
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with GPTQ with a group size 128
-recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"], percdamp=0.01)
 
 # Apply algorithms.
 oneshot(