Offloading Bug Fix (#58)

* fix fstring * fix offloaded sparsity calculation
vllm-project · Aug 6, 2024 · 066d1e4 · 066d1e4
1 parent 0a0a2de
commit 066d1e4
Showing 1 changed file with 7 additions and 5 deletions.
diff --git a/src/llmcompressor/transformers/compression/helpers.py b/src/llmcompressor/transformers/compression/helpers.py
@@ -1,5 +1,5 @@
 from typing import Dict, List, Optional, Union
-
+from accelerate.accelerator import get_state_dict_offloaded_model
 import psutil
 import torch
 from accelerate import infer_auto_device_map, init_empty_weights
@@ -73,10 +73,12 @@ def infer_sparsity_structure_from_model(model: torch.nn.Module) -> Optional[str]
     structures = {"2:4"}
     for sparsity_structure in structures:
         linear_modules = get_linear_layers(model)
+        offloaded_params = get_state_dict_offloaded_model(model)
+
         linear_modules_with_sparsity_structure = [
-            tensor_follows_mask_structure(layer.weight)
-            for layer in tqdm(
-                linear_modules.values(),
+            tensor_follows_mask_structure(offloaded_params[f"{name}.weight"])
+            for name in tqdm(
+                linear_modules.keys(),
                 desc="Checking whether model follows "
                 f"{sparsity_structure} sparsity structure",
             )
@@ -199,7 +201,7 @@ def calculate_offload_device_map(
     available_gpus = torch.cuda.device_count()
     if available_gpus < num_gpus:
         raise ValueError(
-            "Requested {num_gpus} GPUs but only {available_gpus} are available."
+            f"Requested {num_gpus} GPUs but only {available_gpus} are available."
         )
     max_gpu_memory = [max_gpu_memory] * num_gpus