Merge branch 'main' into rigl

neuralmagic · Nov 20, 2023 · af19b0f · af19b0f
2 parents 5b22a49 + 1011fbc
commit af19b0f
Show file tree

Hide file tree

Showing 14 changed files with 143 additions and 25 deletions.
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -65,6 +65,8 @@ ENV PATH="${VENV}/bin:$PATH"
 ENV PIP_DEFAULT_TIMEOUT=200
 ARG VERSION
 ARG MODE=""
+ARG BRANCH
+
 RUN \
     if [ -n "$BRANCH" ] ; then \
       echo Installing from BRANCH && \
@@ -92,6 +94,8 @@ ENV PATH="${VENV}/bin:$PATH"
 ENV PIP_DEFAULT_TIMEOUT=200
 ARG VERSION
 ARG MODE
+ARG BRANCH
+
 RUN \
     if [ -n "$BRANCH" ] ; then \
       echo Installing from BRANCH && \
@@ -115,6 +119,8 @@ ENV PATH="${VENV}/bin:$PATH"
 ENV PIP_DEFAULT_TIMEOUT=200
 ARG VERSION
 ARG MODE
+ARG BRANCH
+
 RUN \
     if [ -n "$BRANCH" ] ; then \
       echo Installing from BRANCH with editable mode && \
@@ -141,5 +147,6 @@ ARG VENV
 COPY --from=build $VENV $VENV
 ENV PATH="${VENV}/bin:$PATH"
 HEALTHCHECK CMD python -c 'import sparseml'
+RUN pip list | grep sparseml
 CMD bash
 
diff --git a/setup.py b/setup.py
@@ -63,17 +63,17 @@
 
 _onnxruntime_deps = ["onnxruntime>=1.0.0"]
 _clip_deps = ["open_clip_torch==2.20.0"]
-supported_torch_version = "torch>=1.7.0,<=2.0"
+supported_torch_version = "torch>=1.7.0,<2.2"
 _pytorch_deps = [
     supported_torch_version,
     "gputils",
 ]
 _pytorch_all_deps = _pytorch_deps + [
-    "torchvision>=0.3.0,<=0.15.1",
+    "torchvision>=0.3.0,<0.17",
     "torchaudio<=2.0.1",
 ]
 _pytorch_vision_deps = _pytorch_deps + [
-    "torchvision>=0.3.0,<=0.15.1",
+    "torchvision>=0.3.0,<0.17",
     "opencv-python<=4.6.0.66",
 ]
 _transformers_deps = _pytorch_deps + [
@@ -103,9 +103,9 @@
     "black==22.12.0",
     "flake8==3.9.2",
     "isort==5.8.0",
-    "m2r2~=0.2.7",
+    "m2r2>=0.2.7",
     "mistune<3,>=2.0.3",
-    "myst-parser~=0.14.0",
+    "myst-parser>=0.14.0",
     "rinohtype~=0.4.2",
     "sphinx~=3.5.0",
     "sphinx-copybutton~=0.3.0",
@@ -114,8 +114,8 @@
     "sphinx-pydantic~=0.1.0",
     "sphinx-rtd-theme~=0.5.0",
     "wheel>=0.36.2",
-    "pytest~=6.2.0",
-    "pytest-mock~=3.6.0",
+    "pytest>=6.0.0",
+    "pytest-mock>=3.6.0",
     "flaky~=3.7.0",
     "sphinx-rtd-theme",
     "docutils<0.17",

diff --git a/src/sparseml/modifiers/obcq/pytorch.py b/src/sparseml/modifiers/obcq/pytorch.py
@@ -46,7 +46,6 @@ class SparseGPTModifierPyTorch(SparseGPTModifier):
 
     model: Any = None
     device_: str = "cuda:0"
-    finalization_kwargs_: Optional[Dict] = None
     layer_prefix_: Optional[str] = None
 
     def on_initialize(self, state: "State", **kwargs) -> bool:
@@ -61,14 +60,12 @@ def on_initialize(self, state: "State", **kwargs) -> bool:
             self.on_initialize_structure(state, **kwargs)
         if self.quantization_modifier_:
             self.quantization_modifier_.initialize(state, **kwargs)
-        self.finalization_kwargs_ = {}
         modifiable_model = state.model
         calibration_dataloader = state.data.calib
         device = state.hardware.device
 
         self.initialize_obcq(modifiable_model, device)
-        extras = self.apply_obcq(calibration_dataloader)
-        self.finalization_kwargs_.update(extras)
+        self.apply_obcq(calibration_dataloader)
 
         return True
 
@@ -99,7 +96,6 @@ def apply_obcq(
         Run OBCQ on the loaded model, using dataloader as calibration data
 
         :param dataloader: calibration data for OBCQ
-        :return: compression outputs used for finalization
         """
         accum_kwargs = {"dataloader": dataloader}
 
@@ -147,8 +143,6 @@ def apply_obcq(
             layer_kwargs = layer_compressor.compress(dev=self.device_, **accum_kwargs)
             accum_kwargs.update(layer_kwargs)
 
-        return extras
-
     def on_finalize(self, state: "State", **kwargs) -> bool:
         """
         disable the observers used by the OBCQ algorithm and set kv-cache configuration

diff --git a/src/sparseml/modifiers/obcq/utils/sparsegpt.py b/src/sparseml/modifiers/obcq/utils/sparsegpt.py
@@ -199,7 +199,8 @@ def fasterprune(
                 _LOGGER.debug(torch.sum((self.layer(self._inp1) - self.out1) ** 2))
                 _LOGGER.debug(torch.sum(Losses))
 
-        torch.cuda.synchronize()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
         _LOGGER.info("time %.2f" % (time.time() - tick))
         _LOGGER.info("error %.2f" % torch.sum(Losses).item())
 

diff --git a/src/sparseml/modifiers/smoothquant/pytorch.py b/src/sparseml/modifiers/smoothquant/pytorch.py
@@ -127,6 +127,7 @@ def _calibrate(self, model: ModifiableModelPyTorch, calibration_dataloader: List
             calibration_dataloader,
             self.num_calibration_steps,
             self.calibration_function,
+            self.device_,
         )
 
         # remove the hooks now that we are done calibrating

diff --git a/src/sparseml/pytorch/base.py b/src/sparseml/pytorch/base.py
@@ -49,7 +49,7 @@
 
 
 _TORCH_MIN_VERSION = "1.0.0"
-_TORCH_MAX_VERSION = os.environ.get("MAX_TORCH", "2.0.100")
+_TORCH_MAX_VERSION = os.environ.get("MAX_TORCH", "2.1.10")
 
 
 def check_torch_install(

diff --git a/src/sparseml/transformers/data/__init__.py b/src/sparseml/transformers/data/__init__.py
@@ -15,6 +15,7 @@
 # flake8: noqa
 from .base_llm import TransformersDataset
 from .c4 import *
+from .evolcodealpaca import *
 from .gsm8k import *
 from .open_platypus import *
 from .ptb import *

diff --git a/src/sparseml/transformers/data/evolcodealpaca.py b/src/sparseml/transformers/data/evolcodealpaca.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch.nn import Module
+
+from sparseml.transformers.data.base_llm import TransformersDataset
+
+
+@TransformersDataset.register(name="evolcodealpaca")
+class EvolCodeAlpaca(TransformersDataset):
+    def __init__(
+        self,
+        model: Module,
+        seqlen: int,
+        nsamples: int,
+        seed: int = 0,
+        split: str = "train",
+        split_percent_to_use: float = 1.0,
+    ):
+        super().__init__(
+            model=model,
+            seqlen=seqlen,
+            nsamples=nsamples,
+            path="theblackcat102/evol-codealpaca-v1",
+            name=None,
+            seed=seed,
+            split=split,
+            use_max_tokens=False,
+            split_percent_to_use=split_percent_to_use,
+        )
+
+        processed_data = []
+        for sample in self._data:
+            processed_sample = (
+                "Below is an instruction that describes a "
+                "programming task. Write a program that appropriately "
+                "completes the request.\n\n### Instruction:\n{instruction}"
+                "\n\n### Response:\n"
+            ).format(instruction=sample["instruction"])
+
+            if "output" in sample:
+                processed_sample += sample["output"]
+            processed_data.append(processed_sample)
+
+        self.create_dataloader(processed_data)
diff --git a/src/sparseml/transformers/sparsification/obcq/obcq.py b/src/sparseml/transformers/sparsification/obcq/obcq.py
@@ -18,6 +18,7 @@
 from pathlib import Path
 from typing import Optional
 
+import torch
 from torch.nn import Module
 from transformers import AutoConfig
 
@@ -38,6 +39,7 @@
 _LOGGER = logging.getLogger(__name__)
 SUPPORTED_DATASETS = TransformersDataset.registered_names()
 SUPPORTED_MODELS = ["opt", "llama", "mistral"]
+SUPPORTED_PRECISION = ["auto", "half", "full", "float16", "bfloat16", "float32"]
 
 
 def one_shot(
@@ -47,6 +49,7 @@ def one_shot(
     device: str = "cuda:0",
     deploy_dir: Optional[str] = ".",
     recipe_file: Optional[str] = None,
+    precision: str = "auto",
     eval_data: Optional[str] = None,
     do_save: Optional[bool] = False,
 ) -> Module:
@@ -59,6 +62,7 @@ def one_shot(
     :param device: Device (cuda:index or cpu) to use for computation
     :param deploy_dir: The output directory to save the model to
     :param recipe_file: recipe containing SparseGPT configuration
+    :param precision: precision to load model as, either auto, half or full
     :param eval_data: dataset to use for perplexity evalaution, or none to skip
     :param do_save: whether to save the output model to disk
 
@@ -71,6 +75,10 @@ def one_shot(
         if deploy_dir.exists():
             raise RuntimeError(f"deploy_dir={deploy_dir} already exists")
 
+    # fallback to cpu if cuda not available
+    device = _fallback_to_cpu(device)
+    _LOGGER.info(f"Running one_shot on device {device}")
+
     # Load the configuration from the model path
     config = AutoConfig.from_pretrained(model_path)
     model_type = config.model_type.lower()
@@ -88,7 +96,8 @@ def one_shot(
         forward_fn = llama_forward
     else:
         raise ValueError(f"model_path={model_path} should be one of {SUPPORTED_MODELS}")
-    model = model_loader_fn(model_path)
+    torch_dtype = _parse_dtype(precision)
+    model = model_loader_fn(model_path, torch_dtype=torch_dtype)
 
     if dataset_name not in SUPPORTED_DATASETS:
         raise ValueError(
@@ -137,6 +146,18 @@ def one_shot(
     return model
 
 
+def _parse_dtype(dtype_arg):
+    dtype = "auto"  # get precision from model by default
+    if dtype_arg == "half" or dtype_arg == "float16":
+        dtype = torch.float16
+    elif dtype_arg == "bfloat16":
+        dtype = torch.bfloat16
+    elif dtype_arg == "full" or dtype_arg == "float32":
+        dtype = torch.float32
+
+    return dtype
+
+
 def _save(model, tokenizer, save_path, recipe_path):
     model.save_pretrained(save_path)
     tokenizer.save_pretrained(save_path)
@@ -147,6 +168,16 @@ def _save(model, tokenizer, save_path, recipe_path):
         fp.write(load_recipe_yaml_str(recipe_path))
 
 
+def _fallback_to_cpu(device):
+    if "cuda" in device and not torch.cuda.is_available():
+        _LOGGER.warning(
+            f"Requested {device} but CUDA is not available, falling back to CPU"
+        )
+        return "cpu"
+
+    return device
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
@@ -163,6 +194,13 @@ def _save(model, tokenizer, save_path, recipe_path):
     parser.add_argument("--device", type=str, default="cuda:0")
     parser.add_argument("--deploy-dir", type=str, default=".")
     parser.add_argument("--recipe", type=str, default=None)
+    parser.add_argument(
+        "--precision",
+        type=str,
+        choices=SUPPORTED_PRECISION,
+        default="auto",
+        help="Precision to cast model weights to, default to auto",
+    )
     parser.add_argument(
         "--eval", type=str, default=None, help="Optional dataset for perplexity eval"
     )
@@ -179,6 +217,7 @@ def _save(model, tokenizer, save_path, recipe_path):
         num_samples=args.nsamples,
         device=args.device,
         recipe_file=args.recipe,
+        precision=args.precision,
         eval_data=args.eval,
         do_save=args.save,
     )
diff --git a/src/sparseml/transformers/sparsification/obcq/utils/helpers.py b/src/sparseml/transformers/sparsification/obcq/utils/helpers.py
@@ -42,6 +42,7 @@ def opt_forward(model: Module, data_loader: List, device: str, nsamples: int = N
         dataloader=data_loader,
         device=device,
         nsamples=nsamples,
+        target_ids=["attention_mask"],
         layer_prefix="decoder",
     )
     buffer = [b[0] for b in cached_inputs.pop("inputs")]
@@ -95,6 +96,7 @@ def llama_forward(model: Module, data_loader: List, device: str, nsamples: int =
         dataloader=data_loader,
         device=device,
         nsamples=nsamples,
+        target_ids=["attention_mask", "position_ids"],
         layer_prefix=None,
     )
     buffer = [b[0] for b in cached_inputs.pop("inputs")]