diff --git a/.github/workflows/test_quark.yml b/.github/workflows/test_quark.yml new file mode 100644 index 0000000..70ac2aa --- /dev/null +++ b/.github/workflows/test_quark.yml @@ -0,0 +1,52 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Test Lemonade with Quark Quantization + +on: + push: + branches: ["main"] + pull_request: + branches: ["main"] + +permissions: + contents: read + +jobs: + make-quark-lemonade: + env: + LEMONADE_CI_MODE: "True" + runs-on: windows-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Miniconda with 64-bit Python + uses: conda-incubator/setup-miniconda@v2 + with: + miniconda-version: "latest" + activate-environment: lemon + python-version: "3.10" + run-post: "false" + - name: Install dependencies + shell: bash -el {0} + run: | + python -m pip install --upgrade pip + conda install pylint + python -m pip check + pip install -e .[llm-oga-cpu] + lemonade-install --quark 0.6.0 + - name: Lint with Black + uses: psf/black@stable + with: + options: "--check --verbose" + src: "./src" + - name: Lint with PyLint + shell: bash -el {0} + run: | + pylint src/lemonade/tools/quark --rcfile .pylintrc --disable E0401 + - name: Run lemonade tests + shell: bash -el {0} + env: + HF_TOKEN: "${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}" # Required by OGA model_builder in OGA 0.4.0 but not future versions + run: | + python test/lemonade/quark_api.py + diff --git a/docs/lemonade/getting_started.md b/docs/lemonade/getting_started.md index 5506329..9b2456f 100644 --- a/docs/lemonade/getting_started.md +++ b/docs/lemonade/getting_started.md @@ -1,46 +1,109 @@ # Lemonade Welcome to the project page for `lemonade` the Turnkey LLM Aide! -Contents: -1. [Getting Started](#getting-started) -1. [Install Specialized Tools](#install-specialized-tools) - - [OnnxRuntime GenAI](#install-onnxruntime-genai) - - [RyzenAI NPU for PyTorch](#install-ryzenai-npu-for-pytorch) +1. [Install](#install) +1. [CLI Commands](#cli-commands) + - [Syntax](#syntax) + - [Chatting](#chatting) + - [Accuracy](#accuracy) + - [Benchmarking](#benchmarking) + - [Memory Usage](#memory-usage) + - [Serving](#serving) +1. [API Overview](#api) 1. [Code Organization](#code-organization) 1. [Contributing](#contributing) -# Getting Started -`lemonade` introduces a brand new set of LLM-focused tools. +# Install -## Install +You can quickly get started with `lemonade` by installing the `turnkeyml` [PyPI package](#from-pypi) with the appropriate extras for your backend, or you can [install from source](#from-source-code) by cloning and installing this repository. + +## From PyPI + +To install `lemonade` from PyPI: + +1. Create and activate a [miniconda](https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe) environment. + ```bash + conda create -n lemon python=3.10 + cond activate lemon + ``` + +3. Install lemonade for you backend of choice: + - [OnnxRuntime GenAI with CPU backend](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_igpu.md): + ```bash + pip install -e turnkeyml[llm-oga-cpu] + ``` + - [OnnxRuntime GenAI with Integrated GPU (iGPU, DirectML) backend](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_igpu.md): + > Note: Requires Windows and a DirectML-compatible iGPU. + ```bash + pip install -e turnkeyml[llm-oga-igpu] + ``` + - OnnxRuntime GenAI with Ryzen AI Hybrid (NPU + iGPU) backend: + > Note: Ryzen AI Hybrid requires a Windows 11 PC with a AMD Ryzen™ AI 9 HX375, Ryzen AI 9 HX370, or Ryzen AI 9 365 processor. + > - Install the [Ryzen AI driver >= 32.0.203.237](https://ryzenai.docs.amd.com/en/latest/inst.html#install-npu-drivers) (you can check your driver version under Device Manager > Neural Processors). + > - Visit the [AMD Hugging Face page](https://huggingface.co/collections/amd/quark-awq-g128-int4-asym-fp16-onnx-hybrid-13-674b307d2ffa21dd68fa41d5) for supported checkpoints. + ```bash + pip install -e turnkeyml[llm-oga-hybrid] + lemonade-install --ryzenai hybrid + ``` + - Hugging Face (PyTorch) LLMs for CPU backend: + ```bash + pip install -e turnkeyml[llm] + ``` + - llama.cpp: see [instructions](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/llamacpp.md). + +4. Use `lemonade -h` to explore the LLM tools, and see the [command](#cli-commands) and [API](#api) examples below. + + +## From Source Code + +To install `lemonade` from source code: 1. Clone: `git clone https://github.com/onnx/turnkeyml.git` -1. `cd turnkeyml` (where `turnkeyml` is the repo root of your TurnkeyML clone) +1. `cd turnkeyml` (where `turnkeyml` is the repo root of your clone) - Note: be sure to run these installation instructions from the repo root. -1. Create and activate a conda environment: - 1. `conda create -n lemon python=3.10` - 1. `conda activate lemon` -1. Install lemonade: `pip install -e .[llm]` - - or `pip install -e .[llm-oga-igpu]` if you want to use `onnxruntime-genai` (see [OGA](#install-onnxruntime-genai)) -1. `lemonade -h` to explore the LLM tools +1. Follow the same instructions as in the [PyPI installation](#from-pypi), except replace the `turnkeyml` with a `.`. + - For example: `pip install -e .[llm-oga-igpu]` + +# CLI Commands + +The `lemonade` CLI uses a unique command syntax that enables convenient interoperability between models, frameworks, devices, accuracy tests, and deployment options. + +Each unit of functionality (e.g., loading a model, running a test, deploying a server, etc.) is called a `Tool`, and a single call to `lemonade` can invoke any number of `Tools`. Each `Tool` will perform its functionality, then pass its state to the next `Tool` in the command. + +You can read each command out loud to understand what it is doing. For example, a command like this: + +```bash +lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are" +``` + +Can be read like this: -## Syntax +> Run `lemonade` on the input `(-i)` checkpoint `microsoft/Phi-3-mini-4k-instruct`. First, load it in the OnnxRuntime GenAI framework (`oga-load`), on to the integrated GPU device (`--device igpu`) in the int4 data type (`--dtype int4`). Then, pass the OGA model to the prompting tool (`llm-prompt`) with the prompt (`-p`) "Hello, my thoughts are" and print the response. + +The `lemonade -h` command will show you which options and Tools are available, and `lemonade TOOL -h` will tell you more about that specific Tool. -The `lemonade` CLI uses the same style of syntax as `turnkey`, but with a new set of LLM-specific tools. You can read about that syntax [here](https://github.com/onnx/turnkeyml#how-it-works). ## Chatting To chat with your LLM try: -`lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "Hello, my thoughts are"` +OGA iGPU: +```bash + lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are" +``` + +Hugging Face: +```bash + lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "Hello, my thoughts are" +``` -The LLM will run on CPU with your provided prompt, and the LLM's response to your prompt will be printed to the screen. You can replace the `"Hello, my thoughts are"` with any prompt you like. +The LLM will run with your provided prompt, and the LLM's response to your prompt will be printed to the screen. You can replace the `"Hello, my thoughts are"` with any prompt you like. -You can also replace the `facebook/opt-125m` with any Huggingface checkpoint you like, including LLaMA-2, Phi-2, Qwen, Mamba, etc. +You can also replace the `facebook/opt-125m` with any Hugging Face checkpoint you like, including LLaMA-2, Phi-2, Qwen, Mamba, etc. -You can also set the `--device` argument in `huggingface-load` to load your LLM on a different device. +You can also set the `--device` argument in `oga-load` and `huggingface-load` to load your LLM on a different device. Run `lemonade huggingface-load -h` and `lemonade llm-prompt -h` to learn more about those tools. @@ -48,7 +111,15 @@ Run `lemonade huggingface-load -h` and `lemonade llm-prompt -h` to learn more ab To measure the accuracy of an LLM using MMLU, try this: -`lemonade -i facebook/opt-125m huggingface-load accuracy-mmlu --tests management` +OGA iGPU: +```bash + lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 accuracy-mmlu --tests management +``` + +Hugging Face: +```bash + lemonade -i facebook/opt-125m huggingface-load accuracy-mmlu --tests management +``` That command will run just the management test from MMLU on your LLM and save the score to the lemonade cache at `~/.cache/lemonade`. @@ -58,18 +129,34 @@ You can run the full suite of MMLU subjects by omitting the `--test` argument. Y To measure the time-to-first-token and tokens/second of an LLM, try this: -`lemonade -i facebook/opt-125m huggingface-load huggingface-bench` +OGA iGPU: +```bash + lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 oga-bench +``` + +Hugging Face: +```bash + lemonade -i facebook/opt-125m huggingface-load huggingface-bench +``` That command will run a few warmup iterations, then a few generation iterations where performance data is collected. -The prompt size, number of output tokens, and number iterations are all parameters. Learn more by running `lemonade huggingface-bench -h`. +The prompt size, number of output tokens, and number iterations are all parameters. Learn more by running `lemonade oga-bench -h` or `lemonade huggingface-bench -h`. ## Memory Usage -The peak memory used by the lemonade build is captured in the build output. To capture more granular +The peak memory used by the `lemonade` build is captured in the build output. To capture more granular memory usage information, use the `--memory` flag. For example: -`lemonade -i facebook/opt-125m --memory huggingface-load huggingface-bench` +OGA iGPU: +```bash + lemonade --memory -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 oga-bench +``` + +Hugging Face: +```bash + lemonade --memory -i facebook/opt-125m huggingface-load huggingface-bench +``` In this case a `memory_usage.png` file will be generated and stored in the build folder. This file contains a figure plotting the memory usage over the build time. Learn more by running `lemonade -h`. @@ -78,70 +165,66 @@ contains a figure plotting the memory usage over the build time. Learn more by You can launch a WebSocket server for your LLM with: -`lemonade -i facebook/opt-125m huggingface-load serve` - -Once the server has launched, you can connect to it from your own application, or interact directly by following the on-screen instructions to open a basic web app. - -Note that the `llm-prompt`, `accuracy-mmlu`, and `serve` tools can all be used with other model-loading tools, for example `onnxruntime-genai` or `ryzenai-transformers`. See [Install Specialized Tools](#install-specialized-tools) for details. - -## API - -Lemonade is also available via API. Here's a quick example of how to benchmark an LLM: - -```python -import lemonade.tools.torch_llm as tl -import lemonade.tools.chat as cl -from turnkeyml.state import State - -state = State(cache_dir="cache", build_name="test") - -state = tl.HuggingfaceLoad().run(state, input="facebook/opt-125m") -state = cl.Prompt().run(state, prompt="hi", max_new_tokens=15) +OGA iGPU: +```bash + lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 serve +``` -print("Response:", state.response) +Hugging Face: +```bash + lemonade -i facebook/opt-125m huggingface-load serve ``` -# Install Specialized Tools +Once the server has launched, you can connect to it from your own application, or interact directly by following the on-screen instructions to open a basic web app. -Lemonade supports specialized tools that each require their own setup steps. **Note:** These tools will only appear in `lemonade -h` if you run in an environment that has completed setup. +# API -## Install OnnxRuntime-GenAI +Lemonade is also available via API. -To install support for [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai), use `pip install -e .[llm-oga-igpu]` instead of the default installation command. +## LEAP APIs -You can then load supported OGA models on to CPU or iGPU with the `oga-load` tool, for example: +The lemonade enablement platform (LEAP) API abstracts loading models from any supported framework (e.g., Hugging Face, OGA) and backend (e.g., CPU, iGPU, Hybrid). This makes it easy to integrate lemonade LLMs into Python applications. -`lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"` +OGA iGPU: +```python +from lemonade import leap -You can also launch a server process with: +model, tokenizer = leap.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", recipe="oga-igpu") -The `oga-bench` tool is available to capture tokens/second and time-to-first-token metrics: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 oga-bench`. Learn more with `lemonade oga-bench -h`. +input_ids = tokenizer("This is my prompt", return_tensors="pt").input_ids +response = model.generate(input_ids, max_new_tokens=30) -You can also try Phi-3-Mini-128k-Instruct with the following commands: +print(tokenizer.decode(response[0])) +``` -`lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 serve` +You can learn more about the LEAP APIs [here](https://github.com/onnx/turnkeyml/tree/main/examples/lemonade). -You can learn more about the CPU and iGPU support in our [OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_igpu.md). +## Low-Level API -> Note: early access to AMD's RyzenAI NPU is also available. See the [RyzenAI NPU OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_npu.md) for more information. +The low-level API is useful for designing custom experiments. For example, sweeping over specific checkpoints, devices, and/or tools. -## Install RyzenAI NPU for PyTorch +Here's a quick example of how to prompt a Hugging Face LLM using the low-level API, which calls the load and prompt tools one by one: -To run your LLMs on RyzenAI NPU, first install and set up the `ryzenai-transformers` conda environment (see instructions [here](https://github.com/amd/RyzenAI-SW/blob/main/example/transformers/models/llm/docs/README.md)). Then, install `lemonade` into `ryzenai-transformers`. The `ryzenai-npu-load` Tool will become available in that environment. +```python +import lemonade.tools.torch_llm as tl +import lemonade.tools.chat as cl +from turnkeyml.state import State -You can try it out with: `lemonade -i meta-llama/Llama-2-7b-chat-hf ryzenai-npu-load --device DEVICE llm-prompt -p "Hello, my thoughts are"` +state = State(cache_dir="cache", build_name="test") -Where `DEVICE` is either "phx" or "stx" if you have a RyzenAI 7xxx/8xxx or 3xx/9xxx processor, respectively. +state = tl.HuggingfaceLoad().run(state, input="facebook/opt-125m") +state = cl.Prompt().run(state, prompt="hi", max_new_tokens=15) -> Note: only `meta-llama/Llama-2-7b-chat-hf` and `microsoft/Phi-3-mini-4k-instruct` are supported by `lemonade` at this time. Contributions appreciated! +print("Response:", state.response) +``` # Contributing -If you decide to contribute, please: +Contributions are welcome! If you decide to contribute, please: -- do so via a pull request. -- write your code in keeping with the same style as the rest of this repo's code. -- add a test under `test/lemonade/llm_api.py` that provides coverage of your new feature. +- Do so via a pull request. +- Write your code in keeping with the same style as the rest of this repo's code. +- Add a test under `test/lemonade` that provides coverage of your new feature. The best way to contribute is to add new tools to cover more devices and usage scenarios. @@ -150,3 +233,5 @@ To add a new tool: 1. (Optional) Create a new `.py` file under `src/lemonade/tools` (or use an existing file if your tool fits into a pre-existing family of tools). 1. Define a new class that inherits the `Tool` class from `TurnkeyML`. 1. Register the class by adding it to the list of `tools` near the top of `src/lemonade/cli.py`. + +You can learn more about contributing on the repository's [contribution guide](https://github.com/onnx/turnkeyml/blob/main/docs/contribute.md). diff --git a/docs/lemonade/ort_genai_hybrid.md b/docs/lemonade/ort_genai_hybrid.md deleted file mode 100644 index 16e0429..0000000 --- a/docs/lemonade/ort_genai_hybrid.md +++ /dev/null @@ -1,109 +0,0 @@ -# Introduction - -[onnxruntime-genai (aka OGA)](https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file) is a new framework created by Microsoft for running ONNX LLMs. - -## Hybrid instructions - -### Warnings - - - The OGA wheels need to be installed in a specific order or you will end up with the wrong packages in your environment. If you see pip dependency errors, please delete your conda env and start over with a fresh environment. - -### Requirements - - [NPU Drivers (version .237)](https://ryzenai.docs.amd.com/en/latest/inst.html#install-npu-drivers) - - [Hybrid LLM artifacts package](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) - -### Installation - -1. NOTE: ⚠️ DO THESE STEPS IN EXACTLY THIS ORDER ⚠️ -1. Install `lemonade`: - 1. Create a conda environment: `conda create -n oga-hybrid python=3.10` (Python 3.10 is required) - 1. Activate: `conda activate oga-hybrid` - 1. `cd REPO_ROOT` - 1. `pip install -e .[llm-oga-hybrid]` -1. Download required OGA packages - 1. Access the [Hybrid LLM artifacts package](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and download `hybrid-llm-artifacts_1.3.0.zip` and `onnxruntime_vitisai-1.19.0.dev20241217-cp310-cp310-win_amd64.whl`. - 1. Copy the `onnxruntime_vitisai-1.19.0.dev20241217-cp310-cp310-win_amd64.whl` file to the `hybrid-llm-artifacts_1.3.0\hybrid-llm-artifacts\onnxruntime_genai\wheel` folder. - 1. Unzip `hybrid-llm-artifacts_1.3.0.zip` - 1. Create the system environment variable `AMD_OGA_HYBRID` and set it to the path of the `hybrid-llm-artifacts_1.3.0` folder. - 1. Restart your terminal -1. Install the wheels: - 1. `cd hybrid-llm-artifacts_1.3.0\hybrid-llm-artifacts\onnxruntime_genai\wheel` - 1. `pip install onnxruntime_genai_directml-0.4.0.dev0-cp310-cp310-win_amd64.whl` - 1. `pip install onnxruntime_vitisai-1.19.0.dev20241217-cp310-cp310-win_amd64.whl` -1. Install driver - 1. Download NPU driver from [NPU Drivers (version .237)](https://ryzenai.docs.amd.com/en/latest/inst.html#install-npu-drivers) - 1. Unzip `NPU_RAI1.3.zip` - 1. Right click `kipudrv.inf` and select `Install` - 1. Check under `Device Manager` to ensure that `NPU Compute Accelerator` is using version `32.0.203.237`. - -### Runtime - -To test basic functionality, point lemonade to any of the models under [quark-quantized-onnx-hybrid-llms-for-ryzen-ai-1.3](https://huggingface.co/collections/amd/quark-awq-g128-int4-asym-fp16-onnx-hybrid-13-674b307d2ffa21dd68fa41d5): - -``` -lemonade -i amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid oga-load --device hybrid --dtype int4 llm-prompt -p "hello whats your name?" --max-new-tokens 15 -``` - -``` -Building "amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid" - ✓ Loading OnnxRuntime-GenAI model - ✓ Prompting LLM - -amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid: - (executed 1x) - Build dir: C:\Users\ramkr\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid - Status: Successful build! - Dtype: int4 - Device: hybrid - Response: hello whats your name? i'm a robot, and i'm here to help you with any questions - - - -Woohoo! Saved to ~\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid -``` - -To test/use the websocket server: - -``` -lemonade -i amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid oga-load --device hybrid --dtype int4 serve --max-new-tokens 50 -``` - -Then open the address (http://localhost:8000) in a browser and chat with it. - -``` -Building "amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid" - ✓ Loading OnnxRuntime-GenAI model - Launching LLM Server - -INFO: Started server process [8704] -INFO: Waiting for application startup. -INFO: Application startup complete. -INFO: Uvicorn running on http://localhost:8000 (Press CTRL+C to quit) -INFO: ::1:57038 - "GET / HTTP/1.1" 200 OK -INFO: ('::1', 57042) - "WebSocket /ws" [accepted] -INFO: connection open -``` - -To run a single MMLU test: - -``` -lemonade -i amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid oga-load --device hybrid --dtype int4 accuracy-mmlu --tests management -``` - -``` -Building "amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid" - ✓ Loading OnnxRuntime-GenAI model - ✓ Measuring accuracy with MMLU - -amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid: - (executed 1x) - Build dir: C:\Users\ramkr\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid - Status: Successful build! - Dtype: int4 - Device: hybrid - Mmlu Management Accuracy: 49.515 % - - - -Woohoo! Saved to ~\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid -``` diff --git a/docs/lemonade/ort_genai_igpu.md b/docs/lemonade/ort_genai_igpu.md index affb8c8..8226821 100644 --- a/docs/lemonade/ort_genai_igpu.md +++ b/docs/lemonade/ort_genai_igpu.md @@ -1,50 +1,41 @@ # OnnxRuntime GenAI (OGA) for iGPU and CPU -[onnxruntime-genai (aka OGA)](https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file) is a new framework created by Microsoft for running ONNX LLMs +[onnxruntime-genai (aka OGA)](https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file) is a new framework created by Microsoft for running ONNX LLMs. ## Installation -To install: +See [lemonade installation](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/getting_started.md#install) for the OGA iGPU backend. -1. `conda create -n oga-igpu python=3.9` -1. `conda activate oga-igpu` -1. `pip install -e .[llm-oga-igpu]` - - Note: don't forget the `[llm-oga-igpu]` at the end, this is what installs ort-genai -1. Get models: - - The oga-load tool can download models from Hugging Face and build ONNX files using oga model_builder. Models can be quantized and optimized for both igpu and cpu. - - Download and build ONNX model files: - - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4` - - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device cpu --dtype int4` - - The ONNX model files will be stored in the respective subfolder of the lemonade cache folder and will be reused in future oga-load calls: - - `oga_models\microsoft_phi-3-mini-4k-instruct\dml-int4` - - `oga_models\microsoft_phi-3-mini-4k-instruct\cpu-int4` - - The ONNX model build process can be forced to run again, overwriting the above cache, by using the --force flag: - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 --force` - - Transformer model architectures supported by the model_builder tool include many popular state-of-the-art models: - - Gemma - - LLaMa - - Mistral - - Phi - - Qwen - - Nemotron - - For the full list of supported models, please see the - [model_builder documentation](https://github.com/microsoft/onnxruntime-genai/blob/main/src/python/py/models/README.md). - - The following quantizations are supported for automatically building ONNXRuntime GenAI model files from the Hugging Face repository: - - cpu: fp32, int4 - - igpu: fp16, int4 -1. Directory structure: - - The model_builder tool caches Hugging Face files and temporary ONNX external data files in `\model_builder` - - The output from model_builder is stored in `\oga_models\\` - - `MODELNAME` is the Hugging Face checkpoint name where any '/' is mapped to an '_' and everything is lower case - - `SUBFOLDER` is `-`, where `EP` is the execution provider (`dml` for igpu, `cpu` for cpu, and `npu` for npu) and `DTYPE` is the datatype - - If the --int4-block-size flag is used then `SUBFOLDER` is` --block-` where `SIZE` is the specified block size - - Other ONNX models in the format required by onnxruntime-genai can be loaded in lemonade if placed in the `\oga_models` folder. - Use the -i and --subfolder flags to specify the folder and subfolder: - `lemonade -i my_model_name --subfolder my_subfolder --device igpu --dtype int4 oga-load` - Lemonade will expect the ONNX model files to be located in `\oga_models\my_model_name\my_subfolder` - -## Usage +## Get models -Prompt: `lemonade -i meta-llama/Llama-3.2-1B-Instruct oga-load --device igpu --dtype int4 llm-prompt -p "My thoughts are" --max-new-tokens 50` +- The oga-load tool can download models from Hugging Face and build ONNX files using OGA's `model_builder`, which can quantized and optimize models for both igpu and cpu. +- Download and build ONNX model files: + - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4` + - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device cpu --dtype int4` +- The ONNX model files will be stored in the respective subfolder of the lemonade cache folder and will be reused in future oga-load calls: + - `oga_models\microsoft_phi-3-mini-4k-instruct\dml-int4` + - `oga_models\microsoft_phi-3-mini-4k-instruct\cpu-int4` +- The ONNX model build process can be forced to run again, overwriting the above cache, by using the --force flag: + - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 --force` +- Transformer model architectures supported by the model_builder tool include many popular state-of-the-art models: + - Gemma + - LLaMa + - Mistral + - Phi + - Qwen + - Nemotron +- For the full list of supported models, please see the [model_builder documentation](https://github.com/microsoft/onnxruntime-genai/blob/main/src/python/py/models/README.md). +- The following quantizations are supported for automatically building ONNXRuntime GenAI model files from the Hugging Face repository: + - cpu: fp32, int4 + - igpu: fp16, int4 -Serving: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --dtype int4 --device igpu serve --max-new-tokens 100` \ No newline at end of file +## Directory structure: +- The model_builder tool caches Hugging Face files and temporary ONNX external data files in `\model_builder` +- The output from model_builder is stored in `\oga_models\\` + - `MODELNAME` is the Hugging Face checkpoint name where any '/' is mapped to an '_' and everything is lower case. + - `SUBFOLDER` is `-`, where `EP` is the execution provider (`dml` for igpu, `cpu` for cpu, and `npu` for npu) and `DTYPE` is the datatype. + - If the --int4-block-size flag is used then `SUBFOLDER` is` --block-` where `SIZE` is the specified block size. +- Other ONNX models in the format required by onnxruntime-genai can be loaded in lemonade if placed in the `\oga_models` folder. + - Use the -i and --subfolder flags to specify the folder and subfolder: + - `lemonade -i my_model_name --subfolder my_subfolder --device igpu --dtype int4 oga-load` + - Lemonade will expect the ONNX model files to be located in `\oga_models\my_model_name\my_subfolder` diff --git a/docs/lemonade/ort_genai_npu.md b/docs/lemonade/ort_genai_npu.md deleted file mode 100644 index a4e1c8d..0000000 --- a/docs/lemonade/ort_genai_npu.md +++ /dev/null @@ -1,107 +0,0 @@ -# Introduction - -[onnxruntime-genai (aka OGA)](https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file) is a new framework created by Microsoft for running ONNX LLMs - -## NPU instructions - -### Warnings - - - The OGA wheels need to be installed in a specific order or you will end up with the wrong packages in your environment. If you see pip dependency errors, please delete your conda env and start over with a fresh environment. - -### Installation - -1. NOTE: ⚠️ DO THESE STEPS IN EXACTLY THIS ORDER ⚠️ -1. Install `lemonade`: - 1. Create a conda environment: `conda create -n oga-npu python=3.10` (Python 3.10 is required) - 1. Activate: `conda activate oga-npu` - 1. `cd REPO_ROOT` - 1. `pip install -e .[llm-oga-npu]` -1. Download required OGA packages - 1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and download `npu-llm-artifacts_1.3.0.zip` from `Ryzen AI 1.3 Model Release`. - 1. Unzip `npu-llm-artifacts_1.3.0.zip` -1. Setup your folder structure: - 1. Copy the `amd_oga` folder from the above zip file, if desired - 1. Create the system environment variable `AMD_OGA` and set it to the path to the `amd_oga` folder -1. Install the wheels: - 1. `cd %AMD_OGA%\wheels` - 1. `pip install onnxruntime_genai-0.5.0.dev0-cp310-cp310-win_amd64.whl` - 1. `pip install onnxruntime_vitisai-1.20.0-cp310-cp310-win_amd64.whl` - 1. `pip install voe-1.2.0-cp310-cp310-win_amd64.whl` -1. Install driver - 1. Download NPU driver from [NPU Drivers (version .237)](https://ryzenai.docs.amd.com/en/latest/inst.html#install-npu-drivers) - 1. Unzip `NPU_RAI1.3.zip` - 1. Right click `kipudrv.inf` and select `Install` - 1. Check under `Device Manager` to ensure that `NPU Compute Accelerator` is using version `32.0.203.237`. - -### Runtime - -To test basic functionality, point lemonade to any of the models under [quark_awq_g128_int4_asym_bf16_onnx_npu 1.3](https://huggingface.co/collections/amd/quark-awq-g128-int4-asym-bf16-onnx-npu-13-6759f510b8132db53e044aaf) - -``` -lemonade -i amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix --device npu --dtype int4 llm-prompt -p "hello whats your name?" --max-new-tokens 15 -``` - -``` -Building "amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid" - ✓ Loading OnnxRuntime-GenAI model - ✓ Prompting LLM - -amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix: - (executed 1x) - Build dir: C:\Users\ramkr\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix - Status: Successful build! - Dtype: int4 - Device: npu - Response: hello whats your name? i'm a robot, and i'm here to help you with any questions - - - -Woohoo! Saved to ~\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix -``` - -To test/use the websocket server: - -``` -lemonade -i amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix --device npu --dtype int4 serve --max-new-tokens 50 -``` - -Then open the address (http://localhost:8000) in a browser and chat with it. - -``` -Building "amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix" - ✓ Loading OnnxRuntime-GenAI model - Launching LLM Server - -INFO: Started server process [8704] -INFO: Waiting for application startup. -INFO: Application startup complete. -INFO: Uvicorn running on http://localhost:8000 (Press CTRL+C to quit) -INFO: ::1:57038 - "GET / HTTP/1.1" 200 OK -INFO: ('::1', 57042) - "WebSocket /ws" [accepted] -INFO: connection open -``` - -To run a single MMLU test: - -``` -lemonade -i amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix oga-load --device npu --dtype int4 accuracy-mmlu --tests management -``` - -``` -Building "amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix" - ✓ Loading OnnxRuntime-GenAI model - ✓ Measuring accuracy with MMLU - -amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix: - (executed 1x) - Build dir: C:\Users\ramkr\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix - Status: Successful build! - Dtype: int4 - Device: npu - Mmlu Management Accuracy: 49.515 % - - - -Woohoo! Saved to ~\.cache\lemonade\amd_Llama-3.2-1B-Instruct-awq-g128-int4-asym-bf16-onnx-ryzen-strix - -``` diff --git a/docs/lemonade/quark.md b/docs/lemonade/quark.md new file mode 100644 index 0000000..854d37e --- /dev/null +++ b/docs/lemonade/quark.md @@ -0,0 +1,77 @@ +# Quick Guide to Quark Quantization Tools + +## Introduction +Quark is indeed AMD's recommended quantization framework for targeting Ryzen AI platforms, supporting both PyTorch and ONNX formats. For Quark specific info, please visit [quark-doc](https://quark.docs.amd.com/latest/). Here's a guide on using Quark tools for quantization and reloading a quantized model using lemonade: + +## Installation + +1. Create and activate a conda environment: + - `conda create -n quark python=3.10` + - `conda activate quark` +2. Install requirements to setup this environment. +Depending on your usecase you can install for CPU, NPU pr hybrid. + ```bash + pip install -e .[llm-oga-cpu] # Can also work with llm-oga-npu or llm-oga-hybrid + ``` +2. Install `quark` using `lemonade-install` for easy install + ```bash + # Install the latest external version of quark + lemonade-install --quark 0.6.0 + ``` + This downloads the .whl files and zip folder from the Quark page, installs, and sets up the environment for Quark. + +## Usage +```bash +lemonade -i huggingface-load quark-quantize + --model-export # Export formats [quark_safetensors, onnx, gguf] + --quant-algo # Supported algorithms [gptq, awq, autosmoothquant] + --quant-scheme # Quant schemes [w_int4, w_uint4, w_int8...] + --device # Target device [cpu, cuda] + llm-prompt -p "" +``` +## Example Workflows +### Quantize and Export + +This command quantizes an opt-125m loaded from HF, using AWQ qunatization algorithm to generate A8W8 quantized model. Running quantization on CPU can be time consuming. This test can take upto 1hr using +100% of your CPU. + +```bash +lemonade -i facebook/opt-125m huggingface-load quark-quantize --quant-algo awq --quant-scheme w_int8_a_int8_per_tensor_sym --model-export quark_safetensors --device cpu +``` + +#### Load Quantized Model: +This command loads the exported model from a cache folder that corresponds to the quantization recipe used during its export. +```bash +lemonade -i facebook/opt-125m huggingface-load quark-load --safetensors-model-reload --quant-algo awq --quant-scheme w_int8_a_int8_per_tensor_sym --device cpu llm-prompt -p "Hello world" +``` + +### Supported Quantization Schemes + +The following are the different quantization schemes supported for various models. +For a comprehensive list of datatype support for specific models, refer to the [support matrix](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html#id11). + +- w_uint4_per_group_asym +- w_int4_per_channel_sym +- w_int8_a_int8_per_tensor_sym +- w_int8_per_tensor_sym and more.. + +For more information on the supported quantization schemes, see [Language Model Post Training Quantization (PTQ) Using Quark](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html). + +### Supported Export Formats + +Lemonade supports exporting quark quantized models in various formats. The following export formats are available: + +- quark_safetensors +- onnx +- vllm_adopted_safetensors +- gguf + +## Known Issues +- No PyPI installer for Quark yet. You can use lemondade-installer as mentioned [above](#installation) for Quark installation. +- Not enough Quark APIs are exposed. Need to rely heavily of Zip folder released by Quark. +- Latest Quark version is hardcoded in quark_quantize for download checks. + +- There is currently no PyPI installer for Quark. You can use lemonade-installer as mentioned in the [Installation Section](#installation) of this guide for Quark installation. +- There are limited Quark APIs currently available. Users will need to rely on the Zip folder released by Quark. +- Latest Quark version hardcoded in quark_quantize for download checks. +- Unable to suppress logging info from Quark. Using log_severity_level, you can suppress the quantization logs, but you cannot suppress info and warning messages when reloading the model, etc. \ No newline at end of file diff --git a/docs/readme.md b/docs/readme.md index a9aee42..344e7be 100644 --- a/docs/readme.md +++ b/docs/readme.md @@ -10,10 +10,7 @@ The `docs/lemonade` directory has documentation for the LLM-focused `lemonade` t - [Perplexity](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/perplexity.md): details of the Perplexity test for LLMs. - Tool-specific setup guides: - [llama.cpp](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/llamacpp.md) - - OnnxRuntime GenaI: - - [iGPU/NPU hybrid](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_hybrid.md) - - [iGPU](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_igpu.md) - - [NPU](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_npu.md) + - [OnnxRuntime GenaI iGPU and CPU](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/ort_genai_igpu.md) ## CNNs and Transformers: `turnkey` tooling diff --git a/setup.py b/setup.py index 73c40bb..8370514 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ "lemonade", "lemonade.tools", "lemonade.tools.ort_genai", - "lemonade.tools.ryzenai_npu", + "lemonade.tools.quark", "turnkeyml_models", "turnkeyml_models.graph_convolutions", "turnkeyml_models.selftest", @@ -48,6 +48,7 @@ "wmi", "pytz", "tqdm", + "zstandard", "matplotlib", "tabulate", # Conditional dependencies for ONNXRuntime backends diff --git a/src/lemonade/cli.py b/src/lemonade/cli.py index 890dcae..c102709 100644 --- a/src/lemonade/cli.py +++ b/src/lemonade/cli.py @@ -23,6 +23,8 @@ from lemonade.tools.perplexity import AccuracyPerplexity from lemonade.tools.chat import LLMPrompt, Serve from lemonade.tools.serve import ServerPreview +from lemonade.tools.quark.quark_load import QuarkLoad +from lemonade.tools.quark.quark_quantize import QuarkQuantize def main(): @@ -40,6 +42,8 @@ def main(): Serve, HuggingfaceBench, OgaBench, + QuarkQuantize, + QuarkLoad, # Inherited from TurnkeyML Report, Cache, @@ -57,14 +61,6 @@ def main(): except ModuleNotFoundError: pass - # Import RyzenAI NPU modules only if RyzenAI NPU is installed - try: - from lemonade.tools.ryzenai_npu.ryzenai_npu import RyzenAINPULoad - - tools = tools + [RyzenAINPULoad] - except ModuleNotFoundError: - pass - # Define the argument parser parser = cli.CustomArgumentParser( description="Turnkey analysis and benchmarking of GenAI models. " diff --git a/src/lemonade/tools/huggingface_load.py b/src/lemonade/tools/huggingface_load.py index 8b48ec6..da98196 100644 --- a/src/lemonade/tools/huggingface_load.py +++ b/src/lemonade/tools/huggingface_load.py @@ -55,6 +55,9 @@ def batch_decode(self, tokens, **kwargs): def eos_token_id(self): return self.tokenizer.eos_token_id + def save_pretrained(self, model_dir, **kwargs): + return self.tokenizer.save_pretrained(model_dir, **kwargs) + class HuggingfaceLoad(FirstTool): """ diff --git a/src/lemonade/tools/ryzenai_npu/__init__.py b/src/lemonade/tools/quark/__init__.py similarity index 100% rename from src/lemonade/tools/ryzenai_npu/__init__.py rename to src/lemonade/tools/quark/__init__.py diff --git a/src/lemonade/tools/quark/quark_load.py b/src/lemonade/tools/quark/quark_load.py new file mode 100644 index 0000000..b7713f2 --- /dev/null +++ b/src/lemonade/tools/quark/quark_load.py @@ -0,0 +1,168 @@ +import argparse +import os +import sys + +import torch +from turnkeyml.state import State +from turnkeyml.tools import Tool +import turnkeyml.common.printing as printing +import turnkeyml.common.build as build +from lemonade_install.install import DEFAULT_QUARK_DIR + + +class QuarkLoad(Tool): + """ + Load a model Quantized and exported using Quark. + Required Input State: + - state.model: Pretrained model instance to be quantized. + - state.tokenizer: Tokenizer instance from Hugging Face. + Output: + - state of the loaded model + + See docs/quark.md for more details. + """ + + unique_name = "quark-load" + + def __init__(self): + super().__init__(monitor_message="Load Quark Quantized model") + + @staticmethod + def parser(add_help: bool = True) -> argparse.ArgumentParser: + parser = __class__.helpful_parser( + short_description="Load a quantized model using Quark", + add_help=add_help, + ) + + parser.add_argument( + "--quant-scheme", + type=str, + required=True, + default=None, + help="Supported quantization schemes in Quark", + ) + + parser.add_argument( + "--quant-algo", + type=str, + required=True, + default=None, + choices=["awq", "gptq", "autosmoothquant", None], + help="Supported quantization algorithms in Quark", + ) + + parser.add_argument( + "--torch-compile", action="store_true", help="Model torch compile" + ) + + parser.add_argument( + "--safetensors-model-reload", + action="store_true", + help="Safetensors model reload", + ) + + parser.add_argument( + "--safetensors-model-dir", + default=None, + help="Directory of safetensors model", + ) + + parser.add_argument( + "--params-load", action="store_true", help="Model parameters load" + ) + + parser.add_argument("--json-path", help="Specify the path of saved json file") + + parser.add_argument( + "--safetensors-path", + default=None, + help="Specify the path of saved safetensors file", + ) + + return parser + + def run( + self, + state: State, + quant_scheme: str, + quant_algo: str, + torch_compile: bool = False, + safetensors_model_reload: bool = False, + safetensors_model_dir: str = None, + params_load: bool = False, + json_path: str = None, + safetensors_path: str = None, + ) -> State: + """ + Executes the QuarkLoad process. + Returns: + State: The updated state after loading the model. + Raises: + Exception: If an error occurs during the QuarkLoad process. + """ + + try: + if os.path.isdir(DEFAULT_QUARK_DIR): + quark_llm_path = os.path.join( + DEFAULT_QUARK_DIR, "examples", "torch", "language_modeling" + ) + sys.path.insert(0, quark_llm_path) + else: + raise FileNotFoundError( + f"The directory {DEFAULT_QUARK_DIR} does not exist. \ + Please check your installation." + ) + + # Default load path specific to recipe + # This will NOT work + # The default path is now uniquely craeated with timestamp + # Default load path will not work. Need to pass explicit load path + model_export_path = os.path.join( + build.output_dir(state.cache_dir, state.build_name), + "exported_model", + quant_scheme, + quant_algo, + ) + + # Set default paths only if current values are None + if safetensors_model_dir is None: + safetensors_model_dir = model_export_path + if safetensors_path is None: + safetensors_path = os.path.join(model_export_path, "model.safetensors") + printing.log_info("Loading model ...") + if not params_load and not safetensors_model_reload: + raise ValueError( + " Specify load format: 'params_load' or 'safetensors_model_reload'." + ) + + # Reload quantized model if specified + from quark.torch import load_params, import_model_info + + if params_load: + printing.log_info( + "Restoring quantized model from JSON/safetensors files" + ) + model = load_params( + model, + json_path=json_path, + safetensors_path=safetensors_path, + ) + elif safetensors_model_reload: + printing.log_info( + "Restoring quantized model from quark_safetensors files" + ) + model = import_model_info(model, model_info_dir=safetensors_model_dir) + + if torch_compile: + printing.log_info("torch.compile...") + model = torch.compile(model) + + state.model = model + state.dtype = model.dtype + + printing.log_info("Quark Load process completed.") + + except Exception as e: + printing.log_error(f"An error occurred during the QuarkLoad process: {e}") + raise + return state diff --git a/src/lemonade/tools/quark/quark_quantize.py b/src/lemonade/tools/quark/quark_quantize.py new file mode 100644 index 0000000..c562aa8 --- /dev/null +++ b/src/lemonade/tools/quark/quark_quantize.py @@ -0,0 +1,435 @@ +import argparse +import os +import sys +from pathlib import Path + +import torch +from transformers import AutoProcessor +from turnkeyml.state import State +from turnkeyml.tools import Tool +import turnkeyml.common.printing as printing +import turnkeyml.common.build as build +from lemonade_install.install import DEFAULT_QUARK_DIR + + +class QuarkQuantize(Tool): + """ + Quantize a model using the Quark Quantization tool. + + This Tool performs the following steps: + 1. Downloads and extracts necessary resources from AMD Quark Web Page. + 2. Based on the target model, it prepares the model, tokenizer, and calibration data. + 3. Optionally quantizes, freezes, and exports the model. + 4. Optionally evaluates the model. + + Required Input State: + - state.model: Pretrained model instance to be quantized. + - state.tokenizer: Tokenizer instance from Hugging Face. + Output: + - Modifies `state` with quantized and optionally exported model. + + See docs/quark.md for more details. + """ + + unique_name = "quark-quantize" + + def __init__(self): + super().__init__(monitor_message="Quark Quantizing model") + + @staticmethod + def parser(add_help: bool = True) -> argparse.ArgumentParser: + parser = __class__.helpful_parser( + short_description="Quantize a model using Quark", + add_help=add_help, + ) + parser.add_argument( + "--device", + default="cpu", + choices=["cuda", "cpu"], + help="Device for running the quantizer", + ) + parser.add_argument("--multi-gpu", action="store_true") + parser.add_argument( + "--data-type", + default="auto", + choices=["auto", "float16", "bfloat16", "float32"], + help="Input datatype of the model", + ) + parser.add_argument( + "--seq-len", type=int, default=512, help="Sequence length of data" + ) + parser.add_argument( + "--batch-size", type=int, default=1, help="Batch size for calibration." + ) + parser.add_argument( + "--num-fewshot", + type=int, + default=None, + metavar="N", + help="Number of examples in few-shot context", + ) + parser.add_argument( + "--output-dir", default=None, help="Output directory for exported model" + ) + parser.add_argument( + "--no-weight-matrix-merge", + action="store_true", + help="If set, merges onnx model and weight \ + together before export.\ + By default, for onnx export, spits out a model.onnx and a model.weights", + ) + parser.add_argument( + "--dataset", + default="pileval", + choices=[ + "pileval", + "wikitext", + "pileval_for_awq_benchmark", + "wikitext_for_gptq_benchmark", + "HuggingFaceH4/ultrachat_200k", + ], + help="Dataset for calibration", + ) + parser.add_argument( + "--num-calib-data", + type=int, + default=512, + help="Number of samples for calibration.", + ) + + # See docs/quark.md for more details. + parser.add_argument( + "--quant-scheme", + type=str, + default=None, + choices=[ + "w_fp8_a_fp8", + "w_int4_per_channel_sym", + "w_uint4_per_group_asym", + "w_int4_per_group_sym", + "w_uint4_a_bfloat16_per_group_asym", + "w_int8_per_tensor_sym", + "w_int8_per_group_sym", + "w_uint8_per_group_asym", + "w_int8_a_int8_per_tensor_sym", + "w_int8_a_int8_per_tensor_sym_dynamic", + "w_uint8_a_uint8_per_tensor_asym", + "w_fp8_a_fp8_o_fp8", + "w_mx_fp8", + "w_mx_fp8_a_mx_fp8", + "w_int8_a_int8_per_token_dynamic", + "w_bfp16", + "w_bfp16_a_bfp16", + "w_mx6", + "w_mx6_a_mx6", + "w_fp8_per_channel_sym", + "w_int4_per_channel_asym", + "w_int4_per_group_asym", + "w_uint4_per_group_sym", + "w_uint4_per_channel_sym", + "w_uint4_per_channel_asym", + "w_int8_per_tensor_percentile", + "w_int8_per_tensor_mse", + "w_uint8_per_tensor_percentile", + "w_uint8_per_tensor_mse", + "w_mx_fp4_per_group_sym", + "w_mx_fp6_e3m2_per_group_sym", + "w_mx_fp6_e2m3_per_group_sym", + "w_mx_int8_per_group_sym", + "w_uint4_per_channel_a_int8_per_tensor", + "w_uint4_per_group_a_int8_per_tensor", + "w_bfp16_per_group_sym", + None, + ], + help="Supported quantization schemes in Quark", + ) + parser.add_argument( + "--quant-algo", + type=str, + default=None, + choices=["awq", "gptq", "autosmoothquant", None], + help="Support quantization algorithms in Quark", + ) + parser.add_argument( + "--pre-optimization-config-file-path", + type=str, + default=None, + help="The JSON file path of pre-optimization config", + ) + parser.add_argument( + "--quant-algo-config-file-path", + type=str, + default=None, + help="The JSON file path of quantization algorithm config", + ) + parser.add_argument( + "--group-size", + type=int, + default=128, + help="Group size for per_group quantization", + ) + parser.add_argument( + "--pack-method", + type=str, + default="reorder", + choices=["order", "reorder"], + help="Pack method for awq_export", + ) + parser.add_argument( + "--exclude-layers", + type=str, + nargs="*", + default=None, + help="List of layers to exclude from quantization.", + ) + parser.add_argument( + "--kv-cache-dtype", + default=None, + choices=["fp8", None], + help="KV Cache dtype.", + ) + parser.add_argument( + "--pre-quantization-optimization", + action="append", + default=[], + choices=["rotation", "smoothquant"], + help="Pre Quantization Optimization.", + ) + parser.add_argument( + "--model-export", + default=None, + action="append", + choices=[ + None, + "onnx", + "vllm_adopted_safetensors", + "quark_safetensors", + "gguf", + ], + help="Model export format", + ) + parser.add_argument( + "--custom-mode", + default="quark", + type=str, + choices=["quark", "awq", "fp8"], + help="Custom mode for export \ + This is especially relevant for npu/hybrid export", + ) + parser.add_argument( + "--torch-compile", + action="store_true", + help="Compile the quantized model using torch.compile", + ) + parser.add_argument( + "--params-save", action="store_true", help="Save model params" + ) + parser.add_argument( + "--save-dir", + help="Directory to save model parameters as \ + safetensors or pth, in the case when --params_save is used.", + ) + parser.add_argument( + "--log-severity-level", type=int, default=3, help="DEBUG=1, INFO=2, ERROR=3" + ) + parser.add_argument("--skip-quantization", action="store_true") + + return parser + + def run(self, state: State, **kwargs) -> State: + """ + Executes the QuarkQuantize process. + + Args: + state (State): The current state of the process, containing necessary + information such as cache directory and build name. + **kwargs: Additional keyword arguments that may include: + - output_dir (str): Directory to save the output model. + - safetensors_model_dir (str): Directory to save the safetensors model. + - save_dir (str): Directory to save model parameters. + - safetensors_path (str): Path to the safetensors model. + - quant_algo (str): The quantization algorithm to use. + - quant_algo_config_file_path (str): Path to the quantization algorithm + configuration file. + - model_dir (str): Directory of the model. + Returns: + State: The updated state after the quantization process. + Raises: + Exception: If an error occurs during the QuarkQuantize process + and when installation path does not exist. + """ + + try: + + if os.path.isdir(DEFAULT_QUARK_DIR): + quark_llm_path = os.path.join( + DEFAULT_QUARK_DIR, "examples", "torch", "language_modeling" + ) + sys.path.extend([quark_llm_path]) + else: + raise FileNotFoundError( + f"The directory {DEFAULT_QUARK_DIR} does not exist. \ + Please check your installation." + ) + model_build_path = os.path.join( + build.output_dir(state.cache_dir, state.build_name) + ) + model_export_path = os.path.join( + model_build_path, + "exported_model", + kwargs.get("quant_scheme"), + kwargs.get("quant_algo"), + ) + # Set default paths only if current values are None + if kwargs.get("model_dir") is None: + kwargs["model_dir"] = model_build_path + if kwargs.get("output_dir") is None: + kwargs["output_dir"] = model_export_path + if kwargs.get("save_dir") is None: + kwargs["save_dir"] = os.path.join(model_export_path, "model_params") + + from llm_utils.model_preparation import get_model_type + + model_type = get_model_type(state.model) + + quant_algo = kwargs.get("quant_algo") + kwargs["quant_algo_config_file_path"] = os.path.join( + quark_llm_path, + "llm_ptq", + "models", + model_type, + f"{quant_algo}_config.json", + ) + + self._quantize(state, **kwargs) + + except Exception as e: + printing.log_error(f"Error during the QuarkQuantize process: {e}") + raise + return state + + def _quantize(self, state: State, **kwargs) -> None: + """ + Main quantization and export process. + + This method is responsible for: + - Loading the model and tokenizer. + - Preparing the calibration dataset. + - Quantizing the model. + - Optionally exporting, compiling, and evaluating the model. + """ + + model = state.model + tokenizer = state.tokenizer + + # Importing quark utils after adding to sys.path + from llm_utils.data_preparation import get_calib_dataloader + from llm_utils.model_preparation import get_model_type + from llm_ptq.configuration_preparation import get_config, get_export_config + from quark.torch import ModelQuantizer, ModelExporter, save_params + + # 1. Load Model + printing.log_info("Loading model ...") + model_type = get_model_type(model) + + # [mllama specifics] + if model_type == "mllama" and kwargs.get("model_export") is not None: + processor = AutoProcessor.from_pretrained(kwargs.get("model_dir")) + export_dir = Path(kwargs.get("output_dir")) + export_dir.mkdir(parents=True, exist_ok=True) + processor.save_pretrained(kwargs.get("output_dir")) + + # 2. Load dataset + printing.log_info("Loading dataset ...") + main_device = model.device if kwargs.get("multi_gpu") else kwargs.get("device") + calib_dataloader = get_calib_dataloader( + dataset_name=kwargs.get("dataset"), + tokenizer=tokenizer, + batch_size=1, + num_calib_data=kwargs.get("num_calib_data"), + seqlen=kwargs.get("seq_len"), + device=main_device, + ) + + # 3. Quantize model + if not kwargs.get("skip_quantization"): + printing.log_info("Starting quantization process ...") + args = argparse.Namespace(**kwargs) + quant_config = get_config(args, model_type) + quant_config.log_severity_level = kwargs.get("log_severity_level", 3) + quantizer = ModelQuantizer(quant_config) + model = quantizer.quantize_model(model, calib_dataloader) + printing.log_info("Quantization completed.") + + if ( + kwargs.get("model_export") is not None + or kwargs.get("params_save") + or kwargs.get("torch_compile") + ): + printing.log_info("Freezing the quantized model ...") + model = quantizer.freeze(model) + + # 4. Export model + if kwargs.get("model_export") is not None: + printing.log_info("Exporting the model ...") + export_path = kwargs.get("output_dir") + + args = argparse.Namespace(**kwargs) + export_config = get_export_config(args, model_type) + exporter = ModelExporter(config=export_config, export_dir=export_path) + if "quark_safetensors" in kwargs.get("model_export"): + printing.log_info("Exporting quark native json and safetensors...") + with torch.no_grad(): + quant_config = get_config(args, model_type) + exporter.export_model_info( + model, + quant_config=quant_config, + tokenizer=tokenizer, + custom_mode=kwargs.get("custom_mode"), + ) + if "vllm_adopted_safetensors" in kwargs.get("model_export"): + printing.log_info("Exporting vllm adopted json and safetensors...") + with torch.inference_mode(): + exporter.export_model_info( + model, + model_type=model_type, + model_dtype=state.dtype, + export_type="vllm-adopt", + ) + if "onnx" in kwargs.get("model_export"): + printing.log_info("Exporting onnx graph...") + with torch.inference_mode(): + batch_iter = iter(calib_dataloader) + input_args = next(batch_iter) + if kwargs.get("quant_scheme") in [ + "w_int4_per_channel_sym", + "w_uint4_per_group_asym", + "w_int4_per_group_sym", + "w_uint4_a_bfloat16_per_group_asym", + ]: + uint4_int4_flag = True + else: + uint4_int4_flag = False + exporter.export_onnx_model( + model, input_args, uint4_int4_flag=uint4_int4_flag + ) + if "gguf" in kwargs.get("model_export"): + printing.log_info("Exporting gguf model...") + with torch.inference_mode(): + exporter.export_gguf_model( + model, kwargs.get("model_dir"), model_type + ) + + # 6. [Optional] Compile model + if kwargs.get("torch_compile"): + printing.log_info("torch.compile...") + model = torch.compile(model) + + # 7. Save model parameters + if kwargs.get("params_save"): + printing.log_info("Saving model parameters ...") + save_params(model, model_type=model_type, export_dir=kwargs.get("save_dir")) + + state.model = model + state.dtype = model.dtype + printing.log_info("QuarkQuantize process completed.") diff --git a/src/lemonade/tools/ryzenai_npu/ryzenai_npu.py b/src/lemonade/tools/ryzenai_npu/ryzenai_npu.py deleted file mode 100644 index 1d94d44..0000000 --- a/src/lemonade/tools/ryzenai_npu/ryzenai_npu.py +++ /dev/null @@ -1,253 +0,0 @@ -import os -import argparse -import torch -from transformers import ( - LlamaForCausalLM, - LlamaTokenizer, - AutoTokenizer, - PreTrainedTokenizerFast, -) -from ryzenai_llm_engine import RyzenAILLMEngine, TransformConfig -from ryzenai_llm_quantizer import QuantConfig, RyzenAILLMQuantizer -from modeling_phi3 import Phi3ForCausalLM -from turnkeyml.state import State -from turnkeyml.tools import FirstTool -from lemonade.tools.adapter import ModelAdapter -from lemonade.cache import Keys - -npu_root_dir = os.path.dirname(__file__) -quantized_models_path = os.path.join(npu_root_dir, "quantized_models") -if not os.path.exists(quantized_models_path): - os.mkdir(quantized_models_path) - - -class LlamaModelEval(LlamaForCausalLM): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.model_name = "llama-2-7b-chat" - self.tokenizer = None - - def forward(self, *args, **kwargs): - outputs = super().forward(*args, **kwargs) # pylint: disable=no-member - return outputs - - -class Phi3ModelEval(Phi3ForCausalLM): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.model_name = "phi-3-mini-4k-instruct" - self.tokenizer = None - - def forward(self, *args, **kwargs): - outputs = super().forward(*args, **kwargs) - return outputs - - def get_position_embeddings(self): - raise NotImplementedError( - f"`get_position_embeddings` is not implemented for {self.__class__}`. " - f"To implement it, you should overwrite this method in the class {self.__class__} " - f"in `modeling_{self.__class__.__module__}.py`" - ) - - def resize_position_embeddings(self, new_num_position_embeddings: int): - raise NotImplementedError( - f"`resize_position_embeddings` is not implemented for {self.__class__}`." - f"To implement it, you should overwrite this method in the class {self.__class__} " - f"in `modeling_{self.__class__.__module__}.py`" - ) - - -class RyzenAiModel(ModelAdapter): - """ - RyzenAI NPU models require an attention_mask of all 1's to be passed - as input to generate. This class exists for the purpose of inserting - that attention mask. - """ - - def __init__(self, model): - super().__init__() - self.model = model - - # pylint: disable=arguments-differ - def generate(self, input_ids, **kwargs): - attention_mask = torch.ones(input_ids.shape) - return self.model.generate( - input_ids=input_ids, attention_mask=attention_mask, **kwargs - ) - - def __getattr__(self, name): - """ - Forward all attribute access to self.model. - """ - return getattr(self.model, name) - - -class RyzenAINPULoad(FirstTool): - """ - Tool that loads an LLM checkpoint on to a RyzenAI NPU. - - Input: the name or path to a checkpoint. Supported options: - "TheBloke/Llama-2-7b-Chat-fp16" - "meta-llama/Llama-2-7b-chat-hf" - "microsoft/Phi-3-mini-4k-instruct" - "meta-llama/Meta-Llama-3-8B-Instruct" - "meta-llama/Meta-Llama-3-8B" - - Output: - state.model: handle to a Huggingface-style LLM loaded on NPU - state.tokenizer = Huggingface-style LLM tokenizer instance - state.dtype = data type of the model on NPU - - Note: This tool expects the ryzenai-transformers library to be pre-installed. - If that library is not installed, this tool will not load. - """ - - unique_name = "ryzenai-npu-load" - - def __init__(self): - super().__init__(monitor_message="Loading LLM on RyzenAI NPU") - - self.status_stats = [Keys.DTYPE] - - @staticmethod - def parser(add_help: bool = True) -> argparse.ArgumentParser: - parser = __class__.helpful_parser( - short_description="Quantize and transform a model using AWQ \ - in int4 format in RyzenAI NPU", - add_help=add_help, - ) - - parser.add_argument("--device", required=True, choices=["phx", "stx"]) - - return parser - - # pylint: disable=C0103 - def run(self, state: State, input: str = "", device=None) -> State: - - checkpoint = input - - w_bit = 4 - group_size = 128 - - if ( - checkpoint == "TheBloke/Llama-2-7b-Chat-fp16" - or checkpoint == "meta-llama/Llama-2-7b-chat-hf" - ): - model_name = "llama-2-7b-chat" - algorithm = "awqplus" - flash_attention_plus = False - trust_remote_code = False - CausalLMModel = LlamaModelEval - LMTokenizer = LlamaTokenizer - quantized_model_path = os.path.join( - quantized_models_path, - f"quantized_llama-2-7b-chat_w{w_bit}_g{group_size}_{algorithm}.pth", - ) - - elif ( - checkpoint == "meta-llama/Meta-Llama-3-8B-Instruct" - or checkpoint == "meta-llama/Meta-Llama-3-8B" - ): - model_name = checkpoint.replace("meta-llama/", "") - algorithm = "awqplus" - flash_attention_plus = False - trust_remote_code = False - CausalLMModel = LlamaModelEval - LMTokenizer = PreTrainedTokenizerFast - quantized_model_path = os.path.join( - quantized_models_path, - f"quantized_{model_name}_w{w_bit}_g{group_size}_{algorithm}.pth", - ) - - elif checkpoint == "microsoft/Phi-3-mini-4k-instruct": - model_name = "phi-3-mini-4k-instruct" - algorithm = "pergrp" - flash_attention_plus = False - trust_remote_code = True - CausalLMModel = Phi3ModelEval - LMTokenizer = AutoTokenizer - - quantized_model_path = os.path.join( - quantized_models_path, - f"quantized_Phi-3-mini-4k-instruct_w{w_bit}_g{group_size}_{algorithm}.pth", - ) - - else: - raise ValueError(f"Model {checkpoint} is not a supported model.") - - if not os.path.exists(quantized_model_path): - - model = CausalLMModel.from_pretrained( - checkpoint, - torch_dtype=torch.bfloat16, - trust_remote_code=trust_remote_code, - attn_implementation="eager", - ) - - model.tokenizer = LMTokenizer.from_pretrained( - checkpoint, trust_remote_code=trust_remote_code - ) - - quant_config = QuantConfig( - quant_mode=algorithm, - model_name=checkpoint, - dataset="raw", - w_bit=w_bit, - group_size=group_size, - use_qscales=True, - ) - - model = RyzenAILLMQuantizer.quantize(model, quant_config=quant_config) - torch.save(model, quantized_model_path) - else: - model = torch.load(quantized_model_path) - - if device == "phx": - fast_attention = False - elif device == "stx": - fast_attention = True - else: - raise Exception(f"Use a supported device instead of {device}") - - # Different library versions support different flags - # We maintain a safe set of flags and a cutting-edge set of flags, - # and attempt each - try: - transform_config = TransformConfig( - flash_attention_plus=flash_attention_plus, - fast_attention=fast_attention, - fast_mlp=device != "phx", - fast_norm=device != "phx", - precision="w4abf16", - model_name=model_name, - target="aie", - w_bit=w_bit, - group_size=group_size, - profilegemm=False, - ) - except TypeError: - transform_config = TransformConfig( - flash_attention_plus=False, - fast_attention=False, - fast_mlp=False, - precision="w4abf16", - model_name=model_name, - target="aie", - w_bit=w_bit, - group_size=group_size, - profilegemm=False, - ) - - model = RyzenAILLMEngine.transform(model, transform_config) - model = model.to(torch.bfloat16) - model.eval() - - state.model = RyzenAiModel(model) - state.tokenizer = model.tokenizer - state.dtype = "int4" - - state.save_stat(Keys.CHECKPOINT, checkpoint) - state.save_stat(Keys.DEVICE, "ryzenai-npu") - state.save_stat(Keys.DTYPE, "int4") - - return state diff --git a/src/lemonade_install/install.py b/src/lemonade_install/install.py index 8dfa8ad..547c6af 100644 --- a/src/lemonade_install/install.py +++ b/src/lemonade_install/install.py @@ -10,10 +10,10 @@ import subprocess import sys import shutil +from pathlib import Path from typing import Optional import zipfile import requests -from pathlib import Path lemonade_install_dir = Path(__file__).parent.parent.parent @@ -27,6 +27,10 @@ DEFAULT_AMD_OGA_HYBRID_DIR, "hybrid-llm-artifacts_1.3.0_lounge", ) +DEFAULT_QUARK_VERSION = "quark-0.6.0" +DEFAULT_QUARK_DIR = os.path.join( + lemonade_install_dir, "install", "quark", DEFAULT_QUARK_VERSION +) def download_lfs_file(token, file, output_filename): @@ -67,11 +71,23 @@ def download_lfs_file(token, file, output_filename): raise ValueError(f"Error: {output_filename} does not exist.") -def download_file(url, output_filename): - response = requests.get(url) +def download_file(url: str, output_filename: str, description: str = None): + try: + response = requests.get(url) + if response.status_code != 200: + raise Exception( + f"Failed to fetch the content from GitHub API. \ + Status code: {response.status_code}, Response: {response.json()}" + ) - with open(output_filename, "wb") as file: - file.write(response.content) + with open(output_filename, "wb") as file: + file.write(response.content) + + if not os.path.isfile(output_filename): + raise Exception(f"\nError: Failed to write to {output_filename}") + + except Exception as e: + raise Exception(f"\nError downloading {description or 'file'}: {str(e)}") def unzip_file(zip_path, extract_to): @@ -80,6 +96,67 @@ def unzip_file(zip_path, extract_to): zip_ref.extractall(extract_to) +def download_and_extract_package( + url: str, + version: str, + install_dir: str, + package_name: str, +) -> str: + """ + Downloads, Extracts and Renames the folder + + Args: + url: Download URL for the package + version: Version string + install_dir: Directory to install to + package_name: Name of the package + + Returns: + str: Path where package was extracted (renamed to package-version) + """ + zip_filename = f"{package_name}-{version}.zip" + zip_path = os.path.join(install_dir, zip_filename) + target_folder = os.path.join(install_dir, f"{package_name}-{version}") + + print(f"\nDownloading {package_name} from {url}") + response = requests.get(url) + if response.status_code == 200: + with open(zip_path, "wb") as f: + f.write(response.content) + else: + raise Exception( + f"Failed to download {package_name}. Status code: {response.status_code}" + ) + + print("\n[INFO]: Extracting zip file ...") + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(install_dir) + print("\n[INFO]: Extraction completed.") + + os.remove(zip_path) + + extracted_folder = None + for folder in os.listdir(install_dir): + folder_path = os.path.join(install_dir, folder) + if os.path.isdir(folder_path) and folder.startswith(f"{package_name}-"): + extracted_folder = folder_path + break + + if extracted_folder is None: + raise ValueError( + f"Error: Extracted folder for {package_name} version {version} not found." + ) + + # Rename extracted folder to package-version + if extracted_folder != target_folder: + if os.path.exists(target_folder): + shutil.rmtree(target_folder) # Remove if already exists + os.rename(extracted_folder, target_folder) + print(f"\n[INFO]: Renamed folder to {target_folder}") + + return target_folder + + class LicenseRejected(Exception): """ Raise an exception if the user rejects the license prompt. @@ -118,15 +195,25 @@ def parser() -> argparse.ArgumentParser: "variable (e.g., Ryzen AI uses environment variable OGA_TOKEN).", ) + parser.add_argument( + "--quark", + help="Install Quark Quantization tool for LLMs", + choices=["0.6.0"], + ) + return parser def run( self, ryzenai: Optional[str] = None, + quark: Optional[str] = None, yes: bool = False, token: Optional[str] = None, ): - + if ryzenai is None and quark is None: + raise ValueError( + "You must select something to install, for example `--ryzenai` and/or `--quark`" + ) if ryzenai is not None: if ryzenai == "npu": file = "ryzen_ai_13_ga/npu-llm-artifacts_1.3.0.zip" @@ -220,10 +307,32 @@ def run( # Delete the zip file print(f"\nCleaning up, removing {archive_file_path}\n") os.remove(archive_file_path) - else: - raise ValueError( - "You must select something to install, for example `--ryzenai`" + + if quark is not None: + quark_install_dir = os.path.join(lemonade_install_dir, "install", "quark") + os.makedirs(quark_install_dir, exist_ok=True) + + # Install Quark utilities + quark_url = f"https://www.xilinx.com/bin/public/openDownload?filename=quark-{quark}.zip" + quark_path = download_and_extract_package( + url=quark_url, + version=quark, + install_dir=quark_install_dir, + package_name="quark", + ) + # Install Quark wheel + wheel_url = f"https://www.xilinx.com/bin/public/openDownload?filename=quark-{quark}-py3-none-any.whl" + wheel_path = os.path.join( + quark_install_dir, f"quark-{quark}-py3-none-any.whl" ) + print(f"\nInstalling Quark wheel from {wheel_url}") + download_file(wheel_url, wheel_path, "wheel file") + + install_cmd = f"{sys.executable} -m pip install --no-deps {wheel_path}" + subprocess.run(install_cmd, check=True, shell=True) + os.remove(wheel_path) + + print(f"\nQuark installed successfully at: {quark_path}") def main(): diff --git a/test/lemonade/quark_api.py b/test/lemonade/quark_api.py new file mode 100644 index 0000000..f518919 --- /dev/null +++ b/test/lemonade/quark_api.py @@ -0,0 +1,53 @@ +import unittest +import shutil +import os +from turnkeyml.state import State +import turnkeyml.common.test_helpers as common +from lemonade.tools.chat import LLMPrompt +from lemonade.tools.huggingface_load import HuggingfaceLoad +from lemonade.tools.quark.quark_quantize import QuarkQuantize +from lemonade.tools.quark.quark_load import QuarkLoad + + +class Testing(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Load default args from QuarkQuantize parser + parser = QuarkQuantize.parser() + cls.default_args = vars(parser.parse_args([])) + + def setUp(self) -> None: + shutil.rmtree(cache_dir, ignore_errors=True) + + def test_001_quantize(self): + """ + This test first quantizes the model, exports it to + target format and then reloads the quantized model + """ + checkpoint = "facebook/opt-125m" + device = "cpu" + prompt = "What if?" + + state = State(cache_dir=cache_dir, build_name="test") + state = HuggingfaceLoad().run(state, input=checkpoint) + + quantize_args = { + "model_export": "quark_safetensors", + "quant_algo": "awq", + "quant_scheme": "w_uint4_per_group_asym", + "device": "cpu", + "skip_quantization": True + } + # Combine specific quant args with defaults + quantize_args = {**self.default_args, **quantize_args} + state = QuarkQuantize().run(state, **quantize_args) + state = LLMPrompt().run(state, prompt=prompt, max_new_tokens=10) + + assert len(state.response) > 0, state.response + + +if __name__ == "__main__": + cache_dir, _ = common.create_test_dir( + "lemonade_quark_api", base_dir=os.path.abspath(".") + ) + unittest.main()