diff --git a/.gitignore b/.gitignore
index ba4939dc87..1127dd67a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,6 +22,7 @@ venv/
 *.log
 
 settings.json
+settings.yaml
 notification.mp3
 img_bot*
 img_me*
diff --git a/README.md b/README.md
index f767411c53..e6ca1caeff 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,6 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 * [4-bit GPTQ mode](docs/GPTQ-models-(4-bit-mode).md)
 * [LoRA (loading and training)](docs/Using-LoRAs.md)
 * [llama.cpp](docs/llama.cpp-models.md)
-* [RWKV model](docs/RWKV-model.md)
 * 8-bit and 4-bit through bitsandbytes
 * Layers splitting across GPU(s), CPU, and disk
 * CPU mode
@@ -53,8 +52,6 @@ Just download the zip above, extract it, and double-click on "start". The web UI
 
 Recommended if you have some experience with the command line.
 
-On Windows, I additionally recommend carrying out the installation on WSL instead of the base system: [WSL installation guide](https://github.com/oobabooga/text-generation-webui/blob/main/docs/WSL-installation-guide.md).
-
 #### 0. Install Conda
 
 https://docs.conda.io/en/latest/miniconda.html
@@ -81,6 +78,7 @@ conda activate textgen
 | Linux/WSL | NVIDIA | `pip3 install torch torchvision torchaudio` |
 | Linux | AMD | `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.4.2` |
 | MacOS + MPS (untested) | Any | `pip3 install torch torchvision torchaudio` |
+| Windows | NVIDIA | `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117` |
 
 The up-to-date commands can be found here: https://pytorch.org/get-started/locally/. 
 
@@ -97,7 +95,7 @@ cd text-generation-webui
 pip install -r requirements.txt
 ```
 
-#### 4. Install GPTQ-for-LLaMa and the monkey patch
+#### 4. Install GPTQ
 
 The base installation covers [transformers](https://github.com/huggingface/transformers) models (`AutoModelForCausalLM` and `AutoModelForSeq2SeqLM` specifically) and [llama.cpp](https://github.com/ggerganov/llama.cpp) (GGML) models.
 
@@ -105,17 +103,17 @@ To use GPTQ models, the additional installation steps below are necessary:
 
 [GPTQ models (4 bit mode)](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md)
 
-#### Note about bitsandbytes
+#### llama.cpp with GPU acceleration
+
+Requires the additional compilation step described here: [GPU acceleration](https://github.com/oobabooga/text-generation-webui/blob/main/docs/llama.cpp-models.md#gpu-acceleration).
+
+#### bitsandbytes
 
 bitsandbytes >= 0.39 may not work on older NVIDIA GPUs. In that case, to use `--load-in-8bit`, you may have to downgrade like this:
 
 * Linux: `pip install bitsandbytes==0.38.1`
 * Windows: `pip install https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.38.1-py3-none-any.whl`
 
-### Alternative: manual Windows installation
-
-As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Windows installation guide](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Windows-installation-guide.md).
-
 ### Alternative: Docker
 
 ```
@@ -126,7 +124,7 @@ docker compose up --build
 ```
 
 * You need to have docker compose v2.17 or higher installed. See [this guide](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Docker.md) for instructions.
-* For additional docker files, check out [this repository](https://github.com/Atinoda/text-generation-webui/blob/docker-wrapper/docs/Docker.md#dedicated-docker-repository).
+* For additional docker files, check out [this repository](https://github.com/Atinoda/text-generation-webui-docker).
 
 ### Updating the requirements
 
@@ -156,7 +154,9 @@ For example:
 
     python download-model.py facebook/opt-1.3b
 
-If you want to download a model manually, note that all you need are the json, txt, and pytorch\*.bin (or model*.safetensors) files. The remaining files are not necessary.
+* If you want to download a model manually, note that all you need are the json, txt, and pytorch\*.bin (or model*.safetensors) files. The remaining files are not necessary.
+
+* Set env vars `HF_USER` and `HF_PASS` to your Hugging Face username and password (or [User Access Token](https://huggingface.co/settings/tokens)) to download a protected model. The model's terms must first be accepted on the HF website.
 
 #### GGML models
 
@@ -207,7 +207,7 @@ Optionally, you can use the following command-line flags:
 | `--lora-dir LORA_DIR`                      | Path to directory with all the loras. |
 | `--model-menu`                             | Show a model menu in the terminal when the web UI is first launched. |
 | `--no-stream`                              | Don't stream the text output in real time. |
-| `--settings SETTINGS_FILE`                 | Load the default interface settings from this json file. See `settings-template.json` for an example. If you create a file called `settings.json`, this file will be loaded by default without the need to use the `--settings` flag. |
+| `--settings SETTINGS_FILE`                 | Load the default interface settings from this yaml file. See `settings-template.yaml` for an example. If you create a file called `settings.yaml`, this file will be loaded by default without the need to use the `--settings` flag. |
 | `--extensions EXTENSIONS [EXTENSIONS ...]` | The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. |
 | `--verbose`                                | Print the prompts to the terminal. |
 
@@ -217,7 +217,7 @@ Optionally, you can use the following command-line flags:
 |---------------------------------------------|-------------|
 | `--cpu`                                     | Use the CPU to generate text. Warning: Training on CPU is extremely slow.|
 | `--auto-devices`                            | Automatically split the model across the available GPU(s) and CPU. |
-|  `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` | Maxmimum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs. You can also set values in MiB like `--gpu-memory 3500MiB`. |
+|  `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` | Maximum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs. You can also set values in MiB like `--gpu-memory 3500MiB`. |
 | `--cpu-memory CPU_MEMORY`                   | Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.|
 | `--disk`                                    | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
 | `--disk-cache-dir DISK_CACHE_DIR`           | Directory to save the disk cache to. Defaults to `cache/`. |
@@ -226,7 +226,7 @@ Optionally, you can use the following command-line flags:
 | `--no-cache`                                | Set `use_cache` to False while generating text. This reduces the VRAM usage a bit with a performance cost. |
 | `--xformers`                                | Use xformer's memory efficient attention. This should increase your tokens/s. |
 | `--sdp-attention`                           | Use torch 2.0's sdp attention. |
-| `--trust-remote-code`                       | Set trust_remote_code=True while loading a model. Necessary for ChatGLM. |
+| `--trust-remote-code`                       | Set trust_remote_code=True while loading a model. Necessary for ChatGLM and Falcon. |
 
 #### Accelerate 4-bit
 
@@ -266,6 +266,14 @@ Optionally, you can use the following command-line flags:
 | `--warmup_autotune`    | (triton) Enable warmup autotune. |
 | `--fused_mlp`          | (triton) Enable fused mlp. |
 
+#### AutoGPTQ
+
+| Flag             | Description |
+|------------------|-------------|
+| `--autogptq`     | Use AutoGPTQ for loading quantized models instead of the internal GPTQ loader. |
+| `--triton`       | Use triton. |
+|` --desc_act`     | For models that don't have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig. |
+
 #### FlexGen
 
 | Flag             | Description |
@@ -308,6 +316,8 @@ Optionally, you can use the following command-line flags:
 |---------------------------------------|-------------|
 | `--api`                               | Enable the API extension. |
 | `--public-api`                        | Create a public URL for the API using Cloudfare. |
+| `--api-blocking-port BLOCKING_PORT`   | The listening port for the blocking API. |
+| `--api-streaming-port STREAMING_PORT` | The listening port for the streaming API. |
 
 #### Multimodal
 
@@ -319,9 +329,9 @@ Out of memory errors? [Check the low VRAM guide](docs/Low-VRAM-guide.md).
 
 ## Presets
 
-Inference settings presets can be created under `presets/` as text files. These files are detected automatically at startup.
+Inference settings presets can be created under `presets/` as yaml files. These files are detected automatically at startup.
 
-By default, 10 presets by NovelAI and KoboldAI are included. These were selected out of a sample of 43 presets after applying a K-Means clustering algorithm and selecting the elements closest to the average of each cluster.
+By default, 10 presets based on NovelAI and KoboldAI presets are included. These were selected out of a sample of 43 presets after applying a K-Means clustering algorithm and selecting the elements closest to the average of each cluster.
 
 [Visualization](https://user-images.githubusercontent.com/112222186/228956352-1addbdb9-2456-465a-b51d-089f462cd385.png)
 
@@ -333,18 +343,13 @@ https://github.com/oobabooga/text-generation-webui/tree/main/docs
 
 ## Contributing
 
-Pull requests, suggestions, and issue reports are welcome. 
-
-You are also welcome to review open pull requests.
-
-Before reporting a bug, make sure that you have:
-
-1. Created a conda environment and installed the dependencies exactly as in the *Installation* section above.
-2. [Searched](https://github.com/oobabooga/text-generation-webui/issues) to see if an issue already exists for the issue you encountered.
+* Pull requests, suggestions, and issue reports are welcome. 
+* Make sure to carefully [search](https://github.com/oobabooga/text-generation-webui/issues) existing issues before starting a new one.
+* If you have some experience with git, testing an open pull request and leaving a comment on whether it works as expected or not is immensely helpful.
+* A simple way to contribute, even if you are not a programmer, is to leave a 👍 on an issue or pull request that you find relevant.
 
 ## Credits
 
 - Gradio dropdown menu refresh button, code for reloading the interface: https://github.com/AUTOMATIC1111/stable-diffusion-webui
-- Verbose preset: Anonymous 4chan user.
 - NovelAI and KoboldAI presets: https://github.com/KoboldAI/KoboldAI-Client/wiki/Settings-Presets
 - Code for early stopping in chat mode, code for some of the sliders: https://github.com/PygmalionAI/gradio-ui/
diff --git a/api-example-chat-stream.py b/api-example-chat-stream.py
index 337e313606..7314563f0a 100644
--- a/api-example-chat-stream.py
+++ b/api-example-chat-stream.py
@@ -39,6 +39,8 @@ async def run(user_input, history):
         'typical_p': 1,
         'epsilon_cutoff': 0,  # In units of 1e-4
         'eta_cutoff': 0,  # In units of 1e-4
+        'tfs': 1,
+        'top_a': 0,
         'repetition_penalty': 1.18,
         'top_k': 40,
         'min_length': 0,
diff --git a/api-example-chat.py b/api-example-chat.py
index 97f552e2cd..905fbca6e2 100644
--- a/api-example-chat.py
+++ b/api-example-chat.py
@@ -33,6 +33,8 @@ def run(user_input, history):
         'typical_p': 1,
         'epsilon_cutoff': 0,  # In units of 1e-4
         'eta_cutoff': 0,  # In units of 1e-4
+        'tfs': 1,
+        'top_a': 0,
         'repetition_penalty': 1.18,
         'top_k': 40,
         'min_length': 0,
diff --git a/api-example-stream.py b/api-example-stream.py
index 71eaa30c05..43cde299f1 100644
--- a/api-example-stream.py
+++ b/api-example-stream.py
@@ -26,6 +26,8 @@ async def run(context):
         'typical_p': 1,
         'epsilon_cutoff': 0,  # In units of 1e-4
         'eta_cutoff': 0,  # In units of 1e-4
+        'tfs': 1,
+        'top_a': 0,
         'repetition_penalty': 1.18,
         'top_k': 40,
         'min_length': 0,
diff --git a/api-example.py b/api-example.py
index 5cd0243a2c..4e4a7f663b 100644
--- a/api-example.py
+++ b/api-example.py
@@ -18,6 +18,8 @@ def run(prompt):
         'typical_p': 1,
         'epsilon_cutoff': 0,  # In units of 1e-4
         'eta_cutoff': 0,  # In units of 1e-4
+        'tfs': 1,
+        'top_a': 0,
         'repetition_penalty': 1.18,
         'top_k': 40,
         'min_length': 0,
diff --git a/characters/instruction-following/Bluemoon.yaml b/characters/instruction-following/Bluemoon.yaml
new file mode 100644
index 0000000000..e53000820a
--- /dev/null
+++ b/characters/instruction-following/Bluemoon.yaml
@@ -0,0 +1,4 @@
+user: "LEAD:"
+bot: "ASSOCIATE:"
+turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|></s>\n"
+context: "A transcript of a roleplay between two players, LEAD and ASSOCIATE. LEAD sets up a scenario and the characters, from which ASSOCIATE then assumes a character role and continues the story for that role in response to description given by LEAD. The story and characters are developed by exchange of detailed event descriptions and character dialogs, successively given by both LEAD and ASSOCIATE.\n"
diff --git a/css/chat.css b/css/chat.css
index 4709e8deac..b5c42d0cc2 100644
--- a/css/chat.css
+++ b/css/chat.css
@@ -41,3 +41,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .sm.svelte-1ipelgc {
     width: 100%;
 }
+
+#main button {
+    min-width: 0 !important;
+}
diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md
index 0ec28fa6c6..f714840334 100644
--- a/docs/GPTQ-models-(4-bit-mode).md
+++ b/docs/GPTQ-models-(4-bit-mode).md
@@ -1,12 +1,33 @@
-In 4-bit mode, models are loaded with just 25% of their regular VRAM usage. So LLaMA-7B fits into a 6GB GPU, and LLaMA-30B fits into a 24GB GPU.
+GPTQ is a clever quantization algorithm that lightly reoptimizes the weights during quantization so that the accuracy loss is compensated relative to a round-to-nearest quantization. See the paper for more details: https://arxiv.org/abs/2210.17323
 
-This is possible thanks to [@qwopqwop200](https://github.com/qwopqwop200/GPTQ-for-LLaMa)'s adaptation of the GPTQ algorithm for LLaMA: https://github.com/qwopqwop200/GPTQ-for-LLaMa
+4-bit GPTQ models reduce VRAM usage by about 75%. So LLaMA-7B fits into a 6GB GPU, and LLaMA-30B fits into a 24GB GPU.
 
-GPTQ is a clever quantization algorithm that lightly reoptimizes the weights during quantization so that the accuracy loss is compensated relative to a round-to-nearest quantization. See the paper for more details: https://arxiv.org/abs/2210.17323
+## Overview
+
+There are two ways of loading GPTQ models in the web UI at the moment:
+
+* Using GPTQ-for-LLaMa directly:
+  * faster CPU offloading
+  * faster multi-GPU inference
+  * supports loading LoRAs using a monkey patch
+  * included by default in the one-click installers
+  * requires you to manually figure out the wbits/groupsize/model_type parameters for the model to be able to load it
+  * supports either only cuda or only triton depending on the branch
+
+* Using AutoGPTQ:
+  * supports more models
+  * standardized (no need to guess any parameter)
+  * is a proper Python library
+  * ~no wheels are presently available so it requires manual compilation~
+  * supports loading both triton and cuda models
 
-## GPTQ-for-LLaMa branches
+For creating new quantizations, I recommend using AutoGPTQ: https://github.com/PanQiWei/AutoGPTQ
 
-Different branches of GPTQ-for-LLaMa are available:
+## GPTQ-for-LLaMa
+
+GPTQ-for-LLaMa is the original adaptation of GPTQ for the LLaMA model. It was made possible by [@qwopqwop200](https://github.com/qwopqwop200/GPTQ-for-LLaMa): https://github.com/qwopqwop200/GPTQ-for-LLaMa
+
+Different branches of GPTQ-for-LLaMa are currently available, including:
 
 | Branch | Comment |
 |----|----|
@@ -16,62 +37,76 @@ Different branches of GPTQ-for-LLaMa are available:
 
 Overall, I recommend using the old CUDA branch. It is included by default in the one-click-installer for this web UI.
 
-## Installation
+### Installation
 
-### Step 0: install nvcc
+Start by cloning GPTQ-for-LLaMa into your `text-generation-webui/repositories` folder:
 
 ```
-conda activate textgen
-conda install -c conda-forge cudatoolkit-dev
+mkdir repositories
+cd repositories
+git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git -b cuda
 ```
 
-The command above takes some 10 minutes to run and shows no progress bar or updates along the way.
+If you want to you to use the up-to-date CUDA or triton branches instead of the old CUDA branch, use these commands:
+
+```
+git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git -b cuda
+```
+
+```
+git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git -b triton
+```
 
-See this issue for more details: https://github.com/oobabooga/text-generation-webui/issues/416#issuecomment-1475078571
+Next you need to install the CUDA extensions. You can do that either by installing the precompiled wheels, or by compiling the wheels yourself.
 
-### Step 1: install GPTQ-for-LLaMa
+### Precompiled wheels
 
-Clone the GPTQ-for-LLaMa repository into the `text-generation-webui/repositories` subfolder and install it:
+Kindly provided by our friend jllllll: https://github.com/jllllll/GPTQ-for-LLaMa-Wheels
+
+Windows:
 
 ```
-mkdir repositories
-cd repositories
-git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git -b cuda
-cd GPTQ-for-LLaMa
-python setup_cuda.py install
+pip install https://github.com/jllllll/GPTQ-for-LLaMa-Wheels/raw/main/quant_cuda-0.0.0-cp310-cp310-win_amd64.whl
 ```
 
-You are going to need to have a C++ compiler installed into your system for the last command. On Linux, `sudo apt install build-essential` or equivalent is enough.
-
-If you want to you to use the up-to-date CUDA or triton branches instead of the old CUDA branch, use these commands:
+Linux:
 
 ```
-cd repositories
-rm -r GPTQ-for-LLaMa
-pip uninstall -y quant-cuda
-git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git -b cuda
-...
+pip install https://github.com/jllllll/GPTQ-for-LLaMa-Wheels/raw/Linux-x64/quant_cuda-0.0.0-cp310-cp310-linux_x86_64.whl
 ```
 
+### Manual installation
+
+#### Step 1: install nvcc
+
 ```
-cd repositories
-rm -r GPTQ-for-LLaMa
-pip uninstall -y quant-cuda
-git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa.git -b triton
-...
+conda activate textgen
+conda install -c conda-forge cudatoolkit-dev
 ```
 
+The command above takes some 10 minutes to run and shows no progress bar or updates along the way.
 
-https://github.com/qwopqwop200/GPTQ-for-LLaMa
+You are also going to need to have a C++ compiler installed. On Linux, `sudo apt install build-essential` or equivalent is enough.
 
-### Step 2: get the pre-converted weights
+If you're using an older version of CUDA toolkit (e.g. 11.7) but the latest version of `gcc` and `g++` (12.0+), you should downgrade with: `conda install -c conda-forge gxx==11.3.0`. Kernel compilation will fail otherwise.
+
+#### Step 2: compile the CUDA extensions
+
+```
+cd repositories/GPTQ-for-LLaMa
+python setup_cuda.py install
+```
+
+### Getting pre-converted LLaMA weights
+
+These are models that you can simply download and place in your `models` folder.
 
 * Converted without `group-size` (better for the 7b model): https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483891617
 * Converted with `group-size` (better from 13b upwards): https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483941105 
 
 ⚠️ The tokenizer files in the sources above may be outdated. Make sure to obtain the universal LLaMA tokenizer as described [here](https://github.com/oobabooga/text-generation-webui/blob/main/docs/LLaMA-model.md#option-1-pre-converted-weights).
 
-### Step 3: Start the web UI:
+### Starting the web UI:
 
 For the models converted without `group-size`:
 
@@ -91,7 +126,7 @@ The command-line flags `--wbits` and `--groupsize` are automatically detected ba
 python server.py --model llama-13b-4bit-128g --wbits 4 --groupsize 128
 ```
 
-## CPU offloading
+### CPU offloading
 
 It is possible to offload part of the layers of the 4-bit model to the CPU with the `--pre_layer` flag. The higher the number after `--pre_layer`, the more layers will be allocated to the GPU.
 
@@ -109,20 +144,13 @@ Output generated in 123.79 seconds (1.61 tokens/s, 199 tokens)
 
 You can also use multiple GPUs with `pre_layer` if using the oobabooga fork of GPTQ, eg `--pre_layer 30 60` will load a LLaMA-30B model half onto your first GPU and half onto your second, or `--pre_layer 20 40` will load 20 layers onto GPU-0, 20 layers onto GPU-1, and 20 layers offloaded to CPU.
 
-## Using LoRAs in 4-bit mode
+### Using LoRAs with GPTQ-for-LLaMa
 
-At the moment, this feature is not officially supported by the relevant libraries, but a patch exists and is supported by this web UI: https://github.com/johnsmith0031/alpaca_lora_4bit
+This requires using a monkey patch that is supported by this web UI: https://github.com/johnsmith0031/alpaca_lora_4bit
 
-In order to use it:
+To use it:
 
-1. Make sure that your requirements are up to date:
-
-```
-cd text-generation-webui
-pip install -r requirements.txt --upgrade
-```
-
-2. Clone `johnsmith0031/alpaca_lora_4bit` into the repositories folder:
+1. Clone `johnsmith0031/alpaca_lora_4bit` into the repositories folder:
 
 ```
 cd text-generation-webui/repositories
@@ -131,14 +159,58 @@ git clone https://github.com/johnsmith0031/alpaca_lora_4bit
 
 ⚠️  I have tested it with the following commit specifically: `2f704b93c961bf202937b10aac9322b092afdce0`
 
-3. Install https://github.com/sterlind/GPTQ-for-LLaMa with this command:
+2. Install https://github.com/sterlind/GPTQ-for-LLaMa with this command:
 
 ```
 pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
 ```
 
-4. Start the UI with the `--monkey-patch` flag:
+3. Start the UI with the `--monkey-patch` flag:
 
 ```
 python server.py --model llama-7b-4bit-128g --listen --lora tloen_alpaca-lora-7b --monkey-patch
 ```
+
+## AutoGPTQ
+
+### Installation
+
+No additional steps are necessary as AutoGPTQ is already in the `requirements.txt` for the webui. If you still want or need to install it manually for whatever reason, these are the commands:
+
+```
+conda activate textgen
+git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ
+pip install .
+```
+
+The last command requires `nvcc` to be installed (see the [instructions above](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#step-1-install-nvcc)).
+
+### Usage
+
+When you quantize a model using AutoGPTQ, a folder containing a filed called `quantize_config.json` will be generated. Place that folder inside your `models/` folder and load it with the `--autogptq` flag:
+
+```
+python server.py --autogptq --model model_name
+```
+
+Alternatively, check the `autogptq` box in the "Model" tab of the UI before loading the model.
+
+### Offloading
+
+In order to do CPU offloading or multi-gpu inference with AutoGPTQ, use the `--gpu-memory` flag. It is currently somewhat slower than offloading with the `--pre_layer` option in GPTQ-for-LLaMA.
+
+For CPU offloading:
+
+```
+python server.py --autogptq --gpu-memory 3000MiB --model model_name
+```
+
+For multi-GPU inference:
+
+```
+python server.py --autogptq --gpu-memory 3000MiB 6000MiB --model model_name
+```
+
+### Using LoRAs with AutoGPTQ
+
+Not supported yet.
diff --git a/docs/Low-VRAM-guide.md b/docs/Low-VRAM-guide.md
index 1dc86f9c7f..7814ecb0c3 100644
--- a/docs/Low-VRAM-guide.md
+++ b/docs/Low-VRAM-guide.md
@@ -1,4 +1,4 @@
-If you GPU is not large enough to fit a model, try these in the following order:
+If you GPU is not large enough to fit a 16-bit model, try these in the following order:
 
 ### Load the model in 8-bit mode
 
@@ -6,7 +6,11 @@ If you GPU is not large enough to fit a model, try these in the following order:
 python server.py --load-in-8bit
 ```
 
-This reduces the memory usage by half with no noticeable loss in quality. Only newer GPUs support 8-bit mode.
+### Load the model in 4-bit mode
+
+```
+python server.py --load-in-4bit
+```
 
 ### Split the model across your GPU and CPU
 
@@ -34,8 +38,6 @@ python server.py --auto-devices --gpu-memory 3500MiB
 ...
 ```
 
-Additionally, you can also set the `--no-cache` value to reduce the GPU usage while generating text at a performance cost. This may allow you to set a higher value for `--gpu-memory`, resulting in a net performance gain.
-
 ### Send layers to a disk cache
 
 As a desperate last measure, you can split the model across your GPU, CPU, and disk:
diff --git a/docs/Using-LoRAs.md b/docs/Using-LoRAs.md
index fafd6cde2d..ec060cac35 100644
--- a/docs/Using-LoRAs.md
+++ b/docs/Using-LoRAs.md
@@ -8,15 +8,16 @@ Based on https://github.com/tloen/alpaca-lora
 python download-model.py tloen/alpaca-lora-7b
 ```
 
-2. Load the LoRA. 16-bit, 8-bit, and CPU modes work:
+2. Load the LoRA. 16-bit, `--load-in-8bit`, `--load-in-4bit`, and CPU modes work:
 
 ```
 python server.py --model llama-7b-hf --lora tloen_alpaca-lora-7b
 python server.py --model llama-7b-hf --lora tloen_alpaca-lora-7b --load-in-8bit
+python server.py --model llama-7b-hf --lora tloen_alpaca-lora-7b --load-in-4bit
 python server.py --model llama-7b-hf --lora tloen_alpaca-lora-7b --cpu
 ```
 
-* For using LoRAs in 4-bit mode, follow [these special instructions](GPTQ-models-(4-bit-mode).md#using-loras-in-4-bit-mode).
+* For using LoRAs with GPTQ quantized models, follow [these special instructions](GPTQ-models-(4-bit-mode).md#using-loras-with-gptq-for-llama).
 
 * Instead of using the `--lora` command-line flag, you can also select the LoRA in the "Parameters" tab of the interface.
 
diff --git a/docs/llama.cpp-models.md b/docs/llama.cpp-models.md
index 153f70affe..0319a7c2b2 100644
--- a/docs/llama.cpp-models.md
+++ b/docs/llama.cpp-models.md
@@ -10,11 +10,14 @@ Place the model in the `models` folder, making sure that its name contains `ggml
 
 Follow the instructions in the llama.cpp README to generate the `ggml-model.bin` file: https://github.com/ggerganov/llama.cpp#usage
 
-## GPU offloading
+## GPU acceleration
 
-Enabled with the `--n-gpu-layers` parameter. If you have enough VRAM, use a high number like `--n-gpu-layers 200000` to offload all layers to the GPU.
+Enabled with the `--n-gpu-layers` parameter. 
 
-Note that you need to manually install `llama-cpp-python` with GPU support. To do that:
+* If you have enough VRAM, use a high number like `--n-gpu-layers 200000` to offload all layers to the GPU. 
+* Otherwise, start with a low number like `--n-gpu-layers 10` and then gradually increase it until you run out of memory.
+
+To use this feature, you need to manually compile and install `llama-cpp-python` with GPU support.
 
 #### Linux
 
diff --git a/download-model.py b/download-model.py
index 44b87a8c71..83eab84a33 100644
--- a/download-model.py
+++ b/download-model.py
@@ -12,6 +12,7 @@
 import hashlib
 import json
 import re
+import os
 import sys
 from pathlib import Path
 
@@ -70,173 +71,183 @@ def select_model_from_default_options():
     return model, branch
 
 
-def sanitize_model_and_branch_names(model, branch):
-    if model[-1] == '/':
-        model = model[:-1]
-    if branch is None:
-        branch = "main"
-    else:
-        pattern = re.compile(r"^[a-zA-Z0-9._-]+$")
-        if not pattern.match(branch):
-            raise ValueError("Invalid branch name. Only alphanumeric characters, period, underscore and dash are allowed.")
-
-    return model, branch
-
-
-def get_download_links_from_huggingface(model, branch, text_only=False):
-    base = "https://huggingface.co"
-    page = f"/api/models/{model}/tree/{branch}"
-    cursor = b""
-
-    links = []
-    sha256 = []
-    classifications = []
-    has_pytorch = False
-    has_pt = False
-    has_ggml = False
-    has_safetensors = False
-    is_lora = False
-    while True:
-        url = f"{base}{page}" + (f"?cursor={cursor.decode()}" if cursor else "")
-        r = requests.get(url, timeout=10)
-        r.raise_for_status()
-        content = r.content
-
-        dict = json.loads(content)
-        if len(dict) == 0:
-            break
-
-        for i in range(len(dict)):
-            fname = dict[i]['path']
-            if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')):
-                is_lora = True
-
-            is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname)
-            is_safetensors = re.match(".*\.safetensors", fname)
-            is_pt = re.match(".*\.pt", fname)
-            is_ggml = re.match(".*ggml.*\.bin", fname)
-            is_tokenizer = re.match("(tokenizer|ice).*\.model", fname)
-            is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer
-
-            if any((is_pytorch, is_safetensors, is_pt, is_ggml, is_tokenizer, is_text)):
-                if 'lfs' in dict[i]:
-                    sha256.append([fname, dict[i]['lfs']['oid']])
-                if is_text:
-                    links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
-                    classifications.append('text')
-                    continue
-                if not text_only:
-                    links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
-                    if is_safetensors:
-                        has_safetensors = True
-                        classifications.append('safetensors')
-                    elif is_pytorch:
-                        has_pytorch = True
-                        classifications.append('pytorch')
-                    elif is_pt:
-                        has_pt = True
-                        classifications.append('pt')
-                    elif is_ggml:
-                        has_ggml = True
-                        classifications.append('ggml')
-
-        cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
-        cursor = base64.b64encode(cursor)
-        cursor = cursor.replace(b'=', b'%3D')
-
-    # If both pytorch and safetensors are available, download safetensors only
-    if (has_pytorch or has_pt) and has_safetensors:
-        for i in range(len(classifications) - 1, -1, -1):
-            if classifications[i] in ['pytorch', 'pt']:
-                links.pop(i)
-
-    return links, sha256, is_lora
-
-
-def get_output_folder(model, branch, is_lora, base_folder=None):
-    if base_folder is None:
-        base_folder = 'models' if not is_lora else 'loras'
-
-    output_folder = f"{'_'.join(model.split('/')[-2:])}"
-    if branch != 'main':
-        output_folder += f'_{branch}'
-    output_folder = Path(base_folder) / output_folder
-    return output_folder
-
-
-def get_single_file(url, output_folder, start_from_scratch=False):
-    filename = Path(url.rsplit('/', 1)[1])
-    output_path = output_folder / filename
-    if output_path.exists() and not start_from_scratch:
-        # Check if the file has already been downloaded completely
-        r = requests.get(url, stream=True, timeout=10)
-        total_size = int(r.headers.get('content-length', 0))
-        if output_path.stat().st_size >= total_size:
-            return
-        # Otherwise, resume the download from where it left off
-        headers = {'Range': f'bytes={output_path.stat().st_size}-'}
-        mode = 'ab'
-    else:
-        headers = {}
-        mode = 'wb'
-
-    r = requests.get(url, stream=True, headers=headers, timeout=10)
-    with open(output_path, mode) as f:
-        total_size = int(r.headers.get('content-length', 0))
-        block_size = 1024
-        with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t:
-            for data in r.iter_content(block_size):
-                t.update(len(data))
-                f.write(data)
-
-
-def start_download_threads(file_list, output_folder, start_from_scratch=False, threads=1):
-    thread_map(lambda url: get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True)
-
-
-def download_model_files(model, branch, links, sha256, output_folder, start_from_scratch=False, threads=1):
-    # Creating the folder and writing the metadata
-    if not output_folder.exists():
-        output_folder.mkdir()
-    with open(output_folder / 'huggingface-metadata.txt', 'w') as f:
-        f.write(f'url: https://huggingface.co/{model}\n')
-        f.write(f'branch: {branch}\n')
-        f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n')
-        sha256_str = ''
+class ModelDownloader:
+    def __init__(self):
+        self.s = requests.Session()
+        if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None:
+            self.s.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS'))
+
+
+    def sanitize_model_and_branch_names(self, model, branch):
+        if model[-1] == '/':
+            model = model[:-1]
+
+        if branch is None:
+            branch = "main"
+        else:
+            pattern = re.compile(r"^[a-zA-Z0-9._-]+$")
+            if not pattern.match(branch):
+                raise ValueError(
+                    "Invalid branch name. Only alphanumeric characters, period, underscore and dash are allowed.")
+
+        return model, branch
+
+
+    def get_download_links_from_huggingface(self, model, branch, text_only=False):
+        base = "https://huggingface.co"
+        page = f"/api/models/{model}/tree/{branch}"
+        cursor = b""
+
+        links = []
+        sha256 = []
+        classifications = []
+        has_pytorch = False
+        has_pt = False
+        has_ggml = False
+        has_safetensors = False
+        is_lora = False
+        while True:
+            url = f"{base}{page}" + (f"?cursor={cursor.decode()}" if cursor else "")
+            r = self.s.get(url, timeout=10)
+            r.raise_for_status()
+            content = r.content
+
+            dict = json.loads(content)
+            if len(dict) == 0:
+                break
+
+            for i in range(len(dict)):
+                fname = dict[i]['path']
+                if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')):
+                    is_lora = True
+
+                is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname)
+                is_safetensors = re.match(".*\.safetensors", fname)
+                is_pt = re.match(".*\.pt", fname)
+                is_ggml = re.match(".*ggml.*\.bin", fname)
+                is_tokenizer = re.match("(tokenizer|ice).*\.model", fname)
+                is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer
+                if any((is_pytorch, is_safetensors, is_pt, is_ggml, is_tokenizer, is_text)):
+                    if 'lfs' in dict[i]:
+                        sha256.append([fname, dict[i]['lfs']['oid']])
+
+                    if is_text:
+                        links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
+                        classifications.append('text')
+                        continue
+
+                    if not text_only:
+                        links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
+                        if is_safetensors:
+                            has_safetensors = True
+                            classifications.append('safetensors')
+                        elif is_pytorch:
+                            has_pytorch = True
+                            classifications.append('pytorch')
+                        elif is_pt:
+                            has_pt = True
+                            classifications.append('pt')
+                        elif is_ggml:
+                            has_ggml = True
+                            classifications.append('ggml')
+
+            cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
+            cursor = base64.b64encode(cursor)
+            cursor = cursor.replace(b'=', b'%3D')
+
+        # If both pytorch and safetensors are available, download safetensors only
+        if (has_pytorch or has_pt) and has_safetensors:
+            for i in range(len(classifications) - 1, -1, -1):
+                if classifications[i] in ['pytorch', 'pt']:
+                    links.pop(i)
+
+        return links, sha256, is_lora
+
+
+    def get_output_folder(self, model, branch, is_lora, base_folder=None):
+        if base_folder is None:
+            base_folder = 'models' if not is_lora else 'loras'
+
+        output_folder = f"{'_'.join(model.split('/')[-2:])}"
+        if branch != 'main':
+            output_folder += f'_{branch}'
+        output_folder = Path(base_folder) / output_folder
+        return output_folder
+
+
+    def get_single_file(self, url, output_folder, start_from_scratch=False):
+        filename = Path(url.rsplit('/', 1)[1])
+        output_path = output_folder / filename
+        if output_path.exists() and not start_from_scratch:
+            # Check if the file has already been downloaded completely
+            r = self.s.get(url, stream=True, timeout=10)
+            total_size = int(r.headers.get('content-length', 0))
+            if output_path.stat().st_size >= total_size:
+                return
+            # Otherwise, resume the download from where it left off
+            headers = {'Range': f'bytes={output_path.stat().st_size}-'}
+            mode = 'ab'
+        else:
+            headers = {}
+            mode = 'wb'
+
+        r = self.s.get(url, stream=True, headers=headers, timeout=10)
+        with open(output_path, mode) as f:
+            total_size = int(r.headers.get('content-length', 0))
+            block_size = 1024
+            with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t:
+                for data in r.iter_content(block_size):
+                    t.update(len(data))
+                    f.write(data)
+
+
+    def start_download_threads(self, file_list, output_folder, start_from_scratch=False, threads=1):
+        thread_map(lambda url: self.get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True)
+
+
+    def download_model_files(self, model, branch, links, sha256, output_folder, start_from_scratch=False, threads=1):
+        # Creating the folder and writing the metadata
+        if not output_folder.exists():
+            output_folder.mkdir(parents=True, exist_ok=True)
+        with open(output_folder / 'huggingface-metadata.txt', 'w') as f:
+            f.write(f'url: https://huggingface.co/{model}\n')
+            f.write(f'branch: {branch}\n')
+            f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n')
+            sha256_str = ''
+            for i in range(len(sha256)):
+                sha256_str += f'    {sha256[i][1]} {sha256[i][0]}\n'
+            if sha256_str != '':
+                f.write(f'sha256sum:\n{sha256_str}')
+
+        # Downloading the files
+        print(f"Downloading the model to {output_folder}")
+        self.start_download_threads(links, output_folder, start_from_scratch=start_from_scratch, threads=threads)
+
+
+    def check_model_files(self, model, branch, links, sha256, output_folder):
+        # Validate the checksums
+        validated = True
         for i in range(len(sha256)):
-            sha256_str += f'    {sha256[i][1]} {sha256[i][0]}\n'
-        if sha256_str != '':
-            f.write(f'sha256sum:\n{sha256_str}')
-
-    # Downloading the files
-    print(f"Downloading the model to {output_folder}")
-    start_download_threads(links, output_folder, start_from_scratch=start_from_scratch, threads=threads)
-
-
-def check_model_files(model, branch, links, sha256, output_folder):
-    # Validate the checksums
-    validated = True
-    for i in range(len(sha256)):
-        fpath = (output_folder / sha256[i][0])
-
-        if not fpath.exists():
-            print(f"The following file is missing: {fpath}")
-            validated = False
-            continue
-
-        with open(output_folder / sha256[i][0], "rb") as f:
-            bytes = f.read()
-            file_hash = hashlib.sha256(bytes).hexdigest()
-            if file_hash != sha256[i][1]:
-                print(f'Checksum failed: {sha256[i][0]}  {sha256[i][1]}')
+            fpath = (output_folder / sha256[i][0])
+
+            if not fpath.exists():
+                print(f"The following file is missing: {fpath}")
                 validated = False
-            else:
-                print(f'Checksum validated: {sha256[i][0]}  {sha256[i][1]}')
+                continue
 
-    if validated:
-        print('[+] Validated checksums of all model files!')
-    else:
-        print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.')
+            with open(output_folder / sha256[i][0], "rb") as f:
+                bytes = f.read()
+                file_hash = hashlib.sha256(bytes).hexdigest()
+                if file_hash != sha256[i][1]:
+                    print(f'Checksum failed: {sha256[i][0]}  {sha256[i][1]}')
+                    validated = False
+                else:
+                    print(f'Checksum validated: {sha256[i][0]}  {sha256[i][1]}')
+
+        if validated:
+            print('[+] Validated checksums of all model files!')
+        else:
+            print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.')
 
 
 if __name__ == '__main__':
@@ -256,22 +267,23 @@ def check_model_files(model, branch, links, sha256, output_folder):
     if model is None:
         model, branch = select_model_from_default_options()
 
+    downloader = ModelDownloader()
     # Cleaning up the model/branch names
     try:
-        model, branch = sanitize_model_and_branch_names(model, branch)
+        model, branch = downloader.sanitize_model_and_branch_names(model, branch)
     except ValueError as err_branch:
         print(f"Error: {err_branch}")
         sys.exit()
 
     # Getting the download links from Hugging Face
-    links, sha256, is_lora = get_download_links_from_huggingface(model, branch, text_only=args.text_only)
+    links, sha256, is_lora = downloader.get_download_links_from_huggingface(model, branch, text_only=args.text_only)
 
     # Getting the output folder
-    output_folder = get_output_folder(model, branch, is_lora, base_folder=args.output)
+    output_folder = downloader.get_output_folder(model, branch, is_lora, base_folder=args.output)
 
     if args.check:
         # Check previously downloaded files
-        check_model_files(model, branch, links, sha256, output_folder)
+        downloader.check_model_files(model, branch, links, sha256, output_folder)
     else:
         # Download files
-        download_model_files(model, branch, links, sha256, output_folder, threads=args.threads)
+        downloader.download_model_files(model, branch, links, sha256, output_folder, threads=args.threads)
diff --git a/extensions/api/blocking_api.py b/extensions/api/blocking_api.py
index 8c2326f4d9..6bcd840c74 100644
--- a/extensions/api/blocking_api.py
+++ b/extensions/api/blocking_api.py
@@ -5,7 +5,7 @@
 from extensions.api.util import build_parameters, try_start_cloudflared
 from modules import shared
 from modules.chat import generate_chat_reply
-from modules.text_generation import encode, generate_reply
+from modules.text_generation import encode, generate_reply, stop_everything_event
 
 
 class Handler(BaseHTTPRequestHandler):
@@ -78,6 +78,19 @@ def do_POST(self):
 
             self.wfile.write(response.encode('utf-8'))
 
+        elif self.path == '/api/v1/stop-stream':
+            self.send_response(200)
+            self.send_header('Content-Type', 'application/json')
+            self.end_headers()
+
+            stop_everything_event()
+
+            response = json.dumps({
+                'results': 'success'
+            })
+
+            self.wfile.write(response.encode('utf-8'))
+
         elif self.path == '/api/v1/token-count':
             self.send_response(200)
             self.send_header('Content-Type', 'application/json')
diff --git a/extensions/api/streaming_api.py b/extensions/api/streaming_api.py
index 717a80889f..e544066395 100644
--- a/extensions/api/streaming_api.py
+++ b/extensions/api/streaming_api.py
@@ -32,6 +32,9 @@ async def _handle_connection(websocket, path):
 
             for a in generator:
                 to_send = a[skip_index:]
+                if to_send is None or chr(0xfffd) in to_send:  # partial unicode character, don't send it yet.
+                    continue
+
                 await websocket.send(json.dumps({
                     'event': 'text_stream',
                     'message_num': message_num,
diff --git a/extensions/api/util.py b/extensions/api/util.py
index 9c6dd30ed3..59a015de7d 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -17,6 +17,8 @@ def build_parameters(body, chat=False):
         'typical_p': float(body.get('typical_p', body.get('typical', 1))),
         'epsilon_cutoff': float(body.get('epsilon_cutoff', 0)),
         'eta_cutoff': float(body.get('eta_cutoff', 0)),
+        'tfs': float(body.get('tfs', 1)),
+        'top_a': float(body.get('top_a', 0)),
         'repetition_penalty': float(body.get('repetition_penalty', body.get('rep_pen', 1.1))),
         'encoder_repetition_penalty': float(body.get('encoder_repetition_penalty', 1.0)),
         'top_k': int(body.get('top_k', 0)),
diff --git a/extensions/openai/README.md b/extensions/openai/README.md
index b20eba3326..0a9ed20a33 100644
--- a/extensions/openai/README.md
+++ b/extensions/openai/README.md
@@ -126,6 +126,7 @@ Everything needs OPENAI_API_KEY=dummy set.
 | ✅❌ | openai-python | https://github.com/openai/openai-python | only the endpoints from above are working. OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
 | ✅❌ | openai-node | https://github.com/openai/openai-node | only the endpoints from above are working. environment variables don't work by default, but can be configured (see above) |
 | ✅❌ | chatgpt-api | https://github.com/transitive-bullshit/chatgpt-api | only the endpoints from above are working. environment variables don't work by default, but can be configured (see above) |
+| ✅ | anse | https://github.com/anse-app/anse | API Key & URL configurable in UI |
 | ✅ | shell_gpt | https://github.com/TheR1D/shell_gpt | OPENAI_API_HOST=http://127.0.0.1:5001 |
 | ✅ | gpt-shell | https://github.com/jla/gpt-shell | OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
 | ✅ | gpt-discord-bot | https://github.com/openai/gpt-discord-bot | OPENAI_API_BASE=http://127.0.0.1:5001/v1 |
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index d41592a35a..c5c5f2bb79 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -6,6 +6,7 @@
 import yaml
 from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
 from threading import Thread
+from modules.utils import get_available_models
 
 import numpy as np
 
@@ -18,6 +19,41 @@
 
 debug = True if 'OPENEDAI_DEBUG' in os.environ else False
 
+# Slightly different defaults for OpenAI's API
+default_req_params = {
+    'max_new_tokens': 200,
+    'temperature': 1.0,
+    'top_p': 1.0,
+    'top_k': 1,
+    'repetition_penalty': 1.18,
+    'encoder_repetition_penalty': 1.0,
+    'suffix': None,
+    'stream': False,
+    'echo': False,
+    'seed': -1,
+    # 'n' : default(body, 'n', 1),  # 'n' doesn't have a direct map
+    'truncation_length': 2048,
+    'add_bos_token': True,
+    'do_sample': True,
+    'typical_p': 1.0,
+    'epsilon_cutoff': 0,  # In units of 1e-4
+    'eta_cutoff': 0,  # In units of 1e-4
+    'tfs': 1.0,
+    'top_a': 0.0,
+    'min_length': 0,
+    'no_repeat_ngram_size': 0,
+    'num_beams': 1,
+    'penalty_alpha': 0.0,
+    'length_penalty': 1,
+    'early_stopping': False,
+    'mirostat_mode': 0,
+    'mirostat_tau': 5,
+    'mirostat_eta': 0.1,
+    'ban_eos_token': False,
+    'skip_special_tokens': True,
+    'custom_stopping_strings': [],
+}
+
 # Optional, install the module and download the model to enable
 # v1/embeddings
 try:
@@ -91,10 +127,32 @@ def float_list_to_base64(float_list):
 
 
 class Handler(BaseHTTPRequestHandler):
+    def send_access_control_headers(self):
+        self.send_header("Access-Control-Allow-Origin", "*")
+        self.send_header("Access-Control-Allow-Credentials", "true")
+        self.send_header(
+            "Access-Control-Allow-Methods",
+            "GET,HEAD,OPTIONS,POST,PUT"
+        )
+        self.send_header(
+            "Access-Control-Allow-Headers",
+            "Origin, Accept, X-Requested-With, Content-Type, "
+            "Access-Control-Request-Method, Access-Control-Request-Headers, "
+            "Authorization"
+        )    
+ 
+    def do_OPTIONS(self):
+        self.send_response(200)
+        self.send_access_control_headers()
+        self.send_header('Content-Type', 'application/json')
+        self.end_headers()
+        self.wfile.write("OK".encode('utf-8'))
+
     def do_GET(self):
         if self.path.startswith('/v1/models'):
 
             self.send_response(200)
+            self.send_access_control_headers()
             self.send_header('Content-Type', 'application/json')
             self.end_headers()
 
@@ -127,6 +185,8 @@ def do_GET(self):
                 "permission": []
             }]
 
+            models.extend([{ "id": id, "object": "model", "owned_by": "user", "permission": [] } for id in get_available_models() ])
+
             response = ''
             if self.path == '/v1/models':
                 response = json.dumps({
@@ -146,6 +206,7 @@ def do_GET(self):
         elif '/billing/usage' in self.path:
             # Ex. /v1/dashboard/billing/usage?start_date=2023-05-01&end_date=2023-05-31
             self.send_response(200)
+            self.send_access_control_headers()
             self.send_header('Content-Type', 'application/json')
             self.end_headers()
 
@@ -194,48 +255,21 @@ def do_POST(self):
             max_tokens = default(body, max_tokens_str, default(shared.settings, 'max_new_tokens', default_max_tokens))
             # if the user assumes OpenAI, the max_tokens is way too large - try to ignore it unless it's small enough
 
-            req_params = {
-                'max_new_tokens': max_tokens,
-                'temperature': default(body, 'temperature', 1.0),
-                'top_p': default(body, 'top_p', 1.0),
-                'top_k': default(body, 'best_of', 1),
-                # XXX not sure about this one, seems to be the right mapping, but the range is different (-2..2.0) vs 0..2
-                # 0 is default in openai, but 1.0 is default in other places. Maybe it's scaled? scale it.
-                'repetition_penalty': 1.18,  # (default(body, 'presence_penalty', 0) + 2.0 ) / 2.0, # 0 the real default, 1.2 is the model default, but 1.18 works better.
-                # XXX not sure about this one either, same questions. (-2..2.0), 0 is default not 1.0, scale it.
-                'encoder_repetition_penalty': 1.0,  # (default(body, 'frequency_penalty', 0) + 2.0) / 2.0,
-                'suffix': body.get('suffix', None),
-                'stream': default(body, 'stream', False),
-                'echo': default(body, 'echo', False),
-                #####################################################
-                'seed': shared.settings.get('seed', -1),
-                # int(body.get('n', 1)) # perhaps this should be num_beams or chat_generation_attempts? 'n' doesn't have a direct map
-                # unofficial, but it needs to get set anyways.
-                'truncation_length': truncation_length,
-                # no more args.
-                'add_bos_token': shared.settings.get('add_bos_token', True),
-                'do_sample': True,
-                'typical_p': 1.0,
-                'epsilon_cutoff': 0,  # In units of 1e-4
-                'eta_cutoff': 0,  # In units of 1e-4
-                'min_length': 0,
-                'no_repeat_ngram_size': 0,
-                'num_beams': 1,
-                'penalty_alpha': 0.0,
-                'length_penalty': 1,
-                'early_stopping': False,
-                'mirostat_mode': 0,
-                'mirostat_tau': 5,
-                'mirostat_eta': 0.1,
-                'ban_eos_token': False,
-                'skip_special_tokens': True,
-            }
+            req_params = default_req_params.copy()
 
-            # fixup absolute 0.0's
-            for par in ['temperature', 'repetition_penalty', 'encoder_repetition_penalty']:
-                req_params[par] = clamp(req_params[par], 0.001, 1.999)
+            req_params['max_new_tokens'] = max_tokens
+            req_params['truncation_length'] = truncation_length
+            req_params['temperature'] = clamp(default(body, 'temperature', default_req_params['temperature']), 0.001, 1.999) # fixup absolute 0.0
+            req_params['top_p'] = clamp(default(body, 'top_p', default_req_params['top_p']), 0.001, 1.0)
+            req_params['top_k'] = default(body, 'best_of', default_req_params['top_k'])
+            req_params['suffix'] = default(body, 'suffix', default_req_params['suffix'])
+            req_params['stream'] = default(body, 'stream', default_req_params['stream'])
+            req_params['echo'] = default(body, 'echo', default_req_params['echo'])
+            req_params['seed'] = shared.settings.get('seed', default_req_params['seed'])
+            req_params['add_bos_token'] = shared.settings.get('add_bos_token', default_req_params['add_bos_token'])
 
             self.send_response(200)
+            self.send_access_control_headers()
             if req_params['stream']:
                 self.send_header('Content-Type', 'text/event-stream')
                 self.send_header('Cache-Control', 'no-cache')
@@ -259,7 +293,7 @@ def do_POST(self):
 
                 role_formats = {
                     'user': 'user: {message}\n',
-                    'bot': 'assistant: {message}\n',
+                    'assistant': 'assistant: {message}\n',
                     'system': '{message}',
                     'context': 'You are a helpful assistant. Answer as concisely as possible.',
                     'prompt': 'assistant:',
@@ -315,7 +349,7 @@ def do_POST(self):
 
                 # can't really truncate the system messages
                 system_msg = '\n'.join(system_msgs)
-                if system_msg[-1] != '\n':
+                if system_msg and system_msg[-1] != '\n':
                     system_msg = system_msg + '\n'
 
                 system_token_count = len(encode(system_msg)[0])
@@ -389,7 +423,9 @@ def do_POST(self):
                     chunk[resp_list][0]["message"] = {'role': 'assistant', 'content': ''}
                     chunk[resp_list][0]["delta"] = {'role': 'assistant', 'content': ''}
 
-                response = 'data: ' + json.dumps(chunk) + '\n'
+                data_chunk = 'data: ' + json.dumps(chunk) + '\r\n\r\n'
+                chunk_size = hex(len(data_chunk))[2:] + '\r\n'
+                response = chunk_size + data_chunk
                 self.wfile.write(response.encode('utf-8'))
 
             # generate reply #######################################
@@ -462,7 +498,9 @@ def do_POST(self):
                         # So yeah... do both methods? delta and messages.
                         chunk[resp_list][0]['message'] = {'content': new_content}
                         chunk[resp_list][0]['delta'] = {'content': new_content}
-                    response = 'data: ' + json.dumps(chunk) + '\n'
+                    data_chunk = 'data: ' + json.dumps(chunk) + '\r\n\r\n'
+                    chunk_size = hex(len(data_chunk))[2:] + '\r\n'
+                    response = chunk_size + data_chunk
                     self.wfile.write(response.encode('utf-8'))
                     completion_token_count += len(encode(new_content)[0])
 
@@ -487,8 +525,12 @@ def do_POST(self):
                 else:
                     # So yeah... do both methods? delta and messages.
                     chunk[resp_list][0]['message'] = {'content': ''}
-                    chunk[resp_list][0]['delta'] = {}
-                response = 'data: ' + json.dumps(chunk) + '\ndata: [DONE]\n'
+                    chunk[resp_list][0]['delta'] = {'content': ''}
+
+                data_chunk = 'data: ' + json.dumps(chunk) + '\r\n\r\n'
+                chunk_size = hex(len(data_chunk))[2:] + '\r\n'
+                done = 'data: [DONE]\r\n\r\n'
+                response = chunk_size + data_chunk + done
                 self.wfile.write(response.encode('utf-8'))
                 # Finished if streaming.
                 if debug:
@@ -534,6 +576,7 @@ def do_POST(self):
             self.wfile.write(response.encode('utf-8'))
         elif '/edits' in self.path:
             self.send_response(200)
+            self.send_access_control_headers()
             self.send_header('Content-Type', 'application/json')
             self.end_headers()
 
@@ -550,37 +593,14 @@ def do_POST(self):
             token_count = len(encode(edit_task)[0])
             max_tokens = truncation_length - token_count
 
-            req_params = {
-                'max_new_tokens': max_tokens,
-                'temperature': clamp(default(body, 'temperature', 1.0), 0.001, 1.999),
-                'top_p': clamp(default(body, 'top_p', 1.0), 0.001, 1.0),
-                'top_k': 1,
-                'repetition_penalty': 1.18,
-                'encoder_repetition_penalty': 1.0,
-                'suffix': None,
-                'stream': False,
-                'echo': False,
-                'seed': shared.settings.get('seed', -1),
-                # 'n' : default(body, 'n', 1),  # 'n' doesn't have a direct map
-                'truncation_length': truncation_length,
-                'add_bos_token': shared.settings.get('add_bos_token', True),
-                'do_sample': True,
-                'typical_p': 1.0,
-                'epsilon_cutoff': 0,  # In units of 1e-4
-                'eta_cutoff': 0,  # In units of 1e-4
-                'min_length': 0,
-                'no_repeat_ngram_size': 0,
-                'num_beams': 1,
-                'penalty_alpha': 0.0,
-                'length_penalty': 1,
-                'early_stopping': False,
-                'mirostat_mode': 0,
-                'mirostat_tau': 5,
-                'mirostat_eta': 0.1,
-                'ban_eos_token': False,
-                'skip_special_tokens': True,
-                'custom_stopping_strings': [],
-            }
+            req_params = default_req_params.copy()
+
+            req_params['max_new_tokens'] = max_tokens
+            req_params['truncation_length'] = truncation_length
+            req_params['temperature'] = clamp(default(body, 'temperature', default_req_params['temperature']), 0.001, 1.999) # fixup absolute 0.0
+            req_params['top_p'] = clamp(default(body, 'top_p', default_req_params['top_p']), 0.001, 1.0)
+            req_params['seed'] = shared.settings.get('seed', default_req_params['seed'])
+            req_params['add_bos_token'] = shared.settings.get('add_bos_token', default_req_params['add_bos_token'])
 
             if debug:
                 print({'edit_template': edit_task, 'req_params': req_params, 'token_count': token_count})
@@ -629,6 +649,7 @@ def do_POST(self):
             # url return types will require file management and a web serving files... Perhaps later!
 
             self.send_response(200)
+            self.send_access_control_headers()
             self.send_header('Content-Type', 'application/json')
             self.end_headers()
 
@@ -663,6 +684,7 @@ def do_POST(self):
             self.wfile.write(response.encode('utf-8'))
         elif '/embeddings' in self.path and embedding_model is not None:
             self.send_response(200)
+            self.send_access_control_headers()
             self.send_header('Content-Type', 'application/json')
             self.end_headers()
 
@@ -696,6 +718,7 @@ def enc_emb(emb):
         elif '/moderations' in self.path:
             # for now do nothing, just don't error.
             self.send_response(200)
+            self.send_access_control_headers()
             self.send_header('Content-Type', 'application/json')
             self.end_headers()
 
@@ -729,6 +752,7 @@ def enc_emb(emb):
         elif self.path == '/api/v1/token-count':
             # NOT STANDARD. lifted from the api extension, but it's still very useful to calculate tokenized length client side.
             self.send_response(200)
+            self.send_access_control_headers()
             self.send_header('Content-Type', 'application/json')
             self.end_headers()
 
diff --git a/models/config.yaml b/models/config.yaml
index 19ae6c5a6c..db48e5a510 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -177,3 +177,6 @@ llama-65b-gptq-3bit:
 .*manticore:
   mode: 'instruct'
   instruction_template: 'Manticore Chat'
+.*bluemoonrp-(30|13)b:
+  mode: 'instruct'
+  instruction_template: 'Bluemoon'
diff --git a/modules/AutoGPTQ_loader.py b/modules/AutoGPTQ_loader.py
index 622b9f871d..5b87fe5699 100644
--- a/modules/AutoGPTQ_loader.py
+++ b/modules/AutoGPTQ_loader.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from auto_gptq import AutoGPTQForCausalLM
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
 
 import modules.shared as shared
 from modules.logging_colors import logger
@@ -10,34 +10,45 @@
 def load_quantized(model_name):
     path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
     pt_path = None
-    use_safetensors = False
 
     # Find the model checkpoint
-    for ext in ['.safetensors', '.pt', '.bin']:
-        found = list(path_to_model.glob(f"*{ext}"))
-        if len(found) > 0:
-            if len(found) > 1:
-                logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
-
-            pt_path = found[-1]
-            if ext == '.safetensors':
-                use_safetensors = True
-
-            break
+    if shared.args.checkpoint:
+        pt_path = Path(shared.args.checkpoint)
+    else:
+        for ext in ['.safetensors', '.pt', '.bin']:
+            found = list(path_to_model.glob(f"*{ext}"))
+            if len(found) > 0:
+                if len(found) > 1:
+                    logger.warning(f'More than one {ext} model has been found. The last one will be selected. It could be wrong.')
+
+                pt_path = found[-1]
+                break
 
     if pt_path is None:
         logger.error("The model could not be loaded because its checkpoint file in .bin/.pt/.safetensors format could not be located.")
         return
 
+    use_safetensors = pt_path.suffix == '.safetensors'
+    if not (path_to_model / "quantize_config.json").exists():
+        quantize_config = BaseQuantizeConfig(
+            bits=bits if (bits := shared.args.wbits) > 0 else 4,
+            group_size=gs if (gs := shared.args.groupsize) > 0 else -1,
+            desc_act=shared.args.desc_act
+        )
+    else:
+        quantize_config = None
+
     # Define the params for AutoGPTQForCausalLM.from_quantized
     params = {
         'model_basename': pt_path.stem,
         'device': "cuda:0" if not shared.args.cpu else "cpu",
         'use_triton': shared.args.triton,
         'use_safetensors': use_safetensors,
-        'max_memory': get_max_memory_dict()
+        'trust_remote_code': shared.args.trust_remote_code,
+        'max_memory': get_max_memory_dict(),
+        'quantize_config': quantize_config
     }
 
-    logger.warning(f"The AutoGPTQ params are: {params}")
+    logger.info(f"The AutoGPTQ params are: {params}")
     model = AutoGPTQForCausalLM.from_quantized(path_to_model, **params)
     return model
diff --git a/modules/RWKV.py b/modules/RWKV.py
index bb6bab50c7..1b0078ad59 100644
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@@ -87,7 +87,9 @@ def generate_from_cached_state(self, ctx="", token_count=20, args=None, callback
             while len(tokens) > 0:
                 out, state = self.model.forward(tokens[:args.chunk_len], state)
                 tokens = tokens[args.chunk_len:]
-
+            if i == 0:
+                begin_token= len(all_tokens)
+                last_token_posi=begin_token
             # cache the model state after scanning the context
             # we don't cache the state after processing our own generated tokens because
             # the output string might be post-processed arbitrarily. Therefore, what's fed into the model
@@ -116,13 +118,13 @@ def generate_from_cached_state(self, ctx="", token_count=20, args=None, callback
                 occurrence[token] += 1
 
             # output
-            tmp = self.pipeline.decode([token])
+            tmp = self.pipeline.decode(all_tokens[last_token_posi:])
             if '\ufffd' not in tmp:  # is valid utf-8 string?
                 if callback:
                     callback(tmp)
-
+                    
                 out_str += tmp
-
+                last_token_posi = begin_token + i + 1
         return out_str
 
 
diff --git a/modules/chat.py b/modules/chat.py
index be5eb9a7ac..f3388737bb 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -277,7 +277,7 @@ def chatbot_wrapper(text, history, state, regenerate=False, _continue=False, loa
     yield output
 
 
-def impersonate_wrapper(text, state):
+def impersonate_wrapper(text, start_with, state):
     if shared.model_name == 'None' or shared.model is None:
         logger.error("No model is loaded! Select one in the Model tab.")
         yield ''
@@ -322,8 +322,13 @@ def generate_chat_reply(text, history, state, regenerate=False, _continue=False,
         yield history
 
 
-# Same as above but returns HTML
-def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
+# Same as above but returns HTML for the UI
+def generate_chat_reply_wrapper(text, start_with, state, regenerate=False, _continue=False):
+    if start_with != '' and _continue == False:
+        _continue = True
+        send_dummy_message(text)
+        send_dummy_reply(start_with)
+
     for i, history in enumerate(generate_chat_reply(text, shared.history, state, regenerate, _continue, loading_message=True)):
         if i != 0:
             shared.history = copy.deepcopy(history)
@@ -641,7 +646,7 @@ def save_character(name, greeting, context, picture, filename, instruct=False):
     data = {k: v for k, v in data.items() if v}  # Strip falsy
     filepath = Path(f'{folder}/{filename}.yaml')
     with filepath.open('w') as f:
-        yaml.dump(data, f)
+        yaml.dump(data, f, sort_keys=False)
 
     logger.info(f'Wrote {filepath}')
     path_to_img = Path(f'{folder}/{filename}.png')
diff --git a/modules/evaluate.py b/modules/evaluate.py
index 61e30261da..3283278e07 100644
--- a/modules/evaluate.py
+++ b/modules/evaluate.py
@@ -82,7 +82,13 @@ def calculate_perplexity(models, input_dataset, stride, _max_length):
         yield cumulative_log + "Tokenizing the input dataset...\n\n"
         encodings = encode(text, add_special_tokens=False)
         seq_len = encodings.shape[1]
-        max_length = _max_length or shared.model.config.max_position_embeddings
+        if _max_length:
+            max_length = _max_length
+        elif hasattr(shared.model.config, 'max_position_embeddings'):
+            max_length = shared.model.config.max_position_embeddings
+        else:
+            max_length = 2048
+
         nlls = []
         prev_end_loc = 0
         for begin_loc in tqdm(range(0, seq_len, stride)):
diff --git a/modules/models.py b/modules/models.py
index b87d3cd9cc..575f28e1ae 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -15,7 +15,7 @@
                           BitsAndBytesConfig, LlamaTokenizer)
 
 import modules.shared as shared
-from modules import llama_attn_hijack
+from modules import llama_attn_hijack, sampler_hijack
 from modules.logging_colors import logger
 
 transformers.logging.set_verbosity_error()
@@ -36,6 +36,8 @@
     ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
     dschf = HfDeepSpeedConfig(ds_config)  # Keep this object alive for the Transformers integration
 
+sampler_hijack.hijack_samplers()
+
 
 # Some models require special treatment in various parts of the code.
 # This function detects those models
@@ -114,7 +116,7 @@ def load_tokenizer(model_name, model):
     tokenizer = None
     if shared.model_type == 'gpt4chan' and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
         tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
-    elif type(model) is transformers.LlamaForCausalLM:
+    elif type(model) is transformers.LlamaForCausalLM or "LlamaGPTQForCausalLM" in str(type(model)):
         # Try to load an universal LLaMA tokenizer
         if shared.model_type not in ['llava', 'oasst']:
             for p in [Path(f"{shared.args.model_dir}/llama-tokenizer/"), Path(f"{shared.args.model_dir}/oobabooga_llama-tokenizer/")]:
@@ -277,7 +279,7 @@ def GPTQ_loader(model_name):
 
     # Monkey patch
     if shared.args.monkey_patch:
-        logger.warning("Applying the monkey patch for using LoRAs in 4-bit mode. It may cause undefined behavior outside its intended scope.")
+        logger.warning("Applying the monkey patch for using LoRAs with GPTQ models. It may cause undefined behavior outside its intended scope.")
         from modules.monkey_patch_gptq_lora import load_model_llama
 
         model, _ = load_model_llama(model_name)
diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py
new file mode 100644
index 0000000000..f02bea4ffe
--- /dev/null
+++ b/modules/sampler_hijack.py
@@ -0,0 +1,102 @@
+import torch
+import transformers
+from transformers import LogitsWarper
+from transformers.generation.logits_process import LogitNormalization, LogitsProcessorList
+
+
+class TailFreeLogitsWarper(LogitsWarper):
+    def __init__(self, tfs: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        tfs = float(tfs)
+        if tfs < 0 or tfs > 1.0:
+            raise ValueError(f"`tfs` has to be a float >= 0 and <= 1, but is {tfs}")
+        self.tfs = tfs
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        sorted_logits, sorted_indices = torch.sort(scores, descending=True)
+        probs = sorted_logits.softmax(dim=-1)
+
+        # Compute second derivative normalized CDF
+        d2 = probs.diff().diff().abs()
+        normalized_d2 = d2 / d2.sum(dim=-1, keepdim=True)
+        normalized_d2_cdf = normalized_d2.cumsum(dim=-1)
+
+        # Remove tokens with CDF value above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = normalized_d2_cdf > self.tfs
+
+        # Centre the distribution around the cutoff as in the original implementation of the algorithm
+        sorted_indices_to_remove = torch.cat(
+            (
+                torch.zeros(scores.shape[0], 1, dtype=torch.bool, device=scores.device),
+                sorted_indices_to_remove,
+                torch.ones(scores.shape[0], 1, dtype=torch.bool, device=scores.device),
+            ),
+            dim=-1,
+        )
+
+        if self.min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep
+            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
+
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
+class TopALogitsWarper(LogitsWarper):
+    def __init__(self, top_a: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        top_a = float(top_a)
+        if top_a < 0 or top_a > 1.0:
+            raise ValueError(f"`top_a` has to be a float >= 0 and <= 1, but is {top_a}")
+        self.top_a = top_a
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        sorted_logits, sorted_indices = torch.sort(scores, descending=True)
+        probs = sorted_logits.softmax(dim=-1)
+
+        # Remove tokens with probability less than top_a*(max(probs))^2 (token with 0 are kept)
+        probs_max = probs[..., 0, None]
+        sorted_indices_to_remove = probs < probs_max * probs_max * self.top_a
+
+        if self.min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep
+            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
+
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
+def get_logits_warper_patch(self, generation_config):
+    warpers = self._get_logits_warper_old(generation_config)
+    warpers_to_add = LogitsProcessorList()
+    min_tokens_to_keep = 2 if generation_config.num_beams > 1 else 1
+
+    if generation_config.tfs is not None and 0.0 <= generation_config.tfs <= 1.0:
+        warpers_to_add.append(TailFreeLogitsWarper(tfs=generation_config.tfs, min_tokens_to_keep=min_tokens_to_keep))
+    if generation_config.top_a is not None and 0.0 <= generation_config.top_a <= 1.0:
+        warpers_to_add.append(TopALogitsWarper(top_a=generation_config.top_a, min_tokens_to_keep=min_tokens_to_keep))
+
+    if warpers and isinstance(warpers[-1], LogitNormalization):
+        warpers = warpers[:-1] + warpers_to_add + [warpers[-1]]
+    else:
+        warpers += warpers_to_add
+
+    return warpers
+
+
+def generation_config_init_patch(self, **kwargs):
+    self.__init___old(**kwargs)
+    self.tfs = kwargs.pop("tfs", 1.0)
+    self.top_a = kwargs.pop("top_a", 0.0)
+
+
+def hijack_samplers():
+    transformers.GenerationMixin._get_logits_warper_old = transformers.GenerationMixin._get_logits_warper
+    transformers.GenerationMixin._get_logits_warper = get_logits_warper_patch
+
+    transformers.GenerationConfig.__init___old = transformers.GenerationConfig.__init__
+    transformers.GenerationConfig.__init__ = generation_config_init_patch
diff --git a/modules/shared.py b/modules/shared.py
index 99391f6528..9a025587ff 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -55,6 +55,7 @@
     'truncation_length_min': 0,
     'truncation_length_max': 8192,
     'mode': 'chat',
+    'start_with': '',
     'chat_style': 'cai-chat',
     'instruction_template': 'None',
     'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
@@ -65,18 +66,9 @@
     'chat_generation_attempts_min': 1,
     'chat_generation_attempts_max': 10,
     'default_extensions': [],
-    'chat_default_extensions': ["gallery"],
-    'presets': {
-        'default': 'Default',
-        '.*(alpaca|llama|llava|vicuna)': "LLaMA-Precise",
-        '.*pygmalion': 'NovelAI-Storywriter',
-        '.*RWKV.*\.pth': 'Naive',
-        '.*moss': 'MOSS',
-    },
-    'prompts': {
-        'default': 'QA',
-        '.*(gpt4chan|gpt-4chan|4chan)': 'GPT-4chan',
-    }
+    'chat_default_extensions': ['gallery'],
+    'preset': 'LLaMA-Precise',
+    'prompt': 'QA',
 }
 
 
@@ -103,14 +95,14 @@ def str2bool(v):
 parser.add_argument("--lora-dir", type=str, default='loras/', help="Path to directory with all the loras")
 parser.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.')
 parser.add_argument('--no-stream', action='store_true', help='Don\'t stream the text output in real time.')
-parser.add_argument('--settings', type=str, help='Load the default interface settings from this json file. See settings-template.json for an example. If you create a file called settings.json, this file will be loaded by default without the need to use the --settings flag.')
+parser.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
 parser.add_argument('--extensions', type=str, nargs="+", help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
 parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
 
 # Accelerate/transformers
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
 parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
-parser.add_argument('--gpu-memory', type=str, nargs="+", help='Maxmimum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.')
+parser.add_argument('--gpu-memory', type=str, nargs="+", help='Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.')
 parser.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.')
 parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
 parser.add_argument('--disk-cache-dir', type=str, default="cache", help='Directory to save the disk cache to. Defaults to "cache".')
@@ -119,7 +111,7 @@ def str2bool(v):
 parser.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces the VRAM usage a bit at a performance cost.')
 parser.add_argument('--xformers', action='store_true', help="Use xformer's memory efficient attention. This should increase your tokens/s.")
 parser.add_argument('--sdp-attention', action='store_true', help="Use torch 2.0's sdp attention.")
-parser.add_argument('--trust-remote-code', action='store_true', help="Set trust_remote_code=True while loading a model. Necessary for ChatGLM.")
+parser.add_argument('--trust-remote-code', action='store_true', help="Set trust_remote_code=True while loading a model. Necessary for ChatGLM and Falcon.")
 
 # Accelerate 4-bit
 parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision (using bitsandbytes).')
@@ -151,6 +143,7 @@ def str2bool(v):
 # AutoGPTQ
 parser.add_argument('--autogptq', action='store_true', help='Use AutoGPTQ for loading quantized models instead of the internal GPTQ loader.')
 parser.add_argument('--triton', action='store_true', help='Use triton.')
+parser.add_argument('--desc_act', action='store_true', help='For models that don\'t have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.')
 
 # FlexGen
 parser.add_argument('--flexgen', action='store_true', help='Enable the use of FlexGen offloading.')
@@ -199,7 +192,7 @@ def str2bool(v):
 if args.trust_remote_code:
     logger.warning("trust_remote_code is enabled. This is dangerous.")
 if args.share:
-    logger.warning("The gradio \"share link\" feature downloads a proprietary and unaudited blob to create a reverse tunnel. This is potentially dangerous.")
+    logger.warning("The gradio \"share link\" feature uses a proprietary executable to create a reverse tunnel. Use it with care.")
 
 
 def add_extension(name):
diff --git a/modules/text_generation.py b/modules/text_generation.py
index bc260d8369..f4faf4cc08 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -167,7 +167,7 @@ def _generate_reply(question, state, eos_token=None, stopping_strings=None, is_c
     if generate_func is None:
         if shared.model_name == 'None' or shared.model is None:
             logger.error("No model is loaded! Select one in the Model tab.")
-            yield question
+            yield ''
             return
 
         if shared.model_type in ['rwkv', 'llamacpp']:
@@ -188,13 +188,25 @@ def _generate_reply(question, state, eos_token=None, stopping_strings=None, is_c
     shared.stop_everything = False
     clear_torch_cache()
     seed = set_manual_seed(state['seed'])
+    is_stream = state['stream']
+    last_update = -1
+    reply = ''
     for reply in generate_func(question, original_question, seed, state, eos_token, stopping_strings, is_chat=is_chat):
+        if is_stream:
+            cur_time = time.time()
+            if cur_time - last_update > 0.041666666666666664:  # Limit streaming to 24 fps
+                last_update = cur_time
+                yield reply
+        else:
+            yield reply
+
+    if is_stream:
         yield reply
 
 
 def generate_reply_HF(question, original_question, seed, state, eos_token=None, stopping_strings=None, is_chat=False):
     generate_params = {}
-    for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']:
+    for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'tfs', 'top_a']:
         generate_params[k] = state[k]
 
     for k in ['epsilon_cutoff', 'eta_cutoff']:
diff --git a/modules/training.py b/modules/training.py
index f86fa5a4fb..75ba82ca7d 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -234,10 +234,10 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
         time.sleep(5)
 
     if shared.args.wbits > 0 and not shared.args.monkey_patch:
-        yield "LoRA training in 4-bit requires loading with `--monkey-patch`"
+        yield "LoRA training with GPTQ models requires loading with `--monkey-patch`"
         return
 
-    elif not shared.args.load_in_8bit and shared.args.wbits <= 0:
+    elif not (shared.args.load_in_8bit or shared.args.load_in_4bit) and shared.args.wbits <= 0:
         yield "It is highly recommended you use `--load-in-8bit` for LoRA training. *(Will continue anyway in 2 seconds, press `Interrupt` to stop.)*"
         logger.warning("It is highly recommended you use `--load-in-8bit` for LoRA training.")
         time.sleep(2)  # Give it a moment for the message to show in UI before continuing
diff --git a/modules/ui.py b/modules/ui.py
index f10588213c..6279603261 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -30,7 +30,7 @@
 
 
 def list_model_elements():
-    elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed']
+    elements = ['cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'autogptq', 'triton', 'desc_act', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed']
     for i in range(torch.cuda.device_count()):
         elements.append(f'gpu_memory_{i}')
 
@@ -38,7 +38,7 @@ def list_model_elements():
 
 
 def list_interface_input_elements(chat=False):
-    elements = ['max_new_tokens', 'seed', 'temperature', 'top_p', 'top_k', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'min_length', 'do_sample', 'penalty_alpha', 'num_beams', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'add_bos_token', 'ban_eos_token', 'truncation_length', 'custom_stopping_strings', 'skip_special_tokens', 'preset_menu', 'stream']
+    elements = ['max_new_tokens', 'seed', 'temperature', 'top_p', 'top_k', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'min_length', 'do_sample', 'penalty_alpha', 'num_beams', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'add_bos_token', 'ban_eos_token', 'truncation_length', 'custom_stopping_strings', 'skip_special_tokens', 'preset_menu', 'stream', 'tfs', 'top_a']
     if chat:
         elements += ['name1', 'name2', 'greeting', 'context', 'chat_prompt_size', 'chat_generation_attempts', 'stop_at_newline', 'mode', 'instruction_template', 'character_menu', 'name1_instruct', 'name2_instruct', 'context_instruct', 'turn_template', 'chat_style', 'chat-instruct_command']
 
diff --git a/modules/utils.py b/modules/utils.py
index 6722022d89..84ca997fcf 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -29,7 +29,7 @@ def get_available_models():
 
 
 def get_available_presets():
-    return sorted(set((k.stem for k in Path('presets').glob('*.txt'))), key=natural_keys)
+    return sorted(set((k.stem for k in Path('presets').glob('*.yaml'))), key=natural_keys)
 
 
 def get_available_prompts():
diff --git a/presets/Contrastive Search.txt b/presets/Contrastive Search.txt
deleted file mode 100644
index 832bc9caf9..0000000000
--- a/presets/Contrastive Search.txt	
+++ /dev/null
@@ -1,3 +0,0 @@
-do_sample=False
-penalty_alpha=0.6
-top_k=4
diff --git a/presets/Debug-deterministic.txt b/presets/Debug-deterministic.txt
deleted file mode 100644
index 6673b71c81..0000000000
--- a/presets/Debug-deterministic.txt
+++ /dev/null
@@ -1 +0,0 @@
-do_sample=False
diff --git a/presets/Debug-deterministic.yaml b/presets/Debug-deterministic.yaml
new file mode 100644
index 0000000000..e5ccac65d9
--- /dev/null
+++ b/presets/Debug-deterministic.yaml
@@ -0,0 +1 @@
+do_sample: False
diff --git a/presets/Default.txt b/presets/Default.txt
deleted file mode 100644
index d28ce62f0e..0000000000
--- a/presets/Default.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-do_sample=True
-top_p=0.5
-top_k=40
-temperature=0.7
-repetition_penalty=1.2
-typical_p=1.0
diff --git a/presets/Kobold-Godlike.txt b/presets/Kobold-Godlike.txt
deleted file mode 100644
index 0ba5b794b6..0000000000
--- a/presets/Kobold-Godlike.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-do_sample=True
-top_p=0.5
-top_k=0
-temperature=0.7
-repetition_penalty=1.1
-typical_p=0.19
diff --git a/presets/Kobold-Godlike.yaml b/presets/Kobold-Godlike.yaml
new file mode 100644
index 0000000000..3d2bdf2f7a
--- /dev/null
+++ b/presets/Kobold-Godlike.yaml
@@ -0,0 +1,6 @@
+do_sample: true
+top_p: 0.5
+top_k: 0
+temperature: 0.7
+repetition_penalty: 1.1
+typical_p: 0.19
diff --git a/presets/Kobold-Liminal Drift.txt b/presets/Kobold-Liminal Drift.txt
deleted file mode 100644
index be4dd3bd7a..0000000000
--- a/presets/Kobold-Liminal Drift.txt	
+++ /dev/null
@@ -1,6 +0,0 @@
-do_sample=True
-top_p=1.0
-top_k=0
-temperature=0.66
-repetition_penalty=1.1
-typical_p=0.6
diff --git a/presets/Kobold-Liminal Drift.yaml b/presets/Kobold-Liminal Drift.yaml
new file mode 100644
index 0000000000..3dbcd5cce1
--- /dev/null
+++ b/presets/Kobold-Liminal Drift.yaml	
@@ -0,0 +1,6 @@
+do_sample: true
+top_p: 1.0
+top_k: 0
+temperature: 0.66
+repetition_penalty: 1.1
+typical_p: 0.6
diff --git a/presets/LLaMA-Precise.txt b/presets/LLaMA-Precise.txt
deleted file mode 100644
index 8098b390a0..0000000000
--- a/presets/LLaMA-Precise.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-do_sample=True
-top_p=0.1
-top_k=40
-temperature=0.7
-repetition_penalty=1.18
-typical_p=1.0
diff --git a/presets/LLaMA-Precise.yaml b/presets/LLaMA-Precise.yaml
new file mode 100644
index 0000000000..2d0c2bdc8a
--- /dev/null
+++ b/presets/LLaMA-Precise.yaml
@@ -0,0 +1,6 @@
+do_sample: true
+top_p: 0.1
+top_k: 40
+temperature: 0.7
+repetition_penalty: 1.18
+typical_p: 1.0
diff --git a/presets/MOSS.txt b/presets/MOSS.txt
deleted file mode 100644
index e895e88623..0000000000
--- a/presets/MOSS.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-temperature=0.7
-top_p=0.8
-repetition_penalty=1.02
diff --git a/presets/Naive.txt b/presets/Naive.txt
deleted file mode 100644
index aa8c058224..0000000000
--- a/presets/Naive.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-do_sample=True
-temperature=0.7
-top_p=0.85
-top_k=50
diff --git a/presets/Naive.yaml b/presets/Naive.yaml
new file mode 100644
index 0000000000..af98420198
--- /dev/null
+++ b/presets/Naive.yaml
@@ -0,0 +1,4 @@
+do_sample: true
+temperature: 0.7
+top_p: 0.85
+top_k: 50
diff --git a/presets/NovelAI-Best Guess.txt b/presets/NovelAI-Best Guess.txt
deleted file mode 100644
index db3fa75b2a..0000000000
--- a/presets/NovelAI-Best Guess.txt	
+++ /dev/null
@@ -1,6 +0,0 @@
-do_sample=True
-top_p=0.9
-top_k=100
-temperature=0.8
-repetition_penalty=1.15
-typical_p=1.0
diff --git a/presets/NovelAI-Best Guess.yaml b/presets/NovelAI-Best Guess.yaml
new file mode 100644
index 0000000000..2c21d136d8
--- /dev/null
+++ b/presets/NovelAI-Best Guess.yaml	
@@ -0,0 +1,6 @@
+do_sample: true
+top_p: 0.9
+top_k: 100
+temperature: 0.8
+repetition_penalty: 1.15
+typical_p: 1.0
diff --git a/presets/NovelAI-Decadence.txt b/presets/NovelAI-Decadence.txt
deleted file mode 100644
index d3109f3e3f..0000000000
--- a/presets/NovelAI-Decadence.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-do_sample=True
-top_p=1.0
-top_k=100
-temperature=2
-repetition_penalty=1
-typical_p=0.97
diff --git a/presets/NovelAI-Decadence.yaml b/presets/NovelAI-Decadence.yaml
new file mode 100644
index 0000000000..28e1a21b60
--- /dev/null
+++ b/presets/NovelAI-Decadence.yaml
@@ -0,0 +1,6 @@
+do_sample: true
+top_p: 1.0
+top_k: 100
+temperature: 2
+repetition_penalty: 1
+typical_p: 0.97
diff --git a/presets/NovelAI-Genesis.txt b/presets/NovelAI-Genesis.txt
deleted file mode 100644
index cc7376b3b9..0000000000
--- a/presets/NovelAI-Genesis.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-do_sample=True
-top_p=0.98
-top_k=0
-temperature=0.63
-repetition_penalty=1.05
-typical_p=1.0
diff --git a/presets/NovelAI-Genesis.yaml b/presets/NovelAI-Genesis.yaml
new file mode 100644
index 0000000000..41ee4c6bd3
--- /dev/null
+++ b/presets/NovelAI-Genesis.yaml
@@ -0,0 +1,6 @@
+do_sample: true
+top_p: 0.98
+top_k: 0
+temperature: 0.63
+repetition_penalty: 1.05
+typical_p: 1.0
diff --git a/presets/NovelAI-Lycaenidae.txt b/presets/NovelAI-Lycaenidae.txt
deleted file mode 100644
index 0134569cef..0000000000
--- a/presets/NovelAI-Lycaenidae.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-do_sample=True
-top_p=0.85
-top_k=12
-temperature=2
-repetition_penalty=1.15
-typical_p=1.0
diff --git a/presets/NovelAI-Lycaenidae.yaml b/presets/NovelAI-Lycaenidae.yaml
new file mode 100644
index 0000000000..be296d7957
--- /dev/null
+++ b/presets/NovelAI-Lycaenidae.yaml
@@ -0,0 +1,6 @@
+do_sample: true
+top_p: 0.85
+top_k: 12
+temperature: 2
+repetition_penalty: 1.15
+typical_p: 1.0
diff --git a/presets/NovelAI-Ouroboros.txt b/presets/NovelAI-Ouroboros.txt
deleted file mode 100644
index 1e944b54e7..0000000000
--- a/presets/NovelAI-Ouroboros.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-do_sample=True
-top_p=1.0
-top_k=100
-temperature=1.07
-repetition_penalty=1.05
-typical_p=1.0
diff --git a/presets/NovelAI-Ouroboros.yaml b/presets/NovelAI-Ouroboros.yaml
new file mode 100644
index 0000000000..53846a1d39
--- /dev/null
+++ b/presets/NovelAI-Ouroboros.yaml
@@ -0,0 +1,6 @@
+do_sample: true
+top_p: 1.0
+top_k: 100
+temperature: 1.07
+repetition_penalty: 1.05
+typical_p: 1.0
diff --git a/presets/NovelAI-Pleasing Results.txt b/presets/NovelAI-Pleasing Results.txt
deleted file mode 100644
index 330114a25d..0000000000
--- a/presets/NovelAI-Pleasing Results.txt	
+++ /dev/null
@@ -1,6 +0,0 @@
-do_sample=True
-top_p=1.0
-top_k=0
-temperature=0.44
-repetition_penalty=1.15
-typical_p=1.0
diff --git a/presets/NovelAI-Pleasing Results.yaml b/presets/NovelAI-Pleasing Results.yaml
new file mode 100644
index 0000000000..ca7408df83
--- /dev/null
+++ b/presets/NovelAI-Pleasing Results.yaml	
@@ -0,0 +1,6 @@
+do_sample: true
+top_p: 1.0
+top_k: 0
+temperature: 0.44
+repetition_penalty: 1.15
+typical_p: 1.0
diff --git a/presets/NovelAI-Sphinx Moth.txt b/presets/NovelAI-Sphinx Moth.txt
deleted file mode 100644
index bace1e24b5..0000000000
--- a/presets/NovelAI-Sphinx Moth.txt	
+++ /dev/null
@@ -1,6 +0,0 @@
-do_sample=True
-top_p=0.18
-top_k=30
-temperature=2.0
-repetition_penalty=1.15
-typical_p=1.0
diff --git a/presets/NovelAI-Sphinx Moth.yaml b/presets/NovelAI-Sphinx Moth.yaml
new file mode 100644
index 0000000000..8efd5a74b7
--- /dev/null
+++ b/presets/NovelAI-Sphinx Moth.yaml	
@@ -0,0 +1,6 @@
+do_sample: true
+top_p: 0.18
+top_k: 30
+temperature: 2.0
+repetition_penalty: 1.15
+typical_p: 1.0
diff --git a/presets/NovelAI-Storywriter.txt b/presets/NovelAI-Storywriter.txt
deleted file mode 100644
index 2df5f81814..0000000000
--- a/presets/NovelAI-Storywriter.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-do_sample=True
-top_p=0.73
-top_k=0
-temperature=0.72
-repetition_penalty=1.1
-typical_p=1.0
diff --git a/presets/NovelAI-Storywriter.yaml b/presets/NovelAI-Storywriter.yaml
new file mode 100644
index 0000000000..34d11ec814
--- /dev/null
+++ b/presets/NovelAI-Storywriter.yaml
@@ -0,0 +1,6 @@
+do_sample: true
+top_p: 0.73
+top_k: 0
+temperature: 0.72
+repetition_penalty: 1.1
+typical_p: 1.0
diff --git a/presets/Special-Contrastive Search.yaml b/presets/Special-Contrastive Search.yaml
new file mode 100644
index 0000000000..290342e23c
--- /dev/null
+++ b/presets/Special-Contrastive Search.yaml	
@@ -0,0 +1,3 @@
+do_sample: False
+penalty_alpha: 0.6
+top_k: 4
diff --git a/presets/Special-Eta Sampling.yaml b/presets/Special-Eta Sampling.yaml
new file mode 100644
index 0000000000..813522e635
--- /dev/null
+++ b/presets/Special-Eta Sampling.yaml	
@@ -0,0 +1,4 @@
+do_sample: true
+eta_cutoff: 3
+temperature: 0.7
+repetition_penalty: 1.18
diff --git a/presets/Verbose (Beam Search).txt b/presets/Verbose (Beam Search).txt
deleted file mode 100644
index 464a4a5f0d..0000000000
--- a/presets/Verbose (Beam Search).txt	
+++ /dev/null
@@ -1,9 +0,0 @@
-num_beams=10
-min_length=200
-length_penalty=1.4
-no_repeat_ngram_size=2
-early_stopping=True
-temperature=0.7
-top_k=150
-top_p=0.92
-repetition_penalty=4.5
diff --git a/requirements.txt b/requirements.txt
index 3c8ce39fce..084a3cd696 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 colorama
 datasets
+einops
 flexgen==0.1.7
 gradio_client==0.2.5
 gradio==3.31.0
@@ -18,5 +19,7 @@ git+https://github.com/huggingface/transformers@e45e756d22206ca8fa9fb057c8c3d8fa
 git+https://github.com/huggingface/accelerate@0226f750257b3bf2cadc4f189f9eef0c764a0467
 bitsandbytes==0.39.0; platform_system != "Windows"
 https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.39.0-py3-none-any.whl; platform_system == "Windows"
-llama-cpp-python==0.1.53; platform_system != "Windows"
-https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.53/llama_cpp_python-0.1.53-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+llama-cpp-python==0.1.56; platform_system != "Windows"
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.56/llama_cpp_python-0.1.56-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
+https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.0/auto_gptq-0.2.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux"
diff --git a/server.py b/server.py
index 72d09232b0..1c8a5fe08f 100644
--- a/server.py
+++ b/server.py
@@ -89,9 +89,11 @@ def load_preset_values(preset_menu, state, return_dict=False):
         'typical_p': 1,
         'epsilon_cutoff': 0,
         'eta_cutoff': 0,
+        'tfs': 1,
+        'top_a': 0,
         'repetition_penalty': 1,
         'encoder_repetition_penalty': 1,
-        'top_k': 50,
+        'top_k': 0,
         'num_beams': 1,
         'penalty_alpha': 0,
         'min_length': 0,
@@ -103,19 +105,18 @@ def load_preset_values(preset_menu, state, return_dict=False):
         'mirostat_eta': 0.1,
     }
 
-    with open(Path(f'presets/{preset_menu}.txt'), 'r') as infile:
-        preset = infile.read()
-    for i in preset.splitlines():
-        i = i.rstrip(',').strip().split('=')
-        if len(i) == 2 and i[0].strip() != 'tokens':
-            generate_params[i[0].strip()] = eval(i[1].strip())
+    with open(Path(f'presets/{preset_menu}.yaml'), 'r') as infile:
+        preset = yaml.safe_load(infile)
+
+    for k in preset:
+        generate_params[k] = preset[k]
 
     generate_params['temperature'] = min(1.99, generate_params['temperature'])
     if return_dict:
         return generate_params
     else:
         state.update(generate_params)
-        return state, *[generate_params[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta']]
+        return state, *[generate_params[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a']]
 
 
 def upload_soft_prompt(file):
@@ -183,7 +184,8 @@ def count_tokens(text):
 
 def download_model_wrapper(repo_id):
     try:
-        downloader = importlib.import_module("download-model")
+        downloader_module = importlib.import_module("download-model")
+        downloader = downloader_module.ModelDownloader()
         repo_id_parts = repo_id.split(":")
         model = repo_id_parts[0] if len(repo_id_parts) > 0 else repo_id
         branch = repo_id_parts[1] if len(repo_id_parts) > 1 else "main"
@@ -301,7 +303,7 @@ def save_model_settings(model, state):
             shared.model_config[model_regex][k] = state[k]
 
         with open(p, 'w') as f:
-            f.write(yaml.dump(user_config))
+            f.write(yaml.dump(user_config, sort_keys=False))
 
         yield (f"Settings for {model} saved to {p}")
 
@@ -367,6 +369,7 @@ def create_model_menus():
                         shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu)
                         shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
                         shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
+                        shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.')
 
             with gr.Box():
                 gr.Markdown('Transformers 4-bit')
@@ -387,13 +390,19 @@ def create_model_menus():
 
         with gr.Column():
             with gr.Box():
-                gr.Markdown('GPTQ')
                 with gr.Row():
                     with gr.Column():
-                        shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
-                        shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
+                        gr.Markdown('AutoGPTQ')
+                        shared.gradio['autogptq'] = gr.Checkbox(label="autogptq", value=shared.args.autogptq, info='Activate AutoGPTQ loader. gpu-memory should be used for CPU offloading instead of pre_layer.')
+                        shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
+                        shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
 
                     with gr.Column():
+                        gr.Markdown('GPTQ-for-LLaMa')
+                        with gr.Row():
+                            shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
+                            shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
+
                         shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None", "llama", "opt", "gptj"], value=shared.args.model_type or "None")
                         shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
 
@@ -471,8 +480,10 @@ def create_settings_menus(default_preset):
                         shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p', info='If not set to 1, select tokens with probabilities adding up to less than this number. Higher value = higher range of possible random results.')
                         shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k', info='Similar to top_p, but select instead only the top_k most likely tokens. Higher value = higher range of possible random results.')
                         shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p', info='If not set to 1, select only tokens that are at least this much more likely to appear than random tokens, given the prior text.')
-                        shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff', info='In units of 1e-4')
-                        shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff', info='In units of 1e-4')
+                        shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff', info='In units of 1e-4; a reasonable value is 3. This sets a probability floor below which tokens are excluded from being sampled. Should be used with top_p, top_k, and eta_cutoff set to 0.')
+                        shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff', info='In units of 1e-4; a reasonable value is 3. Should be used with top_p, top_k, and epsilon_cutoff set to 0.')
+                        shared.gradio['tfs'] = gr.Slider(0.0, 1.0, value=generate_params['tfs'], step=0.01, label='tfs')
+                        shared.gradio['top_a'] = gr.Slider(0.0, 1.0, value=generate_params['top_a'], step=0.01, label='top_a')
 
                     with gr.Column():
                         shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty', info='Exponential penalty factor for repeating prior tokens. 1 means no penalty, higher value = less repetition, lower value = more repetition.')
@@ -523,7 +534,7 @@ def create_settings_menus(default_preset):
 
             gr.Markdown('[Click here for more information.](https://github.com/oobabooga/text-generation-webui/blob/main/docs/Generation-parameters.md)')
 
-    shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio[k] for k in ['preset_menu', 'interface_state']], [shared.gradio[k] for k in ['interface_state', 'do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta']])
+    shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio[k] for k in ['preset_menu', 'interface_state']], [shared.gradio[k] for k in ['interface_state', 'do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'tfs', 'top_a']])
     shared.gradio['softprompts_menu'].change(load_soft_prompt, shared.gradio['softprompts_menu'], shared.gradio['softprompts_menu'], show_progress=True)
     shared.gradio['upload_softprompt'].upload(upload_soft_prompt, shared.gradio['upload_softprompt'], shared.gradio['softprompts_menu'])
 
@@ -551,11 +562,8 @@ def create_interface():
 
     # Defining some variables
     gen_events = []
-    default_preset = shared.settings['presets'][next((k for k in shared.settings['presets'] if re.match(k.lower(), shared.model_name.lower())), 'default')]
-    if len(shared.lora_names) == 1:
-        default_text = load_prompt(shared.settings['prompts'][next((k for k in shared.settings['prompts'] if re.match(k.lower(), shared.lora_names[0].lower())), 'default')])
-    else:
-        default_text = load_prompt(shared.settings['prompts'][next((k for k in shared.settings['prompts'] if re.match(k.lower(), shared.model_name.lower())), 'default')])
+    default_preset = shared.settings['preset']
+    default_text = load_prompt(shared.settings['prompt'])
     title = 'Text generation web UI'
 
     # Authentication variables
@@ -603,23 +611,27 @@ def create_interface():
                     shared.gradio['Continue'] = gr.Button('Continue')
 
                 with gr.Row():
-                    shared.gradio['Copy last reply'] = gr.Button('Copy last reply')
+                    shared.gradio['Impersonate'] = gr.Button('Impersonate')
                     shared.gradio['Regenerate'] = gr.Button('Regenerate')
-                    shared.gradio['Replace last reply'] = gr.Button('Replace last reply')
+                    shared.gradio['Remove last'] = gr.Button('Remove last')
 
                 with gr.Row():
-                    shared.gradio['Impersonate'] = gr.Button('Impersonate')
+                    shared.gradio['Copy last reply'] = gr.Button('Copy last reply')
+                    shared.gradio['Replace last reply'] = gr.Button('Replace last reply')
                     shared.gradio['Send dummy message'] = gr.Button('Send dummy message')
                     shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
 
                 with gr.Row():
-                    shared.gradio['Remove last'] = gr.Button('Remove last')
                     shared.gradio['Clear history'] = gr.Button('Clear history')
                     shared.gradio['Clear history-confirm'] = gr.Button('Confirm', variant='stop', visible=False)
                     shared.gradio['Clear history-cancel'] = gr.Button('Cancel', visible=False)
 
-                shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'instruct', 'chat-instruct'] else 'chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under "Chat settings" must match the current model.')
-                shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
+                with gr.Row():
+                    shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'])
+
+                with gr.Row():
+                    shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'instruct', 'chat-instruct'] else 'chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under "Chat settings" must match the current model.')
+                    shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
 
             with gr.Tab('Chat settings', elem_id='chat-settings'):
                 with gr.Row():
@@ -689,7 +701,7 @@ def create_interface():
                     with gr.Row():
                         with gr.Column():
                             shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
-                            shared.gradio['chat_prompt_size'] = gr.Slider(minimum=shared.settings['chat_prompt_size_min'], maximum=shared.settings['chat_prompt_size_max'], step=1, label='Maximum prompt size in tokens', value=shared.settings['chat_prompt_size'])
+                            shared.gradio['chat_prompt_size'] = gr.Slider(minimum=shared.settings['chat_prompt_size_min'], maximum=shared.settings['chat_prompt_size_max'], step=1, label='chat_prompt_size', info='Set limit on prompt size by removing old messages (while retaining context and user input)', value=shared.settings['chat_prompt_size'])
 
                         with gr.Column():
                             shared.gradio['chat_generation_attempts'] = gr.Slider(minimum=shared.settings['chat_generation_attempts_min'], maximum=shared.settings['chat_generation_attempts_max'], value=shared.settings['chat_generation_attempts'], step=1, label='Generation attempts (for longer replies)', info='New generations will be called until either this number is reached or no new content is generated between two iterations.')
@@ -817,7 +829,7 @@ def create_interface():
 
         # chat mode event handlers
         if shared.is_chat():
-            shared.input_params = [shared.gradio[k] for k in ['Chat input', 'interface_state']]
+            shared.input_params = [shared.gradio[k] for k in ['Chat input', 'start_with', 'interface_state']]
             clear_arr = [shared.gradio[k] for k in ['Clear history-confirm', 'Clear history', 'Clear history-cancel']]
             shared.reload_inputs = [shared.gradio[k] for k in ['name1', 'name2', 'mode', 'chat_style']]
 
@@ -1016,16 +1028,19 @@ def create_interface():
     settings_file = None
     if shared.args.settings is not None and Path(shared.args.settings).exists():
         settings_file = Path(shared.args.settings)
+    elif Path('settings.yaml').exists():
+        settings_file = Path('settings.yaml')
     elif Path('settings.json').exists():
         settings_file = Path('settings.json')
 
     if settings_file is not None:
         logger.info(f"Loading settings from {settings_file}...")
-        new_settings = json.loads(open(settings_file, 'r').read())
+        file_contents = open(settings_file, 'r', encoding='utf-8').read()
+        new_settings = json.loads(file_contents) if settings_file.suffix == "json" else yaml.safe_load(file_contents)
         for item in new_settings:
             shared.settings[item] = new_settings[item]
 
-    # Set default model settings based on settings.json
+    # Set default model settings based on settings file
     shared.model_config['.*'] = {
         'wbits': 'None',
         'model_type': 'None',
diff --git a/settings-template.json b/settings-template.json
deleted file mode 100644
index 0860d853dd..0000000000
--- a/settings-template.json
+++ /dev/null
@@ -1,47 +0,0 @@
-{
-    "dark_theme": false,
-    "autoload_model": true,
-    "max_new_tokens": 200,
-    "max_new_tokens_min": 1,
-    "max_new_tokens_max": 2000,
-    "seed": -1,
-    "character": "None",
-    "name1": "You",
-    "name2": "Assistant",
-    "context": "This is a conversation with your Assistant. It is a computer program designed to help you with various tasks such as answering questions, providing recommendations, and helping with decision making. You can ask it anything you want and it will do its best to give you accurate and relevant information.",
-    "greeting": "",
-    "turn_template": "",
-    "custom_stopping_strings": "",
-    "stop_at_newline": false,
-    "add_bos_token": true,
-    "ban_eos_token": false,
-    "skip_special_tokens": true,
-    "truncation_length": 2048,
-    "truncation_length_min": 0,
-    "truncation_length_max": 8192,
-    "mode": "chat",
-    "chat_style": "cai-chat",
-    "instruction_template": "None",
-    "chat-instruct_command": "Continue the chat dialogue below. Write a single reply for the character \"<|character|>\".\n\n<|prompt|>",
-    "chat_prompt_size": 2048,
-    "chat_prompt_size_min": 0,
-    "chat_prompt_size_max": 2048,
-    "chat_generation_attempts": 1,
-    "chat_generation_attempts_min": 1,
-    "chat_generation_attempts_max": 10,
-    "default_extensions": [],
-    "chat_default_extensions": [
-        "gallery"
-    ],
-    "presets": {
-        "default": "Default",
-        ".*(alpaca|llama|llava|vicuna)": "LLaMA-Precise",
-        ".*pygmalion": "NovelAI-Storywriter",
-        ".*RWKV.*\.pth": "Naive",
-        ".*moss": "MOSS"
-    },
-    "prompts": {
-        "default": "QA",
-        ".*(gpt4chan|gpt-4chan|4chan)": "GPT-4chan"
-    }
-}
diff --git a/settings-template.yaml b/settings-template.yaml
new file mode 100644
index 0000000000..84cf010597
--- /dev/null
+++ b/settings-template.yaml
@@ -0,0 +1,43 @@
+dark_theme: false
+autoload_model: true
+max_new_tokens: 200
+max_new_tokens_min: 1
+max_new_tokens_max: 2000
+seed: -1
+character: None
+name1: You
+name2: Assistant
+context: This is a conversation with your Assistant. It is a computer program designed
+  to help you with various tasks such as answering questions, providing recommendations,
+  and helping with decision making. You can ask it anything you want and it will do
+  its best to give you accurate and relevant information.
+greeting: ''
+turn_template: ''
+custom_stopping_strings: ''
+stop_at_newline: false
+add_bos_token: true
+ban_eos_token: false
+skip_special_tokens: true
+truncation_length: 2048
+truncation_length_min: 0
+truncation_length_max: 8192
+mode: chat
+start_with: ''
+chat_style: cai-chat
+instruction_template: None
+chat-instruct_command: 'Continue the chat dialogue below. Write a single reply for
+  the character "<|character|>".
+
+
+  <|prompt|>'
+chat_prompt_size: 2048
+chat_prompt_size_min: 0
+chat_prompt_size_max: 2048
+chat_generation_attempts: 1
+chat_generation_attempts_min: 1
+chat_generation_attempts_max: 10
+default_extensions: []
+chat_default_extensions:
+- gallery
+preset: LLaMA-Precise
+prompt: QA