oobabooga · randoentity · Jun 27, 2023 · Jun 27, 2023 · Jun 27, 2023 · Jun 27, 2023
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Text generation web UI
 
-A gradio web UI for running Large Language Models like LLaMA, llama.cpp, GPT-J, Pythia, OPT, and GALACTICA.
+A gradio web UI for running Large Language Models like LLaMA, llama.cpp, GPT-J, Pythia, OPT, starcoder and GALACTICA.
 
 Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
 
@@ -27,6 +27,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 * API, including endpoints for websocket streaming ([see the examples](https://github.com/oobabooga/text-generation-webui/blob/main/api-examples))
 
 To learn how to use the various features, check out the Documentation: https://github.com/oobabooga/text-generation-webui/tree/main/docs
+* [starcoder.cpp](docs/starcoder.cpp-models.md)
 
 ## Installation
 
@@ -251,6 +252,13 @@ Optionally, you can use the following command-line flags:
 | `--n_ctx N_CTX` | Size of the prompt context. |
 | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
 
+#### starcoder.cpp
+
+| Flag        | Description |
+|-------------|-------------|
+| `--threads` | Number of threads to use. |
+| `--n_ctx N_CTX` | Size of the prompt context. |
+
 #### AutoGPTQ
 
 | Flag             | Description |

diff --git a/docs/starcoder.cpp-models.md b/docs/starcoder.cpp-models.md
@@ -0,0 +1,26 @@
+# Using starcoder.cpp in the web UI
+
+## Setting up the models
+
+#### Pre-converted
+
+Place the model in the `models` folder, making sure that its name
+contains `starcoder` or `starchat` in the beginning, `ggml` somewhere
+in the middle and ends in `.bin`.
+
+You can find converted models here:
+
+- [StarChat Alpha](https://huggingface.co/NeoDim/starchat-alpha-GGML)
+- [StarCoder](https://huggingface.co/NeoDim/starcoder-GGML)
+- [StarCoderBase](https://huggingface.co/NeoDim/starcoderbase-GGML)
+
+#### Convert models yourself
+
+Follow the instructions
+[here](https://github.com/ggerganov/ggml/tree/master/examples/starcoder)
+
+There is also
+[starcoder.cpp](https://github.com/bigcode-project/starcoder.cpp#quantizing-the-models)
+but there is an [issue](https://github.com/bigcode-project/starcoder.cpp/issues/11)
+
+
diff --git a/modules/loaders.py b/modules/loaders.py
@@ -64,6 +64,15 @@
         'max_seq_len',
         'compress_pos_emb',
         'exllama_HF_info',
+    ],
+    'starcoder': [
+        'n_ctx',
+        'n_gpu_layers',
+        'n_batch',
+        'threads',
+        'no_mmap',
+        'mlock',
+        'llama_cpp_seed',
     ]
 }
 

diff --git a/modules/models.py b/modules/models.py
@@ -58,7 +58,9 @@ def load_model(model_name, loader=None):
         'FlexGen': flexgen_loader,
         'RWKV': RWKV_loader,
         'ExLlama': ExLlama_loader,
-        'ExLlama_HF': ExLlama_HF_loader
+        'ExLlama_HF': ExLlama_HF_loader,
+        'starcoder': starcodercpp_loader,
+        'starchat': starchatcpp_loader
     }
 
     if loader is None:
@@ -256,6 +258,38 @@ def llamacpp_loader(model_name):
     return model, tokenizer
 
 
+def starcodercpp_loader(model_name):
+    from modules.starcoder_model import StarcoderCppModel
+
+    path = Path(f'{shared.args.model_dir}/{model_name}')
+    if path.is_file():
+        model_file = path
+    else:
+        model_file = list(
+            Path(f'{shared.args.model_dir}/{model_name}').glob('*starcoder*ggml*.bin')
+        )[0]
+
+    logger.info(f'starcoder.cpp weights detected: {model_file}\n')
+    model, tokenizer = StarcoderCppModel().from_pretrained(model_file)
+    return model, tokenizer
+
+
+def starchatcpp_loader(model_name):
+    from modules.starcoder_model import StarcoderCppModel
+
+    path = Path(f'{shared.args.model_dir}/{model_name}')
+    if path.is_file():
+        model_file = path
+    else:
+        model_file = list(
+            Path(f'{shared.args.model_dir}/{model_name}').glob('*starchat*ggml*.bin')
+        )[0]
+
+    logger.info(f'starchat.cpp weights detected: {model_file}\n')
+    model, tokenizer = StarcoderCppModel().from_pretrained(model_file)
+    return model, tokenizer
+
+
 def GPTQ_loader(model_name):
 
     # Monkey patch

diff --git a/modules/models_settings.py b/modules/models_settings.py
@@ -24,6 +24,18 @@ def infer_loader(model_name):
         loader = None
     elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
         loader = 'AutoGPTQ'
+    elif len(list(path_to_model.glob('*starcoder*ggml*.bin'))) > 0:
+        loader = 'starcoder'
+    elif re.match('.*starcoder.*ggml.*\.bin', model_name.lower()):
+        loader = 'starcoder'
+    elif len(list(path_to_model.glob('*starchat*ggml*.bin'))) > 0:
+        loader = 'starchat'
+    elif re.match('.*starchat.*ggml.*\.bin', model_name.lower()):
+        loader = 'starchat'
+    elif len(list(path_to_model.glob('*wizardcoder*ggml*.bin'))) > 0:
+        loader = 'starcoder'
+    elif re.match('.*wizardcoder.*ggml.*\.bin', model_name.lower()):
+        loader = 'starcoder'
     elif len(list(path_to_model.glob('*ggml*.bin'))) > 0:
         loader = 'llama.cpp'
     elif re.match('.*ggml.*\.bin', model_name.lower()):

diff --git a/modules/shared.py b/modules/shared.py
@@ -36,7 +36,7 @@
     'autoload_model': True,
     'max_new_tokens': 200,
     'max_new_tokens_min': 1,
-    'max_new_tokens_max': 2000,
+    'max_new_tokens_max': 8000,
     'seed': -1,
     'character': 'None',
     'name1': 'You',

diff --git a/modules/starcoder_model.py b/modules/starcoder_model.py
@@ -0,0 +1,59 @@
+from ctransformers import AutoModelForCausalLM
+from ctransformers import AutoConfig
+
+from modules import shared
+from modules.callbacks import Iteratorize
+
+
+class StarcoderCppModel:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def from_pretrained(self, path):
+        result = self()
+
+        config = AutoConfig.from_pretrained(
+            str(path),
+            stop=["<|end|>"],
+            threads=shared.args.threads,
+            gpu_layers=shared.args.n_gpu_layers
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            str(path), model_type="starcoder", config=config
+        )
+        return result, result
+
+    def encode(self, string, **kwargs):
+        return self.model.tokenize(string)
+
+    def decode(self, ids):
+        return self.model.detokenize(ids)
+
+
+    def generate(self, prompt, state, callback=None):
+        prompt = prompt if type(prompt) is str else prompt.decode()
+        generator = self.model._stream(
+            prompt=prompt,
+            max_new_tokens=state['max_new_tokens'],
+            temperature=state['temperature'],
+            top_p=state['top_p'],
+            top_k=state['top_k'],
+            repetition_penalty=state['repetition_penalty'],
+            threads=shared.args.threads
+        )
+
+        output = ""
+        for token in generator:
+            if callback:
+                callback(token)
+            output += token
+        return output
+
+
+    def generate_with_streaming(self, *args, **kwargs):
+        with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
+            reply = ''
+            for token in generator:
+                reply += token
+                yield reply
diff --git a/modules/text_generation.py b/modules/text_generation.py
@@ -35,7 +35,7 @@ def get_max_prompt_length(state):
 
 
 def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None):
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel']:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'StarcoderCppModel']:
         input_ids = shared.tokenizer.encode(str(prompt))
         input_ids = np.array(input_ids).reshape(1, len(input_ids))
         return input_ids
@@ -50,7 +50,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     if truncation_length is not None:
         input_ids = input_ids[:, -truncation_length:]
 
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel'] or shared.args.cpu:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'StarcoderCppModel'] or shared.args.cpu:
         return input_ids
     elif shared.args.flexgen:
         return input_ids.numpy()
@@ -179,7 +179,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False):
             yield ''
             return
 
-        if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']:
+        if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'StarcoderCppModel']:
             generate_func = generate_reply_custom
         elif shared.args.flexgen:
             generate_func = generate_reply_flexgen

diff --git a/requirements.txt b/requirements.txt
@@ -25,3 +25,4 @@ https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu
 https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/exllama/releases/download/0.0.3/exllama-0.0.3+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/exllama/releases/download/0.0.3/exllama-0.0.3+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+ctransformers==0.2.10
diff --git a/server.py b/server.py
@@ -202,7 +202,7 @@ def create_model_menus():
 
     with gr.Row():
         with gr.Column():
-            shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "AutoGPTQ", "GPTQ-for-LLaMa", "ExLlama", "ExLlama_HF", "llama.cpp"], value=None)
+            shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "AutoGPTQ", "GPTQ-for-LLaMa", "ExLlama", "ExLlama_HF", "llama.cpp", "starcoder"], value=None)
             with gr.Box():
                 with gr.Row():
                     with gr.Column():