Skip to content

Commit

Permalink
Add exllama2 dynamic generator support
Browse files Browse the repository at this point in the history
  • Loading branch information
epicfilemcnulty committed Jun 20, 2024
1 parent 2398125 commit dde25e0
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/lilush_llm_backend/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" Lilush LLM Proxy """

__version__ = "0.1.6"
__version__ = "0.1.8"

from .loader import *
from .generation import *
Expand Down
22 changes: 21 additions & 1 deletion src/lilush_llm_backend/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def Exl2Query(query, sampler, tokenizer, generator, lora):
settings.min_p = sampler['min_p']
settings.token_repetition_penalty = sampler['repetition_penalty']

input_ids = tokenizer.encode(query, add_bos = sampler['add_bos'], add_eos = sampler['add_eos'], encode_special_tokens = sampler['encode_special_tokens'])
input_ids = tokenizer.encode(query, add_bos = sampler['add_bos'], encode_special_tokens = sampler['encode_special_tokens'])
prompt_tokens = input_ids.shape[-1]

generator.set_stop_conditions(stop_conditions)
Expand All @@ -33,6 +33,26 @@ def Exl2Query(query, sampler, tokenizer, generator, lora):
stop_reason = "eos" if eos else "length"
return new_text, prompt_tokens, generated_tokens, stop_reason

def Exl2QueryDynamic(query, sampler, generator):

stop_conditions = [tokenizer.eos_token_id]
if sampler['stop_conditions']:
for tid in sampler['stop_conditions']:
stop_conditions = stop_conditions + [ tid ]

settings = ExLlamaV2Sampler.Settings(temperature = sampler['temperature'],top_k = sampler['top_k'],top_p = sampler['top_p'],min_p = sampler['min_p'],token_repetition_penalty = sampler['repetition_penalty'])

output = generator.generate(
prompt=query,
gen_settings = settings,
max_new_tokens = sampler['max_new_tokens'],
add_bos = sampler['add_bos'],
encode_special_tokens = sampler['encode_special_tokens'],
stop_conditions = stop_conditions,
completion_only = True
)
return output

def TfQuery(query, sampler, model, tokenizer):

input_ids = tokenizer(query, return_tensors='pt').input_ids.to('cuda')
Expand Down
20 changes: 13 additions & 7 deletions src/lilush_llm_backend/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,27 +13,33 @@
)
from exllamav2.generator import (
ExLlamaV2StreamingGenerator,
ExLlamaV2DynamicGenerator,
ExLlamaV2Sampler
)

def LoadExl2Model(model_dir, context_length=None, lora_dir=None):
def LoadExl2Model(model_dir, context_length=None, cache_size=None, dynamic=False, lora_dir=None):
# Initialize model and cache
config = ExLlamaV2Config()
config.model_dir = model_dir
config.prepare()
config = ExLlamaV2Config(model_dir)
model = ExLlamaV2(config)
if context_length is not None and context_length != 0:
config.max_seq_len = context_length

model = ExLlamaV2(config)
cache = ExLlamaV2Cache_Q4(model, lazy = True)
if cache_size is None:
c_size = config.max_seq_len
else:
c_size = cache_size
cache = ExLlamaV2Cache_Q4(model, max_seq_len = c_size, lazy = True)
print("Loading model: " + model_dir)
model.load_autosplit(cache)
tokenizer = ExLlamaV2Tokenizer(config)
lora = None
if lora_dir is not None:
lora = ExLlamaV2Lora.from_directory(model, lora_dir)
# Initialize generator
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
if dynamic:
generator = ExLlamaV2DynamicGenerator(model, cache, tokenizer)
else:
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
# Make sure CUDA is initialized so we can measure performance
generator.warmup()
return { "model": model, "generator": generator, "tokenizer": tokenizer, "cache": cache, "lora": lora, "type": "exl2" }
Expand Down
15 changes: 12 additions & 3 deletions src/lilush_llm_backend/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
bottle.BaseRequest.MEMFILE_MAX = 1024 * 1024 * 10

from .loader import LoadExl2Model, LoadTfModel, LoadMambaModel
from .generation import Exl2Query, TfQuery, MambaQuery
from .generation import Exl2Query, Exl2QueryDynamic, TfQuery, MambaQuery

models = {}
app = Bottle()
Expand All @@ -30,9 +30,12 @@ def load_model():
response.status = 400
return {"error": "model_alias is required"}
context_length = data.get('context_length')
cache_size = data.get('cache_size')
dynamic = data.get('dynamic', False)

lora_dir = data.get('lora_dir')
if model_type == "exl2":
models[model_alias] = LoadExl2Model(model_dir, context_length, lora_dir)
models[model_alias] = LoadExl2Model(model_dir=model_dir, context_length=context_length, lora_dir=lora_dir, cache_size=cache_size, dynamic=dynamic)
return {"message": "model loaded"}
if model_type == "tf":
models[model_alias] = LoadTfModel(model_dir, context_length, lora_dir, trust_remote_code)
Expand Down Expand Up @@ -89,7 +92,13 @@ def complete():

stop_reason = None
if model_type == "exl2":
new_text, prompt_tokens, generated_tokens, stop_reason = Exl2Query(query, sampler, models[model_alias]["tokenizer"], models[model_alias]["generator"], models[model_alias]["lora"])
dynamic_generator = data.get('dynamic', False)
if dynamic_generator:
new_text = Exl2QueryDynamic(query, sampler, models[model_alias]["generator"])
prompt_tokens = 0
generated_tokens = 0
else:
new_text, prompt_tokens, generated_tokens, stop_reason = Exl2Query(query, sampler, models[model_alias]["tokenizer"], models[model_alias]["generator"], models[model_alias]["lora"])
if model_type == "tf":
new_text, prompt_tokens, generated_tokens = TfQuery(query, sampler, models[model_alias]["model"], models[model_alias]["tokenizer"])
if model_type == "mamba":
Expand Down

0 comments on commit dde25e0

Please sign in to comment.