-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit b8e95c6
Showing
9 changed files
with
1,259 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
__pycache__/ | ||
*.so | ||
.Python | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
[build-system] | ||
requires = ["flit_core >=3.2,<4"] | ||
build-backend = "flit_core.buildapi" | ||
|
||
[project] | ||
name = "lilush_llm_proxy" | ||
authors = [{name = "Vladimir Zorin", email = "[email protected]"}] | ||
license = {file = "LICENSE"} | ||
classifiers = ["License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)"] | ||
dynamic = ["version", "description"] | ||
dependencies = [ 'torch', 'numpy', 'bltzr', 'peft', 'bottle' ] | ||
|
||
[project.urls] | ||
Home = "https://github.com/epicfilemcnulty/lilush-llm-proxy" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import argparse | ||
from lilush_llm_proxy import Serve | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--ip", type=str, default="127.0.0.1", required=False) | ||
parser.add_argument("-p", "--port", type=int, default=8013, required=False) | ||
args = parser.parse_args() | ||
|
||
Serve(args.ip, args.port) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
""" Lilush LLM Proxy """ | ||
|
||
__version__ = "0.1.0" | ||
|
||
from .loader import * | ||
from .generation import * | ||
from .server import Serve | ||
from .mixin import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
import torch | ||
from exllamav2.generator import ExLlamaV2Sampler | ||
from transformers import GenerationConfig | ||
|
||
def Exl2Query(query, sampler, tokenizer, generator, lora): | ||
|
||
stop_conditions = [tokenizer.eos_token_id] | ||
if sampler['stop_conditions']: | ||
for tid in sampler['stop_conditions']: | ||
stop_conditions = stop_conditions + [ tid ] | ||
|
||
settings = ExLlamaV2Sampler.Settings() | ||
settings.temperature = sampler['temperature'] | ||
settings.top_k = sampler['top_k'] | ||
settings.top_p = sampler['top_p'] | ||
settings.min_p = sampler['min_p'] | ||
settings.token_repetition_penalty = sampler['repetition_penalty'] | ||
|
||
input_ids = tokenizer.encode(query, add_bos = sampler['add_bos'], add_eos = sampler['add_eos'], encode_special_tokens = sampler['encode_special_tokens']) | ||
prompt_tokens = input_ids.shape[-1] | ||
|
||
generator.set_stop_conditions(stop_conditions) | ||
generator.begin_stream(input_ids, settings, loras = lora) | ||
generated_tokens = 0 | ||
new_text = "" | ||
while True: | ||
chunk, eos, tokens = generator.stream() | ||
generated_tokens += 1 | ||
new_text += chunk | ||
if eos or generated_tokens == sampler['max_new_tokens']: | ||
break | ||
|
||
stop_reason = "eos" if eos else "length" | ||
return new_text, prompt_tokens, generated_tokens, stop_reason | ||
|
||
def TfQuery(query, sampler, model, tokenizer): | ||
|
||
input_ids = tokenizer(query, return_tensors='pt').input_ids.to('cuda') | ||
prompt_tokens = len(input_ids[0]) | ||
gen_cfg = GenerationConfig.from_model_config(model.config) | ||
gen_cfg.max_new_tokens = sampler['max_new_tokens'] | ||
gen_cfg.top_p = sampler['top_p'] | ||
gen_cfg.top_k = sampler['top_k'] | ||
gen_cfg.repetition_penalty = sampler['repetition_penalty'] | ||
gen_cfg.temperature = sampler['temperature'] | ||
gen_cfg.do_sample = True | ||
gen_cfg.num_beams = 1 | ||
gen_cfg.num_return_sequences=1 | ||
gen_cfg.remove_invalid_values=True | ||
outputs = model.generate( | ||
inputs=input_ids, | ||
generation_config = gen_cfg, | ||
) | ||
new_text = tokenizer.batch_decode(outputs[:, input_ids.shape[1]:], skip_special_tokens=True)[0] | ||
new_tokens = len(outputs[0]) - prompt_tokens | ||
return new_text, prompt_tokens, new_tokens | ||
|
||
def MambaQuery(query, sampler, model, tokenizer): | ||
|
||
sc = [ tokenizer.get_token_id('<PAD>') ] | ||
if sampler['stop_conditions']: | ||
for spt in sampler['stop_conditions']: | ||
sc = sc + [ tokenizer.get_token_id(spt) ] | ||
|
||
tokens = tokenizer.encode(query) | ||
|
||
input_ids = torch.LongTensor(tokens).unsqueeze(0).cuda() | ||
prompt_tokens = len(input_ids[0]) | ||
|
||
output_ids = model.generate( | ||
input_ids=input_ids, | ||
max_length=prompt_tokens + sampler['max_new_tokens'], | ||
temperature=sampler['temperature'], | ||
min_p=sampler['min_p'], | ||
repetition_penalty = sampler['repetition_penalty'], | ||
cg=True, | ||
eos_token_ids=sc | ||
) | ||
gen_text = tokenizer.decode(output_ids[0], hide_special_tokens=sampler['hide_special_tokens']) | ||
old_text = "" | ||
for msg in query: | ||
if 'content' in msg: | ||
old_text += msg['content'] | ||
new_text = gen_text.replace(old_text, "") | ||
new_tokens = len(output_ids[0]) - prompt_tokens | ||
return new_text, prompt_tokens, new_tokens |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import torch | ||
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | ||
from bltzr import Tokenizer | ||
from peft import PeftModel | ||
from .mixin import GenerationMixin | ||
|
||
from exllamav2 import ( | ||
ExLlamaV2, | ||
ExLlamaV2Config, | ||
ExLlamaV2Cache_Q4, | ||
ExLlamaV2Tokenizer, | ||
ExLlamaV2Lora, | ||
) | ||
from exllamav2.generator import ( | ||
ExLlamaV2StreamingGenerator, | ||
ExLlamaV2Sampler | ||
) | ||
|
||
def LoadExl2Model(model_dir, context_length=None, lora_dir=None): | ||
# Initialize model and cache | ||
config = ExLlamaV2Config() | ||
config.model_dir = model_dir | ||
config.prepare() | ||
if context_length is not None and context_length != 0: | ||
config.max_seq_len = context_length | ||
|
||
model = ExLlamaV2(config) | ||
print("Loading model: " + model_dir) | ||
model.load() | ||
tokenizer = ExLlamaV2Tokenizer(config) | ||
cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded) | ||
lora = None | ||
if lora_dir is not None: | ||
lora = ExLlamaV2Lora.from_directory(model, lora_dir) | ||
# Initialize generator | ||
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer) | ||
# Make sure CUDA is initialized so we can measure performance | ||
generator.warmup() | ||
return { "model": model, "generator": generator, "tokenizer": tokenizer, "cache": cache, "lora": lora, "type": "exl2" } | ||
|
||
def LoadTfModel(model_dir, context_length=None, lora_dir=None, trust_remote_code=False): | ||
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=trust_remote_code) | ||
nf4_config = BitsAndBytesConfig( | ||
load_in_4bit=True, | ||
bnb_4bit_quant_type="nf4", | ||
bnb_4bit_use_double_quant=True, | ||
bnb_4bit_compute_dtype=torch.bfloat16 | ||
) | ||
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map='auto', quantization_config=nf4_config, trust_remote_code=trust_remote_code, attn_implementation="flash_attention_2") | ||
print(model.generation_config) | ||
model.eval() | ||
if lora_dir is not None: | ||
model = PeftModel.from_pretrained(model, lora_dir) | ||
|
||
return { "model": model, "tokenizer": tokenizer, "type": "tf" } | ||
|
||
class CustomAutoModelForCausalLM(AutoModelForCausalLM, GenerationMixin): | ||
pass | ||
|
||
def LoadMambaModel(model_dir): | ||
tokenizer = Tokenizer() | ||
model = CustomAutoModelForCausalLM.from_pretrained(model_dir) | ||
return { "model": model, "tokenizer": tokenizer, "type": "mamba" } |
Oops, something went wrong.