Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
epicfilemcnulty committed May 12, 2024
0 parents commit b8e95c6
Show file tree
Hide file tree
Showing 9 changed files with 1,259 additions and 0 deletions.
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
__pycache__/
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
619 changes: 619 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

14 changes: 14 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[build-system]
requires = ["flit_core >=3.2,<4"]
build-backend = "flit_core.buildapi"

[project]
name = "lilush_llm_proxy"
authors = [{name = "Vladimir Zorin", email = "[email protected]"}]
license = {file = "LICENSE"}
classifiers = ["License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)"]
dynamic = ["version", "description"]
dependencies = [ 'torch', 'numpy', 'bltzr', 'peft', 'bottle' ]

[project.urls]
Home = "https://github.com/epicfilemcnulty/lilush-llm-proxy"
9 changes: 9 additions & 0 deletions serve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import argparse
from lilush_llm_proxy import Serve

parser = argparse.ArgumentParser()
parser.add_argument("--ip", type=str, default="127.0.0.1", required=False)
parser.add_argument("-p", "--port", type=int, default=8013, required=False)
args = parser.parse_args()

Serve(args.ip, args.port)
8 changes: 8 additions & 0 deletions src/lilush_llm_proxy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
""" Lilush LLM Proxy """

__version__ = "0.1.0"

from .loader import *
from .generation import *
from .server import Serve
from .mixin import *
86 changes: 86 additions & 0 deletions src/lilush_llm_proxy/generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import torch
from exllamav2.generator import ExLlamaV2Sampler
from transformers import GenerationConfig

def Exl2Query(query, sampler, tokenizer, generator, lora):

stop_conditions = [tokenizer.eos_token_id]
if sampler['stop_conditions']:
for tid in sampler['stop_conditions']:
stop_conditions = stop_conditions + [ tid ]

settings = ExLlamaV2Sampler.Settings()
settings.temperature = sampler['temperature']
settings.top_k = sampler['top_k']
settings.top_p = sampler['top_p']
settings.min_p = sampler['min_p']
settings.token_repetition_penalty = sampler['repetition_penalty']

input_ids = tokenizer.encode(query, add_bos = sampler['add_bos'], add_eos = sampler['add_eos'], encode_special_tokens = sampler['encode_special_tokens'])
prompt_tokens = input_ids.shape[-1]

generator.set_stop_conditions(stop_conditions)
generator.begin_stream(input_ids, settings, loras = lora)
generated_tokens = 0
new_text = ""
while True:
chunk, eos, tokens = generator.stream()
generated_tokens += 1
new_text += chunk
if eos or generated_tokens == sampler['max_new_tokens']:
break

stop_reason = "eos" if eos else "length"
return new_text, prompt_tokens, generated_tokens, stop_reason

def TfQuery(query, sampler, model, tokenizer):

input_ids = tokenizer(query, return_tensors='pt').input_ids.to('cuda')
prompt_tokens = len(input_ids[0])
gen_cfg = GenerationConfig.from_model_config(model.config)
gen_cfg.max_new_tokens = sampler['max_new_tokens']
gen_cfg.top_p = sampler['top_p']
gen_cfg.top_k = sampler['top_k']
gen_cfg.repetition_penalty = sampler['repetition_penalty']
gen_cfg.temperature = sampler['temperature']
gen_cfg.do_sample = True
gen_cfg.num_beams = 1
gen_cfg.num_return_sequences=1
gen_cfg.remove_invalid_values=True
outputs = model.generate(
inputs=input_ids,
generation_config = gen_cfg,
)
new_text = tokenizer.batch_decode(outputs[:, input_ids.shape[1]:], skip_special_tokens=True)[0]
new_tokens = len(outputs[0]) - prompt_tokens
return new_text, prompt_tokens, new_tokens

def MambaQuery(query, sampler, model, tokenizer):

sc = [ tokenizer.get_token_id('<PAD>') ]
if sampler['stop_conditions']:
for spt in sampler['stop_conditions']:
sc = sc + [ tokenizer.get_token_id(spt) ]

tokens = tokenizer.encode(query)

input_ids = torch.LongTensor(tokens).unsqueeze(0).cuda()
prompt_tokens = len(input_ids[0])

output_ids = model.generate(
input_ids=input_ids,
max_length=prompt_tokens + sampler['max_new_tokens'],
temperature=sampler['temperature'],
min_p=sampler['min_p'],
repetition_penalty = sampler['repetition_penalty'],
cg=True,
eos_token_ids=sc
)
gen_text = tokenizer.decode(output_ids[0], hide_special_tokens=sampler['hide_special_tokens'])
old_text = ""
for msg in query:
if 'content' in msg:
old_text += msg['content']
new_text = gen_text.replace(old_text, "")
new_tokens = len(output_ids[0]) - prompt_tokens
return new_text, prompt_tokens, new_tokens
63 changes: 63 additions & 0 deletions src/lilush_llm_proxy/loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from bltzr import Tokenizer
from peft import PeftModel
from .mixin import GenerationMixin

from exllamav2 import (
ExLlamaV2,
ExLlamaV2Config,
ExLlamaV2Cache_Q4,
ExLlamaV2Tokenizer,
ExLlamaV2Lora,
)
from exllamav2.generator import (
ExLlamaV2StreamingGenerator,
ExLlamaV2Sampler
)

def LoadExl2Model(model_dir, context_length=None, lora_dir=None):
# Initialize model and cache
config = ExLlamaV2Config()
config.model_dir = model_dir
config.prepare()
if context_length is not None and context_length != 0:
config.max_seq_len = context_length

model = ExLlamaV2(config)
print("Loading model: " + model_dir)
model.load()
tokenizer = ExLlamaV2Tokenizer(config)
cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded)
lora = None
if lora_dir is not None:
lora = ExLlamaV2Lora.from_directory(model, lora_dir)
# Initialize generator
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
# Make sure CUDA is initialized so we can measure performance
generator.warmup()
return { "model": model, "generator": generator, "tokenizer": tokenizer, "cache": cache, "lora": lora, "type": "exl2" }

def LoadTfModel(model_dir, context_length=None, lora_dir=None, trust_remote_code=False):
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=trust_remote_code)
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map='auto', quantization_config=nf4_config, trust_remote_code=trust_remote_code, attn_implementation="flash_attention_2")
print(model.generation_config)
model.eval()
if lora_dir is not None:
model = PeftModel.from_pretrained(model, lora_dir)

return { "model": model, "tokenizer": tokenizer, "type": "tf" }

class CustomAutoModelForCausalLM(AutoModelForCausalLM, GenerationMixin):
pass

def LoadMambaModel(model_dir):
tokenizer = Tokenizer()
model = CustomAutoModelForCausalLM.from_pretrained(model_dir)
return { "model": model, "tokenizer": tokenizer, "type": "mamba" }
Loading

0 comments on commit b8e95c6

Please sign in to comment.