-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllminvoker.py
237 lines (196 loc) · 9.85 KB
/
llminvoker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI, HarmBlockThreshold, HarmCategory
from langchain_anthropic import ChatAnthropic
from langchain_community.chat_models import ChatOllama
from langchain.prompts.prompt import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from mylocalmodels import LocalModelInterface
from langchain.memory import ConversationTokenBufferMemory
from langchain.chains import ConversationChain
import keys
from langchain_community.llms import HuggingFaceTextGenInference
from gputimekeeper import GPUTimeKeeper
import concurrent.futures
def remove_prefix(text, prefix="Ollama: "):
if text.startswith(prefix):
return text[len(prefix):]
return text
def clean_chat_history(chat_history, model_specs):
# removes inserted LLM prefixes (for readability) from the chat history, returns list of 2 tuples (human, ai)
new_chat_history = []
for message, response in chat_history:
for model_name, specs in model_specs.items():
prefix = specs['friendly_name'] + ": "
if response.startswith(prefix):
response = response[len(prefix):]
break
new_chat_history.append((message, response))
return new_chat_history
async def execute_llms(model_names, model_specs, temperature, sys_msg, cleaned_chat_histories):
# cleaned chat histories is a list of lists, with each element of outer list corresponding to an element of the model_names list (in order)
# inner list contains 2-tuples (human, ai)
futures = {}
first_local_model_handled = False
with concurrent.futures.ThreadPoolExecutor() as executor:
for i, model_name in enumerate(model_names):
is_local = model_specs[model_name]["is_local"]
if not is_local or (is_local and not first_local_model_handled):
args = (
cleaned_chat_histories[i],
model_name,
model_specs[model_name]['maximum_context_length_tokens'],
model_specs[model_name]['maximum_output_tokens'],
sys_msg,
temperature
)
futures[model_name] = executor.submit(chat_bot_backend, *args)
if is_local:
first_local_model_handled = True
results = {model_name: future.result() for model_name, future in futures.items()}
for i, model_name in enumerate(model_names):
if model_specs[model_name]["is_local"] and model_name not in results:
args = (
cleaned_chat_histories[i],
model_name,
model_specs[model_name]['maximum_context_length_tokens'],
model_specs[model_name]['maximum_output_tokens'],
sys_msg,
temperature
)
result = chat_bot_backend(*args)
results[model_name] = result
results_simple = {model_name: results[model_name][0] for model_name in results}
gpuTimes = {
model_specs[model_name]['friendly_name']: results[model_name][1]
for model_name in results
if model_specs[model_name]["is_local"]
}
tokens = {
model_specs[model_name]['friendly_name']: results[model_name][2]
for model_name in results
if model_specs[model_name]["is_local"]
}
tokens_second = {
name: round(tokens[name] / (gpuTimes[name] / 1000), 2)
for name in tokens
if name in gpuTimes and gpuTimes[name] != 0
}
formatted_models = {
model_info['friendly_name']: f"{tokens[model_name]} tokens, {gpuTimes[model_name] / 1000:.2f} seconds, {tokens_second[model_name]} tokens/second"
for model_name, model_info in model_specs.items()
if model_info['is_used'] and model_info['is_local'] and model_info['friendly_name'] in tokens
}
formatted_labels = [f"{key}: {value}" for key, value in formatted_models.items()]
formatted_label_str = "\n".join(formatted_labels)
return results_simple, formatted_label_str
def chat_bot_backend(conversation, model_name, max_context_tokens=2000, max_output_tokens = 2000, assistantPrompt = None, temperatureIn = .5):
GPU_UTILIZATION_THRESHOLD = .8
# Define the base template. Notice {systemPrompt} is part of the template.
base_template = """{systemPrompt}
Current conversation:
{{history}}
Human: {{input}}
AI:"""
# If assistantPrompt is provided, replace the placeholder in the base template.
# Otherwise, use a default prompt or keep an empty string.
if assistantPrompt:
system_prompt_content = assistantPrompt
else:
system_prompt_content = "This is a conversation with an AI." # Default prompt, can be adjusted.
# Now, integrate the actual system prompt content into the template
template = base_template.format(systemPrompt=system_prompt_content)
# Initialize the prompt template with the correct variables and the formatted template
PROMPT = PromptTemplate(input_variables=["history", "input"], template=template)
if model_name.startswith("text-") or model_name.startswith("gpt-"):
llm = ChatOpenAI(openai_api_key=keys.OPENAI_API_KEY,model_name=model_name, max_tokens=max_output_tokens,temperature=temperatureIn )
elif model_name.startswith("claude-"):
llm = ChatAnthropic(model=model_name, anthropic_api_key=keys.ANTHROPIC_API_KEY, max_tokens_to_sample=max_output_tokens, temperature=temperatureIn)
elif model_name.startswith("gemini-"):
safetySettings = {
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}
llm = ChatGoogleGenerativeAI(model=model_name,google_api_key=keys.GOOGLE_API_KEY,
max_output_tokens=max_output_tokens, safety_settings=safetySettings,temperature=temperatureIn
)
elif model_name.startswith("Mixtral"):
ENDPOINT_URL = 'https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1'
llm = HuggingFaceTextGenInference(
inference_server_url=ENDPOINT_URL,
max_new_tokens=max_output_tokens,
top_k=50,
temperature=0.1,
repetition_penalty=1.03,
server_kwargs={
"headers": {
"Authorization": f"Bearer {keys.HUGGINGFACE_API_KEY}",
"Content-Type": "application/json",
}
},
)
elif model_name.startswith("OpenELM"):
ENDPOINT_URL = 'https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1'
llm = HuggingFaceTextGenInference(
inference_server_url=ENDPOINT_URL,
max_new_tokens=max_output_tokens,
top_k=50,
temperature=0.1,
repetition_penalty=1.03,
server_kwargs={
"headers": {
"Authorization": f"Bearer {keys.HUGGINGFACE_API_KEY}",
"Content-Type": "application/json",
}
},
)
elif model_name.startswith("Ollama:"):
inModel = remove_prefix(model_name)
llm = ChatOllama(model=inModel,temperature=temperatureIn )
elif model_name.startswith("OnnxDML:"):
inModel = model_name
llm = LocalModelInterface(model_type=inModel,temperature=temperatureIn, max_output_tokens=max_output_tokens )
elif model_name.startswith("TorchDML:"):
inModel = model_name
llm = LocalModelInterface(model_type=inModel,temperature=temperatureIn, max_output_tokens=max_output_tokens )
else:
raise ValueError(f"Unsupported model: {model_name}")
# Initialize the conversation memory
memory = ConversationTokenBufferMemory(llm=llm, max_token_limit=max_context_tokens)
# Add the conversation history to the memory
for user_input, agent_response in conversation[:-1]:
memory.save_context({"input": user_input}, {"output": agent_response})
# Get the latest user input
latest_user_input = conversation[-1][0]
# Initialize the conversation chain
conversation_chain = ConversationChain(prompt = PROMPT, llm=llm, memory=memory, verbose = True)
# start tje g[]
# Generate the agent's response
aGPUTime = GPUTimeKeeper()
aGPUTime.start_timer(GPU_UTILIZATION_THRESHOLD)
agent_response = conversation_chain.predict(input=latest_user_input)
gpuTime = aGPUTime.stop_timer()
tokenCount = llm.get_num_tokens(agent_response)
return agent_response, gpuTime, tokenCount
if __name__ == "__main__":
# Define a test conversation
test_conversation = [
["Hello, how can I help you?", "Hi, I need assistance with my account."],
["Sure, I can help with that. What is the issue?", "I forgot my password."],
["I understand. I can help you reset your password. Do you have access to your email?", ""]
]
# Define the model name and max tokens for this test
test_model_name = "claude-3-opus-20240229"
test_model_name = "gpt-4-turbo-preview" #
test_model_name = "gemini-pro"
test_model_name = "Mixtral-8x7B-Instruct-v0.1"
test_model_name = "Ollama: llama3"
test_model_name = "Ollama: phi"
test_model_name = "Mylocal: llmt1"
test_max_tokens = 2000
# Run the chat bot backend with the test parameters
test_response = chat_bot_backend(test_conversation, test_model_name, test_max_tokens, assistantPrompt="System: You are a novel maker")
# Print out the response from the chat bot
print("Test response from the chat bot:", test_response)