From 5cd12ba331e7545a1b8793c9efbeda5852ddc6bc Mon Sep 17 00:00:00 2001 From: Sven Knoblauch Date: Wed, 9 Oct 2024 11:01:12 +0200 Subject: [PATCH] add changes for lora adapter support and /v1/models endpoint --- src/engine.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/engine.py b/src/engine.py index 985a9cc..b6e91d3 100644 --- a/src/engine.py +++ b/src/engine.py @@ -11,7 +11,8 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest, ErrorResponse -from vllm.entrypoints.openai.serving_engine import BaseModelPath +from vllm.entrypoints.openai.serving_engine import BaseModelPath, LoRAModulePath + from utils import DummyRequest, JobInput, BatchSize, create_error_response from constants import DEFAULT_MAX_CONCURRENCY, DEFAULT_BATCH_SIZE, DEFAULT_BATCH_SIZE_GROWTH_FACTOR, DEFAULT_MIN_BATCH_SIZE @@ -128,13 +129,23 @@ async def _initialize_engines(self): self.base_model_paths = [ BaseModelPath(name=self.engine_args.model, model_path=self.engine_args.model) ] + + lora_modules = os.getenv('LORA_MODULES', None) + if lora_modules is not None: + try: + lora_modules = json.loads(lora_modules) + lora_modules = LoRAModulePath(**lora_modules) + except: + lora_modules = None + + self.chat_engine = OpenAIServingChat( engine_client=self.llm, model_config=self.model_config, base_model_paths=self.base_model_paths, response_role=self.response_role, chat_template=self.tokenizer.tokenizer.chat_template, - lora_modules=None, + lora_modules=[lora_modules], prompt_adapters=None, request_logger=None ) @@ -142,7 +153,7 @@ async def _initialize_engines(self): engine_client=self.llm, model_config=self.model_config, base_model_paths=self.base_model_paths, - lora_modules=[], + lora_modules=[lora_modules], prompt_adapters=None, request_logger=None ) @@ -158,9 +169,6 @@ async def generate(self, openai_request: JobInput): async def _handle_model_request(self): models = await self.chat_engine.show_available_models() - fixed_model = models.data[0] - fixed_model.id = self.served_model_name - models.data = [fixed_model] return models.model_dump() async def _handle_chat_or_completion_request(self, openai_request: JobInput):