From 9cb9336cf5979046389a06c7c1f56d69b3064f83 Mon Sep 17 00:00:00 2001 From: pandyamarut Date: Fri, 9 Aug 2024 12:01:42 -0700 Subject: [PATCH] update vllm version 0.5.4 Signed-off-by: pandyamarut --- Dockerfile | 2 +- src/engine.py | 4 ++-- src/engine_args.py | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 089b4de..0029433 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install --upgrade -r /requirements.txt # Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer -RUN python3 -m pip install vllm==0.5.3.post1 && \ +RUN python3 -m pip install vllm==0.5.4 && \ python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3 # Setup for Option 2: Building the Image with the Model included diff --git a/src/engine.py b/src/engine.py index 6962027..0c20dc3 100644 --- a/src/engine.py +++ b/src/engine.py @@ -126,7 +126,7 @@ async def _initialize_engines(self): self.model_config = await self.llm.get_model_config() self.chat_engine = OpenAIServingChat( - engine=self.llm, + async_engine_client=self.llm, model_config=self.model_config, served_model_names=[self.served_model_name], response_role=self.response_role, @@ -136,7 +136,7 @@ async def _initialize_engines(self): request_logger=None ) self.completion_engine = OpenAIServingCompletion( - engine=self.llm, + async_engine_client=self.llm, model_config=self.model_config, served_model_names=[self.served_model_name], lora_modules=[], diff --git a/src/engine_args.py b/src/engine_args.py index 42c7d6d..0436de5 100644 --- a/src/engine_args.py +++ b/src/engine_args.py @@ -13,9 +13,9 @@ } DEFAULT_ARGS = { - "disable_log_stats": True, - "disable_log_requests": True, - "gpu_memory_utilization": 0.9, + "disable_log_stats": os.getenv('DISABLE_LOG_STATS', 'False').lower() == 'true', + "disable_log_requests": os.getenv('DISABLE_LOG_REQUESTS', 'False').lower() == 'true', + "gpu_memory_utilization": int(os.getenv('GPU_MEMORY_UTILIZATION', 0.9)), "pipeline_parallel_size": int(os.getenv('PIPELINE_PARALLEL_SIZE', 1)), "tensor_parallel_size": int(os.getenv('TENSOR_PARALLEL_SIZE', 1)), "served_model_name": os.getenv('SERVED_MODEL_NAME', None), @@ -162,8 +162,8 @@ def get_engine_args(): args["max_seq_len_to_capture"] = int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE")) logging.warning("Using MAX_CONTEXT_LEN_TO_CAPTURE is deprecated. Please use MAX_SEQ_LEN_TO_CAPTURE instead.") - if "gemma-2" in args.get("model", "").lower(): - os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER" - logging.info("Using FLASHINFER for gemma-2 model.") + # if "gemma-2" in args.get("model", "").lower(): + # os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER" + # logging.info("Using FLASHINFER for gemma-2 model.") return AsyncEngineArgs(**args)