Merge pull request #96 from runpod-workers/rel-v0.5.4

update vllm version 0.5.4
runpod-workers · Aug 9, 2024 · 7f46582 · 7f46582
2 parents eb75a3a + 571ef2b
commit 7f46582
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 11 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -12,7 +12,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install --upgrade -r /requirements.txt
 
 # Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer 
-RUN python3 -m pip install vllm==0.5.3.post1 && \
+RUN python3 -m pip install vllm==0.5.4 && \
     python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3
 
 # Setup for Option 2: Building the Image with the Model included

diff --git a/README.md b/README.md
@@ -18,8 +18,10 @@ Deploy OpenAI-Compatible Blazing-Fast LLM Endpoints powered by the [vLLM](https:
 ### 1. UI for Deploying vLLM Worker on RunPod console:
 ![Demo of Deploying vLLM Worker on RunPod console with new UI](media/ui_demo.gif)
 
-### 2. Worker vLLM `v1.1` with vLLM `0.5.3` now available under `stable` tags
-Update v1.1 is now available, use the image tag `runpod/worker-v1-vllm:stable-cuda12.1.0`.
+### 2. Worker vLLM `v1.2.0` with vLLM `0.5.4` now available under `stable` tags 
+**[Note]**: Current stable docker image version still runs vllm v0.5.3, It will be updated soon.
+
+Update v1.1.0 is now available, use the image tag `runpod/worker-v1-vllm:stable-cuda12.1.0`.
 
 ### 3. OpenAI-Compatible [Embedding Worker](https://github.com/runpod-workers/worker-infinity-embedding) Released
 Deploy your own OpenAI-compatible Serverless Endpoint on RunPod with multiple embedding models and fast inference for RAG and more! 

diff --git a/src/engine.py b/src/engine.py
@@ -126,7 +126,7 @@ async def _initialize_engines(self):
         self.model_config = await self.llm.get_model_config()
 
         self.chat_engine = OpenAIServingChat(
-            engine=self.llm, 
+            async_engine_client=self.llm, 
             model_config=self.model_config,
             served_model_names=[self.served_model_name], 
             response_role=self.response_role,
@@ -136,7 +136,7 @@ async def _initialize_engines(self):
             request_logger=None
         )
         self.completion_engine = OpenAIServingCompletion(
-            engine=self.llm, 
+            async_engine_client=self.llm, 
             model_config=self.model_config,
             served_model_names=[self.served_model_name],
             lora_modules=[],

diff --git a/src/engine_args.py b/src/engine_args.py
@@ -13,9 +13,9 @@
 }
 
 DEFAULT_ARGS = {
-    "disable_log_stats": True,
-    "disable_log_requests": True,
-    "gpu_memory_utilization": 0.9,
+    "disable_log_stats": os.getenv('DISABLE_LOG_STATS', 'False').lower() == 'true',
+    "disable_log_requests": os.getenv('DISABLE_LOG_REQUESTS', 'False').lower() == 'true',
+    "gpu_memory_utilization": float(os.getenv('GPU_MEMORY_UTILIZATION', 0.95)),
     "pipeline_parallel_size": int(os.getenv('PIPELINE_PARALLEL_SIZE', 1)),
     "tensor_parallel_size": int(os.getenv('TENSOR_PARALLEL_SIZE', 1)),
     "served_model_name": os.getenv('SERVED_MODEL_NAME', None),
@@ -162,8 +162,8 @@ def get_engine_args():
         args["max_seq_len_to_capture"] = int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"))
         logging.warning("Using MAX_CONTEXT_LEN_TO_CAPTURE is deprecated. Please use MAX_SEQ_LEN_TO_CAPTURE instead.")
 
-    if "gemma-2" in args.get("model", "").lower():
-        os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
-        logging.info("Using FLASHINFER for gemma-2 model.")
+    # if "gemma-2" in args.get("model", "").lower():
+    #     os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
+    #     logging.info("Using FLASHINFER for gemma-2 model.")
 
     return AsyncEngineArgs(**args)