diff --git a/Dockerfile b/Dockerfile index 089b4de..0029433 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install --upgrade -r /requirements.txt # Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer -RUN python3 -m pip install vllm==0.5.3.post1 && \ +RUN python3 -m pip install vllm==0.5.4 && \ python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3 # Setup for Option 2: Building the Image with the Model included diff --git a/README.md b/README.md index 1633a74..a6eba8e 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,10 @@ Deploy OpenAI-Compatible Blazing-Fast LLM Endpoints powered by the [vLLM](https: ### 1. UI for Deploying vLLM Worker on RunPod console: ![Demo of Deploying vLLM Worker on RunPod console with new UI](media/ui_demo.gif) -### 2. Worker vLLM `v1.1` with vLLM `0.5.3` now available under `stable` tags -Update v1.1 is now available, use the image tag `runpod/worker-v1-vllm:stable-cuda12.1.0`. +### 2. Worker vLLM `v1.2.0` with vLLM `0.5.4` now available under `stable` tags +**[Note]**: Current stable docker image version still runs vllm v0.5.3, It will be updated soon. + +Update v1.1.0 is now available, use the image tag `runpod/worker-v1-vllm:stable-cuda12.1.0`. ### 3. OpenAI-Compatible [Embedding Worker](https://github.com/runpod-workers/worker-infinity-embedding) Released Deploy your own OpenAI-compatible Serverless Endpoint on RunPod with multiple embedding models and fast inference for RAG and more! diff --git a/src/engine.py b/src/engine.py index 6962027..0c20dc3 100644 --- a/src/engine.py +++ b/src/engine.py @@ -126,7 +126,7 @@ async def _initialize_engines(self): self.model_config = await self.llm.get_model_config() self.chat_engine = OpenAIServingChat( - engine=self.llm, + async_engine_client=self.llm, model_config=self.model_config, served_model_names=[self.served_model_name], response_role=self.response_role, @@ -136,7 +136,7 @@ async def _initialize_engines(self): request_logger=None ) self.completion_engine = OpenAIServingCompletion( - engine=self.llm, + async_engine_client=self.llm, model_config=self.model_config, served_model_names=[self.served_model_name], lora_modules=[], diff --git a/src/engine_args.py b/src/engine_args.py index 42c7d6d..2f37696 100644 --- a/src/engine_args.py +++ b/src/engine_args.py @@ -13,9 +13,9 @@ } DEFAULT_ARGS = { - "disable_log_stats": True, - "disable_log_requests": True, - "gpu_memory_utilization": 0.9, + "disable_log_stats": os.getenv('DISABLE_LOG_STATS', 'False').lower() == 'true', + "disable_log_requests": os.getenv('DISABLE_LOG_REQUESTS', 'False').lower() == 'true', + "gpu_memory_utilization": float(os.getenv('GPU_MEMORY_UTILIZATION', 0.95)), "pipeline_parallel_size": int(os.getenv('PIPELINE_PARALLEL_SIZE', 1)), "tensor_parallel_size": int(os.getenv('TENSOR_PARALLEL_SIZE', 1)), "served_model_name": os.getenv('SERVED_MODEL_NAME', None), @@ -162,8 +162,8 @@ def get_engine_args(): args["max_seq_len_to_capture"] = int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE")) logging.warning("Using MAX_CONTEXT_LEN_TO_CAPTURE is deprecated. Please use MAX_SEQ_LEN_TO_CAPTURE instead.") - if "gemma-2" in args.get("model", "").lower(): - os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER" - logging.info("Using FLASHINFER for gemma-2 model.") + # if "gemma-2" in args.get("model", "").lower(): + # os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER" + # logging.info("Using FLASHINFER for gemma-2 model.") return AsyncEngineArgs(**args)