Skip to content

Commit

Permalink
Merge pull request #96 from runpod-workers/rel-v0.5.4
Browse files Browse the repository at this point in the history
update vllm version 0.5.4
  • Loading branch information
pandyamarut authored Aug 9, 2024
2 parents eb75a3a + 571ef2b commit 7f46582
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 11 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade -r /requirements.txt

# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer
RUN python3 -m pip install vllm==0.5.3.post1 && \
RUN python3 -m pip install vllm==0.5.4 && \
python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3

# Setup for Option 2: Building the Image with the Model included
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ Deploy OpenAI-Compatible Blazing-Fast LLM Endpoints powered by the [vLLM](https:
### 1. UI for Deploying vLLM Worker on RunPod console:
![Demo of Deploying vLLM Worker on RunPod console with new UI](media/ui_demo.gif)

### 2. Worker vLLM `v1.1` with vLLM `0.5.3` now available under `stable` tags
Update v1.1 is now available, use the image tag `runpod/worker-v1-vllm:stable-cuda12.1.0`.
### 2. Worker vLLM `v1.2.0` with vLLM `0.5.4` now available under `stable` tags
**[Note]**: Current stable docker image version still runs vllm v0.5.3, It will be updated soon.

Update v1.1.0 is now available, use the image tag `runpod/worker-v1-vllm:stable-cuda12.1.0`.

### 3. OpenAI-Compatible [Embedding Worker](https://github.com/runpod-workers/worker-infinity-embedding) Released
Deploy your own OpenAI-compatible Serverless Endpoint on RunPod with multiple embedding models and fast inference for RAG and more!
Expand Down
4 changes: 2 additions & 2 deletions src/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ async def _initialize_engines(self):
self.model_config = await self.llm.get_model_config()

self.chat_engine = OpenAIServingChat(
engine=self.llm,
async_engine_client=self.llm,
model_config=self.model_config,
served_model_names=[self.served_model_name],
response_role=self.response_role,
Expand All @@ -136,7 +136,7 @@ async def _initialize_engines(self):
request_logger=None
)
self.completion_engine = OpenAIServingCompletion(
engine=self.llm,
async_engine_client=self.llm,
model_config=self.model_config,
served_model_names=[self.served_model_name],
lora_modules=[],
Expand Down
12 changes: 6 additions & 6 deletions src/engine_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
}

DEFAULT_ARGS = {
"disable_log_stats": True,
"disable_log_requests": True,
"gpu_memory_utilization": 0.9,
"disable_log_stats": os.getenv('DISABLE_LOG_STATS', 'False').lower() == 'true',
"disable_log_requests": os.getenv('DISABLE_LOG_REQUESTS', 'False').lower() == 'true',
"gpu_memory_utilization": float(os.getenv('GPU_MEMORY_UTILIZATION', 0.95)),
"pipeline_parallel_size": int(os.getenv('PIPELINE_PARALLEL_SIZE', 1)),
"tensor_parallel_size": int(os.getenv('TENSOR_PARALLEL_SIZE', 1)),
"served_model_name": os.getenv('SERVED_MODEL_NAME', None),
Expand Down Expand Up @@ -162,8 +162,8 @@ def get_engine_args():
args["max_seq_len_to_capture"] = int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"))
logging.warning("Using MAX_CONTEXT_LEN_TO_CAPTURE is deprecated. Please use MAX_SEQ_LEN_TO_CAPTURE instead.")

if "gemma-2" in args.get("model", "").lower():
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
logging.info("Using FLASHINFER for gemma-2 model.")
# if "gemma-2" in args.get("model", "").lower():
# os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
# logging.info("Using FLASHINFER for gemma-2 model.")

return AsyncEngineArgs(**args)

0 comments on commit 7f46582

Please sign in to comment.