Skip to content

Commit

Permalink
update vllm version 0.5.4
Browse files Browse the repository at this point in the history
Signed-off-by: pandyamarut <[email protected]>
  • Loading branch information
pandyamarut committed Aug 9, 2024
1 parent f023f57 commit 9cb9336
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 9 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade -r /requirements.txt

# Install vLLM (switching back to pip installs since issues that required building fork are fixed and space optimization is not as important since caching) and FlashInfer
RUN python3 -m pip install vllm==0.5.3.post1 && \
RUN python3 -m pip install vllm==0.5.4 && \
python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3

# Setup for Option 2: Building the Image with the Model included
Expand Down
4 changes: 2 additions & 2 deletions src/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ async def _initialize_engines(self):
self.model_config = await self.llm.get_model_config()

self.chat_engine = OpenAIServingChat(
engine=self.llm,
async_engine_client=self.llm,
model_config=self.model_config,
served_model_names=[self.served_model_name],
response_role=self.response_role,
Expand All @@ -136,7 +136,7 @@ async def _initialize_engines(self):
request_logger=None
)
self.completion_engine = OpenAIServingCompletion(
engine=self.llm,
async_engine_client=self.llm,
model_config=self.model_config,
served_model_names=[self.served_model_name],
lora_modules=[],
Expand Down
12 changes: 6 additions & 6 deletions src/engine_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
}

DEFAULT_ARGS = {
"disable_log_stats": True,
"disable_log_requests": True,
"gpu_memory_utilization": 0.9,
"disable_log_stats": os.getenv('DISABLE_LOG_STATS', 'False').lower() == 'true',
"disable_log_requests": os.getenv('DISABLE_LOG_REQUESTS', 'False').lower() == 'true',
"gpu_memory_utilization": int(os.getenv('GPU_MEMORY_UTILIZATION', 0.9)),
"pipeline_parallel_size": int(os.getenv('PIPELINE_PARALLEL_SIZE', 1)),
"tensor_parallel_size": int(os.getenv('TENSOR_PARALLEL_SIZE', 1)),
"served_model_name": os.getenv('SERVED_MODEL_NAME', None),
Expand Down Expand Up @@ -162,8 +162,8 @@ def get_engine_args():
args["max_seq_len_to_capture"] = int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"))
logging.warning("Using MAX_CONTEXT_LEN_TO_CAPTURE is deprecated. Please use MAX_SEQ_LEN_TO_CAPTURE instead.")

if "gemma-2" in args.get("model", "").lower():
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
logging.info("Using FLASHINFER for gemma-2 model.")
# if "gemma-2" in args.get("model", "").lower():
# os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
# logging.info("Using FLASHINFER for gemma-2 model.")

return AsyncEngineArgs(**args)

0 comments on commit 9cb9336

Please sign in to comment.