Skip to content

Commit

Permalink
Revert "align vllm hpu version to latest vllm-fork (opea-project#860)"
Browse files Browse the repository at this point in the history
This reverts commit 786cabe.
  • Loading branch information
cameronmorin committed Nov 8, 2024
1 parent 3cf4a59 commit 2c2e279
Show file tree
Hide file tree
Showing 13 changed files with 77 additions and 30 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/docker/compose/llms-compose-cd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ services:
build:
dockerfile: comps/llms/text-generation/vllm/llama_index/Dockerfile
image: ${REGISTRY:-opea}/llm-vllm-llamaindex:${TAG:-latest}
llm-vllm-llamaindex-hpu:
build:
dockerfile: comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu
image: ${REGISTRY:-opea}/llm-vllm-llamaindex-hpu:${TAG:-latest}
llm-predictionguard:
build:
dockerfile: comps/llms/text-generation/predictionguard/Dockerfile
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/docker/compose/llms-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ services:
build:
dockerfile: comps/llms/text-generation/vllm/langchain/Dockerfile
image: ${REGISTRY:-opea}/llm-vllm:${TAG:-latest}
llm-vllm-hpu:
build:
dockerfile: comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu
image: ${REGISTRY:-opea}/llm-vllm-hpu:${TAG:-latest}
llm-vllm-ray:
build:
dockerfile: comps/llms/text-generation/vllm/ray/Dockerfile
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 AS hpu

RUN git clone https://github.com/HabanaAI/vllm-fork.git /workspace/vllm

# COPY ./ /workspace/vllm

WORKDIR /workspace/vllm

RUN pip install --no-cache-dir -v -r requirements-hpu.txt

ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install

WORKDIR /workspace/

RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

CMD ["/bin/bash"]
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,7 @@ fi

# Build the docker image for vLLM based on the hardware mode
if [ "$hw_mode" = "hpu" ]; then
git clone https://github.com/HabanaAI/vllm-fork.git
cd ./vllm-fork/
docker build -f Dockerfile.hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
cd ..
rm -rf vllm-fork
docker build -f Dockerfile.intel_hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
else
git clone https://github.com/vllm-project/vllm.git
cd ./vllm/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ volume=$PWD/data

# Build the Docker run command based on hardware mode
if [ "$hw_mode" = "hpu" ]; then
docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm:hpu --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture
docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture "
else
docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm:cpu --model $model_name --host 0.0.0.0 --port 80
fi
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ services:
cap_add:
- SYS_NICE
ipc: host
command: --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80
command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80"
llm:
image: opea/llm-vllm:latest
container_name: llm-vllm-gaudi-server
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 AS hpu
RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/
ENV LANG=en_US.UTF-8
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
service ssh restart
USER user
WORKDIR /root

RUN pip install --no-cache-dir --upgrade-strategy eager optimum[habana]

RUN pip install --no-cache-dir -v git+https://github.com/HabanaAI/vllm-fork.git@cf6952d

RUN pip install --no-cache-dir setuptools

ENV PT_HPU_LAZY_ACC_PAR_MODE=0

ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

CMD ["/bin/bash"]
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,7 @@ fi

# Build the docker image for vLLM based on the hardware mode
if [ "$hw_mode" = "hpu" ]; then
git clone https://github.com/HabanaAI/vllm-fork.git
cd ./vllm-fork/
docker build -f Dockerfile.hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
cd ..
rm -rf vllm-fork
docker build -f docker/Dockerfile.intel_hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
else
git clone https://github.com/vllm-project/vllm.git
cd ./vllm/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ volume=$PWD/data

# Build the Docker run command based on hardware mode
if [ "$hw_mode" = "hpu" ]; then
docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm:hpu --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture
docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture "
else
docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm:cpu --model $model_name --host 0.0.0.0 --port 80
fi
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ services:
cap_add:
- SYS_NICE
ipc: host
command: --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80
command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80"
llm:
image: opea/llm-vllm-llamaindex:latest
container_name: llm-vllm-gaudi-server
Expand Down
3 changes: 1 addition & 2 deletions comps/llms/text-generation/vllm/llama_index/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ async def stream_generator():

return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
response = await llm.acomplete(input.query)
response = response.text
response = await llm.acomplete(input.query).text
if logflag:
logger.info(response)
return GeneratedDoc(text=response, prompt=input.query)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ WORKPATH=$(dirname "$PWD")
ip_address=$(hostname -I | awk '{print $1}')

function build_docker_images() {
## Build VLLM docker
cd $WORKPATH
git clone https://github.com/HabanaAI/vllm-fork.git
cd vllm-fork/
docker build --no-cache -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g .
## Build VLLM Ray docker
cd $WORKPATH/comps/llms/text-generation/vllm/langchain/dependency
docker build \
-f Dockerfile.intel_hpu \
--no-cache -t opea/vllm-hpu:comps \
--shm-size=128g .
if [ $? -ne 0 ]; then
echo "opea/vllm-hpu built fail"
exit 1
Expand Down Expand Up @@ -47,7 +48,7 @@ function start_service() {
--ipc=host \
-e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \
opea/vllm-hpu:comps \
--enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
/bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"

export vLLM_ENDPOINT="http://${ip_address}:${port_number}"
docker run -d --rm \
Expand All @@ -64,7 +65,7 @@ function start_service() {
until [[ "$n" -ge 120 ]] || [[ $ready == true ]]; do
docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log
n=$((n+1))
if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then
if grep -q Connected ${WORKPATH}/tests/test-comps-vllm-service.log; then
break
fi
sleep 5s
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ WORKPATH=$(dirname "$PWD")
ip_address=$(hostname -I | awk '{print $1}')

function build_docker_images() {
## Build VLLM docker
cd $WORKPATH
git clone https://github.com/HabanaAI/vllm-fork.git
cd vllm-fork/
docker build --no-cache -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g .
## Build VLLM Ray docker
cd $WORKPATH/comps/llms/text-generation/vllm/llama_index/dependency
docker build \
-f Dockerfile.intel_hpu \
--no-cache -t opea/vllm-hpu:comps \
--shm-size=128g .
if [ $? -ne 0 ]; then
echo "opea/vllm-hpu built fail"
exit 1
Expand Down Expand Up @@ -47,7 +48,7 @@ function start_service() {
--ipc=host \
-e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \
opea/vllm-hpu:comps \
--enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
/bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"

export vLLM_ENDPOINT="http://${ip_address}:${port_number}"
docker run -d --rm \
Expand All @@ -64,7 +65,7 @@ function start_service() {
until [[ "$n" -ge 120 ]] || [[ $ready == true ]]; do
docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log
n=$((n+1))
if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then
if grep -q Connected ${WORKPATH}/tests/test-comps-vllm-service.log; then
break
fi
sleep 5s
Expand Down

0 comments on commit 2c2e279

Please sign in to comment.