From 2c2e2797e30f10854d9743d6883aaabfb776816f Mon Sep 17 00:00:00 2001 From: Cameron Morin Date: Fri, 8 Nov 2024 01:21:35 +0000 Subject: [PATCH] Revert "align vllm hpu version to latest vllm-fork (#860)" This reverts commit 786cabe57db330f94b3a95fd0579e6430d7a729f. --- .../docker/compose/llms-compose-cd.yaml | 4 ++++ .../docker/compose/llms-compose.yaml | 4 ++++ .../langchain/dependency/Dockerfile.intel_hpu | 22 +++++++++++++++++ .../langchain/dependency/build_docker_vllm.sh | 6 +---- .../dependency/launch_vllm_service.sh | 2 +- .../vllm/langchain/docker_compose_llm.yaml | 2 +- .../dependency/Dockerfile.intel_hpu | 24 +++++++++++++++++++ .../dependency/build_docker_vllm.sh | 6 +---- .../dependency/launch_vllm_service.sh | 2 +- .../vllm/llama_index/docker_compose_llm.yaml | 2 +- .../text-generation/vllm/llama_index/llm.py | 3 +-- ...-generation_vllm_langchain_on_intel_hpu.sh | 15 ++++++------ ...generation_vllm_llamaindex_on_intel_hpu.sh | 15 ++++++------ 13 files changed, 77 insertions(+), 30 deletions(-) create mode 100644 comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu create mode 100644 comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu diff --git a/.github/workflows/docker/compose/llms-compose-cd.yaml b/.github/workflows/docker/compose/llms-compose-cd.yaml index 7dff6d5c61..c33bc0f3f7 100644 --- a/.github/workflows/docker/compose/llms-compose-cd.yaml +++ b/.github/workflows/docker/compose/llms-compose-cd.yaml @@ -23,6 +23,10 @@ services: build: dockerfile: comps/llms/text-generation/vllm/llama_index/Dockerfile image: ${REGISTRY:-opea}/llm-vllm-llamaindex:${TAG:-latest} + llm-vllm-llamaindex-hpu: + build: + dockerfile: comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu + image: ${REGISTRY:-opea}/llm-vllm-llamaindex-hpu:${TAG:-latest} llm-predictionguard: build: dockerfile: comps/llms/text-generation/predictionguard/Dockerfile diff --git a/.github/workflows/docker/compose/llms-compose.yaml b/.github/workflows/docker/compose/llms-compose.yaml index c7ea529cda..904f7e1e71 100644 --- a/.github/workflows/docker/compose/llms-compose.yaml +++ b/.github/workflows/docker/compose/llms-compose.yaml @@ -24,6 +24,10 @@ services: build: dockerfile: comps/llms/text-generation/vllm/langchain/Dockerfile image: ${REGISTRY:-opea}/llm-vllm:${TAG:-latest} + llm-vllm-hpu: + build: + dockerfile: comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu + image: ${REGISTRY:-opea}/llm-vllm-hpu:${TAG:-latest} llm-vllm-ray: build: dockerfile: comps/llms/text-generation/vllm/ray/Dockerfile diff --git a/comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu b/comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu new file mode 100644 index 0000000000..f3703e4e74 --- /dev/null +++ b/comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu @@ -0,0 +1,22 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 AS hpu + +RUN git clone https://github.com/HabanaAI/vllm-fork.git /workspace/vllm + +# COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm + +RUN pip install --no-cache-dir -v -r requirements-hpu.txt + +ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true + +RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install + +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +CMD ["/bin/bash"] diff --git a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh index da7ee3aaab..a47bd23bf2 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh @@ -30,11 +30,7 @@ fi # Build the docker image for vLLM based on the hardware mode if [ "$hw_mode" = "hpu" ]; then - git clone https://github.com/HabanaAI/vllm-fork.git - cd ./vllm-fork/ - docker build -f Dockerfile.hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy - cd .. - rm -rf vllm-fork + docker build -f Dockerfile.intel_hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy else git clone https://github.com/vllm-project/vllm.git cd ./vllm/ diff --git a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh index a5b2ceb3b0..0d97eeb478 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh @@ -38,7 +38,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm:hpu --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture " else docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm:cpu --model $model_name --host 0.0.0.0 --port 80 fi diff --git a/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml index acb620d164..cd1e3cf54a 100644 --- a/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml @@ -23,7 +23,7 @@ services: cap_add: - SYS_NICE ipc: host - command: --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 + command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80" llm: image: opea/llm-vllm:latest container_name: llm-vllm-gaudi-server diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu b/comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu new file mode 100644 index 0000000000..8166f471e5 --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu @@ -0,0 +1,24 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 AS hpu +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ +ENV LANG=en_US.UTF-8 +RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + service ssh restart +USER user +WORKDIR /root + +RUN pip install --no-cache-dir --upgrade-strategy eager optimum[habana] + +RUN pip install --no-cache-dir -v git+https://github.com/HabanaAI/vllm-fork.git@cf6952d + +RUN pip install --no-cache-dir setuptools + +ENV PT_HPU_LAZY_ACC_PAR_MODE=0 + +ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true + +CMD ["/bin/bash"] diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh index 8b37fe048a..b4a13d5fb3 100644 --- a/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh +++ b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh @@ -30,11 +30,7 @@ fi # Build the docker image for vLLM based on the hardware mode if [ "$hw_mode" = "hpu" ]; then - git clone https://github.com/HabanaAI/vllm-fork.git - cd ./vllm-fork/ - docker build -f Dockerfile.hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy - cd .. - rm -rf vllm-fork + docker build -f docker/Dockerfile.intel_hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy else git clone https://github.com/vllm-project/vllm.git cd ./vllm/ diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh index bdf46889fd..0c7ed90de4 100644 --- a/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh @@ -38,7 +38,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm:hpu --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture " else docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm:cpu --model $model_name --host 0.0.0.0 --port 80 fi diff --git a/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml index 94358acc66..f754a13d50 100644 --- a/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml @@ -23,7 +23,7 @@ services: cap_add: - SYS_NICE ipc: host - command: --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 + command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80" llm: image: opea/llm-vllm-llamaindex:latest container_name: llm-vllm-gaudi-server diff --git a/comps/llms/text-generation/vllm/llama_index/llm.py b/comps/llms/text-generation/vllm/llama_index/llm.py index 76afa24a98..55bcec7dc6 100644 --- a/comps/llms/text-generation/vllm/llama_index/llm.py +++ b/comps/llms/text-generation/vllm/llama_index/llm.py @@ -66,8 +66,7 @@ async def stream_generator(): return StreamingResponse(stream_generator(), media_type="text/event-stream") else: - response = await llm.acomplete(input.query) - response = response.text + response = await llm.acomplete(input.query).text if logflag: logger.info(response) return GeneratedDoc(text=response, prompt=input.query) diff --git a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh index 5024b0c93a..6ecf5d2d6c 100644 --- a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh @@ -8,11 +8,12 @@ WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { - ## Build VLLM docker - cd $WORKPATH - git clone https://github.com/HabanaAI/vllm-fork.git - cd vllm-fork/ - docker build --no-cache -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g . + ## Build VLLM Ray docker + cd $WORKPATH/comps/llms/text-generation/vllm/langchain/dependency + docker build \ + -f Dockerfile.intel_hpu \ + --no-cache -t opea/vllm-hpu:comps \ + --shm-size=128g . if [ $? -ne 0 ]; then echo "opea/vllm-hpu built fail" exit 1 @@ -47,7 +48,7 @@ function start_service() { --ipc=host \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ opea/vllm-hpu:comps \ - --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 + /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048" export vLLM_ENDPOINT="http://${ip_address}:${port_number}" docker run -d --rm \ @@ -64,7 +65,7 @@ function start_service() { until [[ "$n" -ge 120 ]] || [[ $ready == true ]]; do docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log n=$((n+1)) - if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then + if grep -q Connected ${WORKPATH}/tests/test-comps-vllm-service.log; then break fi sleep 5s diff --git a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh index 724e523e79..ca67a00f4e 100644 --- a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh @@ -8,11 +8,12 @@ WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { - ## Build VLLM docker - cd $WORKPATH - git clone https://github.com/HabanaAI/vllm-fork.git - cd vllm-fork/ - docker build --no-cache -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g . + ## Build VLLM Ray docker + cd $WORKPATH/comps/llms/text-generation/vllm/llama_index/dependency + docker build \ + -f Dockerfile.intel_hpu \ + --no-cache -t opea/vllm-hpu:comps \ + --shm-size=128g . if [ $? -ne 0 ]; then echo "opea/vllm-hpu built fail" exit 1 @@ -47,7 +48,7 @@ function start_service() { --ipc=host \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ opea/vllm-hpu:comps \ - --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 + /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048" export vLLM_ENDPOINT="http://${ip_address}:${port_number}" docker run -d --rm \ @@ -64,7 +65,7 @@ function start_service() { until [[ "$n" -ge 120 ]] || [[ $ready == true ]]; do docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log n=$((n+1)) - if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then + if grep -q Connected ${WORKPATH}/tests/test-comps-vllm-service.log; then break fi sleep 5s