From f6d46017bb93052e56404168b2e057b08f934d62 Mon Sep 17 00:00:00 2001 From: XinyaoWa Date: Fri, 24 Jan 2025 18:48:38 +0800 Subject: [PATCH] Update readme for LLM comps and related third parties (#1234) * Update readme for LLM comps and related third parties Update readmes for doc-summarization, faq-generation, text-generation, tgi and vllm Signed-off-by: Xinyao Wang --- comps/llms/src/doc-summarization/README.md | 30 +- comps/llms/src/faq-generation/README.md | 26 +- comps/llms/src/text-generation/README.md | 155 ++++++--- .../llms/src/text-generation/README_native.md | 16 +- .../text-generation/README_predictionguard.md | 5 +- .../src/text-generation/README_textgen.md | 318 ------------------ .../ollama/README.md} | 19 -- comps/third_parties/tgi/README.md | 30 ++ comps/third_parties/vllm/README.md | 32 +- .../docker_compose/launch_vllm_service.sh | 44 +++ 10 files changed, 241 insertions(+), 434 deletions(-) delete mode 100644 comps/llms/src/text-generation/README_textgen.md rename comps/{llms/src/text-generation/README_ollama.md => third_parties/ollama/README.md} (76%) create mode 100644 comps/third_parties/tgi/README.md create mode 100644 comps/third_parties/vllm/deployment/docker_compose/launch_vllm_service.sh diff --git a/comps/llms/src/doc-summarization/README.md b/comps/llms/src/doc-summarization/README.md index 31431c0a76..77e0969d2a 100644 --- a/comps/llms/src/doc-summarization/README.md +++ b/comps/llms/src/doc-summarization/README.md @@ -17,7 +17,6 @@ export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" export LLM_MODEL_ID=${your_hf_llm_model} export MAX_INPUT_TOKENS=2048 export MAX_TOTAL_TOKENS=4096 -export DocSum_COMPONENT_NAME="OpeaDocSumTgi" # or "OpeaDocSumvLLM" ``` Please make sure MAX_TOTAL_TOKENS should be larger than (MAX_INPUT_TOKENS + max_new_tokens + 50), 50 is reserved prompt length. @@ -26,15 +25,15 @@ Please make sure MAX_TOTAL_TOKENS should be larger than (MAX_INPUT_TOKENS + max_ Step 1: Prepare backend LLM docker image. -If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm/src) to build vLLM docker images first. +If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm/) to build vLLM docker images first. No need for TGI. -Step 2: Build FaqGen docker image. +Step 2: Build DocSum docker image. ```bash cd ../../../../ -docker build -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/summarization/Dockerfile . +docker build -t opea/llm-docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/doc-summarization/Dockerfile . ``` ### 1.3 Run Docker @@ -49,11 +48,12 @@ You can choose one as needed. ### 1.3.1 Run Docker with CLI (Option A) Step 1: Start the backend LLM service -Please refer to [TGI](../../../third_parties/tgi/deployment/docker_compose/) or [vLLM](../../../third_parties/vllm/deployment/docker_compose/) guideline to start a backend LLM service. +Please refer to [TGI](../../../third_parties/tgi) or [vLLM](../../../third_parties/vllm) guideline to start a backend LLM service. Step 2: Start the DocSum microservices ```bash +export DocSum_COMPONENT_NAME="OpeaDocSumTgi" # or "OpeaDocSumvLLM" docker run -d \ --name="llm-docsum-server" \ -p 9000:9000 \ @@ -71,20 +71,16 @@ docker run -d \ ### 1.3.2 Run Docker with Docker Compose (Option B) -```bash -cd ../../deployment/docker_compose/ - -# Backend is TGI on xeon -docker compose -f doc-summarization_tgi.yaml up -d - -# Backend is TGI on gaudi -# docker compose -f doc-summarization_tgi_on_intel_hpu.yaml up -d +Set `service_name` to match backend service. -# Backend is vLLM on xeon -# docker compose -f doc-summarization_vllm.yaml up -d +```bash +export service_name="docsum-tgi" +# export service_name="docsum-tgi-gaudi" +# export service_name="docsum-vllm" +# export service_name="docsum-vllm-gaudi" -# Backend is vLLM on gaudi -# docker compose -f doc-summarization_vllm_on_intel_hpu.yaml up -d +cd ../../deployment/docker_compose/ +docker compose -f compose_doc-summarization.yaml up ${service_name} -d ``` ## πŸš€3. Consume LLM Service diff --git a/comps/llms/src/faq-generation/README.md b/comps/llms/src/faq-generation/README.md index a1969bc98c..32cc3e87b5 100644 --- a/comps/llms/src/faq-generation/README.md +++ b/comps/llms/src/faq-generation/README.md @@ -15,14 +15,13 @@ export FAQ_PORT=9000 export HF_TOKEN=${your_hf_api_token} export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" export LLM_MODEL_ID=${your_hf_llm_model} -export FAQGen_COMPONENT_NAME="OpeaFaqGenTgi" # or "vllm" ``` ### 1.2 Build Docker Image Step 1: Prepare backend LLM docker image. -If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm/src) to build vLLM docker images first. +If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm) to build vLLM docker images first. No need for TGI. @@ -45,11 +44,12 @@ You can choose one as needed. #### 1.3.1 Run Docker with CLI (Option A) Step 1: Start the backend LLM service -Please refer to [TGI](../../../third_parties/tgi/deployment/docker_compose/) or [vLLM](../../../third_parties/vllm/deployment/docker_compose/) guideline to start a backend LLM service. +Please refer to [TGI](../../../third_parties/tgi) or [vLLM](../../../third_parties/vllm) guideline to start a backend LLM service. Step 2: Start the FaqGen microservices ```bash +export FAQGen_COMPONENT_NAME="OpeaFaqGenTgi" # or "OpeaFaqGenvLLM" docker run -d \ --name="llm-faqgen-server" \ -p 9000:9000 \ @@ -65,20 +65,16 @@ docker run -d \ #### 1.3.2 Run Docker with Docker Compose (Option B) -```bash -cd ../../deployment/docker_compose/ - -# Backend is TGI on xeon -docker compose -f faq-generation_tgi.yaml up -d +Set `service_name` to match backend service. -# Backend is TGI on gaudi -# docker compose -f faq-generation_tgi_on_intel_hpu.yaml up -d - -# Backend is vLLM on xeon -# docker compose -f faq-generation_vllm.yaml up -d +```bash +export service_name="faqgen-tgi" +# export service_name="faqgen-tgi-gaudi" +# export service_name="faqgen-vllm" +# export service_name="faqgen-vllm-gaudi" -# Backend is vLLM on gaudi -# docker compose -f faq-generation_vllm_on_intel_hpu.yaml up -d +cd ../../deployment/docker_compose/ +docker compose -f compose_faq-generation.yaml up ${service_name} -d ``` ## πŸš€2. Consume LLM Service diff --git a/comps/llms/src/text-generation/README.md b/comps/llms/src/text-generation/README.md index c3be5362a4..360c459dc1 100644 --- a/comps/llms/src/text-generation/README.md +++ b/comps/llms/src/text-generation/README.md @@ -1,67 +1,131 @@ -# TGI LLM Microservice +# LLM text generation Microservice -[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more. +This microservice, designed for Language Model Inference (LLM), processes input consisting of a query string and associated reranked documents. It constructs a prompt based on the query and documents, which is then used to perform inference with a large language model. The service delivers the inference results as output. -## πŸš€1. Start Microservice with Python (Option 1) +A prerequisite for using this microservice is that users must have a LLM text generation service (etc., TGI, vLLM) already running. Users need to set the LLM service's endpoint into an environment variable. The microservice utilizes this endpoint to create an LLM object, enabling it to communicate with the LLM service for executing language model operations. -To start the LLM microservice, you need to install python packages first. +Overall, this microservice offers a streamlined way to integrate large language model inference into applications, requiring minimal setup from the user beyond initiating a TGI/vLLM service and configuring the necessary environment variables. This allows for the seamless processing of queries and documents to generate intelligent, context-aware responses. -### 1.1 Install Requirements +## Validated LLM Models -```bash -pip install -r requirements.txt -``` +| Model | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi | +| --------------------------- | --------- | -------- | ---------- | +| [Intel/neural-chat-7b-v3-3] | βœ“ | βœ“ | βœ“ | +| [Llama-2-7b-chat-hf] | βœ“ | βœ“ | βœ“ | +| [Llama-2-70b-chat-hf] | βœ“ | - | βœ“ | +| [Meta-Llama-3-8B-Instruct] | βœ“ | βœ“ | βœ“ | +| [Meta-Llama-3-70B-Instruct] | βœ“ | - | βœ“ | +| [Phi-3] | x | Limit 4K | Limit 4K | + +## Support integrations -### 1.2 Start 3rd-party TGI Service +In this microservices, we have supported following backend LLM service as integrations, we will include TGI/vLLM/Ollama in this readme, for others, please refer to corresponding readmes. -Please refer to [3rd-party TGI](../../../third_parties/tgi/deployment/docker_compose/) to start a LLM endpoint and verify. +- TGI +- VLLM +- Ollama +- [Bedrock](./README_bedrock.md) +- [Native](./README_native.md), based on optimum habana +- [Predictionguard](./README_predictionguard.md) -### 1.3 Start LLM Service with Python Script +## Clone OPEA GenAIComps + +Clone this repository at your desired location and set an environment variable for easy setup and usage throughout the instructions. ```bash -export TGI_LLM_ENDPOINT="http://${your_ip}:8008" -python llm.py +git clone https://github.com/opea-project/GenAIComps.git + +export OPEA_GENAICOMPS_ROOT=$(pwd)/GenAIComps ``` -## πŸš€2. Start Microservice with Docker (Option 2) +## Prerequisites -If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a TGI/vLLM service with docker. +For TGI/vLLM, You must create a user account with [HuggingFace] and obtain permission to use the gated LLM models by adhering to the guidelines provided on the respective model's webpage. The environment variables `LLM_MODEL` would be the HuggingFace model id and the `HF_TOKEN` is your HuggugFace account's "User Access Token". -### 2.1 Setup Environment Variables +## πŸš€Start Microservice with Docker -In order to start TGI and LLM services, you need to setup the following environment variables first. +In order to start the microservices with docker, you need to build the docker images first for the microservice. -```bash -export HF_TOKEN=${your_hf_api_token} -export TGI_LLM_ENDPOINT="http://${your_ip}:8008" -export LLM_MODEL_ID=${your_hf_llm_model} -``` +### 1. Build Docker Image + +#### 1.1 Prepare backend LLM docker image. + +If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm/) to build vLLM docker images first. -### 2.2 Build Docker Image +No need for TGI or Ollama. + +#### 1.2 Prepare TextGen docker image. ```bash -cd ../../../../ -docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile . +# Build the microservice docker +cd ${OPEA_GENAICOMPS_ROOT} + +docker build \ + --build-arg https_proxy=$https_proxy \ + --build-arg http_proxy=$http_proxy \ + -t opea/llm-textgen:latest \ + -f comps/llms/src/text-generation/Dockerfile . ``` +### 2. Start LLM Service with the built image + To start a docker container, you have two options: - A. Run Docker with CLI - B. Run Docker with Docker Compose -You can choose one as needed. +You can choose one as needed. If you start an LLM microservice with docker compose, the `compose_text-generation.yaml` file will automatically start both endpoint and the microservice docker. -### 2.3 Run Docker with CLI (Option A) +#### 2.1 Setup Environment Variables + +In order to start services, you need to setup the following environment variables first. + +```bash +export LLM_ENDPOINT_PORT=8008 +export TEXTGEN_PORT=9000 +export host_ip=${host_ip} +export HF_TOKEN=${HF_TOKEN} +export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" +export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +``` + +#### 2.2 Run Docker with CLI (Option A) + +Step 1: Start the backend LLM service + +Please refer to [TGI](../../../third_parties/tgi/), [vLLM](../../../third_parties/vllm/), [Ollama](../../../third_parties/ollama/) guideline to start a backend LLM service. + +Step 2: Start the TextGen microservices ```bash -docker run -d --name="llm-tgi-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HF_TOKEN=$HF_TOKEN opea/llm-textgen:latest +export LLM_COMPONENT_NAME="OpeaTextGenService" +docker run \ + --name="llm-textgen-server" \ + -p $TEXTGEN_PORT:9000 \ + --ipc=host \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=${no_proxy} \ + -e LLM_ENDPOINT=$LLM_ENDPOINT \ + -e HF_TOKEN=$HF_TOKEN \ + -e LLM_MODEL_ID=$LLM_MODEL_ID \ + -e LLM_COMPONENT_NAME=$LLM_COMPONENT_NAME \ + opea/llm-textgen:latest ``` -### 2.4 Run Docker with Docker Compose (Option B) +#### 2.3 Run Docker with Docker Compose (Option B) + +Set `service_name` to match backend service. ```bash -cd comps/llms/deployment/docker_compose/ -docker compose -f text-generation_tgi.yaml up -d +export service_name="textgen-service-tgi" +# export service_name="textgen-service-tgi-gaudi" +# export service_name="textgen-service-vllm" +# export service_name="textgen-service-vllm-gaudi" +# export service_name="textgen-service-ollama" + +cd ../../deployment/docker_compose/ +docker compose -f compose_text-generation.yaml up ${service_name} -d ``` ## πŸš€3. Consume LLM Service @@ -69,12 +133,12 @@ docker compose -f text-generation_tgi.yaml up -d ### 3.1 Check Service Status ```bash -curl http://${your_ip}:9000/v1/health_check\ +curl http://${host_ip}:${TEXTGEN_PORT}/v1/health_check\ -X GET \ -H 'Content-Type: application/json' ``` -### 3.2 Consume LLM Service +### 3.1 Verify microservice You can set the following model parameters according to your actual needs, such as `max_tokens`, `stream`. @@ -82,32 +146,29 @@ The `stream` parameter determines the format of the data returned by the API. It ```bash # stream mode -curl http://${your_ip}:9000/v1/chat/completions \ +curl http://${host_ip}:${TEXTGEN_PORT}/v1/chat/completions \ -X POST \ -d '{"model": "${LLM_MODEL_ID}", "messages": "What is Deep Learning?", "max_tokens":17}' \ -H 'Content-Type: application/json' -curl http://${your_ip}:9000/v1/chat/completions \ +curl http://${host_ip}:${TEXTGEN_PORT}/v1/chat/completions \ -X POST \ -d '{"model": "${LLM_MODEL_ID}", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ -H 'Content-Type: application/json' #Non-stream mode -curl http://${your_ip}:9000/v1/chat/completions \ +curl http://${host_ip}:${TEXTGEN_PORT}/v1/chat/completions \ -X POST \ -d '{"model": "${LLM_MODEL_ID}", "messages": "What is Deep Learning?", "max_tokens":17, "stream":false}' \ -H 'Content-Type: application/json' ``` -For parameters in Chat mode, please refer to [OpenAI API](https://platform.openai.com/docs/api-reference/chat/create) - -### 4. Validated Model + -| Model | TGI | -| ------------------------- | --- | -| Intel/neural-chat-7b-v3-3 | βœ“ | -| Llama-2-7b-chat-hf | βœ“ | -| Llama-2-70b-chat-hf | βœ“ | -| Meta-Llama-3-8B-Instruct | βœ“ | -| Meta-Llama-3-70B-Instruct | βœ“ | -| Phi-3 | βœ“ | +[Intel/neural-chat-7b-v3-3]: https://huggingface.co/Intel/neural-chat-7b-v3-3 +[Llama-2-7b-chat-hf]: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf +[Llama-2-70b-chat-hf]: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf +[Meta-Llama-3-8B-Instruct]: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct +[Meta-Llama-3-70B-Instruct]: https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct +[Phi-3]: https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3 +[HuggingFace]: https://huggingface.co/ diff --git a/comps/llms/src/text-generation/README_native.md b/comps/llms/src/text-generation/README_native.md index 8f197cbff7..ba574be027 100644 --- a/comps/llms/src/text-generation/README_native.md +++ b/comps/llms/src/text-generation/README_native.md @@ -4,24 +4,24 @@ LLM Native microservice uses [optimum-habana](https://github.com/huggingface/opt ## πŸš€1. Start Microservice -If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a Native LLM service with docker. - ### 1.1 Setup Environment Variables In order to start Native LLM service, you need to setup the following environment variables first. -For LLM model, both `Qwen` and `Falcon3` models are supported. Users can set different models by changing the `LLM_NATIVE_MODEL` below. +For LLM model, both `Qwen` and `Falcon3` models are supported. Users can set different models by changing the `LLM_MODEL_ID` below. ```bash -export LLM_NATIVE_MODEL="Qwen/Qwen2-7B-Instruct" +export LLM_MODEL_ID="Qwen/Qwen2-7B-Instruct" export HF_TOKEN="your_huggingface_token" +export TEXTGEN_PORT=10512 +export host_ip=${host_ip} ``` ### 1.2 Build Docker Image ```bash cd ../../../../../ -docker build -t opea/llm-native:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile . +docker build -t opea/llm-textgen-gaudi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile.intel_hpu . ``` To start a docker container, you have two options: @@ -34,13 +34,15 @@ You can choose one as needed. ### 1.3 Run Docker with CLI (Option A) ```bash -docker run -d --runtime=habana --name="llm-native-server" -p 9000:9000 -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e TOKENIZERS_PARALLELISM=false -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e LLM_NATIVE_MODEL=${LLM_NATIVE_MODEL} opea/llm-native:latest +docker run -d --runtime=habana --name="llm-native-server" -p 9000:9000 -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e TOKENIZERS_PARALLELISM=false -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e LLM_MODEL_ID=${LLM_MODEL_ID} opea/llm-textgen-gaudi:latest ``` ### 1.4 Run Docker with Docker Compose (Option B) ```bash -docker compose -f docker_compose_llm.yaml up -d +export service_name="textgen-native-gaudi" +cd comps/llms/deployment/docker_compose +docker compose -f compose_text-generation.yaml up ${service_name} -d ``` ## πŸš€2. Consume LLM Service diff --git a/comps/llms/src/text-generation/README_predictionguard.md b/comps/llms/src/text-generation/README_predictionguard.md index 32b46fe250..06680a98f0 100644 --- a/comps/llms/src/text-generation/README_predictionguard.md +++ b/comps/llms/src/text-generation/README_predictionguard.md @@ -7,7 +7,10 @@ ### Run the Predictionguard Microservice ```bash -docker run -d -p 9000:9000 -e PREDICTIONGUARD_API_KEY=$PREDICTIONGUARD_API_KEY --name llm-textgen-predictionguard opea/llm-textgen-predictionguard:latest +export service_name="textgen-predictionguard" + +cd comps/llms/deployment/docker_compose/ +docker compose -f compose_text-generation.yaml up ${service_name} -d ``` ## Consume the Prediction Guard Microservice diff --git a/comps/llms/src/text-generation/README_textgen.md b/comps/llms/src/text-generation/README_textgen.md deleted file mode 100644 index 2c12e1cfc4..0000000000 --- a/comps/llms/src/text-generation/README_textgen.md +++ /dev/null @@ -1,318 +0,0 @@ -# LLM Microservice - -This microservice, designed for Language Model Inference (LLM), processes input consisting of a query string and associated reranked documents. It constructs a prompt based on the query and documents, which is then used to perform inference with a large language model. The service delivers the inference results as output. - -A prerequisite for using this microservice is that users must have a LLM text generation service (etc., TGI, vLLM) already running. Users need to set the LLM service's endpoint into an environment variable. The microservice utilizes this endpoint to create an LLM object, enabling it to communicate with the LLM service for executing language model operations. - -Overall, this microservice offers a streamlined way to integrate large language model inference into applications, requiring minimal setup from the user beyond initiating a TGI/vLLM service and configuring the necessary environment variables. This allows for the seamless processing of queries and documents to generate intelligent, context-aware responses. - -## Validated LLM Models - -| Model | TGI-Gaudi | vLLM-CPU | vLLM-Gaudi | -| --------------------------- | --------- | -------- | ---------- | -| [Intel/neural-chat-7b-v3-3] | βœ“ | βœ“ | βœ“ | -| [Llama-2-7b-chat-hf] | βœ“ | βœ“ | βœ“ | -| [Llama-2-70b-chat-hf] | βœ“ | - | βœ“ | -| [Meta-Llama-3-8B-Instruct] | βœ“ | βœ“ | βœ“ | -| [Meta-Llama-3-70B-Instruct] | βœ“ | - | βœ“ | -| [Phi-3] | x | Limit 4K | Limit 4K | - -## Clone OPEA GenAIComps - -Clone this repository at your desired location and set an environment variable for easy setup and usage throughout the instructions. - -```bash -git clone https://github.com/opea-project/GenAIComps.git - -export OPEA_GENAICOMPS_ROOT=$(pwd)/GenAIComps -``` - -## Prerequisites - -You must create a user account with [HuggingFace] and obtain permission to use the gated LLM models by adhering to the guidelines provided on the respective model's webpage. The environment variables `LLM_MODEL` would be the HuggingFace model id and the `HF_TOKEN` is your HuggugFace account's "User Access Token". - -## πŸš€1. Start Microservice with Python (Option 1) - -To start the LLM microservice, you need to install python packages first. - -### 1.1 Install Requirements - -```bash -# Install opea-comps -pip install opea-comps - -# Install requirements from comps/llms -cd ${OPEA_GENAICOMPS_ROOT}/comps/llms - -pip install -r requirements.txt -``` - -### 1.2 Start LLM Service with Python Script - -#### 1.2.1 Start the TGI Service - -Install the requirements for TGI Service - -```bash -cd ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/tgi - -pip install -r requirements.txt -``` - -Execute the docker run command to initiate the backend, along with the Python script that launches the microservice. - -```bash -export TGI_HOST_IP=$(hostname -I | awk '{print $1}') # This sets IP of the current machine -export LLM_MODEL=${your_hf_llm_model} -export DATA_DIR=$HOME/data # Location to download the model -export HF_TOKEN=${your_hf_api_token} - -# Initiate the backend -docker run -d \ - -p 8008:80 \ - -e HF_TOKEN=${HF_TOKEN} \ - -v ${DATA_DIR}:/data \ - --name tgi_service \ - --shm-size 1g \ - ghcr.io/huggingface/text-generation-inference:1.4 \ - --model-id ${LLM_MODEL} - -# Start the microservice with an endpoint as the above docker run command -export TGI_LLM_ENDPOINT="http://${TGI_HOST_IP}:8008" - -python llm.py -``` - -#### 1.2.2 Start the vLLM Service - -Install the requirements for vLLM Service - -```bash -cd ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/langchain - -pip install -r requirements.txt -``` - -Execute the docker run command to initiate the backend, along with the Python script that launches the microservice. - -```bash -export vLLM_HOST_IP=$(hostname -I | awk '{print $1}') # This sets IP of the current machine -export LLM_MODEL=${your_hf_llm_model} -export DATA_DIR=$HOME/data # Location to download the model -export HF_TOKEN=${your_hf_api_token} - -# Build the image first as opea/vllm-cpu -bash ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh cpu - -# Initiate the backend -docker run -d -it \ - --name vllm_service \ - -p 8008:80 \ - -e HF_TOKEN=${HF_TOKEN} \ - -e VLLM_CPU_KVCACHE_SPACE=40 \ - -v ${DATA_DIR}:/data \ - opea/vllm-cpu:latest \ - --model ${LLM_MODEL} \ - --port 80 - -# Start the microservice with an endpoint as the above docker run command -export vLLM_ENDPOINT="http://${vLLM_HOST_IP}:8008" - -python llm.py -``` - -## πŸš€2. Start Microservice with Docker (Option 2) - -In order to start the microservices with docker, you need to build the docker images first for the microservice. - -### 2.1 Build Docker Image - -```bash -# Build the microservice docker -cd ${OPEA_GENAICOMPS_ROOT} - -docker build \ - --build-arg https_proxy=$https_proxy \ - --build-arg http_proxy=$http_proxy \ - -t opea/llm:latest \ - -f comps/llms/src/text-generation/Dockerfile . -``` - -### 2.2 Start LLM Service with the built image - -To start a docker container, you have two options: - -- A. Run Docker with CLI -- B. Run Docker with Docker Compose - -You can choose one as needed. If you start an LLM microservice with docker compose, the `docker_compose_llm.yaml` file will automatically start both endpoint and the microservice docker. - -#### 2.2.1 Setup Environment Variables - -In order to start TGI and LLM services, you need to setup the following environment variables first. - -```bash -export HF_TOKEN=${your_hf_api_token} -export TGI_LLM_ENDPOINT="http://${your_ip}:8008" -export LLM_MODEL=${your_hf_llm_model} -export DATA_DIR=$HOME/data -``` - -In order to start vLLM and LLM services, you need to setup the following environment variables first. - -```bash -export HF_TOKEN=${your_hf_api_token} -export vLLM_LLM_ENDPOINT="http://${your_ip}:8008" -export LLM_MODEL=${your_hf_llm_model} -``` - -### 2.3 Run Docker with CLI (Option A) - -#### 2.3.1 TGI - -Start TGI endpoint. - -```bash -docker run -d \ - -p 8008:80 \ - -e HF_TOKEN=${HF_TOKEN} \ - -v ${DATA_DIR}:/data \ - --name tgi_service \ - --shm-size 1g \ - ghcr.io/huggingface/text-generation-inference:1.4 \ - --model-id ${LLM_MODEL} -``` - -Start TGI microservice - -```bash -docker run -d \ - --name="llm-tgi-server" \ - -p 9000:9000 \ - --ipc=host \ - -e http_proxy=$http_proxy \ - -e https_proxy=$https_proxy \ - -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT \ - -e HF_TOKEN=$HF_TOKEN \ - opea/llm-textgen:latest -``` - -#### 2.3.2 vLLM - -Start vllm endpoint. - -```bash -bash ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh -``` - -Start vllm microservice. - -```bash -docker run \ - --name="llm-vllm-server" \ - -p 9000:9000 \ - --ipc=host \ - -e http_proxy=$http_proxy \ - -e https_proxy=$https_proxy \ - -e no_proxy=${no_proxy} \ - -e vLLM_LLM_ENDPOINT=$vLLM_LLM_ENDPOINT \ - -e HF_TOKEN=$HF_TOKEN \ - -e LLM_MODEL=$LLM_MODEL \ - opea/llm-textgen:latest -``` - -### 2.4 Run Docker with Docker Compose (Option B) - -#### 2.4.1 TGI - -```bash -cd ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/tgi -docker compose -f docker_compose_llm.yaml up -d -``` - -#### 2.4.2 vLLM - -```bash -cd ${OPEA_GENAICOMPS_ROOT}/comps/llms/text-generation/vllm/langchain -docker compose -f docker_compose_llm.yaml up -d -``` - -## πŸš€3. Consume LLM Service - -### 3.1 Check Service Status - -```bash -curl http://${your_ip}:9000/v1/health_check\ - -X GET \ - -H 'Content-Type: application/json' -``` - -### 3.2 Verify the LLM Service - -#### 3.2.1 Verify the TGI Service - -```bash -curl http://${your_ip}:8008/v1/chat/completions \ - -X POST \ - -d '{"model": ${your_hf_llm_model}, "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ - -H 'Content-Type: application/json' -``` - -#### 3.2.2 Verify the vLLM Service - -```bash -curl http://${host_ip}:8008/v1/chat/completions \ - -X POST \ - -H "Content-Type: application/json" \ - -d '{"model": ${your_hf_llm_model}, "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' -``` - -### 3.3 Consume LLM Service - -You can set the following model parameters according to your actual needs, such as `max_tokens`, `stream`. - -The `stream` parameter determines the format of the data returned by the API. It will return text string with `stream=false`, return text stream flow with `stream=true`. - -```bash -# non-stream mode -curl http://${your_ip}:9000/v1/chat/completions \ - -X POST \ - -H 'Content-Type: application/json' \ - -d '{ - "query":"What is Deep Learning?", - "max_tokens":17, - "top_k":10, - "top_p":0.95, - "typical_p":0.95, - "temperature":0.01, - "repetition_penalty":1.03, - "stream":false - }' - - -# stream mode -curl http://${your_ip}:9000/v1/chat/completions \ - -X POST \ - -H 'Content-Type: application/json' \ - -d '{ - "query":"What is Deep Learning?", - "max_tokens":17, - "top_k":10, - "top_p":0.95, - "typical_p":0.95, - "temperature":0.01, - "repetition_penalty":1.03, - "stream":true - }' - -``` - - - -[Intel/neural-chat-7b-v3-3]: https://huggingface.co/Intel/neural-chat-7b-v3-3 -[Llama-2-7b-chat-hf]: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf -[Llama-2-70b-chat-hf]: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf -[Meta-Llama-3-8B-Instruct]: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct -[Meta-Llama-3-70B-Instruct]: https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct -[Phi-3]: https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3 -[HuggingFace]: https://huggingface.co/ diff --git a/comps/llms/src/text-generation/README_ollama.md b/comps/third_parties/ollama/README.md similarity index 76% rename from comps/llms/src/text-generation/README_ollama.md rename to comps/third_parties/ollama/README.md index c18c8febc5..c82e5c6cea 100644 --- a/comps/llms/src/text-generation/README_ollama.md +++ b/comps/third_parties/ollama/README.md @@ -53,22 +53,3 @@ curl --noproxy "*" http://localhost:11434/api/generate -d '{ "prompt":"Why is the sky blue?" }' ``` - -## Build Docker Image - -```bash -cd ../../../../ -docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile . -``` - -## Run the Ollama Microservice - -```bash -docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLM_ENDPOINT="http://localhost:11434" -e LLM_MODEL_ID="llama3" opea/llm-textgen:latest -``` - -## Consume the Ollama Microservice - -```bash -curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' -H 'Content-Type: application/json' -``` diff --git a/comps/third_parties/tgi/README.md b/comps/third_parties/tgi/README.md new file mode 100644 index 0000000000..e12f6d34da --- /dev/null +++ b/comps/third_parties/tgi/README.md @@ -0,0 +1,30 @@ +# TGI LLM Microservice + +[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more. + +## Start TGI with docker compose + +Set up environment. + +```bash +export LLM_ENDPOINT_PORT=8008 +export host_ip=${host_ip} +export HF_TOKEN=${HF_TOKEN} +export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export MAX_INPUT_TOKENS=1024 +export MAX_TOTAL_TOKENS=2048 +``` + +Run tgi on xeon. + +```bash +cd deplopyment/docker_compose +docker compose -f compose.yaml tgi-server up -d +``` + +Run tgi on gaudi. + +```bash +cd deplopyment/docker_compose +docker compose -f compose.yaml tgi-gaudi-server up -d +``` diff --git a/comps/third_parties/vllm/README.md b/comps/third_parties/vllm/README.md index eb8dd4efbf..baf71b08b4 100644 --- a/comps/third_parties/vllm/README.md +++ b/comps/third_parties/vllm/README.md @@ -5,21 +5,17 @@ ## πŸš€1. Set up Environment Variables ```bash -export HF_TOKEN= -export vLLM_ENDPOINT="http://${your_ip}:8008" -export LLM_MODEL="meta-llama/Meta-Llama-3-8B-Instruct" +export LLM_ENDPOINT_PORT=8008 +export host_ip=${host_ip} +export HF_TOKEN=${HF_TOKEN} +export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}" +export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" ``` For gated models such as `LLAMA-2`, you will have to pass the environment HF_TOKEN. Please follow this link [huggingface token](https://huggingface.co/docs/hub/security-tokens) to get the access token and export `HF_TOKEN` environment with the token. ## πŸš€2. Set up vLLM Service -First of all, go to the server folder for vllm. - -```bash -cd dependency -``` - ### 2.1 vLLM on CPU First let's enable VLLM on CPU. @@ -32,7 +28,7 @@ bash ./build_docker_vllm.sh The `build_docker_vllm` accepts one parameter `hw_mode` to specify the hardware mode of the service, with the default being `cpu`, and the optional selection can be `hpu`. -#### Launch vLLM service +#### Launch vLLM service with scripts ```bash bash ./launch_vllm_service.sh @@ -44,6 +40,13 @@ If you want to customize the port or model_name, can run: bash ./launch_vllm_service.sh ${port_number} ${model_name} ``` +#### Launch vLLM service with docker compose + +```bash +cd deplopyment/docker_compose +docker compose -f compose.yaml vllm-server up -d +``` + ### 2.2 vLLM on Gaudi Then we show how to enable VLLM on Gaudi. @@ -58,6 +61,15 @@ Set `hw_mode` to `hpu`. #### Launch vLLM service on single node +1. Option 1: Use docker compose for quick deploy + +```bash +cd deplopyment/docker_compose +docker compose -f compose.yaml vllm-gaudi-server up -d +``` + +2. Option 2: Use scripts to set parameters. + For small model, we can just use single node. ```bash diff --git a/comps/third_parties/vllm/deployment/docker_compose/launch_vllm_service.sh b/comps/third_parties/vllm/deployment/docker_compose/launch_vllm_service.sh new file mode 100644 index 0000000000..83ecd67530 --- /dev/null +++ b/comps/third_parties/vllm/deployment/docker_compose/launch_vllm_service.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Set default values +default_port=8008 +default_model=$LLM_MODEL +default_hw_mode="cpu" +default_parallel_number=1 +default_block_size=128 +default_max_num_seqs=256 +default_max_seq_len_to_capture=2048 + +# Assign arguments to variables +port_number=${1:-$default_port} +model_name=${2:-$default_model} +hw_mode=${3:-$default_hw_mode} +parallel_number=${4:-$default_parallel_number} +block_size=${5:-$default_block_size} +max_num_seqs=${6:-$default_max_num_seqs} +max_seq_len_to_capture=${7:-$default_max_seq_len_to_capture} + +# Check if all required arguments are provided +if [ "$#" -lt 0 ] || [ "$#" -gt 4 ]; then + echo "Usage: $0 [port_number] [model_name] [hw_mode] [parallel_number]" + echo "port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080." + echo "model_name: The model name utilized for LLM, with the default set to 'meta-llama/Meta-Llama-3-8B-Instruct'." + echo "hw_mode: The hardware mode utilized for LLM, with the default set to 'cpu', and the optional selection can be 'hpu'" + echo "parallel_number: parallel nodes number for 'hpu' mode" + echo "block_size: default set to 128 for better performance on HPU" + echo "max_num_seqs: default set to 256 for better performance on HPU" + echo "max_seq_len_to_capture: default set to 2048 for better performance on HPU" + exit 1 +fi + +# Set the volume variable +volume=$PWD/data + +# Build the Docker run command based on hardware mode +if [ "$hw_mode" = "hpu" ]; then + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm-gaudi:latest --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture +else + docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm-cpu:latest --model $model_name --host 0.0.0.0 --port 80 +fi