-
Notifications
You must be signed in to change notification settings - Fork 796
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Docker compose scripts for remote adapters (#241)
* tgi docker compose * path * wait for tgi server to start before starting server * update provider-id * move scripts to distribution/ folder * add readme * readme
- Loading branch information
Showing
3 changed files
with
129 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# Docker Compose Scripts | ||
|
||
This folder contains scripts to enable starting a distribution using `docker compose`. | ||
|
||
|
||
#### Example: TGI Inference Adapter | ||
``` | ||
$ cd llama_stack/distribution/docker/tgi | ||
$ ls | ||
compose.yaml tgi-run.yaml | ||
$ docker compose up | ||
``` | ||
|
||
The script will first start up TGI server, then start up Llama Stack distribution server hooking up to the remote TGI provider for inference. You should be able to see the following outputs -- | ||
``` | ||
[text-generation-inference] | 2024-10-15T18:56:33.810397Z INFO text_generation_router::server: router/src/server.rs:1813: Using config Some(Llama) | ||
[text-generation-inference] | 2024-10-15T18:56:33.810448Z WARN text_generation_router::server: router/src/server.rs:1960: Invalid hostname, defaulting to 0.0.0.0 | ||
[text-generation-inference] | 2024-10-15T18:56:33.864143Z INFO text_generation_router::server: router/src/server.rs:2353: Connected | ||
INFO: Started server process [1] | ||
INFO: Waiting for application startup. | ||
INFO: Application startup complete. | ||
INFO: Uvicorn running on http://[::]:5000 (Press CTRL+C to quit) | ||
``` | ||
|
||
To kill the server | ||
``` | ||
docker compose down | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
services: | ||
text-generation-inference: | ||
image: ghcr.io/huggingface/text-generation-inference:latest | ||
network_mode: "host" | ||
volumes: | ||
- $HOME/.cache/huggingface:/data | ||
ports: | ||
- "5009:5009" | ||
devices: | ||
- nvidia.com/gpu=all | ||
environment: | ||
- CUDA_VISIBLE_DEVICES=0 | ||
- HF_HOME=/data | ||
- HF_DATASETS_CACHE=/data | ||
- HF_MODULES_CACHE=/data | ||
- HF_HUB_CACHE=/data | ||
command: ["--dtype", "bfloat16", "--usage-stats", "on", "--sharded", "false", "--model-id", "meta-llama/Llama-3.1-8B-Instruct", "--port", "5009", "--cuda-memory-fraction", "0.3"] | ||
deploy: | ||
resources: | ||
reservations: | ||
devices: | ||
- driver: nvidia | ||
# that's the closest analogue to --gpus; provide | ||
# an integer amount of devices or 'all' | ||
count: 1 | ||
# Devices are reserved using a list of capabilities, making | ||
# capabilities the only required field. A device MUST | ||
# satisfy all the requested capabilities for a successful | ||
# reservation. | ||
capabilities: [gpu] | ||
runtime: nvidia | ||
healthcheck: | ||
test: ["CMD", "curl", "-f", "http://text-generation-inference:5009/health"] | ||
interval: 5s | ||
timeout: 5s | ||
retries: 30 | ||
llamastack-local-cpu: | ||
depends_on: | ||
text-generation-inference: | ||
condition: service_healthy | ||
image: llamastack-local-cpu | ||
network_mode: "host" | ||
volumes: | ||
- ~/.llama:/root/.llama | ||
# Link to TGI run.yaml file | ||
- ./tgi-run.yaml:/root/llamastack-run-tgi.yaml | ||
ports: | ||
- "5000:5000" | ||
# Hack: wait for TGI server to start before starting docker | ||
entrypoint: bash -c "sleep 60; python -m llama_stack.distribution.server.server --yaml_config /root/llamastack-run-tgi.yaml" | ||
restart_policy: | ||
condition: on-failure | ||
delay: 3s | ||
max_attempts: 5 | ||
window: 60s |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
version: '2' | ||
built_at: '2024-10-08T17:40:45.325529' | ||
image_name: local | ||
docker_image: null | ||
conda_env: local | ||
apis: | ||
- shields | ||
- agents | ||
- models | ||
- memory | ||
- memory_banks | ||
- inference | ||
- safety | ||
providers: | ||
inference: | ||
- provider_id: tgi0 | ||
provider_type: remote::tgi | ||
config: | ||
url: http://127.0.0.1:5009 | ||
safety: | ||
- provider_id: meta0 | ||
provider_type: meta-reference | ||
config: | ||
llama_guard_shield: | ||
model: Llama-Guard-3-1B | ||
excluded_categories: [] | ||
disable_input_check: false | ||
disable_output_check: false | ||
prompt_guard_shield: | ||
model: Prompt-Guard-86M | ||
memory: | ||
- provider_id: meta0 | ||
provider_type: meta-reference | ||
config: {} | ||
agents: | ||
- provider_id: meta0 | ||
provider_type: meta-reference | ||
config: | ||
persistence_store: | ||
namespace: null | ||
type: sqlite | ||
db_path: ~/.llama/runtime/kvstore.db | ||
telemetry: | ||
- provider_id: meta0 | ||
provider_type: meta-reference | ||
config: {} |