Unverified Commit f8b0a5a9 authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

feat: Add trtllm deploy examples for k8s (#2133)


Co-authored-by: default avatarBiswa Panda <biswa.panda@gmail.com>
parent f3868b1f
......@@ -185,6 +185,65 @@ For comprehensive instructions on multinode serving, see the [multinode-examples
### Speculative Decoding
- **[Llama 4 Maverick Instruct + Eagle Speculative Decoding](./llama4_plus_eagle.md)**
### Kubernetes Deployment
For Kubernetes deployment, YAML manifests are provided in the `deploy/` directory. These define DynamoGraphDeployment resources for various configurations:
- `agg.yaml` - Aggregated serving
- `agg_router.yaml` - Aggregated serving with KV routing
- `disagg.yaml` - Disaggregated serving
- `disagg_router.yaml` - Disaggregated serving with KV routing
#### Prerequisites
- **Dynamo Cloud**: Follow the [Quickstart Guide](../../../docs/guides/dynamo_deploy/quickstart.md) to deploy Dynamo Cloud first.
- **Container Images**: The deployment files currently require access to `nvcr.io/nvidian/nim-llm-dev/trtllm-runtime`. If you don't have access, build and push your own image:
```bash
./container/build.sh --framework tensorrtllm
# Tag and push to your container registry
# Update the image references in the YAML files
```
- **Port Forwarding**: After deployment, forward the frontend service to access the API:
```bash
kubectl port-forward deployment/trtllm-v1-disagg-frontend-<pod-uuid-info> 8080:8000
```
#### Deploy to Kubernetes
Example with disagg:
Export the NAMESPACE you used in your Dynamo Cloud Installation.
```bash
cd dynamo
cd components/backends/trtllm/deploy
kubectl apply -f disagg.yaml -n $NAMESPACE
```
To change `DYN_LOG` level, edit the yaml file by adding
```yaml
...
spec:
envs:
- name: DYN_LOG
value: "debug" # or other log levels
...
```
### Client
See [client](../llm/README.md#client) section to learn how to send request to the deployment.
NOTE: To send a request to a multi-node deployment, target the node which is running `dynamo-run in=http`.
### Benchmarking
To benchmark your deployment with GenAI-Perf, see this utility script, configuring the
`model` name and `host` based on your deployment: [perf.sh](../../benchmarks/llm/perf.sh)
## Disaggregation Strategy
The disaggregation strategy controls how requests are distributed between the prefill and decode workers in a disaggregated deployment.
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: trtllm-agg
spec:
services:
Frontend:
dynamoNamespace: trtllm-agg
componentType: main
livenessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 3
failureThreshold: 10
replicas: 1
resources:
requests:
cpu: "5"
memory: "10Gi"
limits:
cpu: "5"
memory: "10Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.frontend --http-port 8000"
TRTLLMWorker:
envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
dynamoNamespace: trtllm-agg
componentType: worker
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
args:
- "python3"
- "-m"
- "dynamo.trtllm"
- "--model-path"
- "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
- "--served-model-name"
- "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
- "--extra-engine-args"
- "engine_configs/agg.yaml"
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: trtllm-agg-router
spec:
services:
Frontend:
livenessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 3
failureThreshold: 5
dynamoNamespace: trtllm-agg-router
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.frontend --http-port 8000 --router-mode kv"
TRTLLMWorker:
envFromSecret: hf-token-secret
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
dynamoNamespace: trtllm-agg-router
componentType: worker
replicas: 2
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
args:
- "python3"
- "-m"
- "dynamo.trtllm"
- "--model-path"
- "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
- "--served-model-name"
- "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
- "--extra-engine-args"
- "engine_configs/agg.yaml"
- "--publish-events-and-metrics"
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: trtllm-disagg
spec:
services:
Frontend:
dynamoNamespace: trtllm-disagg
componentType: main
livenessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 3
failureThreshold: 10
replicas: 1
resources:
requests:
cpu: "5"
memory: "10Gi"
limits:
cpu: "5"
memory: "10Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.frontend --http-port 8000"
TRTLLMPrefillWorker:
dynamoNamespace: trtllm-disagg
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first 2>&1 | tee /tmp/trtllm.log"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
TRTLLMDecodeWorker:
dynamoNamespace: trtllm-disagg
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first 2>&1 | tee /tmp/trtllm.log"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: trtllm-v1-disagg-router
spec:
services:
Frontend:
dynamoNamespace: trtllm-v1-disagg-router
componentType: main
livenessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/sh
- -c
- 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 3
failureThreshold: 10
replicas: 1
resources:
requests:
cpu: "5"
memory: "10Gi"
limits:
cpu: "5"
memory: "10Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.frontend --http-port 8000 --router-mode kv"
TRTLLMPrefillWorker:
dynamoNamespace: trtllm-v1-disagg-router
envFromSecret: hf-token-secret
componentType: worker
replicas: 2
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics 2>&1 | tee /tmp/trtllm.log"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
TRTLLMDecodeWorker:
dynamoNamespace: trtllm-v1-disagg-router
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/trtllm-runtime:dep-233.17
workingDir: /workspace/components/backends/trtllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first 2>&1 | tee /tmp/trtllm.log"
envs:
- name: DYN_SYSTEM_ENABLED
value: "true"
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
value: "[\"generate\"]"
- name: DYN_SYSTEM_PORT
value: "9090"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment