Unverified Commit d669547a authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

chore: Advance deepseek wideep and qwen-235b recipes to 1.0.1 TRTLLM version (#7479)

parent 8bee4ac8
...@@ -22,6 +22,13 @@ spec: ...@@ -22,6 +22,13 @@ spec:
env: env:
- name: HF_HUB_ENABLE_HF_TRANSFER - name: HF_HUB_ENABLE_HF_TRANSFER
value: "1" value: "1"
# Optional: create with: kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=<token> -n <namespace>
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
optional: true
args: args:
- | - |
set -eux set -eux
......
...@@ -11,6 +11,13 @@ ...@@ -11,6 +11,13 @@
# kubectl apply -f deploy.yaml -n <namespace> # kubectl apply -f deploy.yaml -n <namespace>
# 4. To benchmark the service, run: # 4. To benchmark the service, run:
# kubectl apply -f perf.yaml -n <namespace> # kubectl apply -f perf.yaml -n <namespace>
#
# NOTE (empty /v1/models): If the frontend returns "data": [] from /v1/models, check frontend logs.
# The frontend discovery watcher may treat the workers' --model-path (a local path like
# /model-cache/deepseek-r1-fp4) as a HuggingFace model ID and fail with 404. Mounting the
# model-cache on the Frontend with HF_HOME allows the frontend to resolve local paths when
# the runtime supports it. Otherwise use a HuggingFace model ID for --model-path and HF_HOME
# on workers (with model downloaded in HF cache layout).
# ConfigMap for prefill engine configuration # ConfigMap for prefill engine configuration
# This configuration sets up a DEP 4 prefill worker # This configuration sets up a DEP 4 prefill worker
...@@ -122,11 +129,17 @@ spec: ...@@ -122,11 +129,17 @@ spec:
Frontend: Frontend:
componentType: frontend componentType: frontend
replicas: 1 replicas: 1
volumeMounts:
- name: model-cache
mountPoint: /model-cache
extraPodSpec: extraPodSpec:
tolerations: [] tolerations: []
affinity: {} affinity: {}
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0 image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
env:
- name: HF_HOME
value: /model-cache
args: args:
- | - |
python3 -m dynamo.frontend --http-port 8000 python3 -m dynamo.frontend --http-port 8000
...@@ -158,7 +171,7 @@ spec: ...@@ -158,7 +171,7 @@ spec:
tolerations: [] tolerations: []
affinity: {} affinity: {}
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0 image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
# NOTE: If your PVCs (Persistent Volume Claims) are really slow, # NOTE: If your PVCs (Persistent Volume Claims) are really slow,
# you might need to increase 'failureThreshold' below to allow more time for startup # you might need to increase 'failureThreshold' below to allow more time for startup
...@@ -216,7 +229,7 @@ spec: ...@@ -216,7 +229,7 @@ spec:
tolerations: [] tolerations: []
affinity: {} affinity: {}
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0 image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
# NOTE: If your PVCs (Persistent Volume Claims) are really slow, # NOTE: If your PVCs (Persistent Volume Claims) are really slow,
# you might need to increase 'failureThreshold' below to allow more time for startup # you might need to increase 'failureThreshold' below to allow more time for startup
......
...@@ -13,6 +13,7 @@ spec: ...@@ -13,6 +13,7 @@ spec:
labels: labels:
app: deepseek-r1-bench app: deepseek-r1-bench
spec: spec:
tolerations: []
affinity: affinity:
podAntiAffinity: podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution: requiredDuringSchedulingIgnoredDuringExecution:
......
...@@ -42,6 +42,7 @@ spec: ...@@ -42,6 +42,7 @@ spec:
componentType: frontend componentType: frontend
replicas: 1 replicas: 1
extraPodSpec: extraPodSpec:
tolerations: []
affinity: affinity:
podAntiAffinity: podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution: requiredDuringSchedulingIgnoredDuringExecution:
...@@ -53,7 +54,7 @@ spec: ...@@ -53,7 +54,7 @@ spec:
- qwen3-235b-a22b-agg-frontend - qwen3-235b-a22b-agg-frontend
topologyKey: kubernetes.io/hostname topologyKey: kubernetes.io/hostname
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0 image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
args: args:
- python3 -m dynamo.frontend --router-mode kv --http-port 8000 - python3 -m dynamo.frontend --router-mode kv --http-port 8000
command: command:
...@@ -65,6 +66,7 @@ spec: ...@@ -65,6 +66,7 @@ spec:
sharedMemory: sharedMemory:
size: 256Gi size: 256Gi
extraPodSpec: extraPodSpec:
tolerations: []
affinity: affinity:
nodeAffinity: nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution: requiredDuringSchedulingIgnoredDuringExecution:
...@@ -94,7 +96,7 @@ spec: ...@@ -94,7 +96,7 @@ spec:
--max-num-tokens 8192 \ --max-num-tokens 8192 \
--max-seq-len 8192 \ --max-seq-len 8192 \
--extra-engine-args "${ENGINE_ARGS}" --extra-engine-args "${ENGINE_ARGS}"
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0 image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
workingDir: /workspace/components/backends/trtllm workingDir: /workspace/components/backends/trtllm
volumeMounts: volumeMounts:
- name: agg-config - name: agg-config
......
...@@ -24,6 +24,9 @@ data: ...@@ -24,6 +24,9 @@ data:
max_batch_size: 2 max_batch_size: 2
disable_overlap_scheduler: true disable_overlap_scheduler: true
print_iter_log: false print_iter_log: false
moe_config:
backend: DEEPGEMM
max_num_tokens: 8192
--- ---
apiVersion: v1 apiVersion: v1
kind: ConfigMap kind: ConfigMap
...@@ -49,6 +52,9 @@ data: ...@@ -49,6 +52,9 @@ data:
max_batch_size: 512 max_batch_size: 512
disable_overlap_scheduler: false disable_overlap_scheduler: false
print_iter_log: false print_iter_log: false
moe_config:
backend: DEEPGEMM
max_num_tokens: 8192
--- ---
apiVersion: nvidia.com/v1alpha1 apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment kind: DynamoGraphDeployment
...@@ -64,6 +70,7 @@ spec: ...@@ -64,6 +70,7 @@ spec:
componentType: frontend componentType: frontend
replicas: 1 replicas: 1
extraPodSpec: extraPodSpec:
tolerations: []
affinity: affinity:
podAntiAffinity: podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution: requiredDuringSchedulingIgnoredDuringExecution:
...@@ -92,6 +99,7 @@ spec: ...@@ -92,6 +99,7 @@ spec:
sharedMemory: sharedMemory:
size: 256Gi size: 256Gi
extraPodSpec: extraPodSpec:
tolerations: []
affinity: affinity:
nodeAffinity: nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution: requiredDuringSchedulingIgnoredDuringExecution:
...@@ -147,6 +155,7 @@ spec: ...@@ -147,6 +155,7 @@ spec:
sharedMemory: sharedMemory:
size: 256Gi size: 256Gi
extraPodSpec: extraPodSpec:
tolerations: []
affinity: affinity:
nodeAffinity: nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution: requiredDuringSchedulingIgnoredDuringExecution:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment