Unverified Commit d669547a authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

chore: Advance deepseek wideep and qwen-235b recipes to 1.0.1 TRTLLM version (#7479)

parent 8bee4ac8
......@@ -22,6 +22,13 @@ spec:
env:
- name: HF_HUB_ENABLE_HF_TRANSFER
value: "1"
# Optional: create with: kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=<token> -n <namespace>
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
optional: true
args:
- |
set -eux
......
......@@ -11,6 +11,13 @@
# kubectl apply -f deploy.yaml -n <namespace>
# 4. To benchmark the service, run:
# kubectl apply -f perf.yaml -n <namespace>
#
# NOTE (empty /v1/models): If the frontend returns "data": [] from /v1/models, check frontend logs.
# The frontend discovery watcher may treat the workers' --model-path (a local path like
# /model-cache/deepseek-r1-fp4) as a HuggingFace model ID and fail with 404. Mounting the
# model-cache on the Frontend with HF_HOME allows the frontend to resolve local paths when
# the runtime supports it. Otherwise use a HuggingFace model ID for --model-path and HF_HOME
# on workers (with model downloaded in HF cache layout).
# ConfigMap for prefill engine configuration
# This configuration sets up a DEP 4 prefill worker
......@@ -122,11 +129,17 @@ spec:
Frontend:
componentType: frontend
replicas: 1
volumeMounts:
- name: model-cache
mountPoint: /model-cache
extraPodSpec:
tolerations: []
affinity: {}
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
env:
- name: HF_HOME
value: /model-cache
args:
- |
python3 -m dynamo.frontend --http-port 8000
......@@ -158,7 +171,7 @@ spec:
tolerations: []
affinity: {}
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
workingDir: /workspace/components/backends/trtllm
# NOTE: If your PVCs (Persistent Volume Claims) are really slow,
# you might need to increase 'failureThreshold' below to allow more time for startup
......@@ -216,7 +229,7 @@ spec:
tolerations: []
affinity: {}
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
workingDir: /workspace/components/backends/trtllm
# NOTE: If your PVCs (Persistent Volume Claims) are really slow,
# you might need to increase 'failureThreshold' below to allow more time for startup
......
......@@ -13,6 +13,7 @@ spec:
labels:
app: deepseek-r1-bench
spec:
tolerations: []
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
......
......@@ -42,6 +42,7 @@ spec:
componentType: frontend
replicas: 1
extraPodSpec:
tolerations: []
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
......@@ -53,7 +54,7 @@ spec:
- qwen3-235b-a22b-agg-frontend
topologyKey: kubernetes.io/hostname
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
args:
- python3 -m dynamo.frontend --router-mode kv --http-port 8000
command:
......@@ -65,6 +66,7 @@ spec:
sharedMemory:
size: 256Gi
extraPodSpec:
tolerations: []
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
......@@ -94,7 +96,7 @@ spec:
--max-num-tokens 8192 \
--max-seq-len 8192 \
--extra-engine-args "${ENGINE_ARGS}"
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
workingDir: /workspace/components/backends/trtllm
volumeMounts:
- name: agg-config
......
......@@ -24,6 +24,9 @@ data:
max_batch_size: 2
disable_overlap_scheduler: true
print_iter_log: false
moe_config:
backend: DEEPGEMM
max_num_tokens: 8192
---
apiVersion: v1
kind: ConfigMap
......@@ -49,6 +52,9 @@ data:
max_batch_size: 512
disable_overlap_scheduler: false
print_iter_log: false
moe_config:
backend: DEEPGEMM
max_num_tokens: 8192
---
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
......@@ -64,6 +70,7 @@ spec:
componentType: frontend
replicas: 1
extraPodSpec:
tolerations: []
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
......@@ -92,6 +99,7 @@ spec:
sharedMemory:
size: 256Gi
extraPodSpec:
tolerations: []
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
......@@ -147,6 +155,7 @@ spec:
sharedMemory:
size: 256Gi
extraPodSpec:
tolerations: []
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment