chore: Advance deepseek wideep and qwen-235b recipes to 1.0.1 TRTLLM version (#7479)

d669547a · Tanmay Verma · GitHub · 8bee4ac8 · d669547a · d669547a
Unverified Commit d669547a authored Mar 17, 2026 by Tanmay Verma Committed by GitHub Mar 17, 2026
5 changed files
--- a/recipes/deepseek-r1/model-cache/model-download.yaml
+++ b/recipes/deepseek-r1/model-cache/model-download.yaml
@@ -22,6 +22,13 @@ spec:
          env:
            - name: HF_HUB_ENABLE_HF_TRANSFER
              value: "1"
+            # Optional: create with: kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=<token> -n <namespace>
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: HF_TOKEN
+                  optional: true
          args:
            - |
              set -eux

--- a/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml
+++ b/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml
@@ -11,6 +11,13 @@
 #    kubectl apply -f deploy.yaml -n <namespace>
 # 4. To benchmark the service, run:
 #    kubectl apply -f perf.yaml -n <namespace>
+#
+# NOTE (empty /v1/models): If the frontend returns "data": [] from /v1/models, check frontend logs.
+#    The frontend discovery watcher may treat the workers' --model-path (a local path like
+#    /model-cache/deepseek-r1-fp4) as a HuggingFace model ID and fail with 404. Mounting the
+#    model-cache on the Frontend with HF_HOME allows the frontend to resolve local paths when
+#    the runtime supports it. Otherwise use a HuggingFace model ID for --model-path and HF_HOME
+#    on workers (with model downloaded in HF cache layout).
 # ConfigMap for prefill engine configuration
 # This configuration sets up a DEP 4 prefill worker
@@ -122,11 +129,17 @@ spec:
    Frontend:
      componentType: frontend
      replicas: 1
+      volumeMounts:
+        - name: model-cache
+          mountPoint: /model-cache
      extraPodSpec:
        tolerations: []
        affinity: {}
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
+          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
+          env:
+            - name: HF_HOME
+              value: /model-cache
          args:
          - |
            python3 -m dynamo.frontend --http-port 8000
@@ -158,7 +171,7 @@ spec:
        tolerations: []
        affinity: {}
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
+          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
          workingDir: /workspace/components/backends/trtllm
          # NOTE: If your PVCs (Persistent Volume Claims) are really slow,
          #       you might need to increase 'failureThreshold' below to allow more time for startup
@@ -216,7 +229,7 @@ spec:
        tolerations: []
        affinity: {}
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
+          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
          workingDir: /workspace/components/backends/trtllm
          # NOTE: If your PVCs (Persistent Volume Claims) are really slow,
          #       you might need to increase 'failureThreshold' below to allow more time for startup

--- a/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/perf.yaml
+++ b/recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/perf.yaml
@@ -13,6 +13,7 @@ spec:
      labels:
        app: deepseek-r1-bench
    spec:
+      tolerations: []
      affinity:
        podAntiAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:

--- a/recipes/qwen3-235b-a22b-fp8/trtllm/agg/deploy.yaml
+++ b/recipes/qwen3-235b-a22b-fp8/trtllm/agg/deploy.yaml
@@ -42,6 +42,7 @@ spec:
      componentType: frontend
      replicas: 1
      extraPodSpec:
+        tolerations: []
        affinity:
          podAntiAffinity:
            requiredDuringSchedulingIgnoredDuringExecution:
@@ -53,7 +54,7 @@ spec:
                    - qwen3-235b-a22b-agg-frontend
              topologyKey: kubernetes.io/hostname
        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
+          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
          args:
            - python3 -m dynamo.frontend --router-mode kv --http-port 8000
          command:
@@ -65,6 +66,7 @@ spec:
      sharedMemory:
        size: 256Gi
      extraPodSpec:
+        tolerations: []
        affinity:
          nodeAffinity:
            requiredDuringSchedulingIgnoredDuringExecution:
@@ -94,7 +96,7 @@ spec:
              --max-num-tokens 8192 \
              --max-seq-len 8192 \
              --extra-engine-args "${ENGINE_ARGS}"
-          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
+          image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
          workingDir: /workspace/components/backends/trtllm
          volumeMounts:
            - name: agg-config

--- a/recipes/qwen3-235b-a22b-fp8/trtllm/disagg/deploy.yaml
+++ b/recipes/qwen3-235b-a22b-fp8/trtllm/disagg/deploy.yaml
@@ -24,6 +24,9 @@ data:
      max_batch_size: 2
    disable_overlap_scheduler: true
    print_iter_log: false
+    moe_config:
+      backend: DEEPGEMM
+      max_num_tokens: 8192
 ---
 apiVersion: v1
 kind: ConfigMap
@@ -49,6 +52,9 @@ data:
      max_batch_size: 512
    disable_overlap_scheduler: false
    print_iter_log: false
+    moe_config:
+      backend: DEEPGEMM
+      max_num_tokens: 8192
 ---
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
@@ -64,6 +70,7 @@ spec:
      componentType: frontend
      replicas: 1
      extraPodSpec:
+        tolerations: []
        affinity:
          podAntiAffinity:
            requiredDuringSchedulingIgnoredDuringExecution:
@@ -92,6 +99,7 @@ spec:
      sharedMemory:
        size: 256Gi
      extraPodSpec:
+        tolerations: []
        affinity:
          nodeAffinity:
            requiredDuringSchedulingIgnoredDuringExecution:
@@ -147,6 +155,7 @@ spec:
      sharedMemory:
        size: 256Gi
      extraPodSpec:
+        tolerations: []
        affinity:
          nodeAffinity:
            requiredDuringSchedulingIgnoredDuringExecution: