Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
d669547a
Unverified
Commit
d669547a
authored
Mar 17, 2026
by
Tanmay Verma
Committed by
GitHub
Mar 17, 2026
Browse files
chore: Advance deepseek wideep and qwen-235b recipes to 1.0.1 TRTLLM version (#7479)
parent
8bee4ac8
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
37 additions
and
5 deletions
+37
-5
recipes/deepseek-r1/model-cache/model-download.yaml
recipes/deepseek-r1/model-cache/model-download.yaml
+7
-0
recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml
recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml
+16
-3
recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/perf.yaml
recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/perf.yaml
+1
-0
recipes/qwen3-235b-a22b-fp8/trtllm/agg/deploy.yaml
recipes/qwen3-235b-a22b-fp8/trtllm/agg/deploy.yaml
+4
-2
recipes/qwen3-235b-a22b-fp8/trtllm/disagg/deploy.yaml
recipes/qwen3-235b-a22b-fp8/trtllm/disagg/deploy.yaml
+9
-0
No files found.
recipes/deepseek-r1/model-cache/model-download.yaml
View file @
d669547a
...
...
@@ -22,6 +22,13 @@ spec:
env
:
-
name
:
HF_HUB_ENABLE_HF_TRANSFER
value
:
"
1"
# Optional: create with: kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=<token> -n <namespace>
-
name
:
HF_TOKEN
valueFrom
:
secretKeyRef
:
name
:
hf-token-secret
key
:
HF_TOKEN
optional
:
true
args
:
-
|
set -eux
...
...
recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml
View file @
d669547a
...
...
@@ -11,6 +11,13 @@
# kubectl apply -f deploy.yaml -n <namespace>
# 4. To benchmark the service, run:
# kubectl apply -f perf.yaml -n <namespace>
#
# NOTE (empty /v1/models): If the frontend returns "data": [] from /v1/models, check frontend logs.
# The frontend discovery watcher may treat the workers' --model-path (a local path like
# /model-cache/deepseek-r1-fp4) as a HuggingFace model ID and fail with 404. Mounting the
# model-cache on the Frontend with HF_HOME allows the frontend to resolve local paths when
# the runtime supports it. Otherwise use a HuggingFace model ID for --model-path and HF_HOME
# on workers (with model downloaded in HF cache layout).
# ConfigMap for prefill engine configuration
# This configuration sets up a DEP 4 prefill worker
...
...
@@ -122,11 +129,17 @@ spec:
Frontend
:
componentType
:
frontend
replicas
:
1
volumeMounts
:
-
name
:
model-cache
mountPoint
:
/model-cache
extraPodSpec
:
tolerations
:
[]
affinity
:
{}
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.0
image
:
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
env
:
-
name
:
HF_HOME
value
:
/model-cache
args
:
-
|
python3 -m dynamo.frontend --http-port 8000
...
...
@@ -158,7 +171,7 @@ spec:
tolerations
:
[]
affinity
:
{}
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:
0.8.0
image
:
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:
1.0.1
workingDir
:
/workspace/components/backends/trtllm
# NOTE: If your PVCs (Persistent Volume Claims) are really slow,
# you might need to increase 'failureThreshold' below to allow more time for startup
...
...
@@ -216,7 +229,7 @@ spec:
tolerations
:
[]
affinity
:
{}
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:
0.8.0
image
:
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:
1.0.1
workingDir
:
/workspace/components/backends/trtllm
# NOTE: If your PVCs (Persistent Volume Claims) are really slow,
# you might need to increase 'failureThreshold' below to allow more time for startup
...
...
recipes/deepseek-r1/trtllm/disagg/wide_ep/gb200/perf.yaml
View file @
d669547a
...
...
@@ -13,6 +13,7 @@ spec:
labels
:
app
:
deepseek-r1-bench
spec
:
tolerations
:
[]
affinity
:
podAntiAffinity
:
requiredDuringSchedulingIgnoredDuringExecution
:
...
...
recipes/qwen3-235b-a22b-fp8/trtllm/agg/deploy.yaml
View file @
d669547a
...
...
@@ -42,6 +42,7 @@ spec:
componentType
:
frontend
replicas
:
1
extraPodSpec
:
tolerations
:
[]
affinity
:
podAntiAffinity
:
requiredDuringSchedulingIgnoredDuringExecution
:
...
...
@@ -53,7 +54,7 @@ spec:
-
qwen3-235b-a22b-agg-frontend
topologyKey
:
kubernetes.io/hostname
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:
0.8.0
image
:
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:
1.0.1
args
:
-
python3 -m dynamo.frontend --router-mode kv --http-port
8000
command
:
...
...
@@ -65,6 +66,7 @@ spec:
sharedMemory
:
size
:
256Gi
extraPodSpec
:
tolerations
:
[]
affinity
:
nodeAffinity
:
requiredDuringSchedulingIgnoredDuringExecution
:
...
...
@@ -94,7 +96,7 @@ spec:
--max-num-tokens 8192 \
--max-seq-len 8192 \
--extra-engine-args "${ENGINE_ARGS}"
image
:
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:
0.8.0
image
:
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:
1.0.1
workingDir
:
/workspace/components/backends/trtllm
volumeMounts
:
-
name
:
agg-config
...
...
recipes/qwen3-235b-a22b-fp8/trtllm/disagg/deploy.yaml
View file @
d669547a
...
...
@@ -24,6 +24,9 @@ data:
max_batch_size: 2
disable_overlap_scheduler: true
print_iter_log: false
moe_config:
backend: DEEPGEMM
max_num_tokens: 8192
---
apiVersion
:
v1
kind
:
ConfigMap
...
...
@@ -49,6 +52,9 @@ data:
max_batch_size: 512
disable_overlap_scheduler: false
print_iter_log: false
moe_config:
backend: DEEPGEMM
max_num_tokens: 8192
---
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
...
...
@@ -64,6 +70,7 @@ spec:
componentType
:
frontend
replicas
:
1
extraPodSpec
:
tolerations
:
[]
affinity
:
podAntiAffinity
:
requiredDuringSchedulingIgnoredDuringExecution
:
...
...
@@ -92,6 +99,7 @@ spec:
sharedMemory
:
size
:
256Gi
extraPodSpec
:
tolerations
:
[]
affinity
:
nodeAffinity
:
requiredDuringSchedulingIgnoredDuringExecution
:
...
...
@@ -147,6 +155,7 @@ spec:
sharedMemory
:
size
:
256Gi
extraPodSpec
:
tolerations
:
[]
affinity
:
nodeAffinity
:
requiredDuringSchedulingIgnoredDuringExecution
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment