Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
157714aa
Unverified
Commit
157714aa
authored
Jul 29, 2025
by
Hongkuan Zhou
Committed by
GitHub
Jul 29, 2025
Browse files
chore: add instructions to modify SLA to profile_sla doc; update component name (#2167)
parent
8248a116
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
93 additions
and
47 deletions
+93
-47
benchmarks/profiler/deploy/profile_sla_job.yaml
benchmarks/profiler/deploy/profile_sla_job.yaml
+14
-5
benchmarks/profiler/utils/config.py
benchmarks/profiler/utils/config.py
+26
-26
components/backends/vllm/deploy/disagg_planner.yaml
components/backends/vllm/deploy/disagg_planner.yaml
+2
-2
components/planner/src/dynamo/planner/defaults.py
components/planner/src/dynamo/planner/defaults.py
+4
-2
components/planner/src/dynamo/planner/utils/planner_core.py
components/planner/src/dynamo/planner/utils/planner_core.py
+16
-4
docs/architecture/pre_deployment_profiling.md
docs/architecture/pre_deployment_profiling.md
+31
-8
No files found.
benchmarks/profiler/deploy/profile_sla_job.yaml
View file @
157714aa
...
...
@@ -14,11 +14,8 @@ spec:
image
:
${DOCKER_IMAGE}
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
2"
memory
:
"
4Gi"
cpu
:
"
16"
memory
:
"
10Gi"
env
:
-
name
:
HUGGING_FACE_HUB_TOKEN
valueFrom
:
...
...
@@ -37,6 +34,18 @@ spec:
-
/workspace/profiling_results
-
--namespace
-
${NAMESPACE}
-
--min-num-gpus-per-engine
-
"
1"
-
--max-num-gpus-per-engine
-
"
8"
-
--isl
-
"
3000"
-
--osl
-
"
150"
-
--ttft
-
"
200"
-
--itl
-
"
20"
volumeMounts
:
-
name
:
output-volume
mountPath
:
/workspace/profiling_results
...
...
benchmarks/profiler/utils/config.py
View file @
157714aa
...
...
@@ -89,16 +89,16 @@ class VllmV1ConfigModifier:
if
target
==
"prefill"
:
# convert prefill worker into decode worker
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
]
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
_k8s_name
]
del
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
_k8s_name
]
args
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
args
=
break_arguments
(
args
)
...
...
@@ -112,18 +112,18 @@ class VllmV1ConfigModifier:
if
"--no-enable-prefix-caching"
not
in
args
:
args
=
append_argument
(
args
,
"--no-enable-prefix-caching"
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
elif
target
==
"decode"
:
# delete prefill worker
del
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
_k8s_name
]
args
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
args
=
break_arguments
(
args
)
...
...
@@ -134,13 +134,13 @@ class VllmV1ConfigModifier:
if
"--no-enable-prefix-caching"
in
args
:
args
.
remove
(
"--no-enable-prefix-caching"
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
# set num workers to 1
decode_worker_config
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
]
decode_worker_config
[
"replicas"
]
=
1
...
...
@@ -150,16 +150,16 @@ class VllmV1ConfigModifier:
def
set_config_tp_size
(
cls
,
config
:
dict
,
tp_size
:
int
):
config
=
deepcopy
(
config
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
"resources"
][
"requests"
][
"gpu"
]
=
str
(
tp_size
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
"resources"
][
"limits"
][
"gpu"
]
=
str
(
tp_size
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"resources"
][
"requests"
][
"gpu"
]
=
str
(
tp_size
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"resources"
][
"limits"
][
"gpu"
]
=
str
(
tp_size
)
args
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
args
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
args
=
break_arguments
(
args
)
...
...
@@ -169,15 +169,15 @@ class VllmV1ConfigModifier:
except
ValueError
:
args
=
append_argument
(
args
,
[
"--tensor-parallel-size"
,
str
(
tp_size
)])
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
return
config
@
classmethod
def
get_model_name
(
cls
,
config
:
dict
)
->
str
:
worker_name
=
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
worker_name
=
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
args
=
config
[
"spec"
][
"services"
][
worker_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
...
...
components/backends/vllm/deploy/disagg_planner.yaml
View file @
157714aa
...
...
@@ -141,7 +141,7 @@ spec:
-
-c
args
:
-
"
python3
-m
dynamo.planner.prometheus"
backend
:
VllmDecodeWorker
:
dynamoNamespace
:
vllm-disagg-planner
envFromSecret
:
hf-token-secret
componentType
:
worker
...
...
@@ -191,7 +191,7 @@ spec:
-
-c
args
:
-
"
python3
-m
dynamo.vllm
--model
Qwen/Qwen3-0.6B
2>&1
|
tee
/tmp/vllm.log"
p
refill
:
VllmP
refill
Worker
:
dynamoNamespace
:
vllm-disagg-planner
envFromSecret
:
hf-token-secret
componentType
:
worker
...
...
components/planner/src/dynamo/planner/defaults.py
View file @
157714aa
...
...
@@ -74,9 +74,11 @@ class SLAPlannerDefaults(BasePlannerDefaults):
class
VllmComponentName
:
prefill_worker
=
"prefill"
prefill_worker_k8s_name
=
"VllmPrefillWorker"
prefill_worker_component_name
=
"prefill"
prefill_worker_endpoint
=
"generate"
decode_worker
=
"backend"
decode_worker_k8s_name
=
"VllmDecodeWorker"
decode_worker_component_name
=
"backend"
decode_worker_endpoint
=
"generate"
...
...
components/planner/src/dynamo/planner/utils/planner_core.py
View file @
157714aa
...
...
@@ -106,7 +106,11 @@ class Planner:
if
self
.
prefill_client
is
None
:
self
.
prefill_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker_component_name
)
.
endpoint
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
...
...
@@ -127,7 +131,11 @@ class Planner:
if
self
.
workers_client
is
None
:
self
.
workers_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker_component_name
)
.
endpoint
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker_endpoint
)
...
...
@@ -300,8 +308,12 @@ class Planner:
if
not
self
.
args
.
no_operation
:
target_replicas
=
{
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker
:
next_num_p
,
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker
:
next_num_d
,
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker_k8s_name
:
next_num_p
,
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker_k8s_name
:
next_num_d
,
}
await
self
.
connector
.
set_component_replicas
(
target_replicas
,
blocking
=
False
)
...
...
docs/architecture/pre_deployment_profiling.md
View file @
157714aa
...
...
@@ -82,25 +82,47 @@ kubectl create secret docker-registry nvcr-imagepullsecret \
# in the project's root folder
./container/build.sh
--framework
VLLM
# Tag and push to your container registry
export
DOCKER_IMAGE
=
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
# or your own dynamoimage
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
# Modify this yaml to profile different models
export
DGD_CONFIG_FILE
=
/workspace/components/backends/vllm/deploy/disagg.yaml
# or your own disagg config file
```
Replace the
`image`
within
`profile_sla_job.yaml`
with the tag of the image you pushed.
**Step 2: Run profiling (required)**
**Step 2: Set SLA target**
Edit
`$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml`
to set the target ISL, OSL, TTFT, and ITL.
```
yaml
spec
:
template
:
spec
:
containers
:
-
name
:
profile-sla
args
:
-
--isl
-
"
3000"
# average ISL is 3000 tokens
-
--osl
-
"
150"
# average OSL is 150 tokens
-
--ttft
-
"
200"
# target TTFT is 200ms
-
--itl
-
"
20"
# target ITL is 20ms
```
**Step 3: Run profiling (required)**
```
bash
cd
$DYNAMO_HOME
/benchmarks/profiler/deploy
envsubst < profiling_pvc.yaml | kubectl apply
-f
-
envsubst < profile_sla_sa.yaml | kubectl apply
-f
-
envsubst < profile_sla_rbac.yaml | kubectl apply
-f
-
envsubst < profile_sla_binding.yaml | kubectl apply
-f
-
export
DOCKER_IMAGE
=
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
# or your own image
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
export
DGD_CONFIG_FILE
=
/workspace/components/backends/vllm/deploy/disagg.yaml
# or your own disagg config file
envsubst < profile_sla_job.yaml | kubectl apply
-f
-
```
**Step
3
: Wait for profiling to complete**
**Step
4
: Wait for profiling to complete**
```
bash
kubectl get
jobs
-n
$NAMESPACE
kubectl logs job/profile-sla
-n
$NAMESPACE
...
...
@@ -129,13 +151,14 @@ The profiling results are stored in a PVC named `profiling-pvc`. To access the r
1.
**Create a temporary pod to access the PVC:**
```
bash
kubectl run temp-access
--image
=
alpine:latest
--rm
-it
--restart
=
Never
\
--overrides
=
'{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["
sh
"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}'
\
kubectl run temp-access
--image
=
alpine:latest
--restart
=
Never
\
--overrides
=
'{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["
tail","-f","/dev/null
"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}'
\
-n
$NAMESPACE
```
2.
**Inside the temporary pod, navigate to the results directory:**
```
bash
kubectl
exec
-it
temp-access
-n
$NAMESPACE
--
sh
cd
/workspace/profiling_results
ls
-la
```
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment