Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
157714aa
Unverified
Commit
157714aa
authored
Jul 29, 2025
by
Hongkuan Zhou
Committed by
GitHub
Jul 29, 2025
Browse files
chore: add instructions to modify SLA to profile_sla doc; update component name (#2167)
parent
8248a116
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
93 additions
and
47 deletions
+93
-47
benchmarks/profiler/deploy/profile_sla_job.yaml
benchmarks/profiler/deploy/profile_sla_job.yaml
+14
-5
benchmarks/profiler/utils/config.py
benchmarks/profiler/utils/config.py
+26
-26
components/backends/vllm/deploy/disagg_planner.yaml
components/backends/vllm/deploy/disagg_planner.yaml
+2
-2
components/planner/src/dynamo/planner/defaults.py
components/planner/src/dynamo/planner/defaults.py
+4
-2
components/planner/src/dynamo/planner/utils/planner_core.py
components/planner/src/dynamo/planner/utils/planner_core.py
+16
-4
docs/architecture/pre_deployment_profiling.md
docs/architecture/pre_deployment_profiling.md
+31
-8
No files found.
benchmarks/profiler/deploy/profile_sla_job.yaml
View file @
157714aa
...
@@ -14,11 +14,8 @@ spec:
...
@@ -14,11 +14,8 @@ spec:
image
:
${DOCKER_IMAGE}
image
:
${DOCKER_IMAGE}
resources
:
resources
:
requests
:
requests
:
cpu
:
"
1"
cpu
:
"
16"
memory
:
"
2Gi"
memory
:
"
10Gi"
limits
:
cpu
:
"
2"
memory
:
"
4Gi"
env
:
env
:
-
name
:
HUGGING_FACE_HUB_TOKEN
-
name
:
HUGGING_FACE_HUB_TOKEN
valueFrom
:
valueFrom
:
...
@@ -37,6 +34,18 @@ spec:
...
@@ -37,6 +34,18 @@ spec:
-
/workspace/profiling_results
-
/workspace/profiling_results
-
--namespace
-
--namespace
-
${NAMESPACE}
-
${NAMESPACE}
-
--min-num-gpus-per-engine
-
"
1"
-
--max-num-gpus-per-engine
-
"
8"
-
--isl
-
"
3000"
-
--osl
-
"
150"
-
--ttft
-
"
200"
-
--itl
-
"
20"
volumeMounts
:
volumeMounts
:
-
name
:
output-volume
-
name
:
output-volume
mountPath
:
/workspace/profiling_results
mountPath
:
/workspace/profiling_results
...
...
benchmarks/profiler/utils/config.py
View file @
157714aa
...
@@ -89,16 +89,16 @@ class VllmV1ConfigModifier:
...
@@ -89,16 +89,16 @@ class VllmV1ConfigModifier:
if
target
==
"prefill"
:
if
target
==
"prefill"
:
# convert prefill worker into decode worker
# convert prefill worker into decode worker
config
[
"spec"
][
"services"
][
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
]
=
config
[
"spec"
][
"services"
][
]
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
_k8s_name
]
]
del
config
[
"spec"
][
"services"
][
del
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
_k8s_name
]
]
args
=
config
[
"spec"
][
"services"
][
args
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
args
=
break_arguments
(
args
)
args
=
break_arguments
(
args
)
...
@@ -112,18 +112,18 @@ class VllmV1ConfigModifier:
...
@@ -112,18 +112,18 @@ class VllmV1ConfigModifier:
if
"--no-enable-prefix-caching"
not
in
args
:
if
"--no-enable-prefix-caching"
not
in
args
:
args
=
append_argument
(
args
,
"--no-enable-prefix-caching"
)
args
=
append_argument
(
args
,
"--no-enable-prefix-caching"
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
config
[
"spec"
][
"services"
][
"extraPodSpec"
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
elif
target
==
"decode"
:
elif
target
==
"decode"
:
# delete prefill worker
# delete prefill worker
del
config
[
"spec"
][
"services"
][
del
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
_k8s_name
]
]
args
=
config
[
"spec"
][
"services"
][
args
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
args
=
break_arguments
(
args
)
args
=
break_arguments
(
args
)
...
@@ -134,13 +134,13 @@ class VllmV1ConfigModifier:
...
@@ -134,13 +134,13 @@ class VllmV1ConfigModifier:
if
"--no-enable-prefix-caching"
in
args
:
if
"--no-enable-prefix-caching"
in
args
:
args
.
remove
(
"--no-enable-prefix-caching"
)
args
.
remove
(
"--no-enable-prefix-caching"
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
config
[
"spec"
][
"services"
][
"extraPodSpec"
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
# set num workers to 1
# set num workers to 1
decode_worker_config
=
config
[
"spec"
][
"services"
][
decode_worker_config
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
]
]
decode_worker_config
[
"replicas"
]
=
1
decode_worker_config
[
"replicas"
]
=
1
...
@@ -150,16 +150,16 @@ class VllmV1ConfigModifier:
...
@@ -150,16 +150,16 @@ class VllmV1ConfigModifier:
def
set_config_tp_size
(
cls
,
config
:
dict
,
tp_size
:
int
):
def
set_config_tp_size
(
cls
,
config
:
dict
,
tp_size
:
int
):
config
=
deepcopy
(
config
)
config
=
deepcopy
(
config
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
config
[
"spec"
][
"services"
][
"resources"
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"requests"
][
"gpu"
]
=
str
(
tp_size
)
][
"resources"
][
"requests"
][
"gpu"
]
=
str
(
tp_size
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
config
[
"spec"
][
"services"
][
"resources"
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"limits"
][
"gpu"
]
=
str
(
tp_size
)
][
"resources"
][
"limits"
][
"gpu"
]
=
str
(
tp_size
)
args
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
args
=
config
[
"spec"
][
"services"
][
"extraPodSpec"
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"mainContainer"
][
"args"
]
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
args
=
break_arguments
(
args
)
args
=
break_arguments
(
args
)
...
@@ -169,15 +169,15 @@ class VllmV1ConfigModifier:
...
@@ -169,15 +169,15 @@ class VllmV1ConfigModifier:
except
ValueError
:
except
ValueError
:
args
=
append_argument
(
args
,
[
"--tensor-parallel-size"
,
str
(
tp_size
)])
args
=
append_argument
(
args
,
[
"--tensor-parallel-size"
,
str
(
tp_size
)])
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
config
[
"spec"
][
"services"
][
"extraPodSpec"
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
return
config
return
config
@
classmethod
@
classmethod
def
get_model_name
(
cls
,
config
:
dict
)
->
str
:
def
get_model_name
(
cls
,
config
:
dict
)
->
str
:
worker_name
=
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
worker_name
=
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
args
=
config
[
"spec"
][
"services"
][
worker_name
][
"extraPodSpec"
][
"mainContainer"
][
args
=
config
[
"spec"
][
"services"
][
worker_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
"args"
]
]
...
...
components/backends/vllm/deploy/disagg_planner.yaml
View file @
157714aa
...
@@ -141,7 +141,7 @@ spec:
...
@@ -141,7 +141,7 @@ spec:
-
-c
-
-c
args
:
args
:
-
"
python3
-m
dynamo.planner.prometheus"
-
"
python3
-m
dynamo.planner.prometheus"
backend
:
VllmDecodeWorker
:
dynamoNamespace
:
vllm-disagg-planner
dynamoNamespace
:
vllm-disagg-planner
envFromSecret
:
hf-token-secret
envFromSecret
:
hf-token-secret
componentType
:
worker
componentType
:
worker
...
@@ -191,7 +191,7 @@ spec:
...
@@ -191,7 +191,7 @@ spec:
-
-c
-
-c
args
:
args
:
-
"
python3
-m
dynamo.vllm
--model
Qwen/Qwen3-0.6B
2>&1
|
tee
/tmp/vllm.log"
-
"
python3
-m
dynamo.vllm
--model
Qwen/Qwen3-0.6B
2>&1
|
tee
/tmp/vllm.log"
p
refill
:
VllmP
refill
Worker
:
dynamoNamespace
:
vllm-disagg-planner
dynamoNamespace
:
vllm-disagg-planner
envFromSecret
:
hf-token-secret
envFromSecret
:
hf-token-secret
componentType
:
worker
componentType
:
worker
...
...
components/planner/src/dynamo/planner/defaults.py
View file @
157714aa
...
@@ -74,9 +74,11 @@ class SLAPlannerDefaults(BasePlannerDefaults):
...
@@ -74,9 +74,11 @@ class SLAPlannerDefaults(BasePlannerDefaults):
class
VllmComponentName
:
class
VllmComponentName
:
prefill_worker
=
"prefill"
prefill_worker_k8s_name
=
"VllmPrefillWorker"
prefill_worker_component_name
=
"prefill"
prefill_worker_endpoint
=
"generate"
prefill_worker_endpoint
=
"generate"
decode_worker
=
"backend"
decode_worker_k8s_name
=
"VllmDecodeWorker"
decode_worker_component_name
=
"backend"
decode_worker_endpoint
=
"generate"
decode_worker_endpoint
=
"generate"
...
...
components/planner/src/dynamo/planner/utils/planner_core.py
View file @
157714aa
...
@@ -106,7 +106,11 @@ class Planner:
...
@@ -106,7 +106,11 @@ class Planner:
if
self
.
prefill_client
is
None
:
if
self
.
prefill_client
is
None
:
self
.
prefill_client
=
(
self
.
prefill_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker_component_name
)
.
endpoint
(
.
endpoint
(
WORKER_COMPONENT_NAMES
[
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
self
.
args
.
backend
...
@@ -127,7 +131,11 @@ class Planner:
...
@@ -127,7 +131,11 @@ class Planner:
if
self
.
workers_client
is
None
:
if
self
.
workers_client
is
None
:
self
.
workers_client
=
(
self
.
workers_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker_component_name
)
.
endpoint
(
.
endpoint
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker_endpoint
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker_endpoint
)
)
...
@@ -300,8 +308,12 @@ class Planner:
...
@@ -300,8 +308,12 @@ class Planner:
if
not
self
.
args
.
no_operation
:
if
not
self
.
args
.
no_operation
:
target_replicas
=
{
target_replicas
=
{
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker
:
next_num_p
,
WORKER_COMPONENT_NAMES
[
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker
:
next_num_d
,
self
.
args
.
backend
].
prefill_worker_k8s_name
:
next_num_p
,
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker_k8s_name
:
next_num_d
,
}
}
await
self
.
connector
.
set_component_replicas
(
target_replicas
,
blocking
=
False
)
await
self
.
connector
.
set_component_replicas
(
target_replicas
,
blocking
=
False
)
...
...
docs/architecture/pre_deployment_profiling.md
View file @
157714aa
...
@@ -82,25 +82,47 @@ kubectl create secret docker-registry nvcr-imagepullsecret \
...
@@ -82,25 +82,47 @@ kubectl create secret docker-registry nvcr-imagepullsecret \
# in the project's root folder
# in the project's root folder
./container/build.sh
--framework
VLLM
./container/build.sh
--framework
VLLM
# Tag and push to your container registry
# Tag and push to your container registry
export
DOCKER_IMAGE
=
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
# or your own dynamoimage
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
# Modify this yaml to profile different models
export
DGD_CONFIG_FILE
=
/workspace/components/backends/vllm/deploy/disagg.yaml
# or your own disagg config file
```
```
Replace the
`image`
within
`profile_sla_job.yaml`
with the tag of the image you pushed.
Replace the
`image`
within
`profile_sla_job.yaml`
with the tag of the image you pushed.
**Step 2: Run profiling (required)**
**Step 2: Set SLA target**
Edit
`$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml`
to set the target ISL, OSL, TTFT, and ITL.
```
yaml
spec
:
template
:
spec
:
containers
:
-
name
:
profile-sla
args
:
-
--isl
-
"
3000"
# average ISL is 3000 tokens
-
--osl
-
"
150"
# average OSL is 150 tokens
-
--ttft
-
"
200"
# target TTFT is 200ms
-
--itl
-
"
20"
# target ITL is 20ms
```
**Step 3: Run profiling (required)**
```
bash
```
bash
cd
$DYNAMO_HOME
/benchmarks/profiler/deploy
cd
$DYNAMO_HOME
/benchmarks/profiler/deploy
envsubst < profiling_pvc.yaml | kubectl apply
-f
-
envsubst < profiling_pvc.yaml | kubectl apply
-f
-
envsubst < profile_sla_sa.yaml | kubectl apply
-f
-
envsubst < profile_sla_sa.yaml | kubectl apply
-f
-
envsubst < profile_sla_rbac.yaml | kubectl apply
-f
-
envsubst < profile_sla_rbac.yaml | kubectl apply
-f
-
envsubst < profile_sla_binding.yaml | kubectl apply
-f
-
envsubst < profile_sla_binding.yaml | kubectl apply
-f
-
export
DOCKER_IMAGE
=
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
# or your own image
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
export
DGD_CONFIG_FILE
=
/workspace/components/backends/vllm/deploy/disagg.yaml
# or your own disagg config file
envsubst < profile_sla_job.yaml | kubectl apply
-f
-
envsubst < profile_sla_job.yaml | kubectl apply
-f
-
```
```
**Step
3
: Wait for profiling to complete**
**Step
4
: Wait for profiling to complete**
```
bash
```
bash
kubectl get
jobs
-n
$NAMESPACE
kubectl get
jobs
-n
$NAMESPACE
kubectl logs job/profile-sla
-n
$NAMESPACE
kubectl logs job/profile-sla
-n
$NAMESPACE
...
@@ -129,13 +151,14 @@ The profiling results are stored in a PVC named `profiling-pvc`. To access the r
...
@@ -129,13 +151,14 @@ The profiling results are stored in a PVC named `profiling-pvc`. To access the r
1.
**Create a temporary pod to access the PVC:**
1.
**Create a temporary pod to access the PVC:**
```
bash
```
bash
kubectl run temp-access
--image
=
alpine:latest
--rm
-it
--restart
=
Never
\
kubectl run temp-access
--image
=
alpine:latest
--restart
=
Never
\
--overrides
=
'{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["
sh
"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}'
\
--overrides
=
'{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["
tail","-f","/dev/null
"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}'
\
-n
$NAMESPACE
-n
$NAMESPACE
```
```
2.
**Inside the temporary pod, navigate to the results directory:**
2.
**Inside the temporary pod, navigate to the results directory:**
```
bash
```
bash
kubectl
exec
-it
temp-access
-n
$NAMESPACE
--
sh
cd
/workspace/profiling_results
cd
/workspace/profiling_results
ls
-la
ls
-la
```
```
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment