Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
157714aa
"lib/llm/src/vscode:/vscode.git/clone" did not exist on "b2605a8e64bdbc8ecd9933259caaff7b78307c7d"
Unverified
Commit
157714aa
authored
Jul 29, 2025
by
Hongkuan Zhou
Committed by
GitHub
Jul 29, 2025
Browse files
chore: add instructions to modify SLA to profile_sla doc; update component name (#2167)
parent
8248a116
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
93 additions
and
47 deletions
+93
-47
benchmarks/profiler/deploy/profile_sla_job.yaml
benchmarks/profiler/deploy/profile_sla_job.yaml
+14
-5
benchmarks/profiler/utils/config.py
benchmarks/profiler/utils/config.py
+26
-26
components/backends/vllm/deploy/disagg_planner.yaml
components/backends/vllm/deploy/disagg_planner.yaml
+2
-2
components/planner/src/dynamo/planner/defaults.py
components/planner/src/dynamo/planner/defaults.py
+4
-2
components/planner/src/dynamo/planner/utils/planner_core.py
components/planner/src/dynamo/planner/utils/planner_core.py
+16
-4
docs/architecture/pre_deployment_profiling.md
docs/architecture/pre_deployment_profiling.md
+31
-8
No files found.
benchmarks/profiler/deploy/profile_sla_job.yaml
View file @
157714aa
...
...
@@ -14,11 +14,8 @@ spec:
image
:
${DOCKER_IMAGE}
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
2"
memory
:
"
4Gi"
cpu
:
"
16"
memory
:
"
10Gi"
env
:
-
name
:
HUGGING_FACE_HUB_TOKEN
valueFrom
:
...
...
@@ -37,6 +34,18 @@ spec:
-
/workspace/profiling_results
-
--namespace
-
${NAMESPACE}
-
--min-num-gpus-per-engine
-
"
1"
-
--max-num-gpus-per-engine
-
"
8"
-
--isl
-
"
3000"
-
--osl
-
"
150"
-
--ttft
-
"
200"
-
--itl
-
"
20"
volumeMounts
:
-
name
:
output-volume
mountPath
:
/workspace/profiling_results
...
...
benchmarks/profiler/utils/config.py
View file @
157714aa
...
...
@@ -89,16 +89,16 @@ class VllmV1ConfigModifier:
if
target
==
"prefill"
:
# convert prefill worker into decode worker
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
]
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
_k8s_name
]
del
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
_k8s_name
]
args
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
args
=
break_arguments
(
args
)
...
...
@@ -112,18 +112,18 @@ class VllmV1ConfigModifier:
if
"--no-enable-prefix-caching"
not
in
args
:
args
=
append_argument
(
args
,
"--no-enable-prefix-caching"
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
elif
target
==
"decode"
:
# delete prefill worker
del
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
prefill_worker
_k8s_name
]
args
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
args
=
break_arguments
(
args
)
...
...
@@ -134,13 +134,13 @@ class VllmV1ConfigModifier:
if
"--no-enable-prefix-caching"
in
args
:
args
.
remove
(
"--no-enable-prefix-caching"
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
# set num workers to 1
decode_worker_config
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
]
decode_worker_config
[
"replicas"
]
=
1
...
...
@@ -150,16 +150,16 @@ class VllmV1ConfigModifier:
def
set_config_tp_size
(
cls
,
config
:
dict
,
tp_size
:
int
):
config
=
deepcopy
(
config
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
"resources"
][
"requests"
][
"gpu"
]
=
str
(
tp_size
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
"resources"
][
"limits"
][
"gpu"
]
=
str
(
tp_size
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"resources"
][
"requests"
][
"gpu"
]
=
str
(
tp_size
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"resources"
][
"limits"
][
"gpu"
]
=
str
(
tp_size
)
args
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
args
=
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
args
=
break_arguments
(
args
)
...
...
@@ -169,15 +169,15 @@ class VllmV1ConfigModifier:
except
ValueError
:
args
=
append_argument
(
args
,
[
"--tensor-parallel-size"
,
str
(
tp_size
)])
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
config
[
"spec"
][
"services"
][
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker_k8s_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
=
join_arguments
(
args
)
return
config
@
classmethod
def
get_model_name
(
cls
,
config
:
dict
)
->
str
:
worker_name
=
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
worker_name
=
WORKER_COMPONENT_NAMES
[
"vllm"
].
decode_worker
_k8s_name
args
=
config
[
"spec"
][
"services"
][
worker_name
][
"extraPodSpec"
][
"mainContainer"
][
"args"
]
...
...
components/backends/vllm/deploy/disagg_planner.yaml
View file @
157714aa
...
...
@@ -141,7 +141,7 @@ spec:
-
-c
args
:
-
"
python3
-m
dynamo.planner.prometheus"
backend
:
VllmDecodeWorker
:
dynamoNamespace
:
vllm-disagg-planner
envFromSecret
:
hf-token-secret
componentType
:
worker
...
...
@@ -191,7 +191,7 @@ spec:
-
-c
args
:
-
"
python3
-m
dynamo.vllm
--model
Qwen/Qwen3-0.6B
2>&1
|
tee
/tmp/vllm.log"
p
refill
:
VllmP
refill
Worker
:
dynamoNamespace
:
vllm-disagg-planner
envFromSecret
:
hf-token-secret
componentType
:
worker
...
...
components/planner/src/dynamo/planner/defaults.py
View file @
157714aa
...
...
@@ -74,9 +74,11 @@ class SLAPlannerDefaults(BasePlannerDefaults):
class
VllmComponentName
:
prefill_worker
=
"prefill"
prefill_worker_k8s_name
=
"VllmPrefillWorker"
prefill_worker_component_name
=
"prefill"
prefill_worker_endpoint
=
"generate"
decode_worker
=
"backend"
decode_worker_k8s_name
=
"VllmDecodeWorker"
decode_worker_component_name
=
"backend"
decode_worker_endpoint
=
"generate"
...
...
components/planner/src/dynamo/planner/utils/planner_core.py
View file @
157714aa
...
...
@@ -106,7 +106,11 @@ class Planner:
if
self
.
prefill_client
is
None
:
self
.
prefill_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker_component_name
)
.
endpoint
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
...
...
@@ -127,7 +131,11 @@ class Planner:
if
self
.
workers_client
is
None
:
self
.
workers_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker_component_name
)
.
endpoint
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker_endpoint
)
...
...
@@ -300,8 +308,12 @@ class Planner:
if
not
self
.
args
.
no_operation
:
target_replicas
=
{
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker
:
next_num_p
,
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker
:
next_num_d
,
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker_k8s_name
:
next_num_p
,
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker_k8s_name
:
next_num_d
,
}
await
self
.
connector
.
set_component_replicas
(
target_replicas
,
blocking
=
False
)
...
...
docs/architecture/pre_deployment_profiling.md
View file @
157714aa
...
...
@@ -82,25 +82,47 @@ kubectl create secret docker-registry nvcr-imagepullsecret \
# in the project's root folder
./container/build.sh
--framework
VLLM
# Tag and push to your container registry
export
DOCKER_IMAGE
=
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
# or your own dynamoimage
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
# Modify this yaml to profile different models
export
DGD_CONFIG_FILE
=
/workspace/components/backends/vllm/deploy/disagg.yaml
# or your own disagg config file
```
Replace the
`image`
within
`profile_sla_job.yaml`
with the tag of the image you pushed.
**Step 2: Run profiling (required)**
**Step 2: Set SLA target**
Edit
`$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml`
to set the target ISL, OSL, TTFT, and ITL.
```
yaml
spec
:
template
:
spec
:
containers
:
-
name
:
profile-sla
args
:
-
--isl
-
"
3000"
# average ISL is 3000 tokens
-
--osl
-
"
150"
# average OSL is 150 tokens
-
--ttft
-
"
200"
# target TTFT is 200ms
-
--itl
-
"
20"
# target ITL is 20ms
```
**Step 3: Run profiling (required)**
```
bash
cd
$DYNAMO_HOME
/benchmarks/profiler/deploy
envsubst < profiling_pvc.yaml | kubectl apply
-f
-
envsubst < profile_sla_sa.yaml | kubectl apply
-f
-
envsubst < profile_sla_rbac.yaml | kubectl apply
-f
-
envsubst < profile_sla_binding.yaml | kubectl apply
-f
-
export
DOCKER_IMAGE
=
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
# or your own image
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
export
DGD_CONFIG_FILE
=
/workspace/components/backends/vllm/deploy/disagg.yaml
# or your own disagg config file
envsubst < profile_sla_job.yaml | kubectl apply
-f
-
```
**Step
3
: Wait for profiling to complete**
**Step
4
: Wait for profiling to complete**
```
bash
kubectl get
jobs
-n
$NAMESPACE
kubectl logs job/profile-sla
-n
$NAMESPACE
...
...
@@ -129,13 +151,14 @@ The profiling results are stored in a PVC named `profiling-pvc`. To access the r
1.
**Create a temporary pod to access the PVC:**
```
bash
kubectl run temp-access
--image
=
alpine:latest
--rm
-it
--restart
=
Never
\
--overrides
=
'{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["
sh
"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}'
\
kubectl run temp-access
--image
=
alpine:latest
--restart
=
Never
\
--overrides
=
'{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["
tail","-f","/dev/null
"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}'
\
-n
$NAMESPACE
```
2.
**Inside the temporary pod, navigate to the results directory:**
```
bash
kubectl
exec
-it
temp-access
-n
$NAMESPACE
--
sh
cd
/workspace/profiling_results
ls
-la
```
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment