Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2bed47eb
Unverified
Commit
2bed47eb
authored
Jun 30, 2025
by
Hongkuan Zhou
Committed by
GitHub
Jun 30, 2025
Browse files
feat: support sla planner in vllm_v1 example (#1680)
parent
92f06b0e
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
153 additions
and
10 deletions
+153
-10
components/planner/src/dynamo/planner/defaults.py
components/planner/src/dynamo/planner/defaults.py
+5
-0
components/planner/src/dynamo/planner/planner_sla.py
components/planner/src/dynamo/planner/planner_sla.py
+1
-0
components/planner/src/dynamo/planner/utils/planner_core.py
components/planner/src/dynamo/planner/utils/planner_core.py
+27
-9
container/Dockerfile.vllm_v1
container/Dockerfile.vllm_v1
+17
-0
container/deps/requirements.txt
container/deps/requirements.txt
+1
-0
docs/architecture/sla_planner.md
docs/architecture/sla_planner.md
+6
-1
examples/vllm_v1/components/frontend.py
examples/vllm_v1/components/frontend.py
+4
-0
examples/vllm_v1/configs/disagg_planner.yaml
examples/vllm_v1/configs/disagg_planner.yaml
+64
-0
examples/vllm_v1/graphs/disagg_planner.py
examples/vllm_v1/graphs/disagg_planner.py
+28
-0
No files found.
components/planner/src/dynamo/planner/defaults.py
View file @
2bed47eb
...
@@ -18,6 +18,7 @@
...
@@ -18,6 +18,7 @@
class
BasePlannerDefaults
:
class
BasePlannerDefaults
:
namespace
=
"dynamo"
namespace
=
"dynamo"
environment
=
"local"
environment
=
"local"
backend
=
"vllm_v0"
no_operation
=
False
no_operation
=
False
log_dir
=
None
log_dir
=
None
adjustment_interval
=
180
# in seconds
adjustment_interval
=
180
# in seconds
...
@@ -48,12 +49,16 @@ class SLAPlannerDefaults(BasePlannerDefaults):
...
@@ -48,12 +49,16 @@ class SLAPlannerDefaults(BasePlannerDefaults):
class
VllmV0ComponentName
:
class
VllmV0ComponentName
:
prefill_worker
=
"PrefillWorker"
prefill_worker
=
"PrefillWorker"
prefill_worker_endpoint
=
"mock"
decode_worker
=
"VllmWorker"
decode_worker
=
"VllmWorker"
decode_worker_endpoint
=
"generate"
class
VllmV1ComponentName
:
class
VllmV1ComponentName
:
prefill_worker
=
"VllmPrefillWorker"
prefill_worker
=
"VllmPrefillWorker"
prefill_worker_endpoint
=
"generate"
decode_worker
=
"VllmDecodeWorker"
decode_worker
=
"VllmDecodeWorker"
decode_worker_endpoint
=
"generate"
WORKER_COMPONENT_NAMES
=
{
WORKER_COMPONENT_NAMES
=
{
...
...
components/planner/src/dynamo/planner/planner_sla.py
View file @
2bed47eb
...
@@ -64,6 +64,7 @@ class Planner:
...
@@ -64,6 +64,7 @@ class Planner:
environment
=
config_instance
.
get
(
environment
=
config_instance
.
get
(
"environment"
,
SLAPlannerDefaults
.
environment
"environment"
,
SLAPlannerDefaults
.
environment
),
),
backend
=
config_instance
.
get
(
"backend"
,
SLAPlannerDefaults
.
backend
),
no_operation
=
config_instance
.
get
(
no_operation
=
config_instance
.
get
(
"no-operation"
,
SLAPlannerDefaults
.
no_operation
"no-operation"
,
SLAPlannerDefaults
.
no_operation
),
),
...
...
components/planner/src/dynamo/planner/utils/planner_core.py
View file @
2bed47eb
...
@@ -22,7 +22,7 @@ from dataclasses import dataclass
...
@@ -22,7 +22,7 @@ from dataclasses import dataclass
from
typing
import
Optional
from
typing
import
Optional
from
dynamo.planner
import
KubernetesConnector
,
LocalConnector
from
dynamo.planner
import
KubernetesConnector
,
LocalConnector
from
dynamo.planner.defaults
import
SLAPlannerDefaults
from
dynamo.planner.defaults
import
WORKER_COMPONENT_NAMES
,
SLAPlannerDefaults
from
dynamo.planner.utils.load_predictor
import
LOAD_PREDICTORS
from
dynamo.planner.utils.load_predictor
import
LOAD_PREDICTORS
from
dynamo.planner.utils.perf_interpolation
import
(
from
dynamo.planner.utils.perf_interpolation
import
(
DecodeInterpolator
,
DecodeInterpolator
,
...
@@ -93,8 +93,12 @@ class Planner:
...
@@ -93,8 +93,12 @@ class Planner:
if
self
.
prefill_client
is
None
:
if
self
.
prefill_client
is
None
:
self
.
prefill_client
=
(
self
.
prefill_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
"PrefillWorker"
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker
)
.
endpoint
(
"mock"
)
.
endpoint
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker_endpoint
)
.
client
()
.
client
()
)
)
# TODO: remove this sleep after rust client() is blocking until watching state
# TODO: remove this sleep after rust client() is blocking until watching state
...
@@ -110,8 +114,10 @@ class Planner:
...
@@ -110,8 +114,10 @@ class Planner:
if
self
.
workers_client
is
None
:
if
self
.
workers_client
is
None
:
self
.
workers_client
=
(
self
.
workers_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
"VllmWorker"
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker
)
.
endpoint
(
"generate"
)
.
endpoint
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker_endpoint
)
.
client
()
.
client
()
)
)
# TODO: remove this sleep after rust client() is blocking until watching state
# TODO: remove this sleep after rust client() is blocking until watching state
...
@@ -270,17 +276,29 @@ class Planner:
...
@@ -270,17 +276,29 @@ class Planner:
# TODO: add a check to avoid scaling before the previous scaling is completed
# TODO: add a check to avoid scaling before the previous scaling is completed
if
next_num_p
>
len
(
self
.
p_endpoints
):
if
next_num_p
>
len
(
self
.
p_endpoints
):
for
_
in
range
(
next_num_p
-
len
(
self
.
p_endpoints
)):
for
_
in
range
(
next_num_p
-
len
(
self
.
p_endpoints
)):
self
.
connector
.
add_component
(
"PrefillWorker"
,
blocking
=
False
)
self
.
connector
.
add_component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker
,
blocking
=
False
,
)
elif
next_num_p
<
len
(
self
.
p_endpoints
):
elif
next_num_p
<
len
(
self
.
p_endpoints
):
for
_
in
range
(
len
(
self
.
p_endpoints
)
-
next_num_p
):
for
_
in
range
(
len
(
self
.
p_endpoints
)
-
next_num_p
):
self
.
connector
.
remove_component
(
"PrefillWorker"
,
blocking
=
False
)
self
.
connector
.
remove_component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker
,
blocking
=
False
,
)
if
next_num_d
>
len
(
self
.
d_endpoints
):
if
next_num_d
>
len
(
self
.
d_endpoints
):
for
_
in
range
(
next_num_d
-
len
(
self
.
d_endpoints
)):
for
_
in
range
(
next_num_d
-
len
(
self
.
d_endpoints
)):
self
.
connector
.
add_component
(
"VllmWorker"
,
blocking
=
False
)
self
.
connector
.
add_component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker
,
blocking
=
False
,
)
elif
next_num_d
<
len
(
self
.
d_endpoints
):
elif
next_num_d
<
len
(
self
.
d_endpoints
):
for
_
in
range
(
len
(
self
.
d_endpoints
)
-
next_num_d
):
for
_
in
range
(
len
(
self
.
d_endpoints
)
-
next_num_d
):
self
.
connector
.
remove_component
(
"VllmWorker"
,
blocking
=
False
)
self
.
connector
.
remove_component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker
,
blocking
=
False
,
)
async
def
run
(
self
):
async
def
run
(
self
):
"""Main loop for the planner"""
"""Main loop for the planner"""
...
...
container/Dockerfile.vllm_v1
View file @
2bed47eb
...
@@ -211,6 +211,23 @@ RUN printf "[safe]\n directory=/workspace\n" > /root/.gitconfig
...
@@ -211,6 +211,23 @@ RUN printf "[safe]\n directory=/workspace\n" > /root/.gitconfig
RUN ln -sf /bin/bash /bin/sh
RUN ln -sf /bin/bash /bin/sh
# Install prometheus
ARG PROM_VERSION=3.4.1
RUN apt-get update && apt-get install -y --no-install-recommends \
curl tar ca-certificates && \
rm -rf /var/lib/apt/lists/*
RUN ARCH=$(dpkg --print-architecture) && \
case "$ARCH" in \
amd64) PLATFORM=linux-amd64 ;; \
arm64) PLATFORM=linux-arm64 ;; \
*) echo "Unsupported architecture: $ARCH" && exit 1 ;; \
esac && \
curl -fsSL https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.${PLATFORM}.tar.gz \
| tar -xz -C /tmp && \
mv /tmp/prometheus-${PROM_VERSION}.${PLATFORM}/prometheus /usr/local/bin/ && \
chmod +x /usr/local/bin/prometheus && \
rm -rf /tmp/prometheus-${PROM_VERSION}.${PLATFORM}
### BUILDS ###
### BUILDS ###
# Rust build/dev dependencies
# Rust build/dev dependencies
...
...
container/deps/requirements.txt
View file @
2bed47eb
...
@@ -37,6 +37,7 @@ pynvml
...
@@ -37,6 +37,7 @@ pynvml
pyright
pyright
PyYAML
PyYAML
scikit-learn
scikit-learn
scipy<1.14.0 # Pin scipy version for pmdarima compatibility
sentencepiece
sentencepiece
tensorboard==2.19.0
tensorboard==2.19.0
tensorboardX==2.6.2.2
tensorboardX==2.6.2.2
...
...
docs/architecture/sla_planner.md
View file @
2bed47eb
...
@@ -140,8 +140,13 @@ SLA-planner and prometheus server are provided as common components that can be
...
@@ -140,8 +140,13 @@ SLA-planner and prometheus server are provided as common components that can be
-
Link
`Planner`
and
`Prometheus`
in the graph.
-
Link
`Planner`
and
`Prometheus`
in the graph.
-
Add
`Planner`
and
`Prometheus`
configurations in the config file.
-
Add
`Planner`
and
`Prometheus`
configurations in the config file.
A
`vllm_v0`
example is available for reference
:
We provide examples for
`vllm_v0`
and
`vllm_v1`
:
```
bash
```
bash
# vllm_v0
cd
$DYNAMO_HOME
/examples/vllm_v0
cd
$DYNAMO_HOME
/examples/vllm_v0
dynamo serve graphs.disagg_planner:Frontend
-f
./configs/disagg_planner.yaml
dynamo serve graphs.disagg_planner:Frontend
-f
./configs/disagg_planner.yaml
# vllm_v1
cd
$DYNAMO_HOME
/examples/vllm_v1
dynamo serve graphs.disagg_planner:Frontend
-f
./configs/disagg_planner.yaml
```
```
\ No newline at end of file
examples/vllm_v1/components/frontend.py
View file @
2bed47eb
...
@@ -22,6 +22,8 @@ from fastapi import FastAPI
...
@@ -22,6 +22,8 @@ from fastapi import FastAPI
from
pydantic
import
BaseModel
from
pydantic
import
BaseModel
import
dynamo.sdk
as
sdk
import
dynamo.sdk
as
sdk
from
dynamo.planner.planner_sla
import
Planner
from
dynamo.planner.prometheus
import
Prometheus
from
dynamo.sdk
import
depends
,
service
from
dynamo.sdk
import
depends
,
service
from
dynamo.sdk.lib.config
import
ServiceConfig
from
dynamo.sdk.lib.config
import
ServiceConfig
from
dynamo.sdk.lib.image
import
DYNAMO_IMAGE
from
dynamo.sdk.lib.image
import
DYNAMO_IMAGE
...
@@ -59,6 +61,8 @@ class FrontendConfig(BaseModel):
...
@@ -59,6 +61,8 @@ class FrontendConfig(BaseModel):
)
)
class
Frontend
:
class
Frontend
:
worker
=
depends
(
SimpleLoadBalancer
)
worker
=
depends
(
SimpleLoadBalancer
)
planner
=
depends
(
Planner
)
prometheus
=
depends
(
Prometheus
)
def
__init__
(
self
):
def
__init__
(
self
):
"""Initialize Frontend service with HTTP server and model configuration."""
"""Initialize Frontend service with HTTP server and model configuration."""
...
...
examples/vllm_v1/configs/disagg_planner.yaml
0 → 100644
View file @
2bed47eb
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
kv-transfer-config
:
'
{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
Frontend
:
endpoint
:
dynamo.SimpleLoadBalancer.generate_disagg
port
:
8000
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
SimpleLoadBalancer
:
enable_disagg
:
true
common-configs
:
[
model
,
kv-transfer-config
,
served_model_name
]
VllmPrefillWorker
:
enforce-eager
:
true
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
kv-transfer-config
,
served_model_name
]
VllmDecodeWorker
:
enforce-eager
:
true
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
kv-transfer-config
,
served_model_name
]
Prometheus
:
global
:
scrape_interval
:
5s
scrape_configs
:
-
job_name
:
'
prometheus'
static_configs
:
-
targets
:
[
'
localhost:9090'
]
-
job_name
:
'
frontend'
static_configs
:
-
targets
:
[
'
localhost:8000'
]
Planner
:
backend
:
"
vllm_v1"
adjustment-interval
:
180
profile-results-dir
:
"
/workspace/examples/profiling_results"
isl
:
3000
osl
:
150
ttft
:
0.5
itl
:
0.05
load-predictor
:
"
arima"
\ No newline at end of file
examples/vllm_v1/graphs/disagg_planner.py
0 → 100644
View file @
2bed47eb
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
components.frontend
import
Frontend
from
components.simple_load_balancer
import
SimpleLoadBalancer
from
components.worker
import
VllmDecodeWorker
,
VllmPrefillWorker
from
dynamo.planner.planner_sla
import
Planner
from
dynamo.planner.prometheus
import
Prometheus
load_balancer
=
Frontend
.
link
(
SimpleLoadBalancer
)
load_balancer
.
link
(
VllmPrefillWorker
)
load_balancer
.
link
(
VllmDecodeWorker
)
Frontend
.
link
(
Planner
)
Frontend
.
link
(
Prometheus
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment