Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2bed47eb
"lib/vscode:/vscode.git/clone" did not exist on "dcbccbcd2ea52d5a0762eb5834718af00317c8e6"
Unverified
Commit
2bed47eb
authored
Jun 30, 2025
by
Hongkuan Zhou
Committed by
GitHub
Jun 30, 2025
Browse files
feat: support sla planner in vllm_v1 example (#1680)
parent
92f06b0e
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
153 additions
and
10 deletions
+153
-10
components/planner/src/dynamo/planner/defaults.py
components/planner/src/dynamo/planner/defaults.py
+5
-0
components/planner/src/dynamo/planner/planner_sla.py
components/planner/src/dynamo/planner/planner_sla.py
+1
-0
components/planner/src/dynamo/planner/utils/planner_core.py
components/planner/src/dynamo/planner/utils/planner_core.py
+27
-9
container/Dockerfile.vllm_v1
container/Dockerfile.vllm_v1
+17
-0
container/deps/requirements.txt
container/deps/requirements.txt
+1
-0
docs/architecture/sla_planner.md
docs/architecture/sla_planner.md
+6
-1
examples/vllm_v1/components/frontend.py
examples/vllm_v1/components/frontend.py
+4
-0
examples/vllm_v1/configs/disagg_planner.yaml
examples/vllm_v1/configs/disagg_planner.yaml
+64
-0
examples/vllm_v1/graphs/disagg_planner.py
examples/vllm_v1/graphs/disagg_planner.py
+28
-0
No files found.
components/planner/src/dynamo/planner/defaults.py
View file @
2bed47eb
...
@@ -18,6 +18,7 @@
...
@@ -18,6 +18,7 @@
class
BasePlannerDefaults
:
class
BasePlannerDefaults
:
namespace
=
"dynamo"
namespace
=
"dynamo"
environment
=
"local"
environment
=
"local"
backend
=
"vllm_v0"
no_operation
=
False
no_operation
=
False
log_dir
=
None
log_dir
=
None
adjustment_interval
=
180
# in seconds
adjustment_interval
=
180
# in seconds
...
@@ -48,12 +49,16 @@ class SLAPlannerDefaults(BasePlannerDefaults):
...
@@ -48,12 +49,16 @@ class SLAPlannerDefaults(BasePlannerDefaults):
class
VllmV0ComponentName
:
class
VllmV0ComponentName
:
prefill_worker
=
"PrefillWorker"
prefill_worker
=
"PrefillWorker"
prefill_worker_endpoint
=
"mock"
decode_worker
=
"VllmWorker"
decode_worker
=
"VllmWorker"
decode_worker_endpoint
=
"generate"
class
VllmV1ComponentName
:
class
VllmV1ComponentName
:
prefill_worker
=
"VllmPrefillWorker"
prefill_worker
=
"VllmPrefillWorker"
prefill_worker_endpoint
=
"generate"
decode_worker
=
"VllmDecodeWorker"
decode_worker
=
"VllmDecodeWorker"
decode_worker_endpoint
=
"generate"
WORKER_COMPONENT_NAMES
=
{
WORKER_COMPONENT_NAMES
=
{
...
...
components/planner/src/dynamo/planner/planner_sla.py
View file @
2bed47eb
...
@@ -64,6 +64,7 @@ class Planner:
...
@@ -64,6 +64,7 @@ class Planner:
environment
=
config_instance
.
get
(
environment
=
config_instance
.
get
(
"environment"
,
SLAPlannerDefaults
.
environment
"environment"
,
SLAPlannerDefaults
.
environment
),
),
backend
=
config_instance
.
get
(
"backend"
,
SLAPlannerDefaults
.
backend
),
no_operation
=
config_instance
.
get
(
no_operation
=
config_instance
.
get
(
"no-operation"
,
SLAPlannerDefaults
.
no_operation
"no-operation"
,
SLAPlannerDefaults
.
no_operation
),
),
...
...
components/planner/src/dynamo/planner/utils/planner_core.py
View file @
2bed47eb
...
@@ -22,7 +22,7 @@ from dataclasses import dataclass
...
@@ -22,7 +22,7 @@ from dataclasses import dataclass
from
typing
import
Optional
from
typing
import
Optional
from
dynamo.planner
import
KubernetesConnector
,
LocalConnector
from
dynamo.planner
import
KubernetesConnector
,
LocalConnector
from
dynamo.planner.defaults
import
SLAPlannerDefaults
from
dynamo.planner.defaults
import
WORKER_COMPONENT_NAMES
,
SLAPlannerDefaults
from
dynamo.planner.utils.load_predictor
import
LOAD_PREDICTORS
from
dynamo.planner.utils.load_predictor
import
LOAD_PREDICTORS
from
dynamo.planner.utils.perf_interpolation
import
(
from
dynamo.planner.utils.perf_interpolation
import
(
DecodeInterpolator
,
DecodeInterpolator
,
...
@@ -93,8 +93,12 @@ class Planner:
...
@@ -93,8 +93,12 @@ class Planner:
if
self
.
prefill_client
is
None
:
if
self
.
prefill_client
is
None
:
self
.
prefill_client
=
(
self
.
prefill_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
"PrefillWorker"
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker
)
.
endpoint
(
"mock"
)
.
endpoint
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker_endpoint
)
.
client
()
.
client
()
)
)
# TODO: remove this sleep after rust client() is blocking until watching state
# TODO: remove this sleep after rust client() is blocking until watching state
...
@@ -110,8 +114,10 @@ class Planner:
...
@@ -110,8 +114,10 @@ class Planner:
if
self
.
workers_client
is
None
:
if
self
.
workers_client
is
None
:
self
.
workers_client
=
(
self
.
workers_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
"VllmWorker"
)
.
component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker
)
.
endpoint
(
"generate"
)
.
endpoint
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker_endpoint
)
.
client
()
.
client
()
)
)
# TODO: remove this sleep after rust client() is blocking until watching state
# TODO: remove this sleep after rust client() is blocking until watching state
...
@@ -270,17 +276,29 @@ class Planner:
...
@@ -270,17 +276,29 @@ class Planner:
# TODO: add a check to avoid scaling before the previous scaling is completed
# TODO: add a check to avoid scaling before the previous scaling is completed
if
next_num_p
>
len
(
self
.
p_endpoints
):
if
next_num_p
>
len
(
self
.
p_endpoints
):
for
_
in
range
(
next_num_p
-
len
(
self
.
p_endpoints
)):
for
_
in
range
(
next_num_p
-
len
(
self
.
p_endpoints
)):
self
.
connector
.
add_component
(
"PrefillWorker"
,
blocking
=
False
)
self
.
connector
.
add_component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker
,
blocking
=
False
,
)
elif
next_num_p
<
len
(
self
.
p_endpoints
):
elif
next_num_p
<
len
(
self
.
p_endpoints
):
for
_
in
range
(
len
(
self
.
p_endpoints
)
-
next_num_p
):
for
_
in
range
(
len
(
self
.
p_endpoints
)
-
next_num_p
):
self
.
connector
.
remove_component
(
"PrefillWorker"
,
blocking
=
False
)
self
.
connector
.
remove_component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
prefill_worker
,
blocking
=
False
,
)
if
next_num_d
>
len
(
self
.
d_endpoints
):
if
next_num_d
>
len
(
self
.
d_endpoints
):
for
_
in
range
(
next_num_d
-
len
(
self
.
d_endpoints
)):
for
_
in
range
(
next_num_d
-
len
(
self
.
d_endpoints
)):
self
.
connector
.
add_component
(
"VllmWorker"
,
blocking
=
False
)
self
.
connector
.
add_component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker
,
blocking
=
False
,
)
elif
next_num_d
<
len
(
self
.
d_endpoints
):
elif
next_num_d
<
len
(
self
.
d_endpoints
):
for
_
in
range
(
len
(
self
.
d_endpoints
)
-
next_num_d
):
for
_
in
range
(
len
(
self
.
d_endpoints
)
-
next_num_d
):
self
.
connector
.
remove_component
(
"VllmWorker"
,
blocking
=
False
)
self
.
connector
.
remove_component
(
WORKER_COMPONENT_NAMES
[
self
.
args
.
backend
].
decode_worker
,
blocking
=
False
,
)
async
def
run
(
self
):
async
def
run
(
self
):
"""Main loop for the planner"""
"""Main loop for the planner"""
...
...
container/Dockerfile.vllm_v1
View file @
2bed47eb
...
@@ -211,6 +211,23 @@ RUN printf "[safe]\n directory=/workspace\n" > /root/.gitconfig
...
@@ -211,6 +211,23 @@ RUN printf "[safe]\n directory=/workspace\n" > /root/.gitconfig
RUN ln -sf /bin/bash /bin/sh
RUN ln -sf /bin/bash /bin/sh
# Install prometheus
ARG PROM_VERSION=3.4.1
RUN apt-get update && apt-get install -y --no-install-recommends \
curl tar ca-certificates && \
rm -rf /var/lib/apt/lists/*
RUN ARCH=$(dpkg --print-architecture) && \
case "$ARCH" in \
amd64) PLATFORM=linux-amd64 ;; \
arm64) PLATFORM=linux-arm64 ;; \
*) echo "Unsupported architecture: $ARCH" && exit 1 ;; \
esac && \
curl -fsSL https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.${PLATFORM}.tar.gz \
| tar -xz -C /tmp && \
mv /tmp/prometheus-${PROM_VERSION}.${PLATFORM}/prometheus /usr/local/bin/ && \
chmod +x /usr/local/bin/prometheus && \
rm -rf /tmp/prometheus-${PROM_VERSION}.${PLATFORM}
### BUILDS ###
### BUILDS ###
# Rust build/dev dependencies
# Rust build/dev dependencies
...
...
container/deps/requirements.txt
View file @
2bed47eb
...
@@ -37,6 +37,7 @@ pynvml
...
@@ -37,6 +37,7 @@ pynvml
pyright
pyright
PyYAML
PyYAML
scikit-learn
scikit-learn
scipy<1.14.0 # Pin scipy version for pmdarima compatibility
sentencepiece
sentencepiece
tensorboard==2.19.0
tensorboard==2.19.0
tensorboardX==2.6.2.2
tensorboardX==2.6.2.2
...
...
docs/architecture/sla_planner.md
View file @
2bed47eb
...
@@ -140,8 +140,13 @@ SLA-planner and prometheus server are provided as common components that can be
...
@@ -140,8 +140,13 @@ SLA-planner and prometheus server are provided as common components that can be
-
Link
`Planner`
and
`Prometheus`
in the graph.
-
Link
`Planner`
and
`Prometheus`
in the graph.
-
Add
`Planner`
and
`Prometheus`
configurations in the config file.
-
Add
`Planner`
and
`Prometheus`
configurations in the config file.
A
`vllm_v0`
example is available for reference
:
We provide examples for
`vllm_v0`
and
`vllm_v1`
:
```
bash
```
bash
# vllm_v0
cd
$DYNAMO_HOME
/examples/vllm_v0
cd
$DYNAMO_HOME
/examples/vllm_v0
dynamo serve graphs.disagg_planner:Frontend
-f
./configs/disagg_planner.yaml
dynamo serve graphs.disagg_planner:Frontend
-f
./configs/disagg_planner.yaml
# vllm_v1
cd
$DYNAMO_HOME
/examples/vllm_v1
dynamo serve graphs.disagg_planner:Frontend
-f
./configs/disagg_planner.yaml
```
```
\ No newline at end of file
examples/vllm_v1/components/frontend.py
View file @
2bed47eb
...
@@ -22,6 +22,8 @@ from fastapi import FastAPI
...
@@ -22,6 +22,8 @@ from fastapi import FastAPI
from
pydantic
import
BaseModel
from
pydantic
import
BaseModel
import
dynamo.sdk
as
sdk
import
dynamo.sdk
as
sdk
from
dynamo.planner.planner_sla
import
Planner
from
dynamo.planner.prometheus
import
Prometheus
from
dynamo.sdk
import
depends
,
service
from
dynamo.sdk
import
depends
,
service
from
dynamo.sdk.lib.config
import
ServiceConfig
from
dynamo.sdk.lib.config
import
ServiceConfig
from
dynamo.sdk.lib.image
import
DYNAMO_IMAGE
from
dynamo.sdk.lib.image
import
DYNAMO_IMAGE
...
@@ -59,6 +61,8 @@ class FrontendConfig(BaseModel):
...
@@ -59,6 +61,8 @@ class FrontendConfig(BaseModel):
)
)
class
Frontend
:
class
Frontend
:
worker
=
depends
(
SimpleLoadBalancer
)
worker
=
depends
(
SimpleLoadBalancer
)
planner
=
depends
(
Planner
)
prometheus
=
depends
(
Prometheus
)
def
__init__
(
self
):
def
__init__
(
self
):
"""Initialize Frontend service with HTTP server and model configuration."""
"""Initialize Frontend service with HTTP server and model configuration."""
...
...
examples/vllm_v1/configs/disagg_planner.yaml
0 → 100644
View file @
2bed47eb
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
kv-transfer-config
:
'
{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
Frontend
:
endpoint
:
dynamo.SimpleLoadBalancer.generate_disagg
port
:
8000
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
SimpleLoadBalancer
:
enable_disagg
:
true
common-configs
:
[
model
,
kv-transfer-config
,
served_model_name
]
VllmPrefillWorker
:
enforce-eager
:
true
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
kv-transfer-config
,
served_model_name
]
VllmDecodeWorker
:
enforce-eager
:
true
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
kv-transfer-config
,
served_model_name
]
Prometheus
:
global
:
scrape_interval
:
5s
scrape_configs
:
-
job_name
:
'
prometheus'
static_configs
:
-
targets
:
[
'
localhost:9090'
]
-
job_name
:
'
frontend'
static_configs
:
-
targets
:
[
'
localhost:8000'
]
Planner
:
backend
:
"
vllm_v1"
adjustment-interval
:
180
profile-results-dir
:
"
/workspace/examples/profiling_results"
isl
:
3000
osl
:
150
ttft
:
0.5
itl
:
0.05
load-predictor
:
"
arima"
\ No newline at end of file
examples/vllm_v1/graphs/disagg_planner.py
0 → 100644
View file @
2bed47eb
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
components.frontend
import
Frontend
from
components.simple_load_balancer
import
SimpleLoadBalancer
from
components.worker
import
VllmDecodeWorker
,
VllmPrefillWorker
from
dynamo.planner.planner_sla
import
Planner
from
dynamo.planner.prometheus
import
Prometheus
load_balancer
=
Frontend
.
link
(
SimpleLoadBalancer
)
load_balancer
.
link
(
VllmPrefillWorker
)
load_balancer
.
link
(
VllmDecodeWorker
)
Frontend
.
link
(
Planner
)
Frontend
.
link
(
Prometheus
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment