Unverified Commit 2bed47eb authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: support sla planner in vllm_v1 example (#1680)

parent 92f06b0e
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
class BasePlannerDefaults: class BasePlannerDefaults:
namespace = "dynamo" namespace = "dynamo"
environment = "local" environment = "local"
backend = "vllm_v0"
no_operation = False no_operation = False
log_dir = None log_dir = None
adjustment_interval = 180 # in seconds adjustment_interval = 180 # in seconds
...@@ -48,12 +49,16 @@ class SLAPlannerDefaults(BasePlannerDefaults): ...@@ -48,12 +49,16 @@ class SLAPlannerDefaults(BasePlannerDefaults):
class VllmV0ComponentName: class VllmV0ComponentName:
prefill_worker = "PrefillWorker" prefill_worker = "PrefillWorker"
prefill_worker_endpoint = "mock"
decode_worker = "VllmWorker" decode_worker = "VllmWorker"
decode_worker_endpoint = "generate"
class VllmV1ComponentName: class VllmV1ComponentName:
prefill_worker = "VllmPrefillWorker" prefill_worker = "VllmPrefillWorker"
prefill_worker_endpoint = "generate"
decode_worker = "VllmDecodeWorker" decode_worker = "VllmDecodeWorker"
decode_worker_endpoint = "generate"
WORKER_COMPONENT_NAMES = { WORKER_COMPONENT_NAMES = {
......
...@@ -64,6 +64,7 @@ class Planner: ...@@ -64,6 +64,7 @@ class Planner:
environment=config_instance.get( environment=config_instance.get(
"environment", SLAPlannerDefaults.environment "environment", SLAPlannerDefaults.environment
), ),
backend=config_instance.get("backend", SLAPlannerDefaults.backend),
no_operation=config_instance.get( no_operation=config_instance.get(
"no-operation", SLAPlannerDefaults.no_operation "no-operation", SLAPlannerDefaults.no_operation
), ),
......
...@@ -22,7 +22,7 @@ from dataclasses import dataclass ...@@ -22,7 +22,7 @@ from dataclasses import dataclass
from typing import Optional from typing import Optional
from dynamo.planner import KubernetesConnector, LocalConnector from dynamo.planner import KubernetesConnector, LocalConnector
from dynamo.planner.defaults import SLAPlannerDefaults from dynamo.planner.defaults import WORKER_COMPONENT_NAMES, SLAPlannerDefaults
from dynamo.planner.utils.load_predictor import LOAD_PREDICTORS from dynamo.planner.utils.load_predictor import LOAD_PREDICTORS
from dynamo.planner.utils.perf_interpolation import ( from dynamo.planner.utils.perf_interpolation import (
DecodeInterpolator, DecodeInterpolator,
...@@ -93,8 +93,12 @@ class Planner: ...@@ -93,8 +93,12 @@ class Planner:
if self.prefill_client is None: if self.prefill_client is None:
self.prefill_client = ( self.prefill_client = (
await self.runtime.namespace(self.namespace) await self.runtime.namespace(self.namespace)
.component("PrefillWorker") .component(WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker)
.endpoint("mock") .endpoint(
WORKER_COMPONENT_NAMES[
self.args.backend
].prefill_worker_endpoint
)
.client() .client()
) )
# TODO: remove this sleep after rust client() is blocking until watching state # TODO: remove this sleep after rust client() is blocking until watching state
...@@ -110,8 +114,10 @@ class Planner: ...@@ -110,8 +114,10 @@ class Planner:
if self.workers_client is None: if self.workers_client is None:
self.workers_client = ( self.workers_client = (
await self.runtime.namespace(self.namespace) await self.runtime.namespace(self.namespace)
.component("VllmWorker") .component(WORKER_COMPONENT_NAMES[self.args.backend].decode_worker)
.endpoint("generate") .endpoint(
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker_endpoint
)
.client() .client()
) )
# TODO: remove this sleep after rust client() is blocking until watching state # TODO: remove this sleep after rust client() is blocking until watching state
...@@ -270,17 +276,29 @@ class Planner: ...@@ -270,17 +276,29 @@ class Planner:
# TODO: add a check to avoid scaling before the previous scaling is completed # TODO: add a check to avoid scaling before the previous scaling is completed
if next_num_p > len(self.p_endpoints): if next_num_p > len(self.p_endpoints):
for _ in range(next_num_p - len(self.p_endpoints)): for _ in range(next_num_p - len(self.p_endpoints)):
self.connector.add_component("PrefillWorker", blocking=False) self.connector.add_component(
WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker,
blocking=False,
)
elif next_num_p < len(self.p_endpoints): elif next_num_p < len(self.p_endpoints):
for _ in range(len(self.p_endpoints) - next_num_p): for _ in range(len(self.p_endpoints) - next_num_p):
self.connector.remove_component("PrefillWorker", blocking=False) self.connector.remove_component(
WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker,
blocking=False,
)
if next_num_d > len(self.d_endpoints): if next_num_d > len(self.d_endpoints):
for _ in range(next_num_d - len(self.d_endpoints)): for _ in range(next_num_d - len(self.d_endpoints)):
self.connector.add_component("VllmWorker", blocking=False) self.connector.add_component(
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker,
blocking=False,
)
elif next_num_d < len(self.d_endpoints): elif next_num_d < len(self.d_endpoints):
for _ in range(len(self.d_endpoints) - next_num_d): for _ in range(len(self.d_endpoints) - next_num_d):
self.connector.remove_component("VllmWorker", blocking=False) self.connector.remove_component(
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker,
blocking=False,
)
async def run(self): async def run(self):
"""Main loop for the planner""" """Main loop for the planner"""
......
...@@ -211,6 +211,23 @@ RUN printf "[safe]\n directory=/workspace\n" > /root/.gitconfig ...@@ -211,6 +211,23 @@ RUN printf "[safe]\n directory=/workspace\n" > /root/.gitconfig
RUN ln -sf /bin/bash /bin/sh RUN ln -sf /bin/bash /bin/sh
# Install prometheus
ARG PROM_VERSION=3.4.1
RUN apt-get update && apt-get install -y --no-install-recommends \
curl tar ca-certificates && \
rm -rf /var/lib/apt/lists/*
RUN ARCH=$(dpkg --print-architecture) && \
case "$ARCH" in \
amd64) PLATFORM=linux-amd64 ;; \
arm64) PLATFORM=linux-arm64 ;; \
*) echo "Unsupported architecture: $ARCH" && exit 1 ;; \
esac && \
curl -fsSL https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.${PLATFORM}.tar.gz \
| tar -xz -C /tmp && \
mv /tmp/prometheus-${PROM_VERSION}.${PLATFORM}/prometheus /usr/local/bin/ && \
chmod +x /usr/local/bin/prometheus && \
rm -rf /tmp/prometheus-${PROM_VERSION}.${PLATFORM}
### BUILDS ### ### BUILDS ###
# Rust build/dev dependencies # Rust build/dev dependencies
......
...@@ -37,6 +37,7 @@ pynvml ...@@ -37,6 +37,7 @@ pynvml
pyright pyright
PyYAML PyYAML
scikit-learn scikit-learn
scipy<1.14.0 # Pin scipy version for pmdarima compatibility
sentencepiece sentencepiece
tensorboard==2.19.0 tensorboard==2.19.0
tensorboardX==2.6.2.2 tensorboardX==2.6.2.2
......
...@@ -140,8 +140,13 @@ SLA-planner and prometheus server are provided as common components that can be ...@@ -140,8 +140,13 @@ SLA-planner and prometheus server are provided as common components that can be
- Link `Planner` and `Prometheus` in the graph. - Link `Planner` and `Prometheus` in the graph.
- Add `Planner` and `Prometheus` configurations in the config file. - Add `Planner` and `Prometheus` configurations in the config file.
A `vllm_v0` example is available for reference: We provide examples for `vllm_v0` and `vllm_v1`:
```bash ```bash
# vllm_v0
cd $DYNAMO_HOME/examples/vllm_v0 cd $DYNAMO_HOME/examples/vllm_v0
dynamo serve graphs.disagg_planner:Frontend -f ./configs/disagg_planner.yaml dynamo serve graphs.disagg_planner:Frontend -f ./configs/disagg_planner.yaml
# vllm_v1
cd $DYNAMO_HOME/examples/vllm_v1
dynamo serve graphs.disagg_planner:Frontend -f ./configs/disagg_planner.yaml
``` ```
\ No newline at end of file
...@@ -22,6 +22,8 @@ from fastapi import FastAPI ...@@ -22,6 +22,8 @@ from fastapi import FastAPI
from pydantic import BaseModel from pydantic import BaseModel
import dynamo.sdk as sdk import dynamo.sdk as sdk
from dynamo.planner.planner_sla import Planner
from dynamo.planner.prometheus import Prometheus
from dynamo.sdk import depends, service from dynamo.sdk import depends, service
from dynamo.sdk.lib.config import ServiceConfig from dynamo.sdk.lib.config import ServiceConfig
from dynamo.sdk.lib.image import DYNAMO_IMAGE from dynamo.sdk.lib.image import DYNAMO_IMAGE
...@@ -59,6 +61,8 @@ class FrontendConfig(BaseModel): ...@@ -59,6 +61,8 @@ class FrontendConfig(BaseModel):
) )
class Frontend: class Frontend:
worker = depends(SimpleLoadBalancer) worker = depends(SimpleLoadBalancer)
planner = depends(Planner)
prometheus = depends(Prometheus)
def __init__(self): def __init__(self):
"""Initialize Frontend service with HTTP server and model configuration.""" """Initialize Frontend service with HTTP server and model configuration."""
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
kv-transfer-config: '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
Frontend:
endpoint: dynamo.SimpleLoadBalancer.generate_disagg
port: 8000
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
SimpleLoadBalancer:
enable_disagg: true
common-configs: [model, kv-transfer-config, served_model_name]
VllmPrefillWorker:
enforce-eager: true
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, kv-transfer-config, served_model_name]
VllmDecodeWorker:
enforce-eager: true
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, kv-transfer-config, served_model_name]
Prometheus:
global:
scrape_interval: 5s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'frontend'
static_configs:
- targets: ['localhost:8000']
Planner:
backend: "vllm_v1"
adjustment-interval: 180
profile-results-dir: "/workspace/examples/profiling_results"
isl: 3000
osl: 150
ttft: 0.5
itl: 0.05
load-predictor: "arima"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from components.frontend import Frontend
from components.simple_load_balancer import SimpleLoadBalancer
from components.worker import VllmDecodeWorker, VllmPrefillWorker
from dynamo.planner.planner_sla import Planner
from dynamo.planner.prometheus import Prometheus
load_balancer = Frontend.link(SimpleLoadBalancer)
load_balancer.link(VllmPrefillWorker)
load_balancer.link(VllmDecodeWorker)
Frontend.link(Planner)
Frontend.link(Prometheus)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment