"tests/vscode:/vscode.git/clone" did not exist on "906a19cdb06b390f3dde287b06a3fe26c03a45e5"
Unverified Commit 2bed47eb authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: support sla planner in vllm_v1 example (#1680)

parent 92f06b0e
......@@ -18,6 +18,7 @@
class BasePlannerDefaults:
namespace = "dynamo"
environment = "local"
backend = "vllm_v0"
no_operation = False
log_dir = None
adjustment_interval = 180 # in seconds
......@@ -48,12 +49,16 @@ class SLAPlannerDefaults(BasePlannerDefaults):
class VllmV0ComponentName:
prefill_worker = "PrefillWorker"
prefill_worker_endpoint = "mock"
decode_worker = "VllmWorker"
decode_worker_endpoint = "generate"
class VllmV1ComponentName:
prefill_worker = "VllmPrefillWorker"
prefill_worker_endpoint = "generate"
decode_worker = "VllmDecodeWorker"
decode_worker_endpoint = "generate"
WORKER_COMPONENT_NAMES = {
......
......@@ -64,6 +64,7 @@ class Planner:
environment=config_instance.get(
"environment", SLAPlannerDefaults.environment
),
backend=config_instance.get("backend", SLAPlannerDefaults.backend),
no_operation=config_instance.get(
"no-operation", SLAPlannerDefaults.no_operation
),
......
......@@ -22,7 +22,7 @@ from dataclasses import dataclass
from typing import Optional
from dynamo.planner import KubernetesConnector, LocalConnector
from dynamo.planner.defaults import SLAPlannerDefaults
from dynamo.planner.defaults import WORKER_COMPONENT_NAMES, SLAPlannerDefaults
from dynamo.planner.utils.load_predictor import LOAD_PREDICTORS
from dynamo.planner.utils.perf_interpolation import (
DecodeInterpolator,
......@@ -93,8 +93,12 @@ class Planner:
if self.prefill_client is None:
self.prefill_client = (
await self.runtime.namespace(self.namespace)
.component("PrefillWorker")
.endpoint("mock")
.component(WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker)
.endpoint(
WORKER_COMPONENT_NAMES[
self.args.backend
].prefill_worker_endpoint
)
.client()
)
# TODO: remove this sleep after rust client() is blocking until watching state
......@@ -110,8 +114,10 @@ class Planner:
if self.workers_client is None:
self.workers_client = (
await self.runtime.namespace(self.namespace)
.component("VllmWorker")
.endpoint("generate")
.component(WORKER_COMPONENT_NAMES[self.args.backend].decode_worker)
.endpoint(
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker_endpoint
)
.client()
)
# TODO: remove this sleep after rust client() is blocking until watching state
......@@ -270,17 +276,29 @@ class Planner:
# TODO: add a check to avoid scaling before the previous scaling is completed
if next_num_p > len(self.p_endpoints):
for _ in range(next_num_p - len(self.p_endpoints)):
self.connector.add_component("PrefillWorker", blocking=False)
self.connector.add_component(
WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker,
blocking=False,
)
elif next_num_p < len(self.p_endpoints):
for _ in range(len(self.p_endpoints) - next_num_p):
self.connector.remove_component("PrefillWorker", blocking=False)
self.connector.remove_component(
WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker,
blocking=False,
)
if next_num_d > len(self.d_endpoints):
for _ in range(next_num_d - len(self.d_endpoints)):
self.connector.add_component("VllmWorker", blocking=False)
self.connector.add_component(
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker,
blocking=False,
)
elif next_num_d < len(self.d_endpoints):
for _ in range(len(self.d_endpoints) - next_num_d):
self.connector.remove_component("VllmWorker", blocking=False)
self.connector.remove_component(
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker,
blocking=False,
)
async def run(self):
"""Main loop for the planner"""
......
......@@ -211,6 +211,23 @@ RUN printf "[safe]\n directory=/workspace\n" > /root/.gitconfig
RUN ln -sf /bin/bash /bin/sh
# Install prometheus
ARG PROM_VERSION=3.4.1
RUN apt-get update && apt-get install -y --no-install-recommends \
curl tar ca-certificates && \
rm -rf /var/lib/apt/lists/*
RUN ARCH=$(dpkg --print-architecture) && \
case "$ARCH" in \
amd64) PLATFORM=linux-amd64 ;; \
arm64) PLATFORM=linux-arm64 ;; \
*) echo "Unsupported architecture: $ARCH" && exit 1 ;; \
esac && \
curl -fsSL https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.${PLATFORM}.tar.gz \
| tar -xz -C /tmp && \
mv /tmp/prometheus-${PROM_VERSION}.${PLATFORM}/prometheus /usr/local/bin/ && \
chmod +x /usr/local/bin/prometheus && \
rm -rf /tmp/prometheus-${PROM_VERSION}.${PLATFORM}
### BUILDS ###
# Rust build/dev dependencies
......
......@@ -37,6 +37,7 @@ pynvml
pyright
PyYAML
scikit-learn
scipy<1.14.0 # Pin scipy version for pmdarima compatibility
sentencepiece
tensorboard==2.19.0
tensorboardX==2.6.2.2
......
......@@ -140,8 +140,13 @@ SLA-planner and prometheus server are provided as common components that can be
- Link `Planner` and `Prometheus` in the graph.
- Add `Planner` and `Prometheus` configurations in the config file.
A `vllm_v0` example is available for reference:
We provide examples for `vllm_v0` and `vllm_v1`:
```bash
# vllm_v0
cd $DYNAMO_HOME/examples/vllm_v0
dynamo serve graphs.disagg_planner:Frontend -f ./configs/disagg_planner.yaml
# vllm_v1
cd $DYNAMO_HOME/examples/vllm_v1
dynamo serve graphs.disagg_planner:Frontend -f ./configs/disagg_planner.yaml
```
\ No newline at end of file
......@@ -22,6 +22,8 @@ from fastapi import FastAPI
from pydantic import BaseModel
import dynamo.sdk as sdk
from dynamo.planner.planner_sla import Planner
from dynamo.planner.prometheus import Prometheus
from dynamo.sdk import depends, service
from dynamo.sdk.lib.config import ServiceConfig
from dynamo.sdk.lib.image import DYNAMO_IMAGE
......@@ -59,6 +61,8 @@ class FrontendConfig(BaseModel):
)
class Frontend:
worker = depends(SimpleLoadBalancer)
planner = depends(Planner)
prometheus = depends(Prometheus)
def __init__(self):
"""Initialize Frontend service with HTTP server and model configuration."""
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
kv-transfer-config: '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
Frontend:
endpoint: dynamo.SimpleLoadBalancer.generate_disagg
port: 8000
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
SimpleLoadBalancer:
enable_disagg: true
common-configs: [model, kv-transfer-config, served_model_name]
VllmPrefillWorker:
enforce-eager: true
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, kv-transfer-config, served_model_name]
VllmDecodeWorker:
enforce-eager: true
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, kv-transfer-config, served_model_name]
Prometheus:
global:
scrape_interval: 5s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'frontend'
static_configs:
- targets: ['localhost:8000']
Planner:
backend: "vllm_v1"
adjustment-interval: 180
profile-results-dir: "/workspace/examples/profiling_results"
isl: 3000
osl: 150
ttft: 0.5
itl: 0.05
load-predictor: "arima"
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from components.frontend import Frontend
from components.simple_load_balancer import SimpleLoadBalancer
from components.worker import VllmDecodeWorker, VllmPrefillWorker
from dynamo.planner.planner_sla import Planner
from dynamo.planner.prometheus import Prometheus
load_balancer = Frontend.link(SimpleLoadBalancer)
load_balancer.link(VllmPrefillWorker)
load_balancer.link(VllmDecodeWorker)
Frontend.link(Planner)
Frontend.link(Prometheus)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment