feat: SLA-based Planner (#1420)

Signed-off-by: Hongkuan Zhou <tedzhouhk@gmail.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com> Co-authored-by: Alec <35311602+alec-flowers@users.noreply.github.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

feat: SLA-based Planner (#1420)
Signed-off-by: Hongkuan Zhou <tedzhouhk@gmail.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com> Co-authored-by: Alec <35311602+alec-flowers@users.noreply.github.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
3f53a78e · Hongkuan Zhou · GitHub · 219cfa1f · 3f53a78e · 3f53a78e
Unverified Commit 3f53a78e authored Jun 13, 2025 by Hongkuan Zhou Committed by GitHub Jun 13, 2025
4 changed files
--- a/examples/llm/components/planner_service.py
+++ b/examples/llm/components/planner_service.py
@@ -19,7 +19,7 @@ import logging
 from pydantic import BaseModel

 from components.planner import start_planner  # type: ignore[attr-defined]
-from dynamo.planner.defaults import PlannerDefaults
+from dynamo.planner.defaults import LoadPlannerDefaults
 from dynamo.runtime.logging import configure_dynamo_logging
 from dynamo.sdk import async_on_start, dynamo_context, endpoint, service
 from dynamo.sdk.core.protocol.interface import ComponentType
@@ -56,44 +56,46 @@ class Planner:

        self.args = argparse.Namespace(
            namespace=self.namespace,
-            environment=config_instance.get("environment", PlannerDefaults.environment),
+            environment=config_instance.get(
+                "environment", LoadPlannerDefaults.environment
+            ),
            no_operation=config_instance.get(
-                "no-operation", PlannerDefaults.no_operation
+                "no-operation", LoadPlannerDefaults.no_operation
            ),
-            log_dir=config_instance.get("log-dir", PlannerDefaults.log_dir),
+            log_dir=config_instance.get("log-dir", LoadPlannerDefaults.log_dir),
            adjustment_interval=config_instance.get(
-                "adjustment-interval", PlannerDefaults.adjustment_interval
+                "adjustment-interval", LoadPlannerDefaults.adjustment_interval
            ),
            metric_pulling_interval=config_instance.get(
-                "metric-pulling-interval", PlannerDefaults.metric_pulling_interval
+                "metric-pulling-interval", LoadPlannerDefaults.metric_pulling_interval
            ),
            max_gpu_budget=config_instance.get(
-                "max-gpu-budget", PlannerDefaults.max_gpu_budget
+                "max-gpu-budget", LoadPlannerDefaults.max_gpu_budget
            ),
            min_endpoint=config_instance.get(
-                "min-endpoint", PlannerDefaults.min_endpoint
+                "min-endpoint", LoadPlannerDefaults.min_endpoint
            ),
            decode_kv_scale_up_threshold=config_instance.get(
                "decode-kv-scale-up-threshold",
-                PlannerDefaults.decode_kv_scale_up_threshold,
+                LoadPlannerDefaults.decode_kv_scale_up_threshold,
            ),
            decode_kv_scale_down_threshold=config_instance.get(
                "decode-kv-scale-down-threshold",
-                PlannerDefaults.decode_kv_scale_down_threshold,
+                LoadPlannerDefaults.decode_kv_scale_down_threshold,
            ),
            prefill_queue_scale_up_threshold=config_instance.get(
                "prefill-queue-scale-up-threshold",
-                PlannerDefaults.prefill_queue_scale_up_threshold,
+                LoadPlannerDefaults.prefill_queue_scale_up_threshold,
            ),
            prefill_queue_scale_down_threshold=config_instance.get(
                "prefill-queue-scale-down-threshold",
-                PlannerDefaults.prefill_queue_scale_down_threshold,
+                LoadPlannerDefaults.prefill_queue_scale_down_threshold,
            ),
            decode_engine_num_gpu=config_instance.get(
-                "decode-engine-num-gpu", PlannerDefaults.decode_engine_num_gpu
+                "decode-engine-num-gpu", LoadPlannerDefaults.decode_engine_num_gpu
            ),
            prefill_engine_num_gpu=config_instance.get(
-                "prefill-engine-num-gpu", PlannerDefaults.prefill_engine_num_gpu
+                "prefill-engine-num-gpu", LoadPlannerDefaults.prefill_engine_num_gpu
            ),
        )


--- a/examples/vllm_v0/components/frontend.py
+++ b/examples/vllm_v0/components/frontend.py
@@ -22,6 +22,8 @@ from fastapi import FastAPI
 from pydantic import BaseModel

 from dynamo import sdk
+from dynamo.planner.planner_sla import Planner
+from dynamo.planner.prometheus import Prometheus
 from dynamo.sdk import depends, service
 from dynamo.sdk.lib.config import ServiceConfig
 from dynamo.sdk.lib.image import DYNAMO_IMAGE
@@ -60,6 +62,8 @@ class FrontendConfig(BaseModel):
 )
 class Frontend:
    worker = depends(VllmWorker)
+    planner = depends(Planner)
+    prometheus = depends(Prometheus)

    def __init__(self):
        """Initialize Frontend service with HTTP server and model configuration."""

--- a/examples/vllm_v0/configs/disagg_planner.yaml
+++ b/examples/vllm_v0/configs/disagg_planner.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+Common:
+  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  block-size: 64
+  max-model-len: 16384
+  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
+
+Frontend:
+  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  endpoint: dynamo.VllmWorker.generate
+  port: 8000
+  router: round-robin
+  common-configs: [block-size]
+
+VllmWorker:
+  remote-prefill: true
+  conditional-disagg: true
+  max-local-prefill-length: 10
+  max-prefill-queue-size: 2
+  enable-prefix-caching: true
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: 1
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+
+PrefillWorker:
+  max-num-batched-tokens: 16384
+  ServiceArgs:
+    workers: 1
+    resources:
+      gpu: 1
+  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+
+Prometheus:
+  global:
+    scrape_interval: 5s
+  scrape_configs:
+    - job_name: 'prometheus'
+      static_configs:
+        - targets: ['localhost:9090']
+    - job_name: 'frontend'
+      static_configs:
+        - targets: ['localhost:8000']
+
+Planner:
+  adjustment-interval: 180
+  profile-results-dir: "/workspace/examples/profiling_results"
+  isl: 3000
+  osl: 150
+  ttft: 0.5
+  itl: 0.05
+  load-predictor: "arima"
--- a/examples/vllm_v0/graphs/disagg_planner.py
+++ b/examples/vllm_v0/graphs/disagg_planner.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from components.frontend import Frontend
+from components.prefill_worker import PrefillWorker
+from components.worker import VllmWorker
+
+from dynamo.planner.planner_sla import Planner
+from dynamo.planner.prometheus import Prometheus
+
+Frontend.link(VllmWorker).link(PrefillWorker)
+Frontend.link(Planner)
+Frontend.link(Prometheus)