fix: planner fixes DEP-78 DEP-94 (#1082)

7f9d92ed · mohammedabdulwahhab · GitHub · 995cf55c · 7f9d92ed · 7f9d92ed
Unverified Commit 7f9d92ed authored May 14, 2025 by mohammedabdulwahhab Committed by GitHub May 14, 2025
13 changed files
--- a/components/planner/README.md
+++ b/components/planner/README.md
@@ -30,7 +30,7 @@ PYTHONPATH=/workspace/examples/llm python components/planner.py --namespace <nam
 ## Backends
 1. `local` - uses circus to start/stop worker subprocesses
-2. `kubernetes` - uses the kubernetes API to adjust replicas of each component's resource definition. This is a work in progress and not currently available
+2. `kubernetes` - uses the kubernetes API to adjust replicas of the DynamoGraphDeployment resource, which automatically scales the corresponding worker pods up or down
 ## Local Backend (LocalPlanner)
@@ -118,10 +118,12 @@ If scaled to zero, the initial entry is kept without resources to maintain confi
 ### Testing
-For manual testing, you can use the controller_test.py file to add/remove components after you've run a serve command with `--enable-local-planner`.
+For manual testing, you can use the controller_test.py file to add/remove components after you've run a serve command on a Dynamo pipeline where the planner is linked.
 ## Kubernetes Backend
-[Coming soon]
+The Kubernetes backend works by updating the replicas count of the DynamoGraphDeployment custom resource. When the planner determines that workers need to be scaled up or down based on workload metrics, it uses the Kubernetes API to patch the DynamoGraphDeployment resource specification, changing the replicas count for the appropriate worker component. The Kubernetes operator then reconciles this change by creating or terminating the necessary pods. This provides a seamless autoscaling experience in Kubernetes environments without requiring manual intervention.
+The Kubernetes backend will automatically be used by Planner when your pipeline is deployed with `dynamo deployment create`. By default, the planner will run in no-op mode, which means it will monitor metrics but not take scaling actions. To enable actual scaling, you should also specify `--Planner.no-operation=false`.
--- a/components/planner/src/dynamo/planner/__init__.py
+++ b/components/planner/src/dynamo/planner/__init__.py
@@ -18,10 +18,12 @@ __all__ = [
    "LocalConnector",
    "PlannerConnector",
    "KubernetesConnector",
+    "PlannerDefaults",
 ]
 # Import the classes
 from dynamo.planner.circusd import CircusController
+from dynamo.planner.defaults import PlannerDefaults
 from dynamo.planner.kubernetes_connector import KubernetesConnector
 from dynamo.planner.local_connector import LocalConnector
 from dynamo.planner.planner_connector import PlannerConnector
--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Source of truth for planner defaults
+class PlannerDefaults:
+    namespace = "dynamo"
+    served_model_name = "vllm"
+    environment = "local"
+    no_operation = False
+    log_dir = None
+    adjustment_interval = 10
+    metric_pulling_interval = 1
+    max_gpu_budget = 8
+    min_endpoint = 1
+    decode_kv_scale_up_threshold = 0.9
+    decode_kv_scale_down_threshold = 0.5
+    prefill_queue_scale_up_threshold = 5.0
+    prefill_queue_scale_down_threshold = 0.2
+    decode_engine_num_gpu = 1
+    prefill_engine_num_gpu = 1
--- a/deploy/sdk/src/dynamo/sdk/cli/serve.py
+++ b/deploy/sdk/src/dynamo/sdk/cli/serve.py
@@ -29,7 +29,11 @@ import typer
 from rich.console import Console
 from rich.panel import Panel
-from dynamo.sdk.cli.utils import resolve_service_config
+from dynamo.sdk.cli.utils import (
+    is_local_planner_enabled,
+    raise_local_planner_warning,
+    resolve_service_config,
+)
 from dynamo.sdk.core.runner import TargetEnum
 if t.TYPE_CHECKING:
@@ -82,10 +86,6 @@ def serve(
        False,
        help="Print the final service configuration and exit without starting the server",
    ),
-    enable_local_planner: bool = typer.Option(
-        False,
-        help="Save a snapshot of your service state to a file that allows planner to edit your deployment configuration",
-    ),
    target: TargetEnum = typer.Option(
        TargetEnum.DYNAMO,
        "--target",
@@ -149,6 +149,12 @@ def serve(
    logger.debug("Dependencies: %s", [dep.on.name for dep in svc.dependencies.values()])
    LinkedServices.remove_unused_edges()
+    # Check if local planner is enabled
+    enable_local_planner = is_local_planner_enabled(svc, service_configs)
+    if enable_local_planner:
+        # Raise warning if local planner is enabled, but workers for prefill or decode is > 1. Not supported.
+        raise_local_planner_warning(svc, service_configs)
    from dynamo.sdk.cli.serving import serve_dynamo_graph  # type: ignore
    svc.inject_config()

--- a/deploy/sdk/src/dynamo/sdk/cli/utils.py
+++ b/deploy/sdk/src/dynamo/sdk/cli/utils.py
@@ -28,17 +28,23 @@ import socket
 from typing import Any, DefaultDict, Dict, Iterator, Optional, Protocol, TextIO, Union
 import click
+import typer
 import yaml
 from click import Command, Context
+from rich.console import Console
+from dynamo.planner.defaults import PlannerDefaults  # type: ignore[attr-defined]
 from dynamo.runtime.logging import configure_dynamo_logging
+from dynamo.sdk.core.protocol.interface import ComponentType
 from dynamo.sdk.core.runner import TargetEnum
 configure_dynamo_logging()
 logger = logging.getLogger(__name__)
+console = Console()
 DYN_LOCAL_STATE_DIR = "DYN_LOCAL_STATE_DIR"
+PLANNER_SERVICE_NAME = "Planner"
 # Define a Protocol for services to ensure type safety
@@ -369,3 +375,65 @@ def configure_target_environment(target: TargetEnum):
        raise ValueError(f"Invalid target: {target}")
    logger.debug(f"Setting deployment target to {target}")
    set_target(target)
+def is_local_planner_enabled(svc: Any, service_configs: dict) -> bool:
+    """Check if local planner is enabled.
+    Args:
+        svc: The entrypoint service instance
+        service_configs: Dictionary of service configurations
+    Returns:
+        bool: True if local planner is enabled, False otherwise
+    """
+    # Check all nodes to find planner
+    nodes = [dep for dep in svc.all_services().values()]
+    nodes.append(svc)
+    planners = [
+        node
+        for node in nodes
+        if node.config.get("dynamo", {}).get("component_type") == ComponentType.PLANNER
+    ]
+    if len(planners) > 1:
+        console.print(
+            "[bold red]Error:[/bold red] More than one planner found in the pipeline"
+        )
+        raise typer.Exit(code=1)
+    # Exactly one planner
+    if planners:
+        # Get the config for the planner and check environment
+        planner_config = service_configs.get(PLANNER_SERVICE_NAME, {})
+        environment = planner_config.get("environment", PlannerDefaults.environment)
+        return environment == "local"
+    return False
+def raise_local_planner_warning(svc: Any, service_configs: dict) -> None:
+    """Warn if local planner is enabled and active (not set to no-op), but workers for prefill or decode is > 1. This is currently not supported.
+    Args:
+        svc: The service instance
+        service_configs: Dictionary of service configurations
+    """
+    planner_config = service_configs.get(PLANNER_SERVICE_NAME, {})
+    # Resolve no-op setting
+    no_op = planner_config.get("no-operation", PlannerDefaults.no_operation)
+    # Check worker counts across nodes
+    nodes = [dep for dep in svc.all_services().values()]
+    nodes.append(svc)
+    worker_names = ("PrefillWorker", "VllmWorker")
+    worker_counts_greater_than_one = [
+        node.config.get("workers", 1) > 1 for node in nodes if node.name in worker_names
+    ]
+    if any(worker_counts_greater_than_one) and not no_op:
+        logger.error(
+            "Local planner is enabled, but workers for prefill or decode is > 1. Local planner must be started with prefill and decode workers set to 1."
+        )
+        raise typer.Exit(code=1)
--- a/deploy/sdk/src/dynamo/sdk/core/runner/dynamo.py
+++ b/deploy/sdk/src/dynamo/sdk/core/runner/dynamo.py
@@ -19,6 +19,7 @@ import logging
 import os
 import shlex
 import sys
+from dataclasses import asdict
 from typing import Any, Dict, List, Optional, Set, Type, TypeVar
 import psutil
@@ -80,6 +81,8 @@ class LocalService(ServiceMixin, ServiceInterface[T]):
        self._dynamo_config = dynamo_config or DynamoConfig(
            name=name, namespace="default"
        )
+        # Add the dynamo config to the service config
+        self._config["dynamo"] = asdict(self._dynamo_config)
        self._watcher = watcher
        self._socket = socket
        self.app = app or FastAPI(title=name)

--- a/docs/guides/planner_benchmark/benchmark_planner.md
+++ b/docs/guides/planner_benchmark/benchmark_planner.md
@@ -48,19 +48,9 @@ To measure the performance of dynamo with planner, we start from a 1p1d deployme
 ```bash
 cd examples/llm
-dynamo serve graphs.disagg:Frontend -f <path to disagg_1p1d.yml in this folder> --enable-local-planner
+dynamo serve graphs.disagg:Frontend -f disagg_1p1d.yml
 # in terminal 2
-PYTHONPATH=/workspace/examples/llm python components/planner.py \
-    --metric-pulling-interval 1 \
-    --adjustment-interval 10 \
-    --prefill-queue-scale-down-threshold 0.2 \
-    --prefill-queue-scale-up-threshold 10 \
-    --decode-kv-scale-down-threshold 0.3 \
-    --decode-kv-scale-up-threshold 0.6 \
-    --log-dir log/planner
-# in terminal 3
 genai-perf profile \
    --tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
    -m deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
@@ -92,12 +82,9 @@ In this example, we use a fixed 2p2d engine as baseline. Planner provides a `--n
 ```bash
 # in terminal 1
-dynamo serve --enable-local-planner graphs.disagg:Frontend -f disagg_2p2d.yml
+dynamo serve graphs.disagg:Frontend -f disagg_2p2d.yml
-# in terminal 2 (optional)
+# in terminal 2
-PYTHONPATH=/workspace/examples/llm python components/planner.py --no-operation --log-dir log/2p2d
-# in terminal 3
 genai-perf profile --tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-8B -m deepseek-ai/DeepSeek-R1-Distill-Llama-8B --service-kind openai --endpoint-type chat --url http://localhost:8000 --streaming --input-file payload:sin_b512_t600_rr5.0-20.0-150.0_io3000150-3000150-0.2-0.8-10.jsonl
 ```

--- a/docs/guides/planner_benchmark/disagg_1p1d.yml
+++ b/docs/guides/planner_benchmark/disagg_1p1d.yml
@@ -43,3 +43,14 @@ PrefillWorker:
    resources:
      gpu: 1
  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: false
+  metric-pulling-interval: 1
+  adjustment-interval: 10
+  prefill-queue-scale-down-threshold: 0.2
+  prefill-queue-scale-up-threshold: 10
+  decode-kv-scale-down-threshold: 0.3
+  decode-kv-scale-up-threshold: 0.6
+  log-dir: log/planner
--- a/docs/guides/planner_benchmark/disagg_2p2d.yaml
+++ b/docs/guides/planner_benchmark/disagg_2p2d.yaml
@@ -43,3 +43,8 @@ PrefillWorker:
    resources:
      gpu: 1
  common-configs: [model, block-size, max-model-len, kv-transfer-config]
+Planner:
+  environment: local
+  no-operation: true
+  log-dir: log/2p2d
--- a/docs/planner.md
+++ b/docs/planner.md
@@ -49,26 +49,48 @@ There are two additional rules set by planner to prevent over-compensation:
 1. We do not scale up prefill worker if the prefill queue size is estimated to reduce below the `--prefill-queue-scale-up-threshold` within the next `NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD=3` adjustment intervals following the trend observed in the current adjustment interval.
 ## Usage
-After you've deployed a dynamo graph - you can start the planner with the following command:
+The planner is started automatically as part of Dynamo pipelines when running `dynamo serve`. You can configure the planner just as you would any other component in your pipeline either via YAML configuration or through CLI arguments.
+Usage:
+```bash
+# Configure the planner through YAML configuration
+dynamo serve graphs.disagg:Frontend -f disagg.yaml
+# disagg.yaml
+# ...
+# Planner:
+#   environment: local
+#   no-operation: false
+#   log-dir: log/planner
+# Configure the planner through CLI arguments
+dynamo serve graphs.disagg:Frontend -f disagg.yaml --Planner.environment=local --Planner.no-operation=false --Planner.log-dir=log/planner
+```
+The planner accepts the following configuration options:
+* `namespace` (str, default: "dynamo"): Namespace planner will look at
+* `served-model-name` (str, default: "vllm"): Model name that is being served`
+* `no-operation` (bool, default: false): Do not make any adjustments, just observe the metrics and log to tensorboard.
+* `log-dir` (str, default: None): Tensorboard logging directory
+* `adjustment-interval` (int, default: 30): Interval in seconds between scaling adjustments
+* `metric-pulling-interval` (int, default: 1): Interval in seconds between metric pulls
+* `max-gpu-budget` (int, default: 8): Maximum number of GPUs to use, planner will not scale up more than this number of GPUs for prefill plus decode workers
+* `min-gpu-budget` (int, default: 1): Minimum number of GPUs to use, planner will not scale down below this number of GPUs for prefill or decode workers
+* `decode-kv-scale-up-threshold` (float, default: 0.9): KV cache utilization threshold to scale up decode workers
+* `decode-kv-scale-down-threshold` (float, default: 0.5): KV cache utilization threshold to scale down decode workers
+* `prefill-queue-scale-up-threshold` (float, default: 0.5): Queue utilization threshold to scale up prefill workers
+* `prefill-queue-scale-down-threshold` (float, default: 0.2): Queue utilization threshold to scale down prefill workers
+* `decode-engine-num-gpu` (int, default: 1): Number of GPUs per decode engine
+* `prefill-engine-num-gpu` (int, default: 1): Number of GPUs per prefill engine
+Alternatively, you can run the planner as a standalone python process. The configuration options above can be directly passed in as CLI arguments.
 ```bash
 PYTHONPATH=/workspace/examples/llm python components/planner.py <arguments>
+# Example
+# PYTHONPATH=/workspace/examples/llm python components/planner.py --namespace=dynamo --served-model-name=vllm --no-operation --log-dir=log/planner
 ```
-Planner takes the following arguments:
-* `--namespace` (str, default: "dynamo"): Namespace planner will look at
-* `--served-model-name` (str, default: "vllm"): Model name that is being served`
-* `--no-operation` (flag): Do not make any adjustments, just observe the metrics and log to tensorboard
-* `--log-dir` (str, default: None): Tensorboard logging directory
-* `--adjustment-interval` (int, default: 30): Interval in seconds between scaling adjustments
-* `--metric-pulling-interval` (int, default: 1): Interval in seconds between metric pulls
-* `--max-gpu-budget` (int, default: 8): Maximum number of GPUs to use, planner will not scale up more than this number of GPUs for prefill plus decode workers
-* `--min-gpu-budget` (int, default: 1): Minimum number of GPUs to use, planner will not scale down below this number of GPUs for prefill or decode workers
-* `--decode-kv-scale-up-threshold` (float, default: 0.9): KV cache utilization threshold to scale up decode workers
-* `--decode-kv-scale-down-threshold` (float, default: 0.5): KV cache utilization threshold to scale down decode workers
-* `--prefill-queue-scale-up-threshold` (float, default: 0.5): Queue utilization threshold to scale up prefill workers
-* `--prefill-queue-scale-down-threshold` (float, default: 0.2): Queue utilization threshold to scale down prefill workers
-* `--decode-engine-num-gpu` (int, default: 1): Number of GPUs per decode engine
-* `--prefill-engine-num-gpu` (int, default: 1): Number of GPUs per prefill engine
 ### Tensorboard
@@ -78,8 +100,9 @@ tensorboard --logdir=<path-to-tensorboard-log-dir>
 ```
 ## Backends
-We currently only support one backend:
+We currently support two backends:
 1. `local` - uses circus to start/stop worker subprocesses
+2. `kubernetes` - uses kubernetes to scale up/down the number of worker pods by updating the replicas count of the DynamoGraphDeployment resource
 ### Local Backend
@@ -129,3 +152,7 @@ Note that we keep the initial non-suffix entry in order to know what cmd we will
 > [!NOTE]
 > At the moment - planner work best if your initial replicas per worker are 1. This is because if you specify replicas > 1 when you initially start `dynamo serve`, the current implementation in `serving.py` starts each process in the same watcher.
+### Kubernetes Backend
+The Kubernetes backend works by updating the replicas count of the DynamoGraphDeployment custom resource. When the planner detects the need to scale up or down a specific worker type, it uses the Kubernetes API to patch the DynamoGraphDeployment resource, modifying the replicas count for the appropriate component. The Kubernetes operator then reconciles this change by creating or removing the necessary pods. This provides a seamless scaling experience in Kubernetes environments without requiring manual intervention.
\ No newline at end of file
--- a/examples/llm/README.md
+++ b/examples/llm/README.md
@@ -125,6 +125,9 @@ This figure shows an overview of the major components to deploy:
 ```
+> [!NOTE]
+> The planner component is enabled by default for all deployment architectures but is set to no-op mode. This means the planner observes metrics but doesn't take scaling actions. To enable active scaling, you can add `--Planner.no-operation=false` to your `dynamo serve` command. For more details, see the [Planner documentation](../../components/planner/README.md).
 ### Example architectures
 _Note_: For a non-dockerized deployment, first export `DYNAMO_HOME` to point to the dynamo repository root, e.g. `export DYNAMO_HOME=$(pwd)`
@@ -216,6 +219,8 @@ export DEPLOYMENT_NAME=llm-agg
 dynamo deployment create $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg.yaml
 ```
+**Note**: Optionally add `--Planner.no-operation=false` at the end of the deployment command to enable the planner component to take scaling actions on your deployment.
 ### Testing the Deployment
 Once the deployment is complete, you can test it using:

--- a/examples/llm/components/planner.py
+++ b/examples/llm/components/planner.py
@@ -31,6 +31,7 @@ from utils.prefill_queue import PrefillQueue
 from dynamo.llm import KvMetricsAggregator
 from dynamo.planner import KubernetesConnector, LocalConnector
+from dynamo.planner.defaults import PlannerDefaults
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
@@ -78,6 +79,8 @@ class Planner:
        logger.info(f"Components present in namespace: {args.namespace}")
        self.init_time = time.time()
+        # Set the appropriate logger function for repeated metric logging
+        self._repeating_log_func = logger.debug if args.no_operation else logger.info
    async def set_metric_aggregator(self):
        # TODO: separate KV metrics and prefill metrics
@@ -100,7 +103,9 @@ class Planner:
            p_endpoints = self.prefill_client.endpoint_ids()
        except Exception:
            p_endpoints = []
-            logger.info("No prefill workers found, operating in aggregated mode")
+            self._repeating_log_func(
+                "No prefill workers found, operating in aggregated mode"
+            )
        try:
            if self.workers_client is None:
                self.workers_client = (
@@ -118,13 +123,13 @@ class Planner:
        return p_endpoints, d_endpoints
    async def reset_adjustment_interval(self):
-        logger.info(
+        self._repeating_log_func(
            f"Reset metrics for new adjustment interval at t={time.time() - self.init_time:.1f}s"
        )
        self.p_endpoints, self.d_endpoints = await self.get_workers_info()
-        logger.info(
+        self._repeating_log_func(
            f"Number of prefill workers: {len(self.p_endpoints)}, number of decode workers: {len(self.d_endpoints)}"
        )
@@ -135,7 +140,9 @@ class Planner:
        self.last_adjustment_time = time.time()
    async def collect_metrics(self):
-        logger.info(f"Collecting metrics at t={time.time() - self.init_time:.1f}s")
+        self._repeating_log_func(
+            f"Collecting metrics at t={time.time() - self.init_time:.1f}s"
+        )
        # collect prefill queue load
        try:
@@ -146,14 +153,16 @@ class Planner:
                prefill_queue_size = await prefill_queue.get_queue_size()
                measure_time = time.time() - self.init_time
            self.prefill_queue_load.append(prefill_queue_size)
-            logger.info(
+            self._repeating_log_func(
                f"Collected prefill queue size at t={measure_time:.1f}s: {int(prefill_queue_size)}"
            )
            self.writer.add_scalar(
                "prefill_queue_size", prefill_queue_size, measure_time
            )
        except Exception as e:
-            logger.info(f"Failed to collect prefill queue size metrics: {e}")
+            self._repeating_log_func(
+                f"Failed to collect prefill queue size metrics: {e}"
+            )
        # collect kv load
        total_active_requests: int = 0
@@ -176,7 +185,7 @@ class Planner:
                        kv_load = kv_load + 0.02 * num_requests_waiting
                self.kv_load.append(kv_load)
            measure_time = time.time() - self.init_time
-            logger.info(
+            self._repeating_log_func(
                f"Collected kv load at t={measure_time:.1f}s: {self.kv_load[prev_kv_load_len:]} (act/pnd req: {total_active_requests}/{total_queued_requests})"
            )
            average_kv_load = np.mean(self.kv_load[prev_kv_load_len:])
@@ -185,7 +194,7 @@ class Planner:
                "total_queued_requests", total_queued_requests, measure_time
            )
        except Exception as e:
-            logger.info(f"Failed to collect kv load metrics: {e}")
+            self._repeating_log_func(f"Failed to collect kv load metrics: {e}")
        p_endpoints, d_endpoints = await self.get_workers_info()
        self.writer.add_scalar(
@@ -333,6 +342,12 @@ class Planner:
        """Main loop for the planner"""
        await self.set_metric_aggregator()
+        if self._repeating_log_func == logger.debug:
+            logger.info(
+                "Running in no-operation mode - detailed metrics will be logged at DEBUG level"
+            )
        await self.reset_adjustment_interval()
        while True:
@@ -391,90 +406,91 @@ if __name__ == "__main__":
    parser.add_argument(
        "--namespace",
        type=str,
-        default="dynamo",
+        default=PlannerDefaults.namespace,
        help="Namespace planner will look at",
    )
    parser.add_argument(
        "--served-model-name",
        type=str,
-        default="vllm",
+        default=PlannerDefaults.served_model_name,
        help="Model name that is being served (used for prefill queue name)",
    )
    parser.add_argument(
        "--no-operation",
        action="store_true",
+        default=PlannerDefaults.no_operation,
        help="Do not make any adjustments, just observe the metrics",
    )
    parser.add_argument(
        "--log-dir",
        type=str,
-        default=None,
+        default=PlannerDefaults.log_dir,
        help="Tensorboard logging directory",
    )
    parser.add_argument(
        "--adjustment-interval",
        type=int,
-        default=10,
+        default=PlannerDefaults.adjustment_interval,
        help="Interval in seconds between scaling adjustments",
    )
    parser.add_argument(
        "--metric-pulling-interval",
        type=int,
-        default=1,
+        default=PlannerDefaults.metric_pulling_interval,
        help="Interval in seconds between metric pulls",
    )
    parser.add_argument(
        "--max-gpu-budget",
        type=int,
-        default=8,
+        default=PlannerDefaults.max_gpu_budget,
        help="Maximum number of GPUs to use",
    )
    parser.add_argument(
        "--min-endpoint",
        type=int,
-        default=1,
+        default=PlannerDefaults.min_endpoint,
        help="Minimum number of endpoints to keep for prefill/decode workers",
    )
    parser.add_argument(
        "--decode-kv-scale-up-threshold",
        type=float,
-        default=0.9,
+        default=PlannerDefaults.decode_kv_scale_up_threshold,
        help="KV cache utilization threshold to scale up decode workers",
    )
    parser.add_argument(
        "--decode-kv-scale-down-threshold",
        type=float,
-        default=0.5,
+        default=PlannerDefaults.decode_kv_scale_down_threshold,
        help="KV cache utilization threshold to scale down decode workers",
    )
    parser.add_argument(
        "--prefill-queue-scale-up-threshold",
        type=float,
-        default=5,
+        default=PlannerDefaults.prefill_queue_scale_up_threshold,
        help="Queue utilization threshold to scale up prefill workers",
    )
    parser.add_argument(
        "--prefill-queue-scale-down-threshold",
        type=float,
-        default=0.2,
+        default=PlannerDefaults.prefill_queue_scale_down_threshold,
        help="Queue utilization threshold to scale down prefill workers",
    )
    parser.add_argument(
        "--decode-engine-num-gpu",
        type=int,
-        default=1,
+        default=PlannerDefaults.decode_engine_num_gpu,
        help="Number of GPUs per decode engine",
    )
    parser.add_argument(
        "--prefill-engine-num-gpu",
        type=int,
-        default=1,
+        default=PlannerDefaults.prefill_engine_num_gpu,
        help="Number of GPUs per prefill engine",
    )
    parser.add_argument(
        "--environment",
        type=str,
-        default="local",
+        default=PlannerDefaults.environment,
        help="Environment to run the planner in (local, kubernetes)",
    )
    args = parser.parse_args()

--- a/examples/llm/components/planner_service.py
+++ b/examples/llm/components/planner_service.py
@@ -19,8 +19,10 @@ import logging
 from pydantic import BaseModel
 from components.planner import start_planner  # type: ignore[attr-defined]
+from dynamo.planner.defaults import PlannerDefaults
 from dynamo.runtime.logging import configure_dynamo_logging
 from dynamo.sdk import async_on_start, dynamo_context, dynamo_endpoint, service
+from dynamo.sdk.core.protocol.interface import ComponentType
 from dynamo.sdk.lib.config import ServiceConfig
 from dynamo.sdk.lib.image import DYNAMO_IMAGE
@@ -34,7 +36,7 @@ class RequestType(BaseModel):
 @service(
    dynamo={
        "namespace": "dynamo",
-        "component_type": "planner",
+        "component_type": ComponentType.PLANNER,
    },
    resources={"cpu": "10", "memory": "20Gi"},
    workers=1,
@@ -50,33 +52,59 @@ class Planner:
        # Get namespace directly from dynamo_context as it contains the active namespace
        self.namespace = dynamo_context["namespace"]
-        self.environment = config.get("Planner", {}).get("environment", "local")
+        config_instance = config.get("Planner", {})
-        self.no_operation = config.get("Planner", {}).get("no-operation", True)
-        # Create args with all parameters from planner.py, using defaults except for namespace and environment
        self.args = argparse.Namespace(
            namespace=self.namespace,
-            environment=self.environment,
+            environment=config_instance.get("environment", PlannerDefaults.environment),
-            served_model_name="vllm",
+            served_model_name=config_instance.get(
-            no_operation=self.no_operation,
+                "served-model-name", PlannerDefaults.served_model_name
-            log_dir=None,
+            ),
-            adjustment_interval=10,
+            no_operation=config_instance.get(
-            metric_pulling_interval=1,
+                "no-operation", PlannerDefaults.no_operation
-            max_gpu_budget=8,
+            ),
-            min_endpoint=1,
+            log_dir=config_instance.get("log-dir", PlannerDefaults.log_dir),
-            decode_kv_scale_up_threshold=0.9,
+            adjustment_interval=config_instance.get(
-            decode_kv_scale_down_threshold=0.5,
+                "adjustment-interval", PlannerDefaults.adjustment_interval
-            prefill_queue_scale_up_threshold=5,
+            ),
-            prefill_queue_scale_down_threshold=0.2,
+            metric_pulling_interval=config_instance.get(
-            decode_engine_num_gpu=1,
+                "metric-pulling-interval", PlannerDefaults.metric_pulling_interval
-            prefill_engine_num_gpu=1,
+            ),
+            max_gpu_budget=config_instance.get(
+                "max-gpu-budget", PlannerDefaults.max_gpu_budget
+            ),
+            min_endpoint=config_instance.get(
+                "min-endpoint", PlannerDefaults.min_endpoint
+            ),
+            decode_kv_scale_up_threshold=config_instance.get(
+                "decode-kv-scale-up-threshold",
+                PlannerDefaults.decode_kv_scale_up_threshold,
+            ),
+            decode_kv_scale_down_threshold=config_instance.get(
+                "decode-kv-scale-down-threshold",
+                PlannerDefaults.decode_kv_scale_down_threshold,
+            ),
+            prefill_queue_scale_up_threshold=config_instance.get(
+                "prefill-queue-scale-up-threshold",
+                PlannerDefaults.prefill_queue_scale_up_threshold,
+            ),
+            prefill_queue_scale_down_threshold=config_instance.get(
+                "prefill-queue-scale-down-threshold",
+                PlannerDefaults.prefill_queue_scale_down_threshold,
+            ),
+            decode_engine_num_gpu=config_instance.get(
+                "decode-engine-num-gpu", PlannerDefaults.decode_engine_num_gpu
+            ),
+            prefill_engine_num_gpu=config_instance.get(
+                "prefill-engine-num-gpu", PlannerDefaults.prefill_engine_num_gpu
+            ),
        )
    @async_on_start
    async def async_init(self):
        import asyncio
-        await asyncio.sleep(60)
+        await asyncio.sleep(30)
        logger.info("Calling start_planner")
        await start_planner(self.runtime, self.args)
        logger.info("Planner started")