feat: deploy SLA profiler to k8s (#2030)

Co-authored-by: hongkuan <hongkuanz@nvidia.com> Co-authored-by: mohammedabdulwahhab <furkhan324@berkeley.edu> Co-authored-by: Hongkuan Zhou <tedzhouhk@gmail.com>

feat: deploy SLA profiler to k8s (#2030)
Co-authored-by: hongkuan <hongkuanz@nvidia.com> Co-authored-by: mohammedabdulwahhab <furkhan324@berkeley.edu> Co-authored-by: Hongkuan Zhou <tedzhouhk@gmail.com>
fe718fd2 · hhzhang16 · GitHub · ba3ac235 · fe718fd2 · fe718fd2
Unverified Commit fe718fd2 authored Jul 24, 2025 by hhzhang16 Committed by GitHub Jul 24, 2025
20 changed files
--- a/benchmarks/profiler/deploy/profile_sla_binding.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_binding.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: profile-sla-binding
+  namespace: ${NAMESPACE}
+subjects:
+- kind: ServiceAccount
+  name: profile-sla-sa
+  namespace: ${NAMESPACE}
+roleRef:
+  kind: Role
+  name: profile-sla-role
+  apiGroup: rbac.authorization.k8s.io
--- a/benchmarks/profiler/deploy/profile_sla_job.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_job.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: profile-sla
+  namespace: ${NAMESPACE}
+spec:
+  template:
+    spec:
+      serviceAccountName: profile-sla-sa
+      containers:
+      - name: profile-sla
+        image: ${DOCKER_IMAGE}
+        resources:
+          requests:
+            cpu: "1"
+            memory: "2Gi"
+          limits:
+            cpu: "2"
+            memory: "4Gi"
+        env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: hf-token-secret
+                key: HF_TOKEN
+          - name: NATS_SERVER
+            value: nats://${NAMESPACE}-nats:4222
+          - name: ETCD_ENDPOINTS
+            value: ${NAMESPACE}-etcd:2379
+        command: ["python", "/workspace/benchmarks/profiler/profile_sla.py"]
+        args:
+          - --config
+          - ${DGD_CONFIG_FILE}
+          - --output-dir
+          - /workspace/profiling_results
+          - --namespace
+          - ${NAMESPACE}
+        volumeMounts:
+          - name: output-volume
+            mountPath: /workspace/profiling_results
+      restartPolicy: Never
+      volumes:
+        - name: output-volume
+          persistentVolumeClaim:
+            claimName: profiling-pvc
+  backoffLimit: 0
--- a/benchmarks/profiler/deploy/profile_sla_rbac.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_rbac.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: profile-sla-role
+  namespace: ${NAMESPACE}
+rules:
+  # DynamoGraphDeployment custom resources - needed for create/get/delete operations
+  - apiGroups: ["nvidia.com"]
+    resources: ["dynamographdeployments"]
+    verbs: ["get", "create", "delete"]
+  # Pods - needed for listing pods by label selector and getting logs
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["list"]
+  - apiGroups: [""]
+    resources: ["pods/log"]
+    verbs: ["get"]
--- a/benchmarks/profiler/deploy/profile_sla_sa.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_sa.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: profile-sla-sa
+  namespace: ${NAMESPACE}
+imagePullSecrets:
+  - name: nvcr-imagepullsecret
--- a/benchmarks/profiler/deploy/profiling_pvc.yaml
+++ b/benchmarks/profiler/deploy/profiling_pvc.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: profiling-pvc
+  namespace: ${NAMESPACE}
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 50Gi
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -14,8 +14,11 @@
 # limitations under the License.
 import logging
+from copy import deepcopy
 from typing import Literal
+from utils.defaults import DEFAULT_MODEL_NAME, DYNAMO_RUN_DEFAULT_PORT
 from dynamo.planner.defaults import WORKER_COMPONENT_NAMES
 logger = logging.getLogger(__name__)
@@ -29,192 +32,184 @@ console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
-class VllmV0ConfigModifier:
+def break_arguments(args: list[str]) -> list[str]:
-    @classmethod
+    ans = []
-    def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
+    if isinstance(args, str):
-        config = config.copy()
+        ans = args.split(" ")
+    else:
-        # disable planner
+        for arg in args:
-        if "Planner" in config:
+            ans.extend(arg.split(" "))
-            config["Planner"]["no-operation"] = True
+    return ans
-        if target == "prefill":
-            if WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker in config:
-                # make PrefillWorker into VllmWorker
-                del config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]
-                config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker] = config[
-                    WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker
-                ]
-                del config[WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker]
-            # to profile prefill, we disable prefix caching
+def join_arguments(args: list[str]) -> list[str]:
-            config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
+    return [" ".join(args)]
-                "enable-prefix-caching"
-            ] = False
-        elif target == "decode":
-            if WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker in config:
-                del config[WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker]
-            # to profile prefill, we enable prefix caching to pass the prefill stage
-            config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
-                "enable-prefix-caching"
-            ] = True
-        # set num workers to 1
+def append_argument(args: list[str], to_append) -> list[str]:
-        config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]["ServiceArgs"][
+    idx = find_arg_index(args)
-            "workers"
+    if isinstance(to_append, list):
-        ] = 1
+        args[idx:idx] = to_append
+    else:
-        # set PP to 1
+        args.insert(idx, to_append)
-        if (
+    return args
-            "pipeline-parallel-size"
-            in config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]
-            and config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
-                "pipeline-parallel-size"
-            ]
-            > 1
-        ):
-            logger.warning("Currently we only support TP, setting PP to 1")
-            config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
-                "pipeline-parallel-size"
-            ] = 1
-        # always local prefill
-        config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
-            "remote-prefill"
-        ] = False
-        config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
-            "conditional-disagg"
-        ] = False
-        return config
-    @classmethod
+def find_arg_index(args: list[str]) -> int:
-    def set_config_tp_size(cls, config: dict, tp_size: int):
+    # find the correct index to insert an argument
-        config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
+    idx = len(args)
-            "tensor-parallel-size"
-        ] = tp_size
-        config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]["ServiceArgs"][
-            "resources"
-        ]["gpu"] = tp_size
-        return config
-    @classmethod
+    try:
-    def get_model_name(cls, config: dict) -> str:
+        new_idx = args.index("|")
-        if "Common" in config and "served_model_name" in config["Common"]:
+        idx = min(idx, new_idx)
-            return config["Common"]["served_model_name"]
+    except ValueError:
-        else:
+        pass
-            return config["Frontend"]["served_model_name"]
-    @classmethod
-    def get_port(cls, config: dict) -> int:
-        if "Common" in config and "port" in config["Common"]:
-            return config["Common"]["port"]
-        else:
-            return config["Frontend"]["port"]
-    @classmethod
-    def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
    try:
-            with open(dynamo_log_fn, "r") as f:
+        new_idx = args.index("2>&1")
-                for line in f:
+        idx = min(idx, new_idx)
-                    if "Maximum concurrency for" in line:
+    except ValueError:
-                        line = line.strip().split("Maximum concurrency for ")[1]
+        pass
-                        token_count = int(line.split(" tokens per request: ")[0])
-                        concurrency = float(line.split(" tokens per request: ")[1][:-1])
-                        logger.info(
+    return idx
-                            f"Found KV cache info: {token_count} x {concurrency} = {int(token_count * concurrency)}"
-                        )
-                        return int(token_count * concurrency)
-        except Exception as e:
-            logger.warning(
-                f"Failed to parse KV cache size from line: {line}. Error: {e}"
-            )
-        return 0
 class VllmV1ConfigModifier:
    @classmethod
    def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
-        config = config.copy()
+        config = deepcopy(config)
-        # disable planner
+        # set metadata name
-        if "Planner" in config:
+        config["metadata"]["name"] = "vllm-v1-agg"
-            config["Planner"]["no-operation"] = True
-        # turn-off disagg
+        # disable planner
-        config["SimpleLoadBalancer"]["enable_disagg"] = False
+        if "Planner" in config["spec"]["services"]:
+            del config["spec"]["services"]["Planner"]
        if target == "prefill":
-            if WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker in config:
+            # convert prefill worker into decode worker
-                # make VllmPrefillWorker into VllmDecodeWorker
+            config["spec"]["services"][
-                del config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]
+                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
-                config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker] = config[
+            ] = config["spec"]["services"][
+                WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
+            ]
+            del config["spec"]["services"][
                WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
            ]
-                del config[WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]
-            # to profile prefill, we disable prefix caching
+            args = config["spec"]["services"][
-            config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
-                "enable-prefix-caching"
+            ]["extraPodSpec"]["mainContainer"]["args"]
-            ] = False
+            args = break_arguments(args)
+            # remove --is-prefill-worker flag
+            args.remove("--is-prefill-worker")
+            # disable prefix caching
+            if "--enable-prefix-caching" in args:
+                args.remove("--enable-prefix-caching")
+            if "--no-enable-prefix-caching" not in args:
+                args = append_argument(args, "--no-enable-prefix-caching")
+            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+                "extraPodSpec"
+            ]["mainContainer"]["args"] = join_arguments(args)
        elif target == "decode":
-            if WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker in config:
+            # delete prefill worker
-                del config[WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]
+            del config["spec"]["services"][
+                WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
+            ]
-            # to profile prefill, we enable prefix caching to pass the prefill stage
+            args = config["spec"]["services"][
-            config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
-                "enable-prefix-caching"
+            ]["extraPodSpec"]["mainContainer"]["args"]
-            ] = True
+            args = break_arguments(args)
+            # enable prefix caching
+            if "--enable-prefix-caching" not in args:
+                args = append_argument(args, "--enable-prefix-caching")
+            if "--no-enable-prefix-caching" in args:
+                args.remove("--no-enable-prefix-caching")
+            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+                "extraPodSpec"
+            ]["mainContainer"]["args"] = join_arguments(args)
        # set num workers to 1
-        config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["ServiceArgs"][
+        decode_worker_config = config["spec"]["services"][
-            "workers"
+            WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
-        ] = 1
-        # set PP to 1
-        if (
-            "pipeline-parallel-size"
-            in config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]
-            and config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
-                "pipeline-parallel-size"
        ]
-            > 1
+        decode_worker_config["replicas"] = 1
-        ):
-            logger.warning("Currently we only support TP, setting PP to 1")
-            config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
-                "pipeline-parallel-size"
-            ] = 1
        return config
    @classmethod
    def set_config_tp_size(cls, config: dict, tp_size: int):
-        config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+        config = deepcopy(config)
-            "tensor-parallel-size"
-        ] = tp_size
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
-        config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["ServiceArgs"][
            "resources"
-        ]["gpu"] = tp_size
+        ]["requests"]["gpu"] = str(tp_size)
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+            "resources"
+        ]["limits"]["gpu"] = str(tp_size)
+        args = config["spec"]["services"][
+            WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+        ]["extraPodSpec"]["mainContainer"]["args"]
+        args = break_arguments(args)
+        try:
+            idx = args.index("--tensor-parallel-size")
+            args[idx + 1] = str(tp_size)
+        except ValueError:
+            args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+            "extraPodSpec"
+        ]["mainContainer"]["args"] = join_arguments(args)
        return config
    @classmethod
    def get_model_name(cls, config: dict) -> str:
-        if "Common" in config and "served_model_name" in config["Common"]:
+        worker_name = WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
-            return config["Common"]["served_model_name"]
+        args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][
-        else:
+            "args"
-            return config["Frontend"]["served_model_name"]
+        ]
+        args = break_arguments(args)
+        for i, arg in enumerate(args):
+            if arg == "--model" and i + 1 < len(args):
+                return args[i + 1]
+        logger.warning(
+            f"Model name not found in configuration args, using default model name: {DEFAULT_MODEL_NAME}"
+        )
+        return DEFAULT_MODEL_NAME
    @classmethod
    def get_port(cls, config: dict) -> int:
-        if "Common" in config and "port" in config["Common"]:
+        args = config["spec"]["services"]["Frontend"]["extraPodSpec"]["mainContainer"][
-            return config["Common"]["port"]
+            "args"
-        else:
+        ]
-            return config["Frontend"]["port"]
+        args = break_arguments(args)
+        try:
+            idx = args.index("--http-port")
+            return int(args[idx + 1])
+        except ValueError:
+            logger.warning(
+                f"Port not found in configuration args, using default port: {DYNAMO_RUN_DEFAULT_PORT}"
+            )
+            return DYNAMO_RUN_DEFAULT_PORT
    @classmethod
    def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
+        # TODO
        try:
            with open(dynamo_log_fn, "r") as f:
                for line in f:
@@ -237,6 +232,5 @@ class VllmV1ConfigModifier:
 CONFIG_MODIFIERS = {
-    "vllm_v0": VllmV0ConfigModifier,
    "vllm_v1": VllmV1ConfigModifier,
 }
--- a/benchmarks/profiler/utils/defaults.py
+++ b/benchmarks/profiler/utils/defaults.py
@@ -29,3 +29,6 @@ DECODE_NUM_REQUESTS_RANGE = [
    450,
    500,
 ]
+DEFAULT_MODEL_NAME = "Qwen/Qwen3-0.6B"
+DYNAMO_RUN_DEFAULT_PORT = 8000
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import asyncio
+import time
+from pathlib import Path
+from typing import Optional, Union
+import aiofiles
+import httpx  # added for HTTP requests
+import kubernetes_asyncio as kubernetes
+import yaml
+from kubernetes_asyncio import client, config
+# Example chat completion request for testing deployments
+EXAMPLE_CHAT_REQUEST = {
+    "model": "Qwen/Qwen3-0.6B",
+    "messages": [
+        {
+            "role": "user",
+            "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden.",
+        }
+    ],
+    "stream": False,
+    "max_tokens": 30,
+}
+class DynamoDeploymentClient:
+    def __init__(
+        self,
+        namespace: str,
+        model_name: str = "Qwen/Qwen3-0.6B",
+        deployment_name: str = "vllm-v1-agg",
+        frontend_port: int = 8000,
+        base_log_dir: Optional[str] = None,
+        service_name: Optional[str] = None,
+    ):
+        """
+        Initialize the client with the namespace and deployment name.
+        Args:
+            namespace: The Kubernetes namespace
+            deployment_name: Name of the deployment, defaults to vllm-v1-agg
+            base_log_dir: Base directory for storing logs, defaults to ./logs if not specified
+            service_name: Service name for connecting to the service, defaults to {deployment_name}-frontend
+        """
+        self.namespace = namespace
+        self.deployment_name = deployment_name
+        self.model_name = model_name
+        self.service_name = service_name or f"{deployment_name}-frontend"
+        self.components: list[str] = []  # Will store component names from CR
+        self.deployment_spec: Optional[
+            dict
+        ] = None  # Will store the full deployment spec
+        self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs")
+        self.frontend_port = frontend_port
+    def _init_kubernetes(self):
+        """Initialize kubernetes client"""
+        try:
+            # Try in-cluster config first (for pods with service accounts)
+            config.load_incluster_config()
+        except Exception:
+            # Fallback to kube config file (for local development)
+            config.load_kube_config()
+        self.k8s_client = client.ApiClient()
+        self.custom_api = client.CustomObjectsApi(self.k8s_client)
+        self.core_api = client.CoreV1Api(self.k8s_client)
+    def get_service_url(self) -> str:
+        """
+        Get the service URL using Kubernetes service DNS.
+        """
+        service_url = f"http://{self.service_name}.{self.namespace}.svc.cluster.local:{self.frontend_port}"
+        print(f"Using service URL: {service_url}")
+        return service_url
+    async def create_deployment(self, deployment: Union[dict, str]):
+        """
+        Create a DynamoGraphDeployment from either a dict or yaml file path.
+        Args:
+            deployment: Either a dict containing the deployment spec or a path to a yaml file
+        """
+        self._init_kubernetes()
+        if isinstance(deployment, str):
+            # Load from yaml file
+            async with aiofiles.open(deployment, "r") as f:
+                content = await f.read()
+                self.deployment_spec = yaml.safe_load(content)
+        else:
+            self.deployment_spec = deployment
+        # Extract component names
+        self.components = [
+            svc.lower() for svc in self.deployment_spec["spec"]["services"].keys()
+        ]
+        # Ensure name and namespace are set correctly
+        self.deployment_spec["metadata"]["name"] = self.deployment_name
+        self.deployment_spec["metadata"]["namespace"] = self.namespace
+        try:
+            await self.custom_api.create_namespaced_custom_object(
+                group="nvidia.com",
+                version="v1alpha1",
+                namespace=self.namespace,
+                plural="dynamographdeployments",
+                body=self.deployment_spec,
+            )
+            print(f"Successfully created deployment {self.deployment_name}")
+        except kubernetes.client.rest.ApiException as e:
+            if e.status == 409:  # Already exists
+                print(f"Deployment {self.deployment_name} already exists")
+            else:
+                print(f"Failed to create deployment {self.deployment_name}: {e}")
+                raise
+    async def wait_for_deployment_ready(self, timeout: int = 1800):
+        """
+        Wait for the custom resource to be ready.
+        Args:
+            timeout: Maximum time to wait in seconds, default to 30 mins (image pulling can take a while)
+        """
+        start_time = time.time()
+        # TODO: A little brittle, also should output intermediate status every so often.
+        while (time.time() - start_time) < timeout:
+            try:
+                status = await self.custom_api.get_namespaced_custom_object(
+                    group="nvidia.com",
+                    version="v1alpha1",
+                    namespace=self.namespace,
+                    plural="dynamographdeployments",
+                    name=self.deployment_name,
+                )
+                # Check both conditions:
+                # 1. Ready condition is True
+                # 2. State is successful
+                status_obj = status.get("status", {})
+                conditions = status_obj.get("conditions", [])
+                current_state = status_obj.get("state", "unknown")
+                print(f"Current deployment state: {current_state}")
+                print(f"Current conditions: {conditions}")
+                print(f"Elapsed time: {time.time() - start_time:.1f}s / {timeout}s")
+                ready_condition = False
+                for condition in conditions:
+                    if (
+                        condition.get("type") == "Ready"
+                        and condition.get("status") == "True"
+                    ):
+                        ready_condition = True
+                        break
+                state_successful = status_obj.get("state") == "successful"
+                if ready_condition and state_successful:
+                    print(
+                        "Deployment is ready: Ready condition is True and state is successful"
+                    )
+                    return True
+                else:
+                    print(
+                        f"Deployment not ready yet - Ready condition: {ready_condition}, State successful: {state_successful}"
+                    )
+            except kubernetes.client.rest.ApiException as e:
+                print(f"API Exception while checking deployment status: {e}")
+                print(f"Status code: {e.status}, Reason: {e.reason}")
+            except Exception as e:
+                print(f"Unexpected exception while checking deployment status: {e}")
+            await asyncio.sleep(20)
+        raise TimeoutError("Deployment failed to become ready within timeout")
+    async def check_chat_completion(self):
+        """
+        Test the deployment with a chat completion request using httpx.
+        """
+        EXAMPLE_CHAT_REQUEST["model"] = self.model_name
+        base_url = self.get_service_url()
+        url = f"{base_url}/v1/chat/completions"
+        async with httpx.AsyncClient() as client:
+            response = await client.post(url, json=EXAMPLE_CHAT_REQUEST)
+            response.raise_for_status()
+            return response.text
+    async def get_deployment_logs(self):
+        """
+        Get logs from all pods in the deployment, organized by component.
+        """
+        # Create logs directory
+        base_dir = self.base_log_dir / self.deployment_name
+        base_dir.mkdir(parents=True, exist_ok=True)
+        for component in self.components:
+            component_dir = base_dir / component
+            component_dir.mkdir(exist_ok=True)
+            # List pods for this component using the selector label
+            # nvidia.com/selector: deployment-name-component
+            label_selector = (
+                f"nvidia.com/selector={self.deployment_name}-{component.lower()}"
+            )
+            pods = await self.core_api.list_namespaced_pod(
+                namespace=self.namespace, label_selector=label_selector
+            )
+            # Get logs for each pod
+            for i, pod in enumerate(pods.items):
+                try:
+                    logs = await self.core_api.read_namespaced_pod_log(
+                        name=pod.metadata.name, namespace=self.namespace
+                    )
+                    async with aiofiles.open(component_dir / f"{i}.log", "w") as f:
+                        await f.write(logs)
+                except kubernetes.client.rest.ApiException as e:
+                    print(f"Error getting logs for pod {pod.metadata.name}: {e}")
+    async def delete_deployment(self):
+        """
+        Delete the DynamoGraphDeployment CR.
+        """
+        try:
+            await self.custom_api.delete_namespaced_custom_object(
+                group="nvidia.com",
+                version="v1alpha1",
+                namespace=self.namespace,
+                plural="dynamographdeployments",
+                name=self.deployment_name,
+            )
+        except kubernetes.client.rest.ApiException as e:
+            if e.status != 404:  # Ignore if already deleted
+                raise
+async def cleanup_remaining_deployments(deployment_clients, namespace):
+    """Clean up any remaining tracked deployments, handling errors gracefully."""
+    import logging
+    logger = logging.getLogger(__name__)
+    if not deployment_clients:
+        logger.info("No deployments to clean up")
+        return
+    logger.info(f"Cleaning up {len(deployment_clients)} remaining deployments...")
+    for deployment_client in deployment_clients:
+        try:
+            logger.info(
+                f"Attempting to delete deployment {deployment_client.deployment_name}..."
+            )
+            await deployment_client.delete_deployment()
+            logger.info(
+                f"Successfully deleted deployment {deployment_client.deployment_name}"
+            )
+        except Exception as e:
+            # If deployment doesn't exist (404), that's fine - it was already cleaned up
+            if "404" in str(e) or "not found" in str(e).lower():
+                logger.info(
+                    f"Deployment {deployment_client.deployment_name} was already deleted"
+                )
+            else:
+                logger.error(
+                    f"Failed to delete deployment {deployment_client.deployment_name}: {e}"
+                )
+async def main():
+    parser = argparse.ArgumentParser(
+        description="Deploy and manage DynamoGraphDeployment CRDs"
+    )
+    parser.add_argument(
+        "--namespace",
+        "-n",
+        required=True,
+        help="Kubernetes namespace to deploy to (default: default)",
+    )
+    parser.add_argument(
+        "--yaml-file",
+        "-f",
+        required=True,
+        help="Path to the DynamoGraphDeployment YAML file",
+    )
+    parser.add_argument(
+        "--log-dir",
+        "-l",
+        default="/tmp/dynamo_logs",
+        help="Base directory for logs (default: /tmp/dynamo_logs)",
+    )
+    parser.add_argument(
+        "--service-name",
+        "-s",
+        help="Service name for connecting to the service (default: {deployment_name}-frontend)",
+    )
+    args = parser.parse_args()
+    # Example usage with parsed arguments
+    client = DynamoDeploymentClient(
+        namespace=args.namespace,
+        base_log_dir=args.log_dir,
+        service_name=args.service_name,
+    )
+    try:
+        # Create deployment from yaml file
+        await client.create_deployment(args.yaml_file)
+        # Wait for deployment to be ready
+        print("Waiting for deployment to be ready...")
+        await client.wait_for_deployment_ready()
+        print("Deployment is ready!")
+        # Test chat completion
+        print("Testing chat completion...")
+        response = await client.check_chat_completion()
+        print(f"Chat completion response: {response}")
+        # Get logs
+        print("Getting deployment logs...")
+        await client.get_deployment_logs()
+        print(
+            f"Logs have been saved to {client.base_log_dir / client.deployment_name}!"
+        )
+    finally:
+        # Cleanup
+        print("Cleaning up deployment...")
+        await client.delete_deployment()
+        print("Deployment deleted!")
+# run with:
+# uv run benchmarks/profiler/utils/dynamo_deployment.py -n mo-dyn-cloud -f ./examples/vllm/deploy/agg.yaml -l ./client_logs
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/benchmarks/profiler/utils/genai_perf.py
+++ b/benchmarks/profiler/utils/genai_perf.py
@@ -34,7 +34,7 @@ def _get_common_genai_perf_cmd(
    artifact_dir,
    seed=100,
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    port=8000,
+    base_url="http://localhost:8000",
 ):
    return [
        "genai-perf",
@@ -49,7 +49,7 @@ def _get_common_genai_perf_cmd(
        "/v1/chat/completions",
        "--streaming",
        "--url",
-        f"http://localhost:{port}",
+        base_url,
        "--extra-inputs",
        "ignore_eos:true",
        "--extra-inputs",
@@ -69,13 +69,13 @@ def get_prefill_genai_perf_cmd(
    seed=100,
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    osl=5,
-    port=8000,
+    base_url="http://localhost:8000",
 ):
    return _get_common_genai_perf_cmd(
        artifact_dir,
        seed,
        model,
-        port,
+        base_url,
    ) + [
        "--synthetic-input-tokens-mean",
        str(isl),
@@ -103,13 +103,13 @@ def get_decode_genai_perf_cmd(
    num_request,
    seed=100,
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    port=8000,
+    base_url="http://localhost:8000",
 ):
    return _get_common_genai_perf_cmd(
        artifact_dir,
        seed,
        model,
-        port,
+        base_url,
    ) + [
        "--synthetic-input-tokens-mean",
        str(isl),
@@ -146,11 +146,15 @@ def get_gap_result(artifact_dir: str) -> dict:
        return json.load(f)
-def benchmark_prefill(isl, genai_perf_artifact_dir, model_name, port):
+def benchmark_prefill(
+    isl, genai_perf_artifact_dir, model_name, base_url="http://localhost:8000"
+):
    logger.info(f"Running genai-perf with isl {isl}")
    genai_perf_cmd = get_prefill_genai_perf_cmd(
-        isl, genai_perf_artifact_dir, model=model_name, port=port
+        isl, genai_perf_artifact_dir, model=model_name, base_url=base_url
    )
+    print(f"genai-perf cmd: {genai_perf_cmd}")
+    # import pdb; pdb.set_trace()
    gap_process = subprocess.Popen(
        genai_perf_cmd,
        stdout=subprocess.PIPE,
@@ -169,12 +173,20 @@ def benchmark_prefill(isl, genai_perf_artifact_dir, model_name, port):
        return None
-def benchmark_decode(isl, osl, num_request, genai_perf_artifact_dir, model_name, port):
+def benchmark_decode(
+    isl,
+    osl,
+    num_request,
+    genai_perf_artifact_dir,
+    model_name,
+    base_url="http://localhost:8000",
+):
    logger.info(f"Profiling decode with num_request {num_request}...")
    # first warm-up the engine by pre-computing all prefill tokens
    # we use the same random seed to make sure the prompt is the same
    seed = random.randint(0, 1000000)
    genai_perf_cmd = get_decode_genai_perf_cmd(
        isl,
        osl,
@@ -182,7 +194,7 @@ def benchmark_decode(isl, osl, num_request, genai_perf_artifact_dir, model_name,
        num_request,
        seed=seed,
        model=model_name,
-        port=port,
+        base_url=base_url,
    )
    gap_process = subprocess.Popen(
        genai_perf_cmd,
@@ -199,7 +211,7 @@ def benchmark_decode(isl, osl, num_request, genai_perf_artifact_dir, model_name,
        num_request,
        seed=seed,
        model=model_name,
-        port=port,
+        base_url=base_url,
    )
    gap_process = subprocess.Popen(
        genai_perf_cmd,

--- a/benchmarks/profiler/utils/profile_cache.py
+++ b/benchmarks/profiler/utils/profile_cache.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import json
+import logging
+import os
+import re
+from typing import List, Optional, Tuple
+logger = logging.getLogger(__name__)
+def check_prefill_results_exist(output_dir: str, tp_size: int, isl: int) -> bool:
+    """Check if prefill results already exist for a given TP size."""
+    work_dir = f"{output_dir}/prefill_tp{tp_size}"
+    result_file = f"{work_dir}/gap_isl{isl}/*/profile_export_genai_perf.json"
+    # Check if the work directory exists
+    if not os.path.exists(work_dir):
+        return False
+    # Look for the genai-perf result file
+    result_files = glob.glob(result_file)
+    if not result_files:
+        return False
+    # Verify the result file has valid data
+    try:
+        with open(result_files[0], "r") as f:
+            data = json.load(f)
+            # Check if it has the required metrics
+            if "time_to_first_token" in data and "avg" in data["time_to_first_token"]:
+                logger.info(
+                    f"Found existing prefill results for TP{tp_size} at {result_files[0]}"
+                )
+                return True
+    except (json.JSONDecodeError, KeyError, FileNotFoundError):
+        pass
+    return False
+def check_decode_results_exist(
+    output_dir: str, tp_size: int, isl: int, osl: int
+) -> bool:
+    """Check if decode results already exist for a given TP size."""
+    work_dir = f"{output_dir}/decode_tp{tp_size}"
+    # Check if the work directory exists
+    if not os.path.exists(work_dir):
+        return False
+    # Look for at least one decode result file
+    result_pattern = (
+        f"{work_dir}/gap_request*_isl{isl}_osl{osl}_n*/*/profile_export_genai_perf.json"
+    )
+    result_files = glob.glob(result_pattern)
+    if not result_files:
+        return False
+    # Verify at least one result file has valid data
+    try:
+        with open(result_files[0], "r") as f:
+            data = json.load(f)
+            # Check if it has the required metrics
+            if "inter_token_latency" in data and "avg" in data["inter_token_latency"]:
+                logger.info(
+                    f"Found existing decode results for TP{tp_size} at {result_files[0]} (and {len(result_files)-1} others)"
+                )
+                return True
+    except (json.JSONDecodeError, KeyError, FileNotFoundError):
+        pass
+    return False
+def load_existing_prefill_results(
+    output_dir: str, tp_size: int, isl: int
+) -> Tuple[Optional[float], Optional[float]]:
+    """Load existing prefill results from disk."""
+    work_dir = f"{output_dir}/prefill_tp{tp_size}"
+    result_file = f"{work_dir}/gap_isl{isl}/*/profile_export_genai_perf.json"
+    result_files = glob.glob(result_file)
+    if result_files:
+        try:
+            with open(result_files[0], "r") as f:
+                data = json.load(f)
+                ttft = data["time_to_first_token"]["avg"]
+                thpt_per_gpu = isl / ttft / tp_size * 1000
+                return ttft, thpt_per_gpu
+        except (json.JSONDecodeError, KeyError, FileNotFoundError):
+            pass
+    return None, None
+def load_existing_decode_results(
+    output_dir: str, tp_size: int, isl: int, osl: int
+) -> List[Tuple[float, float, int]]:
+    """Load existing decode results from disk."""
+    work_dir = f"{output_dir}/decode_tp{tp_size}"
+    result_pattern = (
+        f"{work_dir}/gap_request*_isl{isl}_osl{osl}_n*/*/profile_export_genai_perf.json"
+    )
+    result_files = glob.glob(result_pattern)
+    decode_results = []
+    for result_file in result_files:
+        try:
+            with open(result_file, "r") as f:
+                data = json.load(f)
+                itl = data["inter_token_latency"]["avg"]
+                thpt_per_gpu = data["output_token_throughput"]["avg"] / tp_size
+                # Extract concurrency from filename
+                match = re.search(r"gap_request(\d+)_", result_file)
+                if match:
+                    concurrency = int(match.group(1))
+                    decode_results.append((itl, thpt_per_gpu, concurrency))
+        except (json.JSONDecodeError, KeyError, FileNotFoundError):
+            continue
+    return decode_results
--- a/components/backends/vllm/README.md
+++ b/components/backends/vllm/README.md
@@ -138,6 +138,17 @@ cd ~/dynamo/components/backends/vllm/deploy
 kubectl apply -f disagg.yaml
 ```
+To change `DYN_LOG` level, edit the yaml file by adding
+```yaml
+...
+spec:
+  envs:
+    - name: DYN_LOG
+      value: "debug" # or other log levels
+  ...
+```
 ### Testing the Deployment
 Send a test request to verify your deployment:

--- a/components/backends/vllm/deploy/agg.yaml
+++ b/components/backends/vllm/deploy/agg.yaml
@@ -4,7 +4,7 @@
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: vllm-v1-agg
+  name: vllm-agg
 spec:
  services:
    Frontend:
@@ -26,7 +26,7 @@ spec:
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
-      dynamoNamespace: vllm-v1-agg
+      dynamoNamespace: vllm-agg
      componentType: main
      replicas: 1
      resources:
@@ -38,7 +38,7 @@ spec:
          memory: "2Gi"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          args:
            - "python3 -m dynamo.frontend --http-port 8000"
@@ -63,7 +63,7 @@ spec:
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
-      dynamoNamespace: vllm-v1-agg
+      dynamoNamespace: vllm-agg
      componentType: worker
      replicas: 1
      resources:
@@ -77,7 +77,7 @@ spec:
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh

--- a/components/backends/vllm/deploy/agg_router.yaml
+++ b/components/backends/vllm/deploy/agg_router.yaml
@@ -4,7 +4,7 @@
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: vllm-v1-agg-router
+  name: vllm-agg-router
 spec:
  services:
    Frontend:
@@ -26,7 +26,7 @@ spec:
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
-      dynamoNamespace: vllm-v1-agg-router
+      dynamoNamespace: vllm-agg-router
      componentType: main
      replicas: 1
      resources:
@@ -38,7 +38,7 @@ spec:
          memory: "2Gi"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          args:
            - "python3 -m dynamo.frontend --http-port 8000 --router-mode kv"
@@ -63,7 +63,7 @@ spec:
        periodSeconds: 60
        timeoutSeconds: 30
        failureThreshold: 10
-      dynamoNamespace: vllm-v1-agg-router
+      dynamoNamespace: vllm-agg-router
      componentType: worker
      replicas: 2
      resources:
@@ -77,7 +77,7 @@ spec:
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh

--- a/components/backends/vllm/deploy/disagg.yaml
+++ b/components/backends/vllm/deploy/disagg.yaml
@@ -4,11 +4,11 @@
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: vllm-v1-disagg
+  name: vllm-disagg
 spec:
  services:
    Frontend:
-      dynamoNamespace: vllm-v1-disagg
+      dynamoNamespace: vllm-disagg
      componentType: main
      replicas: 1
      livenessProbe:
@@ -31,19 +31,19 @@ spec:
        failureThreshold: 10
      resources:
        requests:
-          cpu: "1"
+          cpu: "32"
-          memory: "2Gi"
+          memory: "10Gi"
        limits:
-          cpu: "1"
+          cpu: "32"
-          memory: "2Gi"
+          memory: "10Gi"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          args:
            - "python3 -m dynamo.frontend --http-port 8000"
    VllmDecodeWorker:
-      dynamoNamespace: vllm-v1-disagg
+      dynamoNamespace: vllm-disagg
      envFromSecret: hf-token-secret
      componentType: worker
      replicas: 1
@@ -68,16 +68,16 @@ spec:
        failureThreshold: 10
      resources:
        requests:
-          cpu: "10"
+          cpu: "32"
-          memory: "20Gi"
+          memory: "40Gi"
          gpu: "1"
        limits:
-          cpu: "10"
+          cpu: "32"
-          memory: "20Gi"
+          memory: "40Gi"
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh
@@ -85,7 +85,7 @@ spec:
          args:
            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
    VllmPrefillWorker:
-      dynamoNamespace: vllm-v1-disagg
+      dynamoNamespace: vllm-disagg
      envFromSecret: hf-token-secret
      componentType: worker
      replicas: 1
@@ -110,16 +110,16 @@ spec:
        failureThreshold: 10
      resources:
        requests:
-          cpu: "10"
+          cpu: "32"
-          memory: "20Gi"
+          memory: "40Gi"
          gpu: "1"
        limits:
-          cpu: "10"
+          cpu: "32"
-          memory: "20Gi"
+          memory: "40Gi"
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh

--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -4,11 +4,11 @@
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: vllm-v1-disagg-planner
+  name: vllm-disagg-planner
 spec:
  services:
    Frontend:
-      dynamoNamespace: vllm-v1-disagg-planner
+      dynamoNamespace: vllm-disagg-planner
      componentType: main
      replicas: 1
      livenessProbe:
@@ -38,12 +38,12 @@ spec:
          memory: "2Gi"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          args:
            - "python3 -m dynamo.frontend --http-port 8000"
    VllmDecodeWorker:
-      dynamoNamespace: vllm-v1-disagg-planner
+      dynamoNamespace: vllm-disagg-planner
      envFromSecret: hf-token-secret
      componentType: worker
      replicas: 1
@@ -77,12 +77,12 @@ spec:
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          args:
            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
    VllmPrefillWorker:
-      dynamoNamespace: vllm-v1-disagg-planner
+      dynamoNamespace: vllm-disagg-planner
      envFromSecret: hf-token-secret
      componentType: worker
      replicas: 1
@@ -116,7 +116,7 @@ spec:
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh

--- a/components/backends/vllm/deploy/disagg_router.yaml
+++ b/components/backends/vllm/deploy/disagg_router.yaml
@@ -38,7 +38,7 @@ spec:
          memory: "2Gi"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          args:
            - "python3 -m dynamo.frontend --http-port 8000 --router-mode kv"
@@ -77,7 +77,7 @@ spec:
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh
@@ -119,7 +119,7 @@ spec:
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh

--- a/components/planner/README.md
+++ b/components/planner/README.md
@@ -34,7 +34,7 @@ PYTHONPATH=/workspace/examples/llm python components/planner.py --namespace <nam
 ## Local Backend (LocalPlanner)
-The LocalPlanner is built on top of circus, which is what we use to manage component subprocesses when running dynamo serve. LocalPlanner allows the planner component to scale workers up and down based on system metrics.
+The LocalPlanner is built on top of circus, which is what we use to manage component subprocesses when running with the frontend and workers. LocalPlanner allows the planner component to scale workers up and down based on system metrics.
 **Current limitations**
 1. Single node only
@@ -78,7 +78,7 @@ The planner architecture is designed to be simple and extensible:
 ### Statefile
-The statefile maintains the current state of all running workers and is used by the LocalPlanner to track and modify the deployment. It's stored at `~/.dynamo/state/{namespace}.json` (or in the directory specified by `DYN_LOCAL_STATE_DIR`). The statefile is automatically created when you run dynamo serve and is cleaned up when the arbiter terminates. Each worker is identified as `{namespace}_{component_name}` with an optional numeric suffix for additional instances.
+The statefile maintains the current state of all running workers and is used by the LocalPlanner to track and modify the deployment. It's stored at `~/.dynamo/state/{namespace}.json` (or in the directory specified by `DYN_LOCAL_STATE_DIR`). The statefile is automatically created when you run the frontend with workers and is cleaned up when the arbiter terminates. Each worker is identified as `{namespace}_{component_name}` with an optional numeric suffix for additional instances.
 #### Example: Adding and Removing Workers

--- a/components/planner/src/dynamo/planner/kubernetes_connector.py
+++ b/components/planner/src/dynamo/planner/kubernetes_connector.py
@@ -77,3 +77,22 @@ class KubernetesConnector(PlannerConnector):
    def _get_graph_deployment_name(self, deployment: dict) -> str:
        """Get the name of the graph deployment"""
        return deployment["metadata"]["name"]
+if __name__ == "__main__":
+    import argparse
+    import asyncio
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--namespace", type=str, default="dynamo")
+    parser.add_argument("--action", type=str, choices=["add", "remove"])
+    parser.add_argument("--component", type=str, default="planner")
+    parser.add_argument("--blocking", action="store_true")
+    args = parser.parse_args()
+    connector = KubernetesConnector(args.namespace)
+    if args.action == "add":
+        task = connector.add_component(args.component, args.blocking)
+    elif args.action == "remove":
+        task = connector.remove_component(args.component, args.blocking)
+    asyncio.run(task)
--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -14,13 +14,16 @@
 # limitations under the License.
 accelerate==1.6.0
+aiofiles
 av==15.0.0
 fastapi==0.115.6
 ftfy
 genai-perf==0.0.15
 grpcio-tools==1.66.0
 httpx
+kr8s
 kubernetes==32.0.1
+kubernetes_asyncio
 matplotlib
 msgspec
 mypy
@@ -43,5 +46,6 @@ sentencepiece
 tensorboard==2.19.0
 tensorboardX==2.6.2.2
 transformers
+types-aiofiles
 types-PyYAML
 uvicorn