feat: add benchmarking guide (#2620)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

feat: add benchmarking guide (#2620)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
699996e4 · hhzhang16 · GitHub · 3c4adde5 · 699996e4 · 699996e4
Unverified Commit 699996e4 authored Aug 29, 2025 by hhzhang16 Committed by GitHub Aug 29, 2025
19 changed files
--- a/deploy/utils/__init__.py
+++ b/deploy/utils/__init__.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# Package marker for deploy.utils
--- a/benchmarks/profiler/download_pvc_results.py
+++ b/benchmarks/profiler/download_pvc_results.py
@@ -16,42 +16,51 @@
 # limitations under the License.

 """
-PVC Results Download Script
+PVC Results Download Script (generic)

-This script downloads all relevant profiling results from the profiling PVC to a local directory.
-It creates the necessary access pod, downloads the files, and cleans up automatically.
+Downloads files from a specified folder path inside a Kubernetes PVC into a local directory.
+Creates an access pod, copies files, and exits. You can optionally exclude YAML configs.

 Usage:
-    python3 download_pvc_results.py --namespace <namespace> --output-dir <local_directory> [--no-config]
-
-Examples:
-    # Download to ./results directory
-    python3 download_pvc_results.py --namespace <namespace> --output-dir ./results
-
-    # Download to specific directory
-    python3 download_pvc_results.py --namespace <namespace> --output-dir /home/user/profiling_data
-
-    # Download without configuration files
-    python3 download_pvc_results.py --namespace <namespace> --output-dir ./results --no-config
+    python3 download_pvc_results.py --namespace <namespace> --output-dir <local_directory> \
+        --folder </absolute/folder/in/pvc> [--no-config]
 """

 import argparse
 import subprocess
 import sys
-import time
 from pathlib import Path
 from typing import List

-from utils.kubernetes import check_kubectl_access, deploy_access_pod, run_command
+try:
+    from deploy.utils.kubernetes import (
+        check_kubectl_access,
+        cleanup_access_pod,
+        deploy_access_pod,
+        run_command,
+    )
+except ModuleNotFoundError:
+    # Allow running as a script: add repo root to sys.path
+    repo_root = Path(__file__).resolve().parents[2]
+    sys.path.insert(0, str(repo_root))
+    from deploy.utils.kubernetes import (
+        check_kubectl_access,
+        cleanup_access_pod,
+        deploy_access_pod,
+        run_command,
+    )


 def list_pvc_contents(
-    namespace: str, pod_name: str, skip_config: bool = False
+    namespace: str, pod_name: str, base_folder: str, skip_config: bool = False
 ) -> List[str]:
-    """List contents of the PVC to identify relevant files."""
+    """List contents of the PVC to identify files.
+
+    Downloads all files under base_folder. If skip_config is True, excludes *.yaml and *.yml.
+    """
    print("Scanning PVC contents...")

-    # Build find command with optional config file exclusion
+    # Build find command: all files
    find_cmd = [
        "kubectl",
        "exec",
@@ -60,44 +69,28 @@ def list_pvc_contents(
        namespace,
        "--",
        "find",
-        "/profiling_results",
+        base_folder,
        "-type",
        "f",
-        "-name",
-        "*.png",
-        "-o",
-        "-name",
-        "*.npz",
    ]

-    # Add config file patterns if not skipping them
-    if not skip_config:
-        find_cmd.extend(
-            [
-                "-o",
-                "-name",
-                "*.yaml",
-                "-o",
-                "-name",
-                "*.yml",
-            ]
-        )
+    # Exclude YAML files when requested
+    if skip_config:
+        find_cmd.extend(["-not", "-name", "*.yaml", "-not", "-name", "*.yml"])

    try:
        result = run_command(find_cmd, capture_output=True)
-
        files = [f.strip() for f in result.stdout.split("\n") if f.strip()]
        config_note = " (excluding config files)" if skip_config else ""
-        print(f"Found {len(files)} relevant files to download{config_note}")
+        print(f"Found {len(files)} files to download{config_note}")
        return files
-
    except subprocess.CalledProcessError:
        print("ERROR: Failed to list PVC contents")
        sys.exit(1)


 def download_files(
-    namespace: str, pod_name: str, files: List[str], output_dir: Path
+    namespace: str, pod_name: str, files: List[str], output_dir: Path, base_folder: str
 ) -> None:
    """Download relevant files from PVC to local directory."""
    if not files:
@@ -113,8 +106,13 @@ def download_files(

    for file_path in files:
        try:
-            # Determine relative path and create local structure
-            rel_path = file_path.replace("/profiling_results/", "")
+            # Determine relative path and create local structure based on base_folder
+            prefix = base_folder.rstrip("/") + "/"
+            rel_path = (
+                file_path[len(prefix) :]
+                if file_path.startswith(prefix)
+                else file_path.lstrip("/")
+            )

            # Validate relative path
            if ".." in rel_path or rel_path.startswith("/"):
@@ -154,143 +152,6 @@ def download_files(
    print(f"✓ Download completed: {downloaded} successful, {failed} failed")


-def download_summary_files(
-    namespace: str, pod_name: str, output_dir: Path, skip_config: bool = False
-) -> None:
-    """Download key summary files that might not match the pattern."""
-    summary_files = [
-        "/profiling_results/prefill_performance.png",
-        "/profiling_results/decode_performance.png",
-    ]
-
-    # Add config files if not skipping them
-    if not skip_config:
-        summary_files.append(
-            "/profiling_results/disagg.yaml"
-        )  # In case it was injected
-
-    print("Downloading summary files...")
-
-    for file_path in summary_files:
-        try:
-            # Check if file exists first using subprocess.run directly
-            result = subprocess.run(
-                [
-                    "kubectl",
-                    "exec",
-                    pod_name,
-                    "-n",
-                    namespace,
-                    "--",
-                    "test",
-                    "-f",
-                    file_path,
-                ],
-                capture_output=True,
-                text=True,
-                check=False,
-            )
-
-            if result.returncode != 0:
-                # File doesn't exist, skip silently
-                continue
-
-            # File exists, download it
-            rel_path = file_path.replace("/profiling_results/", "")
-
-            # Validate relative path
-            if ".." in rel_path or rel_path.startswith("/"):
-                print(
-                    f"  ⚠️  Skipped {file_path.split('/')[-1]}: potentially unsafe path"
-                )
-                continue
-
-            local_file = output_dir / rel_path
-
-            # Ensure the file is within output_dir
-            if not local_file.resolve().is_relative_to(output_dir.resolve()):
-                print(
-                    f"  ⚠️  Skipped {file_path.split('/')[-1]}: outside output directory"
-                )
-                continue
-
-            local_file.parent.mkdir(parents=True, exist_ok=True)
-
-            run_command(
-                [
-                    "kubectl",
-                    "cp",
-                    f"{namespace}/{pod_name}:{file_path}",
-                    str(local_file),
-                ],
-                capture_output=True,
-            )
-
-            print(f"  ✓ {rel_path}")
-
-        except Exception as e:
-            # File doesn't exist or failed to download, skip silently
-            print(f"  ⚠️  Skipped {file_path.split('/')[-1]}: {e}")
-            pass
-
-
-def cleanup_access_pod(namespace: str, pod_name: str) -> None:
-    """Clean up the access pod (let it auto-delete via activeDeadlineSeconds)."""
-    print(f"ℹ️  Access pod '{pod_name}' will auto-delete in 5 minutes")
-    print(f"   To delete immediately: kubectl delete pod {pod_name} -n {namespace}")
-
-
-def generate_readme(output_dir: Path, file_count: int) -> None:
-    """Generate a README file explaining the downloaded contents."""
-    readme_content = f"""# Profiling Results
-
-Downloaded {file_count} files from profiling PVC.
-
-## File Structure
-
-### Performance Plots
- `prefill_performance.png` - Main prefill performance across TP sizes
- `decode_performance.png` - Main decode performance across TP sizes
-
-### Interpolation Data
- `selected_prefill_interpolation/raw_data.npz` - Prefill performance data
- `selected_prefill_interpolation/*.png` - Prefill interpolation plots
- `selected_decode_interpolation/raw_data.npz` - Decode performance data
- `selected_decode_interpolation/*.png` - Decode interpolation plots
-
-### Configuration Files
- `disagg.yaml` - DynamoGraphDeployment configuration used for profiling
-
-### Individual TP Results
- `prefill_tp*/` - Individual tensor parallelism profiling results
- `decode_tp*/` - Individual tensor parallelism profiling results
-
-## Loading Data
-
-To load the .npz data files in Python:
-
-```python
-import numpy as np
-
-# Load prefill data
-prefill_data = np.load('selected_prefill_interpolation/raw_data.npz')
-print("Prefill data keys:", list(prefill_data.keys()))
-
-# Load decode data
-decode_data = np.load('selected_decode_interpolation/raw_data.npz')
-print("Decode data keys:", list(decode_data.keys()))
-```
-
-Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}
-"""
-
-    readme_path = output_dir / "README.md"
-    with open(readme_path, "w") as f:
-        f.write(readme_content)
-
-    print("📝 Generated README.md with download summary")
-
-
 def main():
    parser = argparse.ArgumentParser(
        description="Download profiling results from PVC to local directory",
@@ -318,6 +179,11 @@ def main():
        action="store_true",
        help="Skip downloading configuration files (*.yaml, *.yml)",
    )
+    parser.add_argument(
+        "--folder",
+        required=True,
+        help="Absolute folder path in the PVC to download, e.g. /profiling_results or /benchmarking_results",
+    )

    args = parser.parse_args()

@@ -329,19 +195,13 @@ def main():

    # Deploy access pod
    pod_name = deploy_access_pod(args.namespace)
-
+    try:
        # List and download files
-    files = list_pvc_contents(args.namespace, pod_name, args.no_config)
-    download_files(args.namespace, pod_name, files, args.output_dir)
-
-    # Download additional summary files
-    download_summary_files(args.namespace, pod_name, args.output_dir, args.no_config)
-
-    # Generate README
-    generate_readme(args.output_dir, len(files))
-
-    # Cleanup info
-    cleanup_access_pod(args.namespace, pod_name)
+        files = list_pvc_contents(args.namespace, pod_name, args.folder, args.no_config)
+        download_files(args.namespace, pod_name, files, args.output_dir, args.folder)
+    finally:
+        # Cleanup
+        cleanup_access_pod(args.namespace)

    print("\n✅ Download completed!")
    print(f"📁 Results available at: {args.output_dir.absolute()}")

--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -15,6 +15,10 @@

 import argparse
 import asyncio
+import os
+import re
+import subprocess
+import sys
 import time
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
@@ -39,6 +43,38 @@ EXAMPLE_CHAT_REQUEST = {
 }


+class ProgressDisplay:
+    """Helper class for cleaner progress display during deployment waiting"""
+
+    def __init__(self, verbose: bool = False):
+        self.verbose = verbose
+        self.last_message = ""
+        self.spinner_chars = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
+        self.spinner_idx = 0
+
+    def update(self, message: str, newline: bool = False):
+        """Update progress display"""
+        if self.verbose or newline:
+            print(message)
+        else:
+            # Clear previous line and write new message
+            sys.stdout.write(f"\r\033[K{message}")
+            sys.stdout.flush()
+            self.last_message = message
+
+    def spinner(self) -> str:
+        """Get next spinner character"""
+        char = self.spinner_chars[self.spinner_idx]
+        self.spinner_idx = (self.spinner_idx + 1) % len(self.spinner_chars)
+        return char
+
+    def finish(self, message: str):
+        """Finish with a final message"""
+        if not self.verbose and self.last_message:
+            sys.stdout.write("\r\033[K")  # Clear the line
+        print(message)
+
+
 class DynamoDeploymentClient:
    def __init__(
        self,
@@ -68,20 +104,84 @@ class DynamoDeploymentClient:
        ] = None  # Will store the full deployment spec
        self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs")
        self.frontend_port = frontend_port
+        self.port_forward_process: Optional[subprocess.Popen[bytes]] = None

-    def _init_kubernetes(self):
+    async def _init_kubernetes(self):
        """Initialize kubernetes client"""
        try:
            # Try in-cluster config first (for pods with service accounts)
            config.load_incluster_config()
        except Exception:
            # Fallback to kube config file (for local development)
-            config.load_kube_config()
+            await config.load_kube_config()

        self.k8s_client = client.ApiClient()
        self.custom_api = client.CustomObjectsApi(self.k8s_client)
        self.core_api = client.CoreV1Api(self.k8s_client)

+    def port_forward_frontend(self, local_port: int = 8000, quiet: bool = False) -> str:
+        """
+        Port forward the frontend service to a local port.
+
+        Args:
+            local_port: Local port to forward to (default: 8000)
+            quiet: If True, suppress kubectl port-forward output messages (default: False)
+        """
+        cmd = [
+            "kubectl",
+            "port-forward",
+            f"svc/{self.service_name}",
+            f"{local_port}:{self.frontend_port}",
+            "-n",
+            self.namespace,
+        ]
+
+        print(f"Starting port forward: {' '.join(cmd)}")
+
+        # Configure output redirection based on quiet flag
+        if quiet:
+            # Suppress kubectl's "Handling connection for..." messages
+            stdout = subprocess.DEVNULL
+            stderr = subprocess.DEVNULL
+        else:
+            stdout = None
+            stderr = None
+
+        # Start port forward in background
+        try:
+            self.port_forward_process = subprocess.Popen(
+                cmd, stdout=stdout, stderr=stderr
+            )
+        except FileNotFoundError as e:
+            raise RuntimeError(
+                "kubectl not found in PATH; required for port-forwarding"
+            ) from e
+
+        # Wait a moment for port forward to establish
+        print("Waiting for port forward to establish...")
+        time.sleep(3)
+
+        print(f"Port forward started with PID: {self.port_forward_process.pid}")
+        return f"http://localhost:{local_port}"
+
+    def stop_port_forward(self):
+        """
+        Stop the port forward process.
+        """
+        if self.port_forward_process:
+            print(
+                f"Stopping port forward process (PID: {self.port_forward_process.pid})"
+            )
+            self.port_forward_process.terminate()
+            try:
+                self.port_forward_process.wait(timeout=5)
+                print("Port forward stopped")
+            except subprocess.TimeoutExpired:
+                print("Port forward process did not terminate, killing it")
+                self.port_forward_process.kill()
+                self.port_forward_process.wait()
+            self.port_forward_process = None
+
    def get_service_url(self) -> str:
        """
        Get the service URL using Kubernetes service DNS.
@@ -97,7 +197,7 @@ class DynamoDeploymentClient:
        Args:
            deployment: Either a dict containing the deployment spec or a path to a yaml file
        """
-        self._init_kubernetes()
+        await self._init_kubernetes()

        if isinstance(deployment, str):
            # Load from yaml file
@@ -107,6 +207,11 @@ class DynamoDeploymentClient:
        else:
            self.deployment_spec = deployment

+        # Ensure deployment_spec is properly loaded
+        assert (
+            self.deployment_spec is not None
+        ), "Failed to load deployment specification"
+
        # Extract component names
        self.components = [
            svc.lower() for svc in self.deployment_spec["spec"]["services"].keys()
@@ -139,15 +244,30 @@ class DynamoDeploymentClient:
                print(f"Failed to create deployment {self.deployment_name}: {e}")
                raise

-    async def wait_for_deployment_ready(self, timeout: int = 1800):
+    async def wait_for_deployment_ready(
+        self, timeout: int = 1800, verbose: Optional[bool] = None
+    ):
        """
-        Wait for the custom resource to be ready.
+        Wait for the custom resource to be ready with improved progress display.

        Args:
            timeout: Maximum time to wait in seconds, default to 30 mins (image pulling can take a while)
+            verbose: If True, show detailed status updates. If None, uses DYNAMO_VERBOSE env var.
        """
+        # Allow environment variable to control verbosity
+        if verbose is None:
+            verbose = os.environ.get("DYNAMO_VERBOSE", "false").lower() == "true"
+
+        progress = ProgressDisplay(verbose=verbose)
        start_time = time.time()
-        # TODO: A little brittle, also should output intermediate status every so often.
+        last_status = None
+        last_conditions_str = ""
+        check_interval = 20 if not verbose else 10
+
+        # Initial message
+        if not verbose:
+            print(f"⏳ Waiting for deployment '{self.deployment_name}'...")
+
        while (time.time() - start_time) < timeout:
            try:
                status = await self.custom_api.get_namespaced_custom_object(
@@ -157,57 +277,129 @@ class DynamoDeploymentClient:
                    plural="dynamographdeployments",
                    name=self.deployment_name,
                )
-                # Check both conditions:
-                # 1. Ready condition is True
-                # 2. State is successful
+
                status_obj = status.get("status", {})
                conditions = status_obj.get("conditions", [])
                current_state = status_obj.get("state", "unknown")
+                elapsed = time.time() - start_time

-                print(f"Current deployment state: {current_state}")
-                print(f"Current conditions: {conditions}")
-                print(f"Elapsed time: {time.time() - start_time:.1f}s / {timeout}s")
-
+                # Check readiness
                ready_condition = False
+                ready_message = ""
                for condition in conditions:
-                    if (
-                        condition.get("type") == "Ready"
-                        and condition.get("status") == "True"
-                    ):
-                        ready_condition = True
+                    if condition.get("type") == "Ready":
+                        ready_condition = condition.get("status") == "True"
+                        ready_message = condition.get("message", "")
                        break

-                state_successful = status_obj.get("state") == "successful"
+                state_successful = current_state == "successful"
+
+                # Extract not ready components from message
+                not_ready_components = []
+                if re.search(r"resources not ready:", ready_message, re.IGNORECASE):
+                    match = re.search(r"\[(.*?)\]", ready_message)
+                    if match:
+                        items = match.group(1)
+                        not_ready_components = [
+                            s.strip() for s in re.split(r"[,\s]+", items) if s.strip()
+                        ]
+
+                # Format progress message based on mode
+                if not verbose:
+                    # Concise single-line progress with spinner
+                    spinner = progress.spinner()
+
+                    # Create status string
+                    if not_ready_components:
+                        # Show first 2 components, abbreviate if more
+                        components_str = ", ".join(not_ready_components[:2])
+                        if len(not_ready_components) > 2:
+                            components_str += f" +{len(not_ready_components)-2} more"
+                        status_str = f"Waiting for: {components_str}"
+                    else:
+                        status_str = f"State: {current_state}"
+
+                    # Format time
+                    time_str = f"[{elapsed:.0f}s]"
+
+                    message = f"{spinner} {time_str} {status_str}"
+                    progress.update(message)
+
+                else:
+                    # Verbose mode - show details when status changes
+                    conditions_str = str(conditions)
+                    if (
+                        current_state != last_status
+                        or conditions_str != last_conditions_str
+                    ):
+                        progress.update(f"Current deployment state: {current_state}")
+                        progress.update(f"Current conditions: {conditions}")
+                        progress.update(f"Elapsed time: {elapsed:.1f}s / {timeout}s")
+                        progress.update(
+                            f"Deployment not ready yet - Ready: {ready_condition}, "
+                            f"State successful: {state_successful}"
+                        )
+                        last_status = current_state
+                        last_conditions_str = conditions_str

+                # Check if deployment is ready
                if ready_condition and state_successful:
-                    print(
-                        "Deployment is ready: Ready condition is True and state is successful"
+                    progress.finish(
+                        f"✅ Deployment '{self.deployment_name}' ready after {elapsed:.1f}s"
                    )
                    return True
-                else:
-                    print(
-                        f"Deployment not ready yet - Ready condition: {ready_condition}, State successful: {state_successful}"
-                    )

            except kubernetes.client.rest.ApiException as e:
-                print(f"API Exception while checking deployment status: {e}")
-                print(f"Status code: {e.status}, Reason: {e.reason}")
+                if verbose:
+                    progress.update(
+                        f"API Exception while checking deployment status: {e}",
+                        newline=True,
+                    )
+                    progress.update(
+                        f"Status code: {e.status}, Reason: {e.reason}", newline=True
+                    )
            except Exception as e:
-                print(f"Unexpected exception while checking deployment status: {e}")
-            await asyncio.sleep(20)
-        raise TimeoutError("Deployment failed to become ready within timeout")
+                if verbose:
+                    progress.update(
+                        f"Unexpected exception while checking deployment status: {e}",
+                        newline=True,
+                    )
+
+            await asyncio.sleep(check_interval)
+
+        # Timeout reached
+        progress.finish(
+            f"❌ Deployment '{self.deployment_name}' failed to become ready within {timeout}s"
+        )
+        raise TimeoutError(f"Deployment failed to become ready within {timeout}s")

-    async def check_chat_completion(self):
+    async def check_chat_completion(
+        self,
+        use_port_forward: bool = False,
+        local_port: int = 8000,
+        quiet: bool = True,
+        timeout_s: float = 30.0,
+    ):
        """
        Test the deployment with a chat completion request using httpx.
        """
        EXAMPLE_CHAT_REQUEST["model"] = self.model_name
+
+        # Use cluster DNS in-cluster; otherwise optionally port-forward
+        inside_cluster = bool(os.environ.get("KUBERNETES_SERVICE_HOST"))
        base_url = self.get_service_url()
+        if use_port_forward or not inside_cluster:
+            base_url = self.port_forward_frontend(local_port=local_port, quiet=quiet)
+
        url = f"{base_url}/v1/chat/completions"
-        async with httpx.AsyncClient() as client:
+        try:
+            async with httpx.AsyncClient(timeout=timeout_s) as client:
                response = await client.post(url, json=EXAMPLE_CHAT_REQUEST)
                response.raise_for_status()
                return response.text
+        finally:
+            if use_port_forward or not inside_cluster:
+                self.stop_port_forward()

    async def get_deployment_logs(self):
        """
@@ -257,6 +449,10 @@ class DynamoDeploymentClient:
        except kubernetes.client.rest.ApiException as e:
            if e.status != 404:  # Ignore if already deleted
                raise
+        finally:
+            # Close the kubernetes client session to avoid warnings
+            if hasattr(self, "k8s_client"):
+                await self.k8s_client.close()


 async def cleanup_remaining_deployments(deployment_clients, namespace):
@@ -339,7 +535,7 @@ async def main():

        # Test chat completion
        print("Testing chat completion...")
-        response = await client.check_chat_completion()
+        response = await client.check_chat_completion(use_port_forward=True)
        print(f"Chat completion response: {response}")

        # Get logs

--- a/benchmarks/profiler/inject_disagg_config.py
+++ b/benchmarks/profiler/inject_disagg_config.py
@@ -16,51 +16,55 @@
 # limitations under the License.

 """
-Disagg Config Injection Script
+Manifest Injection Script

-This script copies a DynamoGraphDeployment disagg configuration file into the profiling PVC
-so it can be used by the SLA profiler job. The profiler can then reference this config
-using the DGD_CONFIG_FILE environment variable.
+Copies any Kubernetes manifest file into the PVC for later use by jobs.
+Both the source manifest path and destination path in the PVC are required.

 Usage:
-    python3 inject_disagg_config.py --namespace <namespace> [--disagg-config <path>] [--target-path <path>]
+    python3 inject_manifest.py --namespace <namespace> --src <local_manifest.yaml> --dest <absolute_path_in_pvc>

 Examples:
-    # Use default disagg.yaml from components/backends/vllm/deploy/
-    python3 inject_disagg_config.py --namespace <namespace>
-
-    # Use custom disagg config
-    python3 inject_disagg_config.py --namespace <namespace> --disagg-config ./my-custom-disagg.yaml
-
-    # Use custom target path in PVC
-    python3 inject_disagg_config.py --namespace <namespace> --target-path /profiling_results/custom-disagg.yaml
+    python3 inject_manifest.py --namespace <ns> --src ./my-disagg.yaml --dest /configs/disagg.yaml
+    python3 inject_manifest.py --namespace <ns> --src ./my-agg.yaml    --dest /configs/agg.yaml
 """

 import argparse
 import sys
 from pathlib import Path

-from utils.kubernetes import check_kubectl_access, deploy_access_pod, run_command
+from deploy.utils.kubernetes import (
+    PVC_ACCESS_POD_NAME,
+    check_kubectl_access,
+    cleanup_access_pod,
+    deploy_access_pod,
+    run_command,
+)


-def copy_disagg_config(
-    namespace: str, disagg_config_path: Path, target_path: str
-) -> None:
-    """Copy the disagg config file into the PVC via the access pod."""
-    pod_name = "pvc-access-pod"
+def copy_manifest(namespace: str, manifest_path: Path, target_path: str) -> None:
+    """Copy a manifest file into the PVC via the access pod."""
+    pod_name = PVC_ACCESS_POD_NAME

-    if not disagg_config_path.exists():
-        print(f"ERROR: Disagg config file not found: {disagg_config_path}")
+    if not manifest_path.exists():
+        print(f"ERROR: Manifest file not found: {manifest_path}")
        sys.exit(1)

-    print(f"Copying {disagg_config_path} to {target_path} in PVC...")
+    print(f"Copying {manifest_path} to {target_path} in PVC...")
+
+    # Ensure destination directory exists
+    target_dir = str(Path(target_path).parent)
+    run_command(
+        ["kubectl", "exec", pod_name, "-n", namespace, "--", "mkdir", "-p", target_dir],
+        capture_output=False,
+    )

    # Copy file to pod
    run_command(
        [
            "kubectl",
            "cp",
-            str(disagg_config_path),
+            str(manifest_path),
            f"{namespace}/{pod_name}:{target_path}",
        ],
        capture_output=False,
@@ -72,38 +76,13 @@ def copy_disagg_config(
        capture_output=True,
    )

-    print("✓ Disagg config successfully copied to PVC")
+    print("✓ Manifest successfully copied to PVC")
    print(f"File details: {result.stdout.strip()}")


-def cleanup_access_pod(namespace: str, keep_pod: bool = True) -> None:
-    """Optionally clean up the access pod."""
-    if keep_pod:
-        print("ℹ️  Access pod 'pvc-access-pod' left running for future use")
-        print(
-            f"   To access PVC: kubectl exec -it pvc-access-pod -n {namespace} -- /bin/bash"
-        )
-        print(f"   To delete pod: kubectl delete pod pvc-access-pod -n {namespace}")
-    else:
-        print("Cleaning up access pod...")
-        run_command(
-            [
-                "kubectl",
-                "delete",
-                "pod",
-                "pvc-access-pod",
-                "-n",
-                namespace,
-                "--ignore-not-found",
-            ],
-            capture_output=False,
-        )
-        print("✓ Access pod deleted")
-
-
 def main():
    parser = argparse.ArgumentParser(
-        description="Inject disagg config into profiling PVC",
+        description="Inject a Kubernetes manifest into the PVC",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
@@ -116,36 +95,28 @@ def main():
    )

    parser.add_argument(
-        "--disagg-config",
-        type=Path,
-        default=Path("components/backends/vllm/deploy/disagg.yaml"),
-        help="Path to disagg config file (default: components/backends/vllm/deploy/disagg.yaml)",
+        "--src", required=True, type=Path, help="Path to manifest file to copy"
    )
-
-    parser.add_argument(
-        "--target-path",
-        default="/profiling_results/disagg.yaml",
-        help="Target path in PVC (default: /profiling_results/disagg.yaml)",
-    )
-
    parser.add_argument(
-        "--cleanup",
-        action="store_true",
-        help="Delete the access pod after copying (default: keep running)",
+        "--dest",
+        required=True,
+        help="Absolute target path in PVC (e.g., /profiling_results/agg.yaml)",
    )

    args = parser.parse_args()

    # Validate target_path to prevent directory traversal
-    if not args.target_path.startswith("/profiling_results/"):
-        print("ERROR: Target path must be within /profiling_results/")
+    if not args.dest.startswith("/"):
+        print(
+            "ERROR: Target path must be an absolute path inside the PVC (start with '/')."
+        )
        sys.exit(1)

-    if ".." in args.target_path:
+    if ".." in args.dest:
        print("ERROR: Target path cannot contain '..'")
        sys.exit(1)

-    print("🚀 Disagg Config Injection")
+    print("🚀 Manifest Injection")
    print("=" * 40)

    # Validate inputs
@@ -153,16 +124,14 @@ def main():

    # Deploy access pod
    deploy_access_pod(args.namespace)
-
-    # Copy disagg config
-    copy_disagg_config(args.namespace, args.disagg_config, args.target_path)
-
-    # Cleanup
-    cleanup_access_pod(args.namespace, keep_pod=not args.cleanup)
-
-    print("\n✅ Disagg config injection completed!")
-    print(f"📁 Config available at: {args.target_path}")
-    print(f"🔧 Set DGD_CONFIG_FILE=/workspace{args.target_path} in your profiler job")
+    try:
+        # Copy manifest
+        copy_manifest(args.namespace, args.src, args.dest)
+        print("\n✅ Manifest injection completed!")
+        print(f"📁 File available at: {args.dest}")
+    finally:
+        # Cleanup even on failure
+        cleanup_access_pod(args.namespace)


 if __name__ == "__main__":

--- a/benchmarks/profiler/utils/kubernetes.py
+++ b/benchmarks/profiler/utils/kubernetes.py
@@ -15,10 +15,11 @@

 import subprocess
 import sys
-import time
 from pathlib import Path
 from typing import List

+PVC_ACCESS_POD_NAME = "pvc-access-pod"
+

 def run_command(
    cmd: List[str], capture_output: bool = True
@@ -48,7 +49,6 @@ def check_kubectl_access(namespace: str) -> None:

 def deploy_access_pod(namespace: str) -> str:
    """Deploy the PVC access pod and return pod name."""
-    pod_name = "pvc-access-pod"

    # Check if pod already exists and is running
    try:
@@ -57,7 +57,7 @@ def deploy_access_pod(namespace: str) -> str:
                "kubectl",
                "get",
                "pod",
-                pod_name,
+                PVC_ACCESS_POD_NAME,
                "-n",
                namespace,
                "-o",
@@ -69,17 +69,17 @@ def deploy_access_pod(namespace: str) -> str:
        )

        if result.returncode == 0 and result.stdout.strip() == "Running":
-            print(f"✓ Access pod '{pod_name}' already running")
-            return pod_name
+            print(f"✓ Access pod '{PVC_ACCESS_POD_NAME}' already running")
+            return PVC_ACCESS_POD_NAME
    except Exception:
        # Pod doesn't exist or isn't running
        pass

-    print(f"Deploying access pod '{pod_name}' in namespace '{namespace}'...")
+    print(f"Deploying access pod '{PVC_ACCESS_POD_NAME}' in namespace '{namespace}'...")

    # Get the directory where this script is located
-    script_dir = Path(__file__).parent.parent
-    pod_yaml_path = script_dir / "deploy" / "pvc-access-pod.yaml"
+    script_dir = Path(__file__).parent
+    pod_yaml_path = script_dir / "manifests" / "pvc-access-pod.yaml"

    if not pod_yaml_path.exists():
        print(f"ERROR: Pod YAML not found at {pod_yaml_path}")
@@ -92,36 +92,34 @@ def deploy_access_pod(namespace: str) -> str:
    )

    print("Waiting for pod to be ready...")
-
-    # Wait for pod to be ready (up to 60 seconds)
-    for i in range(60):
-        try:
-            result = subprocess.run(
+    run_command(
        [
            "kubectl",
-                    "get",
-                    "pod",
-                    pod_name,
+            "wait",
+            f"pod/{PVC_ACCESS_POD_NAME}",
            "-n",
            namespace,
-                    "-o",
-                    "jsonpath={.status.phase}",
+            "--for=condition=Ready",
+            "--timeout=60s",
        ],
-                capture_output=True,
-                text=True,
-                check=False,
+        capture_output=False,
    )
-
-            if result.returncode == 0 and result.stdout.strip() == "Running":
    print("✓ Access pod is ready")
-                return pod_name
-
-        except Exception:
-            pass
+    return PVC_ACCESS_POD_NAME

-        time.sleep(1)
-        if i % 10 == 0:
-            print(f"  Still waiting... ({i+1}s)")

-    print("ERROR: Access pod failed to become ready within 60 seconds")
-    sys.exit(1)
+def cleanup_access_pod(namespace: str) -> None:
+    print("Cleaning up access pod...")
+    run_command(
+        [
+            "kubectl",
+            "delete",
+            "pod",
+            PVC_ACCESS_POD_NAME,
+            "-n",
+            namespace,
+            "--ignore-not-found",
+        ],
+        capture_output=False,
+    )
+    print("✓ Access pod deleted")
--- a/benchmarks/profiler/deploy/pvc-access-pod.yaml
+++ b/benchmarks/profiler/deploy/pvc-access-pod.yaml
@@ -37,5 +37,5 @@ spec:
  volumes:
  - name: profiling-storage
    persistentVolumeClaim:
-      claimName: profiling-pvc
+      claimName: dynamo-pvc
  restartPolicy: Never
--- a/benchmarks/profiler/deploy/profiling_pvc.yaml
+++ b/benchmarks/profiler/deploy/profiling_pvc.yaml
@@ -3,7 +3,7 @@
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
-  name: profiling-pvc
+  name: dynamo-pvc
  namespace: ${NAMESPACE}
 spec:
  accessModes:

--- a/benchmarks/profiler/deploy/profile_sla_rbac.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_rbac.yaml
@@ -3,7 +3,7 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
-  name: profile-sla-role
+  name: dynamo-role
  namespace: ${NAMESPACE}
 rules:
  # DynamoGraphDeployment custom resources - needed for create/get/delete operations
@@ -17,3 +17,10 @@ rules:
  - apiGroups: [""]
    resources: ["pods/log"]
    verbs: ["get"]
+  # Services and Deployments - needed for vLLM deployments
+  - apiGroups: [""]
+    resources: ["services"]
+    verbs: ["get", "create", "delete"]
+  - apiGroups: ["apps"]
+    resources: ["deployments"]
+    verbs: ["get", "create", "delete"]
--- a/benchmarks/profiler/deploy/profile_sla_binding.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_binding.yaml
@@ -3,13 +3,13 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
-  name: profile-sla-binding
+  name: dynamo-binding
  namespace: ${NAMESPACE}
 subjects:
 - kind: ServiceAccount
-  name: profile-sla-sa
+  name: dynamo-sa
  namespace: ${NAMESPACE}
 roleRef:
  kind: Role
-  name: profile-sla-role
+  name: dynamo-role
  apiGroup: rbac.authorization.k8s.io
--- a/benchmarks/profiler/deploy/profile_sla_sa.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_sa.yaml
@@ -3,7 +3,7 @@
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: profile-sla-sa
+  name: dynamo-sa
  namespace: ${NAMESPACE}
 imagePullSecrets:
  - name: nvcr-imagepullsecret
--- a/deploy/utils/requirements.txt
+++ b/deploy/utils/requirements.txt
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+# Kubernetes and async dependencies
+aiofiles>=0.8.0
+# Benchmarking dependencies for Dynamo
+genai-perf==0.0.15
+httpx>=0.24.0
+kubernetes-asyncio>=24.0.0
+
+# Plotting and visualization
+matplotlib>=3.5.0
+numpy>=1.21.0
+pandas>=1.3.0
+plotly>=5.0.0
+
+# YAML processing
+pyyaml>=6.0.0
+scipy>=1.7.0
+seaborn>=0.11.0
--- a/deploy/utils/setup_k8s_namespace.sh
+++ b/deploy/utils/setup_k8s_namespace.sh
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+# Resolve repo root relative to this script
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# Inputs
+NAMESPACE="${NAMESPACE:-default}"
+DOCKER_SERVER="${DOCKER_SERVER:-}"
+IMAGE_TAG="${IMAGE_TAG:-}"
+DOCKER_USERNAME="${DOCKER_USERNAME:-}"
+DOCKER_PASSWORD="${DOCKER_PASSWORD:-}"
+HF_TOKEN="${HF_TOKEN:-}"
+PULL_SECRET_NAME=""
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+log() { echo -e "${BLUE}[INFO]${NC} $*"; }
+ok() { echo -e "${GREEN}[OK]${NC} $*"; }
+warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
+err() { echo -e "${RED}[ERROR]${NC} $*"; }
+
+create_or_update_pull_secret() {
+  local server="$1"; local user="$2"; local pass="$3"
+  if [[ -n "$server" && -n "$user" && -n "$pass" ]]; then
+    log "Creating/updating docker-imagepullsecret for $server in namespace $NAMESPACE"
+    kubectl create secret docker-registry docker-imagepullsecret \
+      --docker-server="$server" \
+      --docker-username="$user" \
+      --docker-password="$pass" \
+      --namespace="$NAMESPACE" \
+      --dry-run=client -o yaml | kubectl apply -f -
+    ok "docker-imagepullsecret configured"
+    PULL_SECRET_NAME="docker-imagepullsecret"
+  fi
+}
+
+usage() {
+  cat << EOF
+Usage:
+  NAMESPACE=<ns> deploy/utils/setup_k8s_namespace.sh
+  NAMESPACE=<ns> DOCKER_SERVER=<registry> IMAGE_TAG=<tag> [DOCKER_USERNAME=<user>] [DOCKER_PASSWORD=<token>] \
+    deploy/utils/setup_k8s_namespace.sh
+
+Sets up Kubernetes namespace for Dynamo (one-time per namespace):
+  - Creates namespace if absent
+  - Applies common manifests (ServiceAccount, Role, RoleBinding, PVC)
+  - Installs CRDs once per cluster (if not already installed)
+  - If DOCKER_SERVER/IMAGE_TAG are provided:
+      * Builds/pushes a custom operator image with Earthly
+      * Installs/updates the operator Helm release using that image
+      * If credentials (DOCKER_USERNAME/DOCKER_PASSWORD) are provided, creates/updates docker-imagepullsecret
+      * If credentials are not provided, prompts interactively to create the pull secret
+  - Otherwise installs the operator using default image: nvcr.io/nvidia/ai-dynamo/kubernetes-operator:0.4.0
+
+Environment variables:
+  NAMESPACE         Target Kubernetes namespace (default: default)
+  DOCKER_SERVER     Registry server for operator image (optional)
+  IMAGE_TAG         Image tag for operator (optional)
+  DOCKER_USERNAME   Registry username (optional; if provided with DOCKER_PASSWORD, secret is created)
+  DOCKER_PASSWORD   Registry password/token (optional)
+  HF_TOKEN          Hugging Face token; if set, a secret named hf-token-secret is created in the namespace (optional)
+EOF
+}
+
+if ! command -v kubectl &>/dev/null; then err "kubectl not found"; exit 1; fi
+
+# 1) Ensure namespace exists
+if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
+  log "Creating namespace $NAMESPACE"
+  kubectl create namespace "$NAMESPACE"
+else
+  log "Namespace $NAMESPACE exists"
+fi
+
+# 2) Apply common manifests
+log "Applying common manifests to namespace $NAMESPACE"
+for mf in "$(dirname "$0")/manifests"/*.yaml; do
+  envsubst < "$mf" | kubectl apply -f -
+done
+ok "Common manifests applied"
+
+# 3) Install CRDs once per cluster (only if not already installed)
+if command -v helm &>/dev/null; then
+  if ! helm status dynamo-crds -n "$NAMESPACE" &>/dev/null; then
+    log "Installing CRDs via Helm release dynamo-crds in namespace $NAMESPACE"
+    pushd "$REPO_ROOT/deploy/cloud/helm" >/dev/null
+    helm upgrade --install dynamo-crds ./crds/ \
+      --namespace "$NAMESPACE" \
+      --wait \
+      --atomic
+    popd >/dev/null
+    ok "CRDs installed"
+  fi
+fi
+
+# 4) Optional: Create Hugging Face token secret if HF_TOKEN provided
+if [[ -n "$HF_TOKEN" ]]; then
+  kubectl create secret generic hf-token-secret \
+    --from-literal=HF_TOKEN="$HF_TOKEN" \
+    -n "$NAMESPACE" \
+    --dry-run=client -o yaml | kubectl apply -f -
+  ok "hf-token-secret created/updated"
+fi
+
+# 5) Optional: Create imagePullSecret for private registry if credentials provided or requested
+if [[ -n "$DOCKER_SERVER" ]]; then
+  if [[ -n "$DOCKER_USERNAME" && -n "$DOCKER_PASSWORD" ]]; then
+    create_or_update_pull_secret "$DOCKER_SERVER" "$DOCKER_USERNAME" "$DOCKER_PASSWORD"
+  elif [[ -n "$IMAGE_TAG" ]]; then
+    echo
+    read -p "Do you need image pull credentials for $DOCKER_SERVER (private registry)? [y/N]: " -r ans
+    if [[ "$ans" =~ ^[Yy]$ ]]; then
+      read -p "Docker username (often '$oauthtoken' for NGC): " DOCKER_USERNAME
+      read -s -p "Docker password/token: " DOCKER_PASSWORD; echo
+      if [[ -n "$DOCKER_USERNAME" && -n "$DOCKER_PASSWORD" ]]; then
+        create_or_update_pull_secret "$DOCKER_SERVER" "$DOCKER_USERNAME" "$DOCKER_PASSWORD"
+      else
+        warn "Username or password empty; skipping secret creation"
+      fi
+    fi
+  fi
+fi
+
+# 6) Operator: Build/push custom image if both vars provided, else use default NGC image
+if [[ -n "$DOCKER_SERVER" && -n "$IMAGE_TAG" ]]; then
+  if ! command -v earthly &>/dev/null; then warn "earthly not found; skipping operator build/push"; else
+    log "Building and pushing operator images via earthly"
+    earthly --push +all-docker --DOCKER_SERVER="$DOCKER_SERVER" --IMAGE_TAG="$IMAGE_TAG"
+  fi
+
+  if ! command -v helm &>/dev/null; then warn "helm not found; skipping helm install"; else
+    pushd "$REPO_ROOT/deploy/cloud/helm/platform" >/dev/null
+    helm dep build
+    popd >/dev/null
+
+    pushd "$REPO_ROOT/deploy/cloud/helm" >/dev/null
+    # Build Helm args
+    HELM_ARGS=(upgrade dynamo-platform ./platform/ --install --namespace "$NAMESPACE" \
+      --set "dynamo-operator.controllerManager.manager.image.repository=${DOCKER_SERVER}/dynamo-operator" \
+      --set "dynamo-operator.controllerManager.manager.image.tag=${IMAGE_TAG}")
+    if [[ -n "$PULL_SECRET_NAME" ]]; then
+      HELM_ARGS+=(--set "dynamo-operator.imagePullSecrets[0].name=${PULL_SECRET_NAME}")
+    fi
+    helm "${HELM_ARGS[@]}"
+    popd >/dev/null
+    ok "Helm chart installed/updated"
+  fi
+else
+  # Use default published image when custom not provided
+  DEFAULT_OPERATOR_IMAGE="nvcr.io/nvidia/ai-dynamo/kubernetes-operator:0.4.0"
+  if ! command -v helm &>/dev/null; then warn "helm not found; skipping helm install"; else
+    pushd "$REPO_ROOT/deploy/cloud/helm/platform" >/dev/null
+    helm dep build
+    popd >/dev/null
+
+    pushd "$REPO_ROOT/deploy/cloud/helm" >/dev/null
+    # Only set imagePullSecrets if the referenced secret exists; otherwise rely on SA
+    HELM_ARGS=(upgrade dynamo-platform ./platform/ --install --namespace "$NAMESPACE" \
+      --set "dynamo-operator.controllerManager.manager.image.repository=${DEFAULT_OPERATOR_IMAGE%:*}" \
+      --set "dynamo-operator.controllerManager.manager.image.tag=${DEFAULT_OPERATOR_IMAGE##*:}")
+    if kubectl get secret nvcr-imagepullsecret -n "$NAMESPACE" &>/dev/null; then
+      HELM_ARGS+=(--set "dynamo-operator.imagePullSecrets[0].name=nvcr-imagepullsecret")
+    fi
+    helm "${HELM_ARGS[@]}"
+    popd >/dev/null
+    ok "Helm chart installed/updated with default operator image"
+  fi
+fi
+
+# 7) Install benchmark dependencies if requirements.txt exists
+REQUIREMENTS_FILE="$SCRIPT_DIR/requirements.txt"
+
+if [[ -f "$REQUIREMENTS_FILE" ]]; then
+  log "Installing benchmark dependencies..."
+  if command -v uv >/dev/null 2>&1; then
+    uv pip install -r "$REQUIREMENTS_FILE"
+  elif command -v pip3 >/dev/null 2>&1; then
+    pip3 install -r "$REQUIREMENTS_FILE"
+  elif command -v pip >/dev/null 2>&1; then
+    pip install -r "$REQUIREMENTS_FILE"
+  else
+    warn "No pip/pip3/uv found; skipping benchmark dependency installation"
+    warn "To run benchmarks, manually install: pip install -r $REQUIREMENTS_FILE"
+  fi
+  ok "Benchmark dependencies installed"
+fi
+
+ok "Kubernetes namespace setup complete"
--- a/docs/architecture/load_planner.md
+++ b/docs/architecture/load_planner.md
@@ -24,7 +24,7 @@ There are two additional rules set by planner to prevent over-compensation:

 ## SLA-based Scaling Up/Down Prefill/Decode Workers

-See [Pre-Deployment Profiling](pre_deployment_profiling.md) for more details.
+See [Pre-Deployment Profiling](../benchmarks/pre_deployment_profiling.md) for more details.

 ## Usage


--- a/docs/architecture/planner_intro.rst
+++ b/docs/architecture/planner_intro.rst
@@ -71,6 +71,6 @@ Key features include:
   :hidden:

   Overview <self>
-   Pre-Deployment Profiling <pre_deployment_profiling.md>
+   Pre-Deployment Profiling <../benchmarks/pre_deployment_profiling.md>
+   Load-based Planner <load_planner.md>
   SLA-based Planner <sla_planner.md>
-   Planner Benchmark <../guides/planner_benchmark/README.md>
\ No newline at end of file
--- a/docs/architecture/sla_planner.md
+++ b/docs/architecture/sla_planner.md
@@ -28,7 +28,7 @@ The SLA planner consists of several key components:

 ## Pre-Deployment Profiling

-SLA-based planner requires pre-deployment profiling to operate. See [Pre-Deployment Profiling](pre_deployment_profiling.md) for more details.
+SLA-based planner requires pre-deployment profiling to operate. See [Pre-Deployment Profiling](../benchmarks/pre_deployment_profiling.md) for more details.

 ## Load Prediction


--- a/docs/benchmarks/benchmarking.md
+++ b/docs/benchmarks/benchmarking.md
+<!-- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License. -->
+
+# Dynamo Benchmarking Guide
+
+This benchmarking framework lets you compare performance across any combination of:
+- **DynamoGraphDeployments** (automatically deployed from your manifests)
+- **External HTTP endpoints** (existing services, vLLM, TensorRT-LLM, etc.)
+
+You can mix and match these in a single benchmark run using custom labels. Configure your DynamoGraphDeployment manifests for your specific models, hardware, and parallelization needs.
+
+## What This Tool Does
+
+The framework is a wrapper around `genai-perf` that:
+- Deploys user-specified `DynamoGraphDeployments` automatically
+- Benchmarks any HTTP endpoints (no deployment needed)
+- Runs concurrency sweeps across configurable load levels
+- Generates comparison plots with your custom labels
+- Works with any HuggingFace-compatible model on NVIDIA GPUs (H200, H100, A100, etc.)
+- Runs locally and connects to your Kubernetes deployments/endpoints
+
+**Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`)
+
+**Important**: The `--model` parameter configures GenAI-Perf for benchmarking and provides logging context. The actual model loaded is determined by your deployment manifests. Only one model can be benchmarked at a time across all inputs to ensure fair comparison. The default `--model` value in the benchmarking script is `deepseek-ai/DeepSeek-R1-Distill-Llama-8B`, but it must match the model in the manifest(s) and the model deployed at the endpoint(s).
+
+## Prerequisites
+
+1. **Kubernetes cluster with NVIDIA GPUs and Dynamo namespace setup** - You need a Kubernetes cluster with eligible NVIDIA GPUs and a properly configured namespace for Dynamo benchmarking. See the [deploy/utils/README](../../deploy/utils/README.md) for complete setup instructions.
+
+2. **kubectl access** - You need `kubectl` installed and configured to access your Kubernetes cluster. All other required tools (GenAI-Perf, Python, etc.) are included in the Dynamo containers. If you are not working within a Dynamo container, you can install the necessary requirements using `deploy/utils/requirements.txt`. *Note: if you are on Ubuntu 22.04 or lower, you will also need to build perf_analyzer [from source](https://github.com/triton-inference-server/perf_analyzer/blob/main/docs/install.md#build-from-source).*
+
+## Quick Start Examples
+
+The tool can be used to deploy, benchmark and compare Dynamo deployments (DynamoGraphDeployments) on a Kubernetes cluster as well as benchmark and compare servers deployed separately given a URL. In the examples below, Dynamo deployments are specified with a yaml and servers deployed separately by URL.
+
+```bash
+export NAMESPACE=benchmarking
+
+# Compare multiple DynamoGraphDeployments of a single backend
+./benchmarks/benchmark.sh --namespace $NAMESPACE \
+   --input agg=components/backends/vllm/deploy/agg.yaml \
+   --input disagg=components/backends/vllm/deploy/disagg.yaml
+
+# Compare different backend types (vLLM vs TensorRT-LLM)
+./benchmarks/benchmark.sh --namespace $NAMESPACE \
+   --input vllm-disagg=components/backends/vllm/deploy/disagg.yaml \
+   --input trtllm-disagg=components/backends/trtllm/deploy/disagg.yaml
+
+# Compare Dynamo deployment vs existing deployment (external endpoint)
+./benchmarks/benchmark.sh --namespace $NAMESPACE \
+   --input dynamo=components/backends/vllm/deploy/disagg.yaml \
+   --input vllm-baseline=http://localhost:8000
+
+# Compare three different configurations
+./benchmarks/benchmark.sh --namespace $NAMESPACE \
+   --input dynamo-agg=components/backends/vllm/deploy/agg.yaml \
+   --input dynamo-disagg=components/backends/vllm/deploy/disagg.yaml \
+   --input external-vllm=http://localhost:8000
+
+# Benchmark single external endpoint
+./benchmarks/benchmark.sh --namespace $NAMESPACE \
+   --input production-api=http://your-api:8000
+
+# Custom model and sequence lengths
+./benchmarks/benchmark.sh --namespace $NAMESPACE \
+   --input my-setup=my-custom-manifest.yaml \
+   --model "meta-llama/Meta-Llama-3-8B" --isl 512 --osl 256
+```
+
+**Key**: Configure your manifests for your specific models, hardware, and parallelization strategy before benchmarking.
+
+### Important: Image Accessibility
+
+Ensure container images in your DynamoGraphDeployment manifests are accessible:
+- **Public images**: Use [Dynamo NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo/artifacts) public releases
+- **Custom registries**: Configure proper credentials in your Kubernetes namespace
+
+## Configuration and Usage
+
+### Command Line Options
+
+```bash
+./benchmarks/benchmark.sh --namespace NAMESPACE --input <label>=<manifest_path_or_endpoint> [--input <label>=<manifest_path_or_endpoint>]... [OPTIONS]
+
+REQUIRED:
+  -n, --namespace NAMESPACE           Kubernetes namespace
+  --input <label>=<manifest_path_or_endpoint>  Benchmark input with custom label
+                                        - <label>: becomes the name/label in plots
+                                        - <manifest_path_or_endpoint>: either a DynamoGraphDeployment manifest or HTTP endpoint URL
+                                        Can be specified multiple times for comparisons
+
+OPTIONS:
+  -h, --help                    Show help message and examples
+  -m, --model MODEL             Model name for GenAI-Perf configuration and logging (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)
+                                NOTE: This must match the model configured in your deployment manifests and endpoints
+  -i, --isl LENGTH              Input sequence length (default: 2000)
+  -s, --std STDDEV              Input sequence standard deviation (default: 10)
+  -o, --osl LENGTH              Output sequence length (default: 256)
+  -d, --output-dir DIR          Output directory (default: ./benchmarks/results)
+  --verbose                     Enable verbose output
+```
+
+### Important Notes
+
+- **Custom Labels**: Each input must have a unique label that becomes the name in plots and results
+- **Label Restrictions**: Labels can only contain letters, numbers, hyphens, and underscores. The label `plots` is reserved.
+- **Input Types**: Supports DynamoGraphDeployment manifests for automatic deployment, or HTTP endpoints for existing services
+- **Model Parameter**: The `--model` parameter configures GenAI-Perf for testing and logging, not deployment (deployment model is determined by the manifest files)
+- **Standalone Deployments**: For non-Dynamo backends (vLLM, TensorRT-LLM, SGLang, etc.), you must deploy them manually following their respective Kubernetes deployment guides. The benchmarking framework only supports automatic deployment of DynamoGraphDeployments.
+- **Single Model Requirement**: Only one model can be benchmarked at a time across all inputs to ensure fair comparison.
+
+### What Happens During Benchmarking
+
+The script automatically:
+1. **Deploys** each DynamoGraphDeployment configuration to Kubernetes if manifests are passed in
+2. **Benchmarks** using GenAI-Perf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250)
+3. **Measures** key metrics: latency, throughput, time-to-first-token
+4. **Generates** comparison plots using your custom labels in `./benchmarks/results/plots/`
+5. **Cleans up** deployments when complete
+
+### Results Clearing Behavior
+
+**Important**: The benchmark script automatically clears the output directory before each run to ensure clean, reproducible results. This means:
+- Previous benchmark results in the same output directory will be completely removed
+- Each benchmark run starts with a clean slate
+- Results from different runs are not mixed or accumulated
+
+If you want to preserve results from previous runs, use different output directories with the `--output-dir` flag.
+
+### Using Your Own Models and Configuration
+
+The benchmarking framework supports any HuggingFace-compatible LLM model. To benchmark your own custom deployment:
+
+1. **Edit your deployment YAML files** to specify your model in the `--model` argument of the container command
+2. **Use the corresponding model name** in the benchmark script's `--model` parameter
+
+**Note**: You can override the default sequence lengths (2000/256 tokens) with `--isl` and `--osl` flags if needed for your specific workload.
+
+### Direct Python Execution
+
+For direct control over the benchmark workflow:
+
+```bash
+# Endpoint benchmarking
+python3 -u -m benchmarks.utils.benchmark \
+   --endpoint "http://your-endpoint:8000" \
+   --namespace $NAMESPACE \
+   --isl 2000 \
+   --std 10 \
+   --osl 256 \
+   --output-dir $OUTPUT_DIR
+
+# Deployment benchmarking (any combination)
+python3 -u -m benchmarks.utils.benchmark \
+   --input agg=$AGG_CONFIG \
+   --input disagg=$DISAGG_CONFIG \
+   --namespace $NAMESPACE \
+   --isl 2000 \
+   --std 10 \
+   --osl 256 \
+   --output-dir $OUTPUT_DIR
+
+# Generate plots separately
+python3 -m benchmarks.utils.plot --data-dir $OUTPUT_DIR
+```
+
+### Comparison Limitations
+
+The plotting system supports up to 12 different inputs in a single comparison. If you need to compare more than 12 different deployments/endpoints, consider running separate benchmark sessions or grouping related comparisons together.
+
+### Concurrency Configuration
+
+You can customize the concurrency levels using the CONCURRENCIES environment variable:
+
+```bash
+# Custom concurrency levels
+CONCURRENCIES="1,5,20,50" ./benchmarks/benchmark.sh --namespace $NAMESPACE --input my-test=components/backends/vllm/deploy/disagg.yaml
+
+# Or set permanently
+export CONCURRENCIES="1,2,5,10,25,50,100"
+./benchmarks/benchmark.sh --namespace $NAMESPACE --input test=disagg.yaml
+```
+
+## Understanding Your Results
+
+After benchmarking completes, check `./benchmarks/results/` (or your custom output directory):
+
+### Summary and Plots
+
+```text
+benchmarks/results/
+├── SUMMARY.txt          # Quick overview of all results
+└── plots/               # Visual comparisons (these are what you want!)
+    ├── p50_inter_token_latency_vs_concurrency.png      # Token generation speed
+    ├── avg_time_to_first_token_vs_concurrency.png      # Response time
+    ├── request_throughput_vs_concurrency.png           # Requests per second
+    ├── efficiency_tok_s_gpu_vs_user.png                # GPU efficiency
+    └── avg_inter_token_latency_vs_concurrency.png      # Average latency
+```
+
+### Data Files
+
+Raw data is organized by deployment/benchmark type and concurrency level:
+
+**For Any Benchmarking (uses your custom labels):**
+```text
+benchmarks/results/
+├── plots/                       # Performance visualization plots
+│   ├── SUMMARY.txt             # Human-readable benchmark summary
+│   ├── p50_inter_token_latency_vs_concurrency.png
+│   ├── avg_inter_token_latency_vs_concurrency.png
+│   ├── request_throughput_vs_concurrency.png
+│   ├── efficiency_tok_s_gpu_vs_user.png
+│   └── avg_time_to_first_token_vs_concurrency.png
+├── <your-label-1>/              # Results for first input (uses your custom label)
+│   ├── c1/                      # Concurrency level 1
+│   │   └── profile_export_genai_perf.json
+│   ├── c2/                      # Concurrency level 2
+│   ├── c5/                      # Concurrency level 5
+│   └── ...                      # Other concurrency levels (10, 50, 100, 250)
+├── <your-label-2>/              # Results for second input (if provided)
+│   └── c*/                      # Same structure as above
+└── <your-label-N>/              # Results for additional inputs
+    └── c*/                      # Same structure as above
+```
+
+**Example with actual labels:**
+```text
+benchmarks/results/
+├── plots/
+├── dynamo-agg/                  # --input dynamo-agg=agg.yaml
+├── dynamo-disagg/               # --input dynamo-disagg=disagg.yaml
+└── external-vllm/               # --input external-vllm=http://localhost:8000
+```
+
+Each concurrency directory contains:
+- **`profile_export_genai_perf.json`** - Structured metrics from GenAI-Perf
+- **`profile_export.json`** - Raw GenAI-Perf results
+- **`inputs.json`** - Generated test inputs
+
+## Customize Benchmarking Behavior
+
+The built-in workflow handles DynamoGraphDeployment deployment, benchmarking with genai-perf, and plot generation automatically. If you want to modify the behavior:
+
+1. **Extend the workflow**: Modify `benchmarks/utils/workflow.py` to add custom deployment types or metrics collection
+
+2. **Generate different plots**: Modify `benchmarks/utils/plot.py` to generate a different set of plots for whatever you wish to visualize.
+
+The `benchmark.sh` script provides a complete end-to-end benchmarking experience. For more granular control, use the Python modules directly.
--- a/docs/architecture/pre_deployment_profiling.md
+++ b/docs/architecture/pre_deployment_profiling.md
@@ -71,22 +71,29 @@ SLA planner can work with any interpolation data that follows the above format.

 ## Running the Profiling Script in Kubernetes

-Set your environment variables:
+Set up your Kubernetes namespace (one-time per namespace). Follow the instructions [here](../../deploy/utils/README.md#kubernetes-setup-one-time-per-namespace). If your namespace is already set up, skip this step.
+
+**Prerequisites**: Ensure all dependencies are installed. If you ran the setup script above, dependencies are already installed. Otherwise, install them manually:
 ```bash
-export NAMESPACE=your-namespace
+pip install -r deploy/utils/requirements.txt
 ```

-**Optional Step 0: add a kubernetes secret**
+### Step 1: Inject your DGD configuration
+
+Use the injector utility to place your DGD manifest into the PVC. The profiling job will read the path you specify.

 ```bash
-kubectl create secret docker-registry nvcr-imagepullsecret \
-  --docker-server=nvcr.io \
-  --docker-username='$oauthtoken' \
-  --docker-password=<nvapi key> \
-  -n $NAMESPACE
+# Inject your disagg manifest
+python3 deploy/utils/inject_manifest.py \
+  --namespace $NAMESPACE \
+  --src components/backends/vllm/deploy/disagg.yaml \
+  --dest /configs/disagg.yaml
+
+# Set the docker image for the profiling job; any docker image that contains your script.
+export DOCKER_IMAGE=nvcr.io/nvidia/dynamo:latest-vllm
 ```

-**Step 1: Configure container image**
+### Configure container image (optional)

 You have two options for configuring your profiling setup:

@@ -102,13 +109,13 @@ Use the default pre-built image and inject custom configurations via PVC:
 2. **Inject your custom disagg configuration:**
   ```bash
   # Use default disagg.yaml config
-   python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE
+   python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src components/backends/vllm/deploy/disagg.yaml --dest /configs/disagg.yaml

   # Or use a custom disagg config file
-   python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE --disagg-config my-custom-disagg.yaml
+   python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src my-custom-disagg.yaml --dest /configs/disagg.yaml

   # Or specify a custom target path in the PVC
-   python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE --target-path /profiling_results/my-disagg.yaml
+   python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src my-custom-disagg.yaml --dest /profiling_results/my-disagg.yaml
   ```

 3. **Set the config path for the profiling job:**
@@ -123,19 +130,6 @@ This approach allows you to:

 > **Important**: For profiling, disagg configs should be run with Grove disabled by adding the annotation `nvidia.com/enable-grove: "false"` to avoid alpha Grove status issues.

-> **Note**: The default location in the PVC is `/profiling_results/disagg.yaml`. If you don't inject a config, the profiler will fall back to the built-in config at `/workspace/components/backends/vllm/deploy/disagg.yaml`.
-
-**Option B: Build custom image (only if you need code changes)**
-
-Only needed if you require custom code modifications beyond configuration changes:
-```bash
-# in the project's root folder
-./container/build.sh --framework <VLLM/sglang>
-# Tag and push to your container registry
-export DOCKER_IMAGE=<your docker tag>
-export DGD_CONFIG_FILE=<disagg config path> # path to your disagg.yaml file within the DOCKER_IMAGE
-```
-
 **Step 2: Set SLA target**

 Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL. Also, set the backend type to `vllm` or `sglang`. The backend type must match the dynamo deployment in the `DGD_CONFIG_FILE`.
@@ -162,12 +156,7 @@ spec:
 **Step 3: Run profiling (required)**

 ```bash
-cd $DYNAMO_HOME/benchmarks/profiler/deploy
-envsubst < profiling_pvc.yaml | kubectl apply -f -
-envsubst < profile_sla_sa.yaml | kubectl apply -f -
-envsubst < profile_sla_rbac.yaml | kubectl apply -f -
-envsubst < profile_sla_binding.yaml | kubectl apply -f -
-envsubst < profile_sla_job.yaml | kubectl apply -f -
+envsubst < benchmarks/profiler/deploy/profile_sla_job.yaml | kubectl apply -f -
 ```

 **Step 4: Wait for profiling to complete**
@@ -176,40 +165,24 @@ kubectl get jobs -n $NAMESPACE
 kubectl logs job/profile-sla -n $NAMESPACE
 ```

-### RBAC Configuration
-
-The SLA profiling job requires specific Kubernetes permissions to manage DynamoGraphDeployment resources and access namespace information. The RBAC setup consists of:
-
- **`profile_sla_sa.yaml`** - Service account with image pull secret for NVIDIA Container Registry access
- **`profile_sla_rbac.yaml`** - Role defining required permissions for managing deployments and accessing namespace resources
- **`profile_sla_binding.yaml`** - RoleBinding that associates the Role with the service account
-
-All three files are necessary:
-1. The service account provides identity and image pull credentials
-2. The Role defines what operations are allowed
-3. The RoleBinding connects the permissions to the service account
-
 ### Viewing Profiling Results

-After the profiling job completes successfully, the results are stored in the persistent volume claim (PVC) created during Step 2. Here's how to access and view your profiling results:
-
-#### Accessing the Profiling Results PVC
+After the profiling job completes successfully, the results are stored in the persistent volume claim (PVC) created during Step 2.

-The profiling results are stored in a PVC named `profiling-pvc`. To access the results:
+To download the results:

-1. **Deploy the PVC access pod (if not already running):**
-   ```bash
-   kubectl apply -f benchmarks/profiler/deploy/pvc-access-pod.yaml -n $NAMESPACE
-   ```
+```bash
+# Download to directory
+python3 deploy/utils/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --folder /profiling_results

-2. **Access the PVC through the pod:**
-   ```bash
-   kubectl exec -it pvc-access-pod -n $NAMESPACE -- /bin/bash
-   cd /profiling_results
-   ls -la
-   ```
+# Download without any of the auto-created config.yaml files used in profiling
+python3 deploy/utils/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --folder /profiling_results --no-config
+```

-> **Note**: The same `pvc-access-pod` is used for both injecting disagg configs and accessing results. If you used the `inject_disagg_config.py` script earlier, the pod may already be running. The pod auto-deletes after 5 minutes of activity.
+The script will:
+* Deploy a temporary access pod
+* Download all files maintaining directory structure
+* Clean the pod up automatically

 #### File Structure

@@ -231,62 +204,6 @@ The profiling results directory contains the following structure:
    └── decode_tp{best_tp}.png                 # 3D ITL surface plot
 ```

-#### Downloading Results Locally
-
-You can download the profiling results using the automated download script or manually:
-
-**Option 1: Automated Download (Recommended)**
-
-Use the provided download script to automatically fetch all relevant files:
-
-```bash
-# Download to ./results directory
-python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results
-
-# Download to specific directory
-python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir /path/to/my/results
-
-# Download without any of the auto-created config.yaml files used in profiling
-python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --no-config
-```
-
-The script will:
- Deploy a temporary access pod (auto-deletes after 5 minutes)
- Scan for relevant files (*.png, *.npz, *.yaml)
- Download all files maintaining directory structure
- Generate a README.md with file descriptions
- Clean up automatically
-
-**Option 2: Manual Download**
-
-To download the profiling results manually:
-
-1. **Download performance plots and data files:**
-   ```bash
-   # Create a local directory for results
-   mkdir -p ./profiling_results
-
-   # Copy main performance plots
-   kubectl cp pvc-access-pod:/profiling_results/prefill_performance.png ./profiling_results/ -n $NAMESPACE
-   kubectl cp pvc-access-pod:/profiling_results/decode_performance.png ./profiling_results/ -n $NAMESPACE
-
-   # Copy interpolation directories (includes additional plots and data)
-   kubectl cp pvc-access-pod:/profiling_results/selected_prefill_interpolation/ ./profiling_results/ -n $NAMESPACE -r
-   kubectl cp pvc-access-pod:/profiling_results/selected_decode_interpolation/ ./profiling_results/ -n $NAMESPACE -r
-   ```
-
-2. **Alternative: Tar and download entire results directory:**
-   ```bash
-   # Inside the access pod, create a tar archive
-   tar -czf /profiling_results/profiling_results.tar.gz -C /profiling_results .
-
-   # Download the archive to your local machine
-   kubectl cp pvc-access-pod:/profiling_results/profiling_results.tar.gz ./profiling_results.tar.gz -n $NAMESPACE
-
-   # Extract locally
-   tar -xzf profiling_results.tar.gz -C ./profiling_results/
-   ```
-
 #### Viewing Performance Plots

 The profiling generates several performance visualization files:
@@ -316,20 +233,6 @@ decode_data = np.load('selected_decode_interpolation/raw_data.npz')
 print("Decode data keys:", list(decode_data.keys()))
 ```

-#### Cleaning Up
-
-The access pod automatically deletes after 5 minutes of activity, but you can also clean it up manually:
-
-```bash
-# Exit the access pod (if still inside)
-exit
-
-# Delete the access pod immediately (optional - it will auto-delete)
-kubectl delete pod pvc-access-pod -n $NAMESPACE
-```
-
-> **Note**: The access pod has `activeDeadlineSeconds: 300` and will auto-delete after 5 minutes to prevent resource waste.
-
 ### Troubleshooting

 #### Image Pull Authentication Errors
@@ -343,7 +246,7 @@ If you see `ErrImagePull` or `ImagePullBackOff` errors with 401 unauthorized mes

 2. Verify the service account was created with the image pull secret:
   ```bash
-   kubectl get serviceaccount profile-sla-sa -n $NAMESPACE -o yaml
+kubectl get serviceaccount dynamo-sa -n $NAMESPACE -o yaml
   ```

 3. The service account should show `imagePullSecrets` containing `nvcr-imagepullsecret`.
--- a/docs/guides/dynamo_deploy/sla_planner_deployment.md
+++ b/docs/guides/dynamo_deploy/sla_planner_deployment.md
@@ -16,7 +16,6 @@ Quick deployment guide for the disaggregated planner with automatic scaling.
 ```mermaid
 flowchart LR
  Frontend --"/metrics"--> Prometheus
-  Prometheus --"scrape"--> Prometheus
  Planner --"query API"--> Prometheus
  Planner --"scaling decisions"--> Workers["prefill<br/>backend"]
  Frontend -.->|"requests"| Workers
@@ -25,7 +24,7 @@ flowchart LR
 ## Prerequisites
 - Kubernetes cluster with GPU nodes
 - `hf-token-secret` created in target namespace
- [Pre-Deployment Profiling](../../architecture/pre_deployment_profiling.md) results saved to `profiling-pvc` PVC.
+- [Pre-Deployment Profiling](../../benchmarks/pre_deployment_profiling.md) results saved to `dynamo-pvc` PVC.
 - Prefill and decode worker uses the best parallelization mapping suggested by the pre-deployment profiling script.

 ```bash
@@ -62,7 +61,7 @@ vllm-disagg-planner-prefill-*             1/1 Running
 kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-frontend 8000:8000

 # Send a streaming request (required for full metrics)
-curl http://localhost:8000/v1/chat/completions \
+curl -N http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "Qwen/Qwen3-0.6B",
@@ -101,8 +100,8 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
 **Connection Issues:**
 ```bash
 # Verify Prometheus is accessible (runs on port 8000)
-kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-prometheus 8000:8000
-curl "http://localhost:8000/api/v1/query?query=up"
+kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-prometheus 9090:8000
+curl "http://localhost:9090/api/v1/query?query=up"
 ```

 **Missing Metrics:**

--- a/tests/planner/README.md
+++ b/tests/planner/README.md
@@ -21,7 +21,7 @@ Use the pre-configured test deployment with sample profiling data, we provide th

 ### Option B: Use Your Own Profiling Results

-1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../docs/architecture/pre_deployment_profiling.md) for detailed instructions.
+1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../docs/benchmarks/pre_deployment_profiling.md) for detailed instructions.

 ## Interpolator Testing