fix: correctly parse subcomponent in sla-planner + deploy/doc update (#3363)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

fix: correctly parse subcomponent in sla-planner + deploy/doc update (#3363)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
cfe74445 · Hongkuan Zhou · GitHub · 92b8fe07 · cfe74445 · cfe74445
Unverified Commit cfe74445 authored Oct 02, 2025 by Hongkuan Zhou Committed by GitHub Oct 02, 2025
13 changed files
--- a/benchmarks/profiler/deploy/profile_sla_aic_job.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_aic_job.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: profile-sla
+  namespace: ${NAMESPACE}
+spec:
+  template:
+    spec:
+      serviceAccountName: dynamo-sa
+      containers:
+      - name: profile-sla
+        image: ${DOCKER_IMAGE}
+        resources:
+          requests:
+            cpu: "16"
+            memory: "10Gi"
+        env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: hf-token-secret
+                key: HF_TOKEN
+          - name: NATS_SERVER
+            value: nats://${NAMESPACE}-nats:4222
+          - name: ETCD_ENDPOINTS
+            value: ${NAMESPACE}-etcd:2379
+        command: ["python", "-m", "benchmarks.profiler.profile_sla"]
+        args:
+          - --config
+          - ${DGD_CONFIG_FILE}
+          - --output-dir
+          - /data/profiling_results
+          - --namespace
+          - ${NAMESPACE}
+          - --backend
+          - vllm
+          - --min-num-gpus-per-engine
+          - "1"
+          - --max-num-gpus-per-engine
+          - "8"
+          - --isl
+          - "3000"
+          - --osl
+          - "150"
+          - --ttft
+          - "500"
+          - --itl
+          - "30"
+          - --use-ai-configurator
+          - --aic-system
+          - h200_sxm
+          - --aic-model-name
+          - QWEN3_32B
+          - --backend-version
+          - 0.20.0
+        volumeMounts:
+          - name: output-volume
+            mountPath: /data
+      restartPolicy: Never
+      volumes:
+        - name: output-volume
+          persistentVolumeClaim:
+            claimName: dynamo-pvc
+  backoffLimit: 0
--- a/components/backends/sglang/deploy/disagg_planner.yaml
+++ b/components/backends/sglang/deploy/disagg_planner.yaml
@@ -25,25 +25,6 @@ spec:
      envFromSecret: hf-token-secret
      componentType: planner
      replicas: 1
-      livenessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        periodSeconds: 60
-        timeoutSeconds: 30
-        failureThreshold: 10
-      readinessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        initialDelaySeconds: 60
-        periodSeconds: 60
-        timeoutSeconds: 30
-        failureThreshold: 10
      volumeMounts:
        - name: dynamo-pvc
          mountPoint: /data

--- a/components/backends/trtllm/deploy/disagg_planner.yaml
+++ b/components/backends/trtllm/deploy/disagg_planner.yaml
@@ -42,25 +42,6 @@ spec:
      envFromSecret: hf-token-secret
      componentType: planner
      replicas: 1
-      livenessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        periodSeconds: 60
-        timeoutSeconds: 30
-        failureThreshold: 10
-      readinessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        initialDelaySeconds: 60
-        periodSeconds: 60
-        timeoutSeconds: 30
-        failureThreshold: 10
      volumeMounts:
        - name: dynamo-pvc # Must be pre-created before deployment and SLA profiler must have been run
          mountPoint: /data
@@ -107,12 +88,6 @@ spec:
      extraPodSpec:
        terminationGracePeriodSeconds: 600
        mainContainer:
-          startupProbe:
-            httpGet:
-              path: /health
-              port: 9090
-            periodSeconds: 10
-            failureThreshold: 60
          image: my-registry/trtllm-runtime:my-tag
          workingDir: /workspace/components/backends/trtllm
          command:
@@ -142,12 +117,6 @@ spec:
      extraPodSpec:
        terminationGracePeriodSeconds: 600
        mainContainer:
-          startupProbe:
-            httpGet:
-              path: /health
-              port: 9090
-            periodSeconds: 10
-            failureThreshold: 60
          image: my-registry/trtllm-runtime:my-tag
          workingDir: /workspace/components/backends/trtllm
          command:

--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -22,28 +22,8 @@ spec:
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
    Planner:
      dynamoNamespace: vllm-disagg-planner
-      envFromSecret: hf-token-secret
      componentType: planner
      replicas: 1
-      livenessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        periodSeconds: 60
-        timeoutSeconds: 30
-        failureThreshold: 10
-      readinessProbe:
-        exec:
-          command:
-            - /bin/sh
-            - -c
-            - "exit 0"
-        initialDelaySeconds: 60
-        periodSeconds: 60
-        timeoutSeconds: 30
-        failureThreshold: 10
      volumeMounts:
        - name: dynamo-pvc
          mountPoint: /data
@@ -65,18 +45,12 @@ spec:
      envFromSecret: hf-token-secret
      componentType: worker
      subComponentType: decode
-      replicas: 2
+      replicas: 1
      resources:
        limits:
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          startupProbe:
-            httpGet:
-              path: /health
-              port: 9090
-            periodSeconds: 10
-            failureThreshold: 60
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
          workingDir: /workspace/components/backends/vllm
          command:
@@ -91,18 +65,12 @@ spec:
      envFromSecret: hf-token-secret
      componentType: worker
      subComponentType: prefill
-      replicas: 2
+      replicas: 1
      resources:
        limits:
          gpu: "1"
      extraPodSpec:
        mainContainer:
-          startupProbe:
-            httpGet:
-              path: /health
-              port: 9090
-            periodSeconds: 10
-            failureThreshold: 60
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
          workingDir: /workspace/components/backends/vllm
          command:

--- a/components/src/dynamo/planner/__init__.py
+++ b/components/src/dynamo/planner/__init__.py
@@ -7,12 +7,10 @@ __all__ = [
    "VirtualConnector",
    "LoadPlannerDefaults",
    "SLAPlannerDefaults",
-    "ServiceConfig",
    "TargetReplica",
    "SubComponentType",
 ]
 # Import the classes
-from dynamo.planner.config import ServiceConfig
 from dynamo.planner.defaults import (
    LoadPlannerDefaults,
    SLAPlannerDefaults,

--- a/components/src/dynamo/planner/config.py
+++ b/components/src/dynamo/planner/config.py
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import json
-import logging
-import os
-logger = logging.getLogger(__name__)
-class ServiceConfig(dict):
-    """Configuration store that inherits from dict for simpler access patterns"""
-    _instance = None
-    COMMON_CONFIG_SERVICE = "Common"
-    COMMON_CONFIG_KEY = "common-configs"
-    @classmethod
-    def get_instance(cls):
-        if cls._instance is None:
-            cls._instance = cls._load_from_env()
-        return cls._instance
-    @classmethod
-    def _load_from_env(cls):
-        """Load config from environment variable"""
-        configs = {}
-        env_config = os.environ.get("DYNAMO_SERVICE_CONFIG")
-        if env_config:
-            try:
-                configs = json.loads(env_config)
-            except json.JSONDecodeError:
-                print("Failed to parse DYNAMO_SERVICE_CONFIG")
-        return cls(configs)  # Initialize dict subclass with configs
-    def require(self, service_name, key):
-        """Require a config value, raising error if not found"""
-        if service_name not in self or key not in self[service_name]:
-            raise ValueError(f"{service_name}.{key} must be specified in configuration")
-        return self[service_name][key]
-    @classmethod
-    def get_parsed_config(cls, service_name):
-        """Get parsed config for a service with common configs applied, returned as dict"""
-        instance = cls.get_instance()
-        if service_name not in instance:
-            return {}
-        # Get service config excluding ServiceArgs if it exists
-        service_config = instance[service_name].copy()
-        if "ServiceArgs" in service_config:
-            del service_config["ServiceArgs"]
-        # Apply common configs if they exist
-        if (common := instance.get(cls.COMMON_CONFIG_SERVICE)) is not None and (
-            common_config_keys := service_config.get(cls.COMMON_CONFIG_KEY)
-        ) is not None:
-            for key in common_config_keys:
-                if key in common and key not in service_config:
-                    service_config[key] = common[key]
-        # Remove the common-configs key itself from the final config
-        if cls.COMMON_CONFIG_KEY in service_config:
-            del service_config[cls.COMMON_CONFIG_KEY]
-        return service_config
-    def as_args(self, service_name, prefix=""):
-        """Extract configs as CLI args for a service, with optional prefix filtering.
-        Every component will additionally have the args in the `Common` configs
-        applied if it has subscribed to that config key, i.e. the given key is provided in
-        the component's `common-configs` setting, and that key has not been overriden by the
-        component's config.
-        """
-        if service_name not in self:
-            return []
-        args: list[str] = []
-        def add_to_args(args: list[str], key: str, value):
-            if prefix and not key.startswith(prefix):
-                return
-            if key.endswith(self.COMMON_CONFIG_KEY):
-                return
-            # Strip prefix if needed
-            arg_key = key[len(prefix) :] if prefix and key.startswith(prefix) else key
-            # vLLM arguments that default to True and need explicit false handling
-            # Based on https://github.com/vllm-project/vllm/blob/v0.9.1/vllm/config.py
-            vllm_true_defaults = {
-                "enable_prefix_caching": "no-enable-prefix-caching",
-                "use_tqdm_on_load": "no-use-tqdm-on-load",
-                "multi_step_stream_outputs": "no-multi-step-stream-outputs",
-            }
-            # Normalize key for comparison (replace hyphens with underscores)
-            normalized_key = arg_key.replace("-", "_")
-            # Convert to CLI format
-            if isinstance(value, bool):
-                if value:
-                    # Always output true values as flags
-                    args.append(f"--{arg_key}")
-                else:
-                    # For false values, check if this is a vLLM arg that defaults to True
-                    if normalized_key in vllm_true_defaults:
-                        # Use negative flag if available
-                        args.append(f"--{vllm_true_defaults[normalized_key]}")
-                    # For other false values, omit entirely (standard action="store_true" behavior)
-            elif isinstance(value, dict):
-                args.extend([f"--{arg_key}", json.dumps(value)])
-            else:
-                args.extend([f"--{arg_key}", str(value)])
-        # Get service config excluding ServiceArgs if it exists
-        # We never want args to be generated from the ServiceArgs
-        service_config = self[service_name].copy()
-        if "ServiceArgs" in service_config:
-            del service_config["ServiceArgs"]
-        if (common := self.get(self.COMMON_CONFIG_SERVICE)) is not None and (
-            common_config_keys := service_config.get(self.COMMON_CONFIG_KEY)
-        ) is not None:
-            for key in common_config_keys:
-                if key in common and key not in service_config:
-                    add_to_args(args, key, common[key])
-        for key, value in service_config.items():
-            add_to_args(args, key, value)
-        logger.info(f"Running {service_name} with {args=}")
-        return args
--- a/components/src/dynamo/planner/defaults.py
+++ b/components/src/dynamo/planner/defaults.py
@@ -15,6 +15,7 @@
 import logging
 import os
+import shlex
 from enum import Enum
 from typing import Optional
@@ -143,6 +144,21 @@ class SubComponentType(str, Enum):
    DECODE = "decode"
+def break_arguments(args: list[str] | None) -> list[str]:
+    ans: list[str] = []
+    if args is None:
+        return ans
+    if isinstance(args, str):
+        # Use shlex.split to properly handle quoted arguments and JSON values
+        ans = shlex.split(args)
+    else:
+        for arg in args:
+            if arg is not None:
+                # Use shlex.split to properly handle quoted arguments
+                ans.extend(shlex.split(arg))
+    return ans
 class Service(BaseModel):
    name: str
    service: dict
@@ -150,6 +166,24 @@ class Service(BaseModel):
    def number_replicas(self) -> int:
        return self.service.get("replicas", 0)
+    def get_model_name(self) -> Optional[str]:
+        args = (
+            self.service.get("extraPodSpec", {})
+            .get("mainContainer", {})
+            .get("args", [])
+        )
+        args = break_arguments(args)
+        if (
+            "--served-model-name" in args
+            and len(args) > args.index("--served-model-name") + 1
+        ):
+            return args[args.index("--served-model-name") + 1]
+        if "--model" in args and len(args) > args.index("--model") + 1:
+            return args[args.index("--model") + 1]
+        return None
 # TODO: still supporting framework component names for backwards compatibility
 # Should be deprecated in favor of service subComponentType

--- a/components/src/dynamo/planner/kubernetes_connector.py
+++ b/components/src/dynamo/planner/kubernetes_connector.py
@@ -15,7 +15,6 @@
 import logging
 import os
-import shlex
 from typing import Optional
 from pydantic import BaseModel
@@ -40,47 +39,6 @@ configure_dynamo_logging()
 logger = logging.getLogger(__name__)
-class Service(BaseModel):
-    name: str
-    service: dict
-    def number_replicas(self) -> int:
-        return self.service.get("replicas", 0)
-    def get_model_name(self) -> Optional[str]:
-        args = (
-            self.service.get("extraPodSpec", {})
-            .get("mainContainer", {})
-            .get("args", [])
-        )
-        args = break_arguments(args)
-        if (
-            "--served-model-name" in args
-            and len(args) > args.index("--served-model-name") + 1
-        ):
-            return args[args.index("--served-model-name") + 1]
-        if "--model" in args and len(args) > args.index("--model") + 1:
-            return args[args.index("--model") + 1]
-        return None
-def break_arguments(args: list[str] | None) -> list[str]:
-    ans: list[str] = []
-    if args is None:
-        return ans
-    if isinstance(args, str):
-        # Use shlex.split to properly handle quoted arguments and JSON values
-        ans = shlex.split(args)
-    else:
-        for arg in args:
-            if arg is not None:
-                # Use shlex.split to properly handle quoted arguments
-                ans.extend(shlex.split(arg))
-    return ans
 class TargetReplica(BaseModel):
    sub_component_type: SubComponentType
    component_name: Optional[str] = None
@@ -205,11 +163,11 @@ class KubernetesConnector(PlannerConnector):
            # TODO: benchmarks/profiler/utils/config.py already contains DGD config parsing
            # and model name logic, should consolidate
-            prefill_service = self.get_service_from_sub_component_type_or_name(
+            prefill_service = get_service_from_sub_component_type_or_name(
                deployment,
                SubComponentType.PREFILL,
            )
-            decode_service = self.get_service_from_sub_component_type_or_name(
+            decode_service = get_service_from_sub_component_type_or_name(
                deployment,
                SubComponentType.DECODE,
            )

--- a/components/src/dynamo/planner/utils/prometheus.py
+++ b/components/src/dynamo/planner/utils/prometheus.py
@@ -67,6 +67,9 @@ class PrometheusAPIClient:
            result = self.prom.custom_query(query=query)
            if not result:
                # No data available yet (no requests made) - return 0 silently
+                logger.warning(
+                    f"No prometheus metric data available for {full_metric_name}, use 0 instead"
+                )
                return 0
            metrics_containers = parse_frontend_metric_containers(result)
@@ -79,6 +82,9 @@ class PrometheusAPIClient:
                    values.append(container.value[1])
            if not values:
+                logger.warning(
+                    f"No prometheus metric data available for {full_metric_name} with model {model_name} and dynamo namespace {self.dynamo_namespace}, use 0 instead"
+                )
                return 0
            return sum(values) / len(values)

--- a/deploy/cloud/operator/internal/dynamo/graph_test.go
+++ b/deploy/cloud/operator/internal/dynamo/graph_test.go
@@ -1202,7 +1202,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 									},
 									VolumeMounts: []v1alpha1.VolumeMount{
 										{
-											Name:       "planner-pvc",
+											Name:       "dynamo-pvc",
 											MountPoint: "/planner",
 										},
 									},
@@ -1426,10 +1426,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 									PodSpec: corev1.PodSpec{
 										Volumes: []corev1.Volume{
 											{
-												Name: "planner-pvc",
+												Name: "dynamo-pvc",
 												VolumeSource: corev1.VolumeSource{
 													PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
-														ClaimName: "planner-pvc",
+														ClaimName: "dynamo-pvc",
 													},
 												},
 											},
@@ -1540,7 +1540,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 												},
 												VolumeMounts: []corev1.VolumeMount{
 													{
-														Name:      "planner-pvc",
+														Name:      "dynamo-pvc",
 														MountPath: "/planner",
 													},
 													{
@@ -1725,7 +1725,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 									},
 									VolumeMounts: []v1alpha1.VolumeMount{
 										{
-											Name:       "planner-pvc",
+											Name:       "dynamo-pvc",
 											MountPoint: "/planner",
 										},
 									},
@@ -2219,10 +2219,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 										RestartPolicy:                 corev1.RestartPolicyAlways,
 										Volumes: []corev1.Volume{
 											{
-												Name: "planner-pvc",
+												Name: "dynamo-pvc",
 												VolumeSource: corev1.VolumeSource{
 													PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
-														ClaimName: "planner-pvc",
+														ClaimName: "dynamo-pvc",
 													},
 												},
 											},
@@ -2321,7 +2321,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 												},
 												VolumeMounts: []corev1.VolumeMount{
 													{
-														Name:      "planner-pvc",
+														Name:      "dynamo-pvc",
 														MountPath: "/planner",
 													},
 													{
@@ -2529,7 +2529,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 									},
 									VolumeMounts: []v1alpha1.VolumeMount{
 										{
-											Name:       "planner-pvc",
+											Name:       "dynamo-pvc",
 											MountPoint: "/planner",
 										},
 									},
@@ -3008,10 +3008,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 										ServiceAccountName:            commonconsts.PlannerServiceAccountName,
 										Volumes: []corev1.Volume{
 											{
-												Name: "planner-pvc",
+												Name: "dynamo-pvc",
 												VolumeSource: corev1.VolumeSource{
 													PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
-														ClaimName: "planner-pvc",
+														ClaimName: "dynamo-pvc",
 													},
 												},
 											},
@@ -3118,7 +3118,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 												},
 												VolumeMounts: []corev1.VolumeMount{
 													{
-														Name:      "planner-pvc",
+														Name:      "dynamo-pvc",
 														MountPath: "/planner",
 													},
 													{

--- a/deploy/utils/manifests/pvc.yaml
+++ b/deploy/utils/manifests/pvc.yaml
@@ -7,7 +7,7 @@ metadata:
  namespace: ${NAMESPACE}
 spec:
  accessModes:
-    - ReadWriteOnce
+    - ReadWriteMany
  resources:
    requests:
      storage: 50Gi
--- a/docs/benchmarks/pre_deployment_profiling.md
+++ b/docs/benchmarks/pre_deployment_profiling.md
@@ -167,6 +167,9 @@ envsubst < benchmarks/profiler/deploy/profile_sla_job.yaml | kubectl apply -f -
 # for MoE models
 envsubst < benchmarks/profiler/deploy/profile_sla_moe_job.yaml | kubectl apply -f -
+# using aiconfigurator instead of real sweeping (see below for more details)
+envsubst < benchmarks/profiler/deploy/profile_sla_aic_job.yaml | kubectl apply -f -
 ```
 **Step 5: Wait for profiling to complete**

--- a/docs/kubernetes/sla_planner_deployment.md
+++ b/docs/kubernetes/sla_planner_deployment.md
@@ -9,7 +9,7 @@ Quick deployment guide for the disaggregated planner with automatic scaling.
 **Components:**
 - **Frontend**: Serves requests and exposes `/metrics`
- **Prometheus**: Scrapes frontend metrics every adjustment interval
+- **Prometheus**: Scrapes frontend metrics every 5s (by default, can be updated in the podmonitor manifest)
 - **Planner**: Queries Prometheus and adjusts worker scaling every adjustment interval
 - **Workers**: prefill and backend workers handle inference
@@ -19,7 +19,7 @@ The adjustment interval can be defined in the planner manifest as an argument. T
 flowchart LR
  Frontend --"/metrics"--> Prometheus
  Planner --"query API"--> Prometheus
-  Planner --"scaling decisions"--> Workers["prefill<br/>backend"]
+  Planner --"scaling decisions"--> Workers
  Frontend -.->|"requests"| Workers
 ```
@@ -27,7 +27,7 @@ flowchart LR
 - Kubernetes cluster with GPU nodes
 - [Pre-Deployment Profiling](/docs/benchmarks/pre_deployment_profiling.md) completed and its results saved to `dynamo-pvc` PVC.
 - Prefill and decode worker uses the best parallelization mapping suggested by the pre-deployment profiling script.
- [kube-prometheus-stack](/docs/kubernetes/metrics.md) installed and running.
+- [kube-prometheus-stack](/docs/kubernetes/metrics.md) installed and running. By default, the prometheus server is not deployed in the `monitoring` namespace. If it is deployed to a different namespace, set `dynamo-operator.dynamo.metrics.prometheusEndpoint="http://prometheus-kube-prometheus-prometheus.<namespace>.svc.cluster.local:9090"`.
 > [!NOTE]
 > **Important**: The profiling that occurs before Planner deployment requires additional Kubernetes manifests (ServiceAccount, Role, RoleBinding, PVC) that are not included in standard Dynamo deployments. Apply these manifests in the same namespace as `$NAMESPACE`. For a complete setup, start with the [Quick Start guide](/deploy/utils/README.md#quick-start), which provides a fully encapsulated deployment including all required manifests.
@@ -42,8 +42,10 @@ We use vllm as the backend engine in this guide. SLA planner also supports SGLan
 ```bash
 # Apply the disaggregated planner deployment
 kubectl apply -f components/backends/vllm/deploy/disagg_planner.yaml -n $NAMESPACE # for vllm
-# kubectl apply -f components/backends/sglang/deploy/disagg_planner.yaml -n $NAMESPACE # for sglang
-# kubectl apply -f components/backends/trtllm/deploy/disagg_planner.yaml -n $NAMESPACE # for trtllm
+kubectl apply -f components/backends/sglang/deploy/disagg_planner.yaml -n $NAMESPACE # for sglang
+kubectl apply -f components/backends/trtllm/deploy/disagg_planner.yaml -n $NAMESPACE # for trtllm
 # Check deployment status
 kubectl get pods -n $NAMESPACE
@@ -60,13 +62,11 @@ vllm-disagg-planner-prefill-*             1/1 Running
 ## 2. Test the System
-**Important:** Streaming requests (`"stream": true`) are required for the planner to collect latency metrics and make scaling decisions. Non-streaming requests will produce successful inference outputs but won't provide the necessary telemetry for automatic scaling.
 ```bash
 # Port forward to frontend
 kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-frontend 8000:8000
-# Send a streaming request (required for full metrics)
+# Send a request
 curl -N http://localhost:8000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
@@ -98,8 +98,8 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
 ### Metrics Requirements
 - **Basic metrics** (request count): Available with any request type
- **Latency metrics** (TTFT/ITL): Only available with `"stream": true` requests
+- **Latency metrics** (TTFT/ITL): Available for both streaming and non-streaming requests
- **Scaling decisions**: Require sufficient request volume and streaming requests
+- **Scaling decisions**: Require sufficient request volume
 ## 4. Troubleshooting