fix: bug fixes for planner tests (#3821)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

fix: bug fixes for planner tests (#3821)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
7b2f95e4 · hhzhang16 · GitHub · 21697e1c · 7b2f95e4 · 7b2f95e4
Unverified Commit 7b2f95e4 authored Oct 22, 2025 by hhzhang16 Committed by GitHub Oct 22, 2025
6 changed files
--- a/components/src/dynamo/planner/utils/prometheus.py
+++ b/components/src/dynamo/planner/utils/prometheus.py
@@ -63,6 +63,11 @@ class PrometheusAPIClient:
            Average metric value or 0 if no data/error
        """
        try:
+            # Prepend the frontend metric prefix if not already present
+            if not full_metric_name.startswith(prometheus_names.name_prefix.FRONTEND):
+                full_metric_name = (
+                    f"{prometheus_names.name_prefix.FRONTEND}_{full_metric_name}"
+                )
            query = f"increase({full_metric_name}_sum[{interval}])/increase({full_metric_name}_count[{interval}])"
            result = self.prom.custom_query(query=query)
            if not result:
@@ -75,8 +80,10 @@ class PrometheusAPIClient:

            values = []
            for container in metrics_containers:
+                # Frontend lowercases model names for Prometheus labels so we need to do case-insensitive comparison
                if (
-                    container.metric.model == model_name
+                    container.metric.model
+                    and container.metric.model.lower() == model_name.lower()
                    and container.metric.dynamo_namespace == self.dynamo_namespace
                ):
                    values.append(container.value[1])
@@ -120,14 +127,23 @@ class PrometheusAPIClient:
        # This function follows a different query pattern than the other metrics
        try:
            requests_total_metric = prometheus_names.frontend_service.REQUESTS_TOTAL
+            # Prepend the frontend metric prefix if not already present
+            if not requests_total_metric.startswith(
+                prometheus_names.name_prefix.FRONTEND
+            ):
+                requests_total_metric = (
+                    f"{prometheus_names.name_prefix.FRONTEND}_{requests_total_metric}"
+                )
            raw_res = self.prom.custom_query(
                query=f"increase({requests_total_metric}[{interval}])"
            )
            metrics_containers = parse_frontend_metric_containers(raw_res)
            total_count = 0.0
            for container in metrics_containers:
+                # Frontend lowercases model names for Prometheus labels so we need to do case-insensitive comparison
                if (
-                    container.metric.model == model_name
+                    container.metric.model
+                    and container.metric.model.lower() == model_name.lower()
                    and container.metric.dynamo_namespace == self.dynamo_namespace
                ):
                    total_count += container.value[1]

--- a/docs/planner/sla_planner_quickstart.md
+++ b/docs/planner/sla_planner_quickstart.md
@@ -38,7 +38,7 @@ flowchart TD

 Before deploying the SLA planner, ensure:
 - **Dynamo platform installed** (see [Installation Guide](/docs/kubernetes/installation_guide.md))
- **[kube-prometheus-stack](/docs/kubernetes/metrics.md) installed and running.** By default, the prometheus server is not deployed in the `monitoring` namespace. If it is deployed to a different namespace, set `dynamo-operator.dynamo.metrics.prometheusEndpoint="http://prometheus-kube-prometheus-prometheus.<namespace>.svc.cluster.local:9090"`.
+- **[kube-prometheus-stack](/docs/kubernetes/metrics.md) installed and running.** By default, the prometheus server is deployed in the `monitoring` namespace. If it is deployed to a different namespace, set `dynamo-operator.dynamo.metrics.prometheusEndpoint="http://prometheus-kube-prometheus-prometheus.<namespace>.svc.cluster.local:9090"`.
 - **Benchmarking resources setup** (see [Kubernetes utilities for Dynamo Benchmarking and Profiling](../../deploy/utils/README.md)) The script will create a `dynamo-pvc` with `ReadWriteMany` access, if your cluster's default storageClassName does not allow `ReadWriteMany`, you need to specify a different storageClassName in `deploy/utils/manifests/pvc.yaml` which does support `ReadWriteMany`.



--- a/tests/planner/README.md
+++ b/tests/planner/README.md
@@ -160,20 +160,64 @@ PYTHONPATH=../../components/src python -m pytest test_replica_calculation.py -v
 **Note**: The unit tests automatically mock external dependencies (prometheus_client, runtime modules) to ensure they can run in isolation without requiring the full Dynamo environment.

 #### Run Full End-to-End Test
-Test complete scaling behavior including Kubernetes deployment and load generation:
+
+Test complete scaling behavior including Kubernetes deployment and load generation.
+
+**Prerequisites:**
+
+- **[kube-prometheus-stack](../../docs/kubernetes/metrics.md) installed and running.** The SLA planner requires Prometheus to observe metrics and make scaling decisions.
+- Ensure the Dynamo operator was installed with the Prometheus endpoint configured (see [SLA Planner Quickstart Guide](../../docs/planner/sla_planner_quickstart.md#prerequisites) for details).
+
+**Prepare the test deployment manifest:**
+
+The test requires modifying `components/backends/vllm/deploy/disagg_planner.yaml` with test-specific planner arguments:
+
+1. Copy the base deployment:

 ```bash
-./scaling/run_scaling_test.sh
+cp components/backends/vllm/deploy/disagg_planner.yaml tests/planner/scaling/disagg_planner.yaml
 ```

-With custom namespace:
+2. Edit `tests/planner/scaling/disagg_planner.yaml`. Ensure all services use the correct image. Modify the Planner service args:
+
+```yaml
+spec:
+  services:
+    Planner:
+      extraPodSpec:
+        mainContainer:
+          args:
+            - --environment=kubernetes
+            - --backend=vllm
+            - --adjustment-interval=60
+            - --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
+            - --ttft=100
+            - --itl=10
+            - --load-predictor=constant
+            - --no-correction
+```
+
+3. Update the model in VllmPrefillWorker and VllmDecodeWorker services:
+
+```yaml
+args:
+  - -m
+  - dynamo.vllm
+  - --model
+  - nvidia/Llama-3.1-8B-Instruct-FP8
+  - --migration-limit=3
+  - --max-model-len=8192
+```
+
+**Run the test:**
+
 ```bash
 ./scaling/run_scaling_test.sh --namespace <namespace>
 ```

 To save results to `tests/planner/e2e_scaling_results` instead of `/tmp`:
 ```bash
-./scaling/run_scaling_test.sh --save-results
+./scaling/run_scaling_test.sh --namespace <namespace> --save-results
 ```

 **E2E Test Deployment Management:**

--- a/tests/planner/scaling/disagg_planner.yaml
+++ b/tests/planner/scaling/disagg_planner.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
-  name: vllm-disagg-planner
-spec:
-  envs:
-    - name: DYNAMO_SERVICE_CONFIG
-      value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
-    - name: DYN_NAMESPACE
-      value: "vllm-disagg-planner"
-  services:
-    Frontend:
-      dynamoNamespace: vllm-disagg-planner
-      componentType: frontend
-      replicas: 1
-      extraPodSpec:
-        mainContainer:
-          image: my-registry/vllm-runtime:my-tag
-    Planner:
-      dynamoNamespace: vllm-disagg-planner
-      envFromSecret: hf-token-secret
-      componentType: planner
-      replicas: 1
-      livenessProbe:
-        httpGet:
-          path: /metrics
-          port: 9085
-        periodSeconds: 60
-        timeoutSeconds: 30
-        failureThreshold: 10
-      readinessProbe:
-        httpGet:
-          path: /metrics
-          port: 9085
-        initialDelaySeconds: 60
-        periodSeconds: 60
-        timeoutSeconds: 30
-        failureThreshold: 10
-      extraPodSpec:
-        mainContainer:
-          image: my-registry/vllm-runtime:my-tag
-          workingDir: /workspace/components/src/dynamo/planner
-          ports:
-            - name: metrics
-              containerPort: 9085
-          command:
-            - /bin/sh
-            - -c
-          args:
-            - >-
-              python3 -m planner_sla
-              --environment=kubernetes
-              --backend=vllm
-              --adjustment-interval=60
-              --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
-              --prometheus-port=9085
-              --ttft=100
-              --itl=10
-              --load-predictor=constant
-              --no-correction
-    VllmDecodeWorker:
-      dynamoNamespace: vllm-disagg-planner
-      envFromSecret: hf-token-secret
-      componentType: worker
-      subComponentType: decode
-      replicas: 1
-      resources:
-        limits:
-          gpu: "1"
-      extraPodSpec:
-        mainContainer:
-          startupProbe:
-            httpGet:
-              path: /health
-              port: 9090
-            periodSeconds: 30
-            failureThreshold: 60
-          image: my-registry/vllm-runtime:my-tag
-          workingDir: /workspace/components/backends/vllm
-          command:
-            - /bin/sh
-            - -c
-          args:
-            - "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --migration-limit=3 --max-model-len=8192"
-    VllmPrefillWorker:
-      dynamoNamespace: vllm-disagg-planner
-      envFromSecret: hf-token-secret
-      componentType: worker
-      subComponentType: prefill
-      replicas: 1
-      resources:
-        limits:
-          gpu: "1"
-      extraPodSpec:
-        mainContainer:
-          startupProbe:
-            httpGet:
-              path: /health
-              port: 9090
-            periodSeconds: 30
-            failureThreshold: 60
-          image: my-registry/vllm-runtime:my-tag
-          workingDir: /workspace/components/backends/vllm
-          command:
-            - /bin/sh
-            - -c
-          args:
-            - python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --is-prefill-worker --migration-limit=3 --max-model-len=8192
--- a/tests/planner/test_scaling_e2e.py
+++ b/tests/planner/test_scaling_e2e.py
@@ -97,13 +97,15 @@ class KubernetesMonitor:
            for pod in data.get("items", []):
                pod_phase = pod.get("status", {}).get("phase", "")
                pod_labels = pod.get("metadata", {}).get("labels", {})
-                component = pod_labels.get("nvidia.com/dynamo-component", "")
+                sub_component = pod_labels.get(
+                    "nvidia.com/dynamo-sub-component-type", ""
+                )

                # Only count Running pods
                if pod_phase == "Running":
-                    if component == "VllmPrefillWorker":
+                    if sub_component == "prefill":
                        prefill_pods += 1
-                    elif component == "VllmDecodeWorker":
+                    elif sub_component == "decode":
                        decode_pods += 1
                    else:
                        continue

--- a/tests/planner/utils/load_generator.py
+++ b/tests/planner/utils/load_generator.py
@@ -208,11 +208,16 @@ class LoadGenerator:
            logger.info(f"Parsing results from: {results_file}")

            with open(results_file, "r") as f:
-                metrics = json.load(f)
+                data = json.load(f)
+
+            # aiperf now wraps metrics under "records" key
+            metrics = data.get("records", data)

            results = {
                "throughput": metrics.get("output_token_throughput", {}).get("avg", 0),
-                "ttft_mean": metrics.get("time_to_first_token", {}).get("avg", 0),
+                "ttft_mean": metrics.get("ttft", {}).get(
+                    "avg", 0
+                ),  # Changed from "time_to_first_token"
                "itl_mean": metrics.get("inter_token_latency", {}).get("avg", 0),
                "end_to_end_latency_mean": metrics.get("request_latency", {}).get(
                    "avg", 0