chore: bug fixes in pre-deployment sweeping and vllm_v1 planner; expose...

chore: bug fixes in pre-deployment sweeping and vllm_v1 planner; expose num_d/p to k8s metrics (#2454)

chore: bug fixes in pre-deployment sweeping and vllm_v1 planner; expose...
chore: bug fixes in pre-deployment sweeping and vllm_v1 planner; expose num_d/p to k8s metrics (#2454)
922850ae · Hongkuan Zhou · GitHub · a7184bec · 922850ae · 922850ae
Unverified Commit 922850ae authored Aug 15, 2025 by Hongkuan Zhou Committed by GitHub Aug 15, 2025
12 changed files
--- a/benchmarks/profiler/inject_disagg_config.py
+++ b/benchmarks/profiler/inject_disagg_config.py
@@ -162,7 +162,7 @@ def main():

    print("\n✅ Disagg config injection completed!")
    print(f"📁 Config available at: {args.target_path}")
-    print(f"🔧 Set DGD_CONFIG_FILE={args.target_path} in your profiler job")
+    print(f"🔧 Set DGD_CONFIG_FILE=/workspace{args.target_path} in your profiler job")


 if __name__ == "__main__":

--- a/benchmarks/profiler/profile_endpoint.py
+++ b/benchmarks/profiler/profile_endpoint.py
@@ -5,10 +5,9 @@ import argparse
 import logging
 import os

+from utils.profile_decode import profile_decode
 from utils.profile_prefill import profile_prefill

-from benchmarks.profiler.utils.profile_decode import profile_decode
-
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 console_handler = logging.StreamHandler()

--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -35,10 +35,9 @@ from utils.profile_cache import (
    load_existing_decode_results,
    load_existing_prefill_results,
 )
+from utils.profile_decode import profile_decode
 from utils.profile_prefill import profile_prefill

-from benchmarks.profiler.utils.profile_decode import profile_decode
-
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 console_handler = logging.StreamHandler()

--- a/benchmarks/profiler/utils/kubernetes.py
+++ b/benchmarks/profiler/utils/kubernetes.py
@@ -78,7 +78,7 @@ def deploy_access_pod(namespace: str) -> str:
    print(f"Deploying access pod '{pod_name}' in namespace '{namespace}'...")

    # Get the directory where this script is located
-    script_dir = Path(__file__).parent
+    script_dir = Path(__file__).parent.parent
    pod_yaml_path = script_dir / "deploy" / "pvc-access-pod.yaml"

    if not pod_yaml_path.exists():

--- a/components/backends/sglang/deploy/disagg_planner.yaml
+++ b/components/backends/sglang/deploy/disagg_planner.yaml
@@ -6,7 +6,7 @@ kind: DynamoGraphDeployment
 metadata:
  name: sglang-disagg-planner
  annotations:
-    nvidia.com/enable-grove: "false"
+    nvidia.com/enable-grove: "false" # temporarily disable grove because current k8s connector does not work with grove
 spec:
  envs:
    - name: DYNAMO_SERVICE_CONFIG

--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -5,6 +5,8 @@ apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
  name: vllm-disagg-planner
+  annotations:
+    nvidia.com/enable-grove: "false" # temporarily disable grove because current k8s connector does not work with grove
 spec:
  envs:
    - name: DYNAMO_SERVICE_CONFIG
@@ -27,7 +29,7 @@ spec:
          memory: "10Gi"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh
@@ -71,8 +73,11 @@ spec:
        mountPoint: /workspace/profiling_results
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
          workingDir: /workspace/components/planner/src/dynamo/planner
+          ports:
+            - name: metrics
+              containerPort: 9085
          command:
            - /bin/sh
            - -c
@@ -83,6 +88,7 @@ spec:
              --backend=vllm
              --adjustment-interval=60
              --profile-results-dir=/workspace/profiling_results
+              --prometheus-port=9085
    Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
      dynamoNamespace: vllm-disagg-planner
      componentType: frontend
@@ -118,7 +124,7 @@ spec:
          memory: "2Gi"
      extraPodSpec:
        mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh
@@ -147,7 +153,7 @@ spec:
              port: 9090
            periodSeconds: 10
            failureThreshold: 60
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh
@@ -176,7 +182,7 @@ spec:
              port: 9090
            periodSeconds: 10
            failureThreshold: 60
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
          workingDir: /workspace/components/backends/vllm
          command:
            - /bin/sh

--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
@@ -23,6 +23,14 @@ configure_dynamo_logging()
 logger = logging.getLogger(__name__)


+def _get_prometheus_port_from_env():
+    """
+    Get prometheus port from environment variables if set.
+    Otherwise, return 0, which means not reporting metrics using prometheus.
+    """
+    return os.environ.get("PLANNER_PROMETHEUS_PORT", 0)
+
+
 # Source of truth for planner defaults
 class BasePlannerDefaults:
    namespace = "dynamo"
@@ -35,6 +43,7 @@ class BasePlannerDefaults:
    min_endpoint = 1  # applies to both decode and prefill
    decode_engine_num_gpu = 1
    prefill_engine_num_gpu = 1
+    prometheus_port = _get_prometheus_port_from_env()


 class LoadPlannerDefaults(BasePlannerDefaults):

--- a/components/planner/src/dynamo/planner/planner_sla.py
+++ b/components/planner/src/dynamo/planner/planner_sla.py
@@ -135,6 +135,12 @@ if __name__ == "__main__":
        default=SLAPlannerDefaults.load_prediction_window_size,
        help="Load prediction window size",
    )
+    parser.add_argument(
+        "--prometheus-port",
+        type=int,
+        default=SLAPlannerDefaults.prometheus_port,
+        help="Prometheus port",
+    )

    args = parser.parse_args()
    asyncio.run(init_planner(args))
--- a/components/planner/src/dynamo/planner/utils/planner_core.py
+++ b/components/planner/src/dynamo/planner/utils/planner_core.py
@@ -9,6 +9,8 @@ import time
 from dataclasses import dataclass
 from typing import Optional

+from prometheus_client import Gauge, start_http_server
+
 from dynamo.planner import KubernetesConnector, __version__
 from dynamo.planner.defaults import WORKER_COMPONENT_NAMES, SLAPlannerDefaults
 from dynamo.planner.utils.load_predictor import LOAD_PREDICTORS
@@ -89,6 +91,23 @@ class Planner:
        self.p_correction_factor = 1.0
        self.d_correction_factor = 1.0

+        self.prometheus_port = args.prometheus_port
+
+        # Initialize Prometheus metrics
+        # TODO: use proper naming
+        self.num_p_workers_gauge = Gauge("num_p_workers", "Number of prefill workers")
+        self.num_d_workers_gauge = Gauge("num_d_workers", "Number of decode workers")
+
+        # Start Prometheus HTTP server if port is specified
+        if self.prometheus_port != 0:
+            try:
+                start_http_server(self.prometheus_port)
+                logger.info(
+                    f"Started Prometheus metrics server on port {self.prometheus_port}"
+                )
+            except Exception as e:
+                logger.error(f"Failed to start Prometheus metrics server: {e}")
+
    async def get_workers_info(self):
        try:
            if self.prefill_client is None:
@@ -137,7 +156,17 @@ class Planner:
            raise RuntimeError(f"Failed to get decode worker endpoints: {e}")
        return p_endpoints, d_endpoints

-    def observe_metrics(self):
+    async def observe_metrics(self):
+        self.p_endpoints, self.d_endpoints = await self.get_workers_info()
+        logger.debug(
+            f"Number of prefill workers: {len(self.p_endpoints)}, number of decode workers: {len(self.d_endpoints)}"
+        )
+
+        # Update Prometheus metrics if server is running
+        if self.prometheus_port != 0:
+            self.num_p_workers_gauge.set(len(self.p_endpoints))
+            self.num_d_workers_gauge.set(len(self.d_endpoints))
+
        self.last_metrics.ttft = self.prometheus_api_client.get_avg_time_to_first_token(
            f"{self.args.adjustment_interval}s"
        )
@@ -319,7 +348,7 @@ class Planner:
            ):
                self.last_adjustment_time = time.time()
                logger.info("New adjustment interval started!")
-                self.observe_metrics()
+                await self.observe_metrics()
                await self.make_adjustments()

            # sleep for a while to avoid busy-waiting but not too long to miss the next adjustment
@@ -434,5 +463,11 @@ if __name__ == "__main__":
        default=SLAPlannerDefaults.load_prediction_window_size,
        help="Window size for load prediction",
    )
+    parser.add_argument(
+        "--prometheus-port",
+        type=int,
+        default=SLAPlannerDefaults.prometheus_port,
+        help="Prometheus port for metrics server (0 to disable)",
+    )
    args = parser.parse_args()
    asyncio.run(dynamo_worker()(start_sla_planner)(args))
--- a/deploy/metrics/k8s/planner-podmonitor.yaml
+++ b/deploy/metrics/k8s/planner-podmonitor.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: dynamo-planner-metrics
+  namespace: $NAMESPACE
+spec:
+  selector:
+    matchLabels:
+      nvidia.com/metrics-enabled: "true"
+      nvidia.com/dynamo-component-type: "planner"
+  podMetricsEndpoints:
+    - port: metrics
+      path: /metrics
+      interval: 2s
+  namespaceSelector:
+    matchNames:
+      - $NAMESPACE
\ No newline at end of file
--- a/docs/architecture/pre_deployment_profiling.md
+++ b/docs/architecture/pre_deployment_profiling.md
@@ -113,7 +113,7 @@ Use the default pre-built image and inject custom configurations via PVC:

 3. **Set the config path for the profiling job:**
   ```bash
-   export DGD_CONFIG_FILE=/profiling_results/disagg.yaml # or your custom path
+   export DGD_CONFIG_FILE=/workspace/profiling_results/disagg.yaml # or your custom path
   ```

 This approach allows you to:

--- a/docs/guides/deploy/k8s_metrics.md
+++ b/docs/guides/deploy/k8s_metrics.md
@@ -93,7 +93,7 @@ apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: dynamo-frontend-metrics
-  namespace: dynamo
+  namespace: $NAMESPACE
 spec:
  selector:
    matchLabels:
@@ -105,7 +105,7 @@ spec:
      interval: 2s
  namespaceSelector:
    matchNames:
-      - dynamo
+      - $NAMESPACE
 ```

 Then, create the worker PodMonitor:
@@ -115,7 +115,7 @@ apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
  name: dynamo-worker-metrics
-  namespace: dynamo
+  namespace: $NAMESPACE
 spec:
  selector:
    matchLabels:
@@ -127,7 +127,28 @@ spec:
      interval: 2s
  namespaceSelector:
    matchNames:
-      - dynamo
+      - $NAMESPACE
+```
+
+If you are using planner, you can also create a PodMonitor for the planner:
+```yaml
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: dynamo-planner-metrics
+  namespace: $NAMESPACE
+spec:
+  selector:
+    matchLabels:
+      nvidia.com/metrics-enabled: "true"
+      nvidia.com/dynamo-component-type: "planner"
+  podMetricsEndpoints:
+    - port: metrics
+      path: /metrics
+      interval: 2s
+  namespaceSelector:
+    matchNames:
+      - $NAMESPACE
 ```

 Apply the PodMonitors:
@@ -136,6 +157,7 @@ pushd deploy/metrics/k8s
 # envsubst replaces ${NAMESPACE} with the actual namespace value
 envsubst < frontend-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
 envsubst < worker-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
+envsubst < planner-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
 popd
 ```

@@ -146,7 +168,7 @@ This will cause Prometheus to be re-configured to scrape metrics from the pods o
 Apply the Dynamo dashboard configuration to populate Grafana with the Dynamo dashboard:
 ```bash
 pushd deploy/metrics/k8s
-kubectl apply -n monitoring -f resources/grafana-dynamo-dashboard-configmap.yaml
+kubectl apply -n monitoring -f grafana-dynamo-dashboard-configmap.yaml
 popd
 ```

@@ -162,7 +184,7 @@ The dashboard is embedded in the ConfigMap. Since it is labeled with `grafana_da

 ### In Prometheus
 ```bash
-kubectl port-forward svc/prometheus-operated 9090:9090
+kubectl port-forward svc/prometheus-operated 9090:9090 -n monitoring
 ```

 Visit http://localhost:9090 and try these example queries:
@@ -173,7 +195,7 @@ Visit http://localhost:9090 and try these example queries:

 ### In Grafana
 ```bash
-kubectl port-forward svc/grafana 3000:80
+kubectl port-forward svc/grafana 3000:80 -n monitoring
 ```

 Visit http://localhost:3000 and find the Dynamo dashboard under General.