Unverified Commit 922850ae authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

chore: bug fixes in pre-deployment sweeping and vllm_v1 planner; expose...

chore: bug fixes in pre-deployment sweeping and vllm_v1 planner; expose num_d/p to k8s metrics (#2454)
parent a7184bec
......@@ -162,7 +162,7 @@ def main():
print("\n✅ Disagg config injection completed!")
print(f"📁 Config available at: {args.target_path}")
print(f"🔧 Set DGD_CONFIG_FILE={args.target_path} in your profiler job")
print(f"🔧 Set DGD_CONFIG_FILE=/workspace{args.target_path} in your profiler job")
if __name__ == "__main__":
......
......@@ -5,10 +5,9 @@ import argparse
import logging
import os
from utils.profile_decode import profile_decode
from utils.profile_prefill import profile_prefill
from benchmarks.profiler.utils.profile_decode import profile_decode
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
......
......@@ -35,10 +35,9 @@ from utils.profile_cache import (
load_existing_decode_results,
load_existing_prefill_results,
)
from utils.profile_decode import profile_decode
from utils.profile_prefill import profile_prefill
from benchmarks.profiler.utils.profile_decode import profile_decode
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
......
......@@ -78,7 +78,7 @@ def deploy_access_pod(namespace: str) -> str:
print(f"Deploying access pod '{pod_name}' in namespace '{namespace}'...")
# Get the directory where this script is located
script_dir = Path(__file__).parent
script_dir = Path(__file__).parent.parent
pod_yaml_path = script_dir / "deploy" / "pvc-access-pod.yaml"
if not pod_yaml_path.exists():
......
......@@ -6,7 +6,7 @@ kind: DynamoGraphDeployment
metadata:
name: sglang-disagg-planner
annotations:
nvidia.com/enable-grove: "false"
nvidia.com/enable-grove: "false" # temporarily disable grove because current k8s connector does not work with grove
spec:
envs:
- name: DYNAMO_SERVICE_CONFIG
......
......@@ -5,6 +5,8 @@ apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-disagg-planner
annotations:
nvidia.com/enable-grove: "false" # temporarily disable grove because current k8s connector does not work with grove
spec:
envs:
- name: DYNAMO_SERVICE_CONFIG
......@@ -27,7 +29,7 @@ spec:
memory: "10Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
......@@ -71,8 +73,11 @@ spec:
mountPoint: /workspace/profiling_results
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
workingDir: /workspace/components/planner/src/dynamo/planner
ports:
- name: metrics
containerPort: 9085
command:
- /bin/sh
- -c
......@@ -83,6 +88,7 @@ spec:
--backend=vllm
--adjustment-interval=60
--profile-results-dir=/workspace/profiling_results
--prometheus-port=9085
Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
dynamoNamespace: vllm-disagg-planner
componentType: frontend
......@@ -118,7 +124,7 @@ spec:
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
......@@ -147,7 +153,7 @@ spec:
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
......@@ -176,7 +182,7 @@ spec:
port: 9090
periodSeconds: 10
failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
......
......@@ -23,6 +23,14 @@ configure_dynamo_logging()
logger = logging.getLogger(__name__)
def _get_prometheus_port_from_env():
"""
Get prometheus port from environment variables if set.
Otherwise, return 0, which means not reporting metrics using prometheus.
"""
return os.environ.get("PLANNER_PROMETHEUS_PORT", 0)
# Source of truth for planner defaults
class BasePlannerDefaults:
namespace = "dynamo"
......@@ -35,6 +43,7 @@ class BasePlannerDefaults:
min_endpoint = 1 # applies to both decode and prefill
decode_engine_num_gpu = 1
prefill_engine_num_gpu = 1
prometheus_port = _get_prometheus_port_from_env()
class LoadPlannerDefaults(BasePlannerDefaults):
......
......@@ -135,6 +135,12 @@ if __name__ == "__main__":
default=SLAPlannerDefaults.load_prediction_window_size,
help="Load prediction window size",
)
parser.add_argument(
"--prometheus-port",
type=int,
default=SLAPlannerDefaults.prometheus_port,
help="Prometheus port",
)
args = parser.parse_args()
asyncio.run(init_planner(args))
......@@ -9,6 +9,8 @@ import time
from dataclasses import dataclass
from typing import Optional
from prometheus_client import Gauge, start_http_server
from dynamo.planner import KubernetesConnector, __version__
from dynamo.planner.defaults import WORKER_COMPONENT_NAMES, SLAPlannerDefaults
from dynamo.planner.utils.load_predictor import LOAD_PREDICTORS
......@@ -89,6 +91,23 @@ class Planner:
self.p_correction_factor = 1.0
self.d_correction_factor = 1.0
self.prometheus_port = args.prometheus_port
# Initialize Prometheus metrics
# TODO: use proper naming
self.num_p_workers_gauge = Gauge("num_p_workers", "Number of prefill workers")
self.num_d_workers_gauge = Gauge("num_d_workers", "Number of decode workers")
# Start Prometheus HTTP server if port is specified
if self.prometheus_port != 0:
try:
start_http_server(self.prometheus_port)
logger.info(
f"Started Prometheus metrics server on port {self.prometheus_port}"
)
except Exception as e:
logger.error(f"Failed to start Prometheus metrics server: {e}")
async def get_workers_info(self):
try:
if self.prefill_client is None:
......@@ -137,7 +156,17 @@ class Planner:
raise RuntimeError(f"Failed to get decode worker endpoints: {e}")
return p_endpoints, d_endpoints
def observe_metrics(self):
async def observe_metrics(self):
self.p_endpoints, self.d_endpoints = await self.get_workers_info()
logger.debug(
f"Number of prefill workers: {len(self.p_endpoints)}, number of decode workers: {len(self.d_endpoints)}"
)
# Update Prometheus metrics if server is running
if self.prometheus_port != 0:
self.num_p_workers_gauge.set(len(self.p_endpoints))
self.num_d_workers_gauge.set(len(self.d_endpoints))
self.last_metrics.ttft = self.prometheus_api_client.get_avg_time_to_first_token(
f"{self.args.adjustment_interval}s"
)
......@@ -319,7 +348,7 @@ class Planner:
):
self.last_adjustment_time = time.time()
logger.info("New adjustment interval started!")
self.observe_metrics()
await self.observe_metrics()
await self.make_adjustments()
# sleep for a while to avoid busy-waiting but not too long to miss the next adjustment
......@@ -434,5 +463,11 @@ if __name__ == "__main__":
default=SLAPlannerDefaults.load_prediction_window_size,
help="Window size for load prediction",
)
parser.add_argument(
"--prometheus-port",
type=int,
default=SLAPlannerDefaults.prometheus_port,
help="Prometheus port for metrics server (0 to disable)",
)
args = parser.parse_args()
asyncio.run(dynamo_worker()(start_sla_planner)(args))
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-planner-metrics
namespace: $NAMESPACE
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "planner"
podMetricsEndpoints:
- port: metrics
path: /metrics
interval: 2s
namespaceSelector:
matchNames:
- $NAMESPACE
\ No newline at end of file
......@@ -113,7 +113,7 @@ Use the default pre-built image and inject custom configurations via PVC:
3. **Set the config path for the profiling job:**
```bash
export DGD_CONFIG_FILE=/profiling_results/disagg.yaml # or your custom path
export DGD_CONFIG_FILE=/workspace/profiling_results/disagg.yaml # or your custom path
```
This approach allows you to:
......
......@@ -93,7 +93,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-frontend-metrics
namespace: dynamo
namespace: $NAMESPACE
spec:
selector:
matchLabels:
......@@ -105,7 +105,7 @@ spec:
interval: 2s
namespaceSelector:
matchNames:
- dynamo
- $NAMESPACE
```
Then, create the worker PodMonitor:
......@@ -115,7 +115,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-worker-metrics
namespace: dynamo
namespace: $NAMESPACE
spec:
selector:
matchLabels:
......@@ -127,7 +127,28 @@ spec:
interval: 2s
namespaceSelector:
matchNames:
- dynamo
- $NAMESPACE
```
If you are using planner, you can also create a PodMonitor for the planner:
```yaml
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-planner-metrics
namespace: $NAMESPACE
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "planner"
podMetricsEndpoints:
- port: metrics
path: /metrics
interval: 2s
namespaceSelector:
matchNames:
- $NAMESPACE
```
Apply the PodMonitors:
......@@ -136,6 +157,7 @@ pushd deploy/metrics/k8s
# envsubst replaces ${NAMESPACE} with the actual namespace value
envsubst < frontend-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
envsubst < worker-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
envsubst < planner-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
popd
```
......@@ -146,7 +168,7 @@ This will cause Prometheus to be re-configured to scrape metrics from the pods o
Apply the Dynamo dashboard configuration to populate Grafana with the Dynamo dashboard:
```bash
pushd deploy/metrics/k8s
kubectl apply -n monitoring -f resources/grafana-dynamo-dashboard-configmap.yaml
kubectl apply -n monitoring -f grafana-dynamo-dashboard-configmap.yaml
popd
```
......@@ -162,7 +184,7 @@ The dashboard is embedded in the ConfigMap. Since it is labeled with `grafana_da
### In Prometheus
```bash
kubectl port-forward svc/prometheus-operated 9090:9090
kubectl port-forward svc/prometheus-operated 9090:9090 -n monitoring
```
Visit http://localhost:9090 and try these example queries:
......@@ -173,7 +195,7 @@ Visit http://localhost:9090 and try these example queries:
### In Grafana
```bash
kubectl port-forward svc/grafana 3000:80
kubectl port-forward svc/grafana 3000:80 -n monitoring
```
Visit http://localhost:3000 and find the Dynamo dashboard under General.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment