Unverified Commit 922850ae authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

chore: bug fixes in pre-deployment sweeping and vllm_v1 planner; expose...

chore: bug fixes in pre-deployment sweeping and vllm_v1 planner; expose num_d/p to k8s metrics (#2454)
parent a7184bec
...@@ -162,7 +162,7 @@ def main(): ...@@ -162,7 +162,7 @@ def main():
print("\n✅ Disagg config injection completed!") print("\n✅ Disagg config injection completed!")
print(f"📁 Config available at: {args.target_path}") print(f"📁 Config available at: {args.target_path}")
print(f"🔧 Set DGD_CONFIG_FILE={args.target_path} in your profiler job") print(f"🔧 Set DGD_CONFIG_FILE=/workspace{args.target_path} in your profiler job")
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -5,10 +5,9 @@ import argparse ...@@ -5,10 +5,9 @@ import argparse
import logging import logging
import os import os
from utils.profile_decode import profile_decode
from utils.profile_prefill import profile_prefill from utils.profile_prefill import profile_prefill
from benchmarks.profiler.utils.profile_decode import profile_decode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
......
...@@ -35,10 +35,9 @@ from utils.profile_cache import ( ...@@ -35,10 +35,9 @@ from utils.profile_cache import (
load_existing_decode_results, load_existing_decode_results,
load_existing_prefill_results, load_existing_prefill_results,
) )
from utils.profile_decode import profile_decode
from utils.profile_prefill import profile_prefill from utils.profile_prefill import profile_prefill
from benchmarks.profiler.utils.profile_decode import profile_decode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
......
...@@ -78,7 +78,7 @@ def deploy_access_pod(namespace: str) -> str: ...@@ -78,7 +78,7 @@ def deploy_access_pod(namespace: str) -> str:
print(f"Deploying access pod '{pod_name}' in namespace '{namespace}'...") print(f"Deploying access pod '{pod_name}' in namespace '{namespace}'...")
# Get the directory where this script is located # Get the directory where this script is located
script_dir = Path(__file__).parent script_dir = Path(__file__).parent.parent
pod_yaml_path = script_dir / "deploy" / "pvc-access-pod.yaml" pod_yaml_path = script_dir / "deploy" / "pvc-access-pod.yaml"
if not pod_yaml_path.exists(): if not pod_yaml_path.exists():
......
...@@ -6,7 +6,7 @@ kind: DynamoGraphDeployment ...@@ -6,7 +6,7 @@ kind: DynamoGraphDeployment
metadata: metadata:
name: sglang-disagg-planner name: sglang-disagg-planner
annotations: annotations:
nvidia.com/enable-grove: "false" nvidia.com/enable-grove: "false" # temporarily disable grove because current k8s connector does not work with grove
spec: spec:
envs: envs:
- name: DYNAMO_SERVICE_CONFIG - name: DYNAMO_SERVICE_CONFIG
......
...@@ -5,6 +5,8 @@ apiVersion: nvidia.com/v1alpha1 ...@@ -5,6 +5,8 @@ apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment kind: DynamoGraphDeployment
metadata: metadata:
name: vllm-disagg-planner name: vllm-disagg-planner
annotations:
nvidia.com/enable-grove: "false" # temporarily disable grove because current k8s connector does not work with grove
spec: spec:
envs: envs:
- name: DYNAMO_SERVICE_CONFIG - name: DYNAMO_SERVICE_CONFIG
...@@ -27,7 +29,7 @@ spec: ...@@ -27,7 +29,7 @@ spec:
memory: "10Gi" memory: "10Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
workingDir: /workspace/components/backends/vllm workingDir: /workspace/components/backends/vllm
command: command:
- /bin/sh - /bin/sh
...@@ -71,8 +73,11 @@ spec: ...@@ -71,8 +73,11 @@ spec:
mountPoint: /workspace/profiling_results mountPoint: /workspace/profiling_results
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
workingDir: /workspace/components/planner/src/dynamo/planner workingDir: /workspace/components/planner/src/dynamo/planner
ports:
- name: metrics
containerPort: 9085
command: command:
- /bin/sh - /bin/sh
- -c - -c
...@@ -83,6 +88,7 @@ spec: ...@@ -83,6 +88,7 @@ spec:
--backend=vllm --backend=vllm
--adjustment-interval=60 --adjustment-interval=60
--profile-results-dir=/workspace/profiling_results --profile-results-dir=/workspace/profiling_results
--prometheus-port=9085
Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently. Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
dynamoNamespace: vllm-disagg-planner dynamoNamespace: vllm-disagg-planner
componentType: frontend componentType: frontend
...@@ -118,7 +124,7 @@ spec: ...@@ -118,7 +124,7 @@ spec:
memory: "2Gi" memory: "2Gi"
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
workingDir: /workspace/components/backends/vllm workingDir: /workspace/components/backends/vllm
command: command:
- /bin/sh - /bin/sh
...@@ -147,7 +153,7 @@ spec: ...@@ -147,7 +153,7 @@ spec:
port: 9090 port: 9090
periodSeconds: 10 periodSeconds: 10
failureThreshold: 60 failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
workingDir: /workspace/components/backends/vllm workingDir: /workspace/components/backends/vllm
command: command:
- /bin/sh - /bin/sh
...@@ -176,7 +182,7 @@ spec: ...@@ -176,7 +182,7 @@ spec:
port: 9090 port: 9090
periodSeconds: 10 periodSeconds: 10
failureThreshold: 60 failureThreshold: 60
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17 image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:hzhou-0814-02
workingDir: /workspace/components/backends/vllm workingDir: /workspace/components/backends/vllm
command: command:
- /bin/sh - /bin/sh
......
...@@ -23,6 +23,14 @@ configure_dynamo_logging() ...@@ -23,6 +23,14 @@ configure_dynamo_logging()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def _get_prometheus_port_from_env():
"""
Get prometheus port from environment variables if set.
Otherwise, return 0, which means not reporting metrics using prometheus.
"""
return os.environ.get("PLANNER_PROMETHEUS_PORT", 0)
# Source of truth for planner defaults # Source of truth for planner defaults
class BasePlannerDefaults: class BasePlannerDefaults:
namespace = "dynamo" namespace = "dynamo"
...@@ -35,6 +43,7 @@ class BasePlannerDefaults: ...@@ -35,6 +43,7 @@ class BasePlannerDefaults:
min_endpoint = 1 # applies to both decode and prefill min_endpoint = 1 # applies to both decode and prefill
decode_engine_num_gpu = 1 decode_engine_num_gpu = 1
prefill_engine_num_gpu = 1 prefill_engine_num_gpu = 1
prometheus_port = _get_prometheus_port_from_env()
class LoadPlannerDefaults(BasePlannerDefaults): class LoadPlannerDefaults(BasePlannerDefaults):
......
...@@ -135,6 +135,12 @@ if __name__ == "__main__": ...@@ -135,6 +135,12 @@ if __name__ == "__main__":
default=SLAPlannerDefaults.load_prediction_window_size, default=SLAPlannerDefaults.load_prediction_window_size,
help="Load prediction window size", help="Load prediction window size",
) )
parser.add_argument(
"--prometheus-port",
type=int,
default=SLAPlannerDefaults.prometheus_port,
help="Prometheus port",
)
args = parser.parse_args() args = parser.parse_args()
asyncio.run(init_planner(args)) asyncio.run(init_planner(args))
...@@ -9,6 +9,8 @@ import time ...@@ -9,6 +9,8 @@ import time
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional from typing import Optional
from prometheus_client import Gauge, start_http_server
from dynamo.planner import KubernetesConnector, __version__ from dynamo.planner import KubernetesConnector, __version__
from dynamo.planner.defaults import WORKER_COMPONENT_NAMES, SLAPlannerDefaults from dynamo.planner.defaults import WORKER_COMPONENT_NAMES, SLAPlannerDefaults
from dynamo.planner.utils.load_predictor import LOAD_PREDICTORS from dynamo.planner.utils.load_predictor import LOAD_PREDICTORS
...@@ -89,6 +91,23 @@ class Planner: ...@@ -89,6 +91,23 @@ class Planner:
self.p_correction_factor = 1.0 self.p_correction_factor = 1.0
self.d_correction_factor = 1.0 self.d_correction_factor = 1.0
self.prometheus_port = args.prometheus_port
# Initialize Prometheus metrics
# TODO: use proper naming
self.num_p_workers_gauge = Gauge("num_p_workers", "Number of prefill workers")
self.num_d_workers_gauge = Gauge("num_d_workers", "Number of decode workers")
# Start Prometheus HTTP server if port is specified
if self.prometheus_port != 0:
try:
start_http_server(self.prometheus_port)
logger.info(
f"Started Prometheus metrics server on port {self.prometheus_port}"
)
except Exception as e:
logger.error(f"Failed to start Prometheus metrics server: {e}")
async def get_workers_info(self): async def get_workers_info(self):
try: try:
if self.prefill_client is None: if self.prefill_client is None:
...@@ -137,7 +156,17 @@ class Planner: ...@@ -137,7 +156,17 @@ class Planner:
raise RuntimeError(f"Failed to get decode worker endpoints: {e}") raise RuntimeError(f"Failed to get decode worker endpoints: {e}")
return p_endpoints, d_endpoints return p_endpoints, d_endpoints
def observe_metrics(self): async def observe_metrics(self):
self.p_endpoints, self.d_endpoints = await self.get_workers_info()
logger.debug(
f"Number of prefill workers: {len(self.p_endpoints)}, number of decode workers: {len(self.d_endpoints)}"
)
# Update Prometheus metrics if server is running
if self.prometheus_port != 0:
self.num_p_workers_gauge.set(len(self.p_endpoints))
self.num_d_workers_gauge.set(len(self.d_endpoints))
self.last_metrics.ttft = self.prometheus_api_client.get_avg_time_to_first_token( self.last_metrics.ttft = self.prometheus_api_client.get_avg_time_to_first_token(
f"{self.args.adjustment_interval}s" f"{self.args.adjustment_interval}s"
) )
...@@ -319,7 +348,7 @@ class Planner: ...@@ -319,7 +348,7 @@ class Planner:
): ):
self.last_adjustment_time = time.time() self.last_adjustment_time = time.time()
logger.info("New adjustment interval started!") logger.info("New adjustment interval started!")
self.observe_metrics() await self.observe_metrics()
await self.make_adjustments() await self.make_adjustments()
# sleep for a while to avoid busy-waiting but not too long to miss the next adjustment # sleep for a while to avoid busy-waiting but not too long to miss the next adjustment
...@@ -434,5 +463,11 @@ if __name__ == "__main__": ...@@ -434,5 +463,11 @@ if __name__ == "__main__":
default=SLAPlannerDefaults.load_prediction_window_size, default=SLAPlannerDefaults.load_prediction_window_size,
help="Window size for load prediction", help="Window size for load prediction",
) )
parser.add_argument(
"--prometheus-port",
type=int,
default=SLAPlannerDefaults.prometheus_port,
help="Prometheus port for metrics server (0 to disable)",
)
args = parser.parse_args() args = parser.parse_args()
asyncio.run(dynamo_worker()(start_sla_planner)(args)) asyncio.run(dynamo_worker()(start_sla_planner)(args))
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-planner-metrics
namespace: $NAMESPACE
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "planner"
podMetricsEndpoints:
- port: metrics
path: /metrics
interval: 2s
namespaceSelector:
matchNames:
- $NAMESPACE
\ No newline at end of file
...@@ -113,7 +113,7 @@ Use the default pre-built image and inject custom configurations via PVC: ...@@ -113,7 +113,7 @@ Use the default pre-built image and inject custom configurations via PVC:
3. **Set the config path for the profiling job:** 3. **Set the config path for the profiling job:**
```bash ```bash
export DGD_CONFIG_FILE=/profiling_results/disagg.yaml # or your custom path export DGD_CONFIG_FILE=/workspace/profiling_results/disagg.yaml # or your custom path
``` ```
This approach allows you to: This approach allows you to:
......
...@@ -93,7 +93,7 @@ apiVersion: monitoring.coreos.com/v1 ...@@ -93,7 +93,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor kind: PodMonitor
metadata: metadata:
name: dynamo-frontend-metrics name: dynamo-frontend-metrics
namespace: dynamo namespace: $NAMESPACE
spec: spec:
selector: selector:
matchLabels: matchLabels:
...@@ -105,7 +105,7 @@ spec: ...@@ -105,7 +105,7 @@ spec:
interval: 2s interval: 2s
namespaceSelector: namespaceSelector:
matchNames: matchNames:
- dynamo - $NAMESPACE
``` ```
Then, create the worker PodMonitor: Then, create the worker PodMonitor:
...@@ -115,7 +115,7 @@ apiVersion: monitoring.coreos.com/v1 ...@@ -115,7 +115,7 @@ apiVersion: monitoring.coreos.com/v1
kind: PodMonitor kind: PodMonitor
metadata: metadata:
name: dynamo-worker-metrics name: dynamo-worker-metrics
namespace: dynamo namespace: $NAMESPACE
spec: spec:
selector: selector:
matchLabels: matchLabels:
...@@ -127,7 +127,28 @@ spec: ...@@ -127,7 +127,28 @@ spec:
interval: 2s interval: 2s
namespaceSelector: namespaceSelector:
matchNames: matchNames:
- dynamo - $NAMESPACE
```
If you are using planner, you can also create a PodMonitor for the planner:
```yaml
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: dynamo-planner-metrics
namespace: $NAMESPACE
spec:
selector:
matchLabels:
nvidia.com/metrics-enabled: "true"
nvidia.com/dynamo-component-type: "planner"
podMetricsEndpoints:
- port: metrics
path: /metrics
interval: 2s
namespaceSelector:
matchNames:
- $NAMESPACE
``` ```
Apply the PodMonitors: Apply the PodMonitors:
...@@ -136,6 +157,7 @@ pushd deploy/metrics/k8s ...@@ -136,6 +157,7 @@ pushd deploy/metrics/k8s
# envsubst replaces ${NAMESPACE} with the actual namespace value # envsubst replaces ${NAMESPACE} with the actual namespace value
envsubst < frontend-podmonitor.yaml | kubectl apply -n $NAMESPACE -f - envsubst < frontend-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
envsubst < worker-podmonitor.yaml | kubectl apply -n $NAMESPACE -f - envsubst < worker-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
envsubst < planner-podmonitor.yaml | kubectl apply -n $NAMESPACE -f -
popd popd
``` ```
...@@ -146,7 +168,7 @@ This will cause Prometheus to be re-configured to scrape metrics from the pods o ...@@ -146,7 +168,7 @@ This will cause Prometheus to be re-configured to scrape metrics from the pods o
Apply the Dynamo dashboard configuration to populate Grafana with the Dynamo dashboard: Apply the Dynamo dashboard configuration to populate Grafana with the Dynamo dashboard:
```bash ```bash
pushd deploy/metrics/k8s pushd deploy/metrics/k8s
kubectl apply -n monitoring -f resources/grafana-dynamo-dashboard-configmap.yaml kubectl apply -n monitoring -f grafana-dynamo-dashboard-configmap.yaml
popd popd
``` ```
...@@ -162,7 +184,7 @@ The dashboard is embedded in the ConfigMap. Since it is labeled with `grafana_da ...@@ -162,7 +184,7 @@ The dashboard is embedded in the ConfigMap. Since it is labeled with `grafana_da
### In Prometheus ### In Prometheus
```bash ```bash
kubectl port-forward svc/prometheus-operated 9090:9090 kubectl port-forward svc/prometheus-operated 9090:9090 -n monitoring
``` ```
Visit http://localhost:9090 and try these example queries: Visit http://localhost:9090 and try these example queries:
...@@ -173,7 +195,7 @@ Visit http://localhost:9090 and try these example queries: ...@@ -173,7 +195,7 @@ Visit http://localhost:9090 and try these example queries:
### In Grafana ### In Grafana
```bash ```bash
kubectl port-forward svc/grafana 3000:80 kubectl port-forward svc/grafana 3000:80 -n monitoring
``` ```
Visit http://localhost:3000 and find the Dynamo dashboard under General. Visit http://localhost:3000 and find the Dynamo dashboard under General.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment