Unverified Commit 7b2f95e4 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: bug fixes for planner tests (#3821)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 21697e1c
......@@ -63,6 +63,11 @@ class PrometheusAPIClient:
Average metric value or 0 if no data/error
"""
try:
# Prepend the frontend metric prefix if not already present
if not full_metric_name.startswith(prometheus_names.name_prefix.FRONTEND):
full_metric_name = (
f"{prometheus_names.name_prefix.FRONTEND}_{full_metric_name}"
)
query = f"increase({full_metric_name}_sum[{interval}])/increase({full_metric_name}_count[{interval}])"
result = self.prom.custom_query(query=query)
if not result:
......@@ -75,8 +80,10 @@ class PrometheusAPIClient:
values = []
for container in metrics_containers:
# Frontend lowercases model names for Prometheus labels so we need to do case-insensitive comparison
if (
container.metric.model == model_name
container.metric.model
and container.metric.model.lower() == model_name.lower()
and container.metric.dynamo_namespace == self.dynamo_namespace
):
values.append(container.value[1])
......@@ -120,14 +127,23 @@ class PrometheusAPIClient:
# This function follows a different query pattern than the other metrics
try:
requests_total_metric = prometheus_names.frontend_service.REQUESTS_TOTAL
# Prepend the frontend metric prefix if not already present
if not requests_total_metric.startswith(
prometheus_names.name_prefix.FRONTEND
):
requests_total_metric = (
f"{prometheus_names.name_prefix.FRONTEND}_{requests_total_metric}"
)
raw_res = self.prom.custom_query(
query=f"increase({requests_total_metric}[{interval}])"
)
metrics_containers = parse_frontend_metric_containers(raw_res)
total_count = 0.0
for container in metrics_containers:
# Frontend lowercases model names for Prometheus labels so we need to do case-insensitive comparison
if (
container.metric.model == model_name
container.metric.model
and container.metric.model.lower() == model_name.lower()
and container.metric.dynamo_namespace == self.dynamo_namespace
):
total_count += container.value[1]
......
......@@ -38,7 +38,7 @@ flowchart TD
Before deploying the SLA planner, ensure:
- **Dynamo platform installed** (see [Installation Guide](/docs/kubernetes/installation_guide.md))
- **[kube-prometheus-stack](/docs/kubernetes/metrics.md) installed and running.** By default, the prometheus server is not deployed in the `monitoring` namespace. If it is deployed to a different namespace, set `dynamo-operator.dynamo.metrics.prometheusEndpoint="http://prometheus-kube-prometheus-prometheus.<namespace>.svc.cluster.local:9090"`.
- **[kube-prometheus-stack](/docs/kubernetes/metrics.md) installed and running.** By default, the prometheus server is deployed in the `monitoring` namespace. If it is deployed to a different namespace, set `dynamo-operator.dynamo.metrics.prometheusEndpoint="http://prometheus-kube-prometheus-prometheus.<namespace>.svc.cluster.local:9090"`.
- **Benchmarking resources setup** (see [Kubernetes utilities for Dynamo Benchmarking and Profiling](../../deploy/utils/README.md)) The script will create a `dynamo-pvc` with `ReadWriteMany` access, if your cluster's default storageClassName does not allow `ReadWriteMany`, you need to specify a different storageClassName in `deploy/utils/manifests/pvc.yaml` which does support `ReadWriteMany`.
......
......@@ -160,20 +160,64 @@ PYTHONPATH=../../components/src python -m pytest test_replica_calculation.py -v
**Note**: The unit tests automatically mock external dependencies (prometheus_client, runtime modules) to ensure they can run in isolation without requiring the full Dynamo environment.
#### Run Full End-to-End Test
Test complete scaling behavior including Kubernetes deployment and load generation:
Test complete scaling behavior including Kubernetes deployment and load generation.
**Prerequisites:**
- **[kube-prometheus-stack](../../docs/kubernetes/metrics.md) installed and running.** The SLA planner requires Prometheus to observe metrics and make scaling decisions.
- Ensure the Dynamo operator was installed with the Prometheus endpoint configured (see [SLA Planner Quickstart Guide](../../docs/planner/sla_planner_quickstart.md#prerequisites) for details).
**Prepare the test deployment manifest:**
The test requires modifying `components/backends/vllm/deploy/disagg_planner.yaml` with test-specific planner arguments:
1. Copy the base deployment:
```bash
./scaling/run_scaling_test.sh
cp components/backends/vllm/deploy/disagg_planner.yaml tests/planner/scaling/disagg_planner.yaml
```
With custom namespace:
2. Edit `tests/planner/scaling/disagg_planner.yaml`. Ensure all services use the correct image. Modify the Planner service args:
```yaml
spec:
services:
Planner:
extraPodSpec:
mainContainer:
args:
- --environment=kubernetes
- --backend=vllm
- --adjustment-interval=60
- --profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
- --ttft=100
- --itl=10
- --load-predictor=constant
- --no-correction
```
3. Update the model in VllmPrefillWorker and VllmDecodeWorker services:
```yaml
args:
- -m
- dynamo.vllm
- --model
- nvidia/Llama-3.1-8B-Instruct-FP8
- --migration-limit=3
- --max-model-len=8192
```
**Run the test:**
```bash
./scaling/run_scaling_test.sh --namespace <namespace>
```
To save results to `tests/planner/e2e_scaling_results` instead of `/tmp`:
```bash
./scaling/run_scaling_test.sh --save-results
./scaling/run_scaling_test.sh --namespace <namespace> --save-results
```
**E2E Test Deployment Management:**
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-disagg-planner
spec:
envs:
- name: DYNAMO_SERVICE_CONFIG
value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
- name: DYN_NAMESPACE
value: "vllm-disagg-planner"
services:
Frontend:
dynamoNamespace: vllm-disagg-planner
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: my-registry/vllm-runtime:my-tag
Planner:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: planner
replicas: 1
livenessProbe:
httpGet:
path: /metrics
port: 9085
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
httpGet:
path: /metrics
port: 9085
initialDelaySeconds: 60
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
extraPodSpec:
mainContainer:
image: my-registry/vllm-runtime:my-tag
workingDir: /workspace/components/src/dynamo/planner
ports:
- name: metrics
containerPort: 9085
command:
- /bin/sh
- -c
args:
- >-
python3 -m planner_sla
--environment=kubernetes
--backend=vllm
--adjustment-interval=60
--profile-results-dir=/workspace/tests/planner/profiling_results/H200_TP1P_TP1D
--prometheus-port=9085
--ttft=100
--itl=10
--load-predictor=constant
--no-correction
VllmDecodeWorker:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
subComponentType: decode
replicas: 1
resources:
limits:
gpu: "1"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 30
failureThreshold: 60
image: my-registry/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --migration-limit=3 --max-model-len=8192"
VllmPrefillWorker:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
subComponentType: prefill
replicas: 1
resources:
limits:
gpu: "1"
extraPodSpec:
mainContainer:
startupProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 30
failureThreshold: 60
image: my-registry/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- python3 -m dynamo.vllm --model nvidia/Llama-3.1-8B-Instruct-FP8 --is-prefill-worker --migration-limit=3 --max-model-len=8192
......@@ -97,13 +97,15 @@ class KubernetesMonitor:
for pod in data.get("items", []):
pod_phase = pod.get("status", {}).get("phase", "")
pod_labels = pod.get("metadata", {}).get("labels", {})
component = pod_labels.get("nvidia.com/dynamo-component", "")
sub_component = pod_labels.get(
"nvidia.com/dynamo-sub-component-type", ""
)
# Only count Running pods
if pod_phase == "Running":
if component == "VllmPrefillWorker":
if sub_component == "prefill":
prefill_pods += 1
elif component == "VllmDecodeWorker":
elif sub_component == "decode":
decode_pods += 1
else:
continue
......
......@@ -208,11 +208,16 @@ class LoadGenerator:
logger.info(f"Parsing results from: {results_file}")
with open(results_file, "r") as f:
metrics = json.load(f)
data = json.load(f)
# aiperf now wraps metrics under "records" key
metrics = data.get("records", data)
results = {
"throughput": metrics.get("output_token_throughput", {}).get("avg", 0),
"ttft_mean": metrics.get("time_to_first_token", {}).get("avg", 0),
"ttft_mean": metrics.get("ttft", {}).get(
"avg", 0
), # Changed from "time_to_first_token"
"itl_mean": metrics.get("inter_token_latency", {}).get("avg", 0),
"end_to_end_latency_mean": metrics.get("request_latency", {}).get(
"avg", 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment