Unverified Commit ce36d9f4 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files
parent c4334471
../../docs/benchmarks/benchmarking.md#server-side-benchmarking-in-cluster
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: batch/v1
kind: Job
metadata:
name: dynamo-benchmark
spec:
template:
spec:
serviceAccountName: dynamo-sa
imagePullSecrets:
- name: docker-imagepullsecret
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
containers:
- name: benchmark-runner
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
resources:
requests:
cpu: "4"
memory: "8Gi"
limits:
cpu: "8"
memory: "16Gi"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: HF_TOKEN
command: ["python3", "-m", "benchmarks.utils.benchmark"]
args:
- --model
- "Qwen/Qwen3-0.6B"
- --isl
- "2000"
- --std
- "10"
- --osl
- "256"
- --output-dir
- /data/results
- --input
- "qwen-vllm-agg=vllm-agg-frontend:8000"
# add more copies of lines 58-59 for each additional service if you want to benchmark multiple services
# - --input
# - "name=service-url:port"
volumeMounts:
- name: data-volume
mountPath: /data
restartPolicy: Never
volumes:
- name: data-volume
persistentVolumeClaim:
claimName: dynamo-pvc
backoffLimit: 0
ttlSecondsAfterFinished: 3600 # Clean up job after 1 hour
...@@ -7,53 +7,59 @@ import argparse ...@@ -7,53 +7,59 @@ import argparse
import re import re
import sys import sys
from typing import Dict, Tuple from typing import Dict, Tuple
from urllib.parse import urlsplit
from benchmarks.utils.workflow import run_benchmark_workflow from benchmarks.utils.workflow import has_http_scheme, run_benchmark_workflow
from deploy.utils.kubernetes import is_running_in_cluster
def validate_inputs(inputs: Dict[str, str]) -> None: def validate_inputs(inputs: Dict[str, str]) -> None:
"""Validate that all inputs are HTTP endpoints""" """Validate that all inputs are HTTP endpoints or internal service URLs when running in cluster"""
for label, value in inputs.items(): for label, value in inputs.items():
if not value.lower().startswith(("http://", "https://")): v = value.strip()
raise ValueError( if is_running_in_cluster():
f"Input '{label}' must be an HTTP endpoint (starting with http:// or https://). Got: {value}" # Allow HTTP(S) or internal service URLs like host[:port][/path]
) if has_http_scheme(v):
pass
else:
parts = urlsplit(f"//{v}")
host_ok = bool(parts.hostname)
port_ok = parts.port is None or (1 <= parts.port <= 65535)
if not (host_ok and port_ok):
raise ValueError(
f"Input '{label}' must be HTTP(S) or internal service URL. Got: {value}"
)
else:
if not has_http_scheme(v):
raise ValueError(f"Input '{label}' must be HTTP endpoint. Got: {value}")
# Validate reserved labels # Validate reserved labels
if label.lower() == "plots": if label.lower() == "plots":
raise ValueError( raise ValueError("Label 'plots' is reserved")
"Label 'plots' is reserved and cannot be used. Please choose a different label."
)
def parse_input(input_str: str) -> Tuple[str, str]: def parse_input(input_str: str) -> Tuple[str, str]:
"""Parse input string in format key=value with additional validation""" """Parse input string in format key=value with additional validation"""
if "=" not in input_str: if "=" not in input_str:
raise ValueError( raise ValueError(f"Invalid input format: {input_str}")
f"Invalid input format. Expected: <label>=<endpoint>, got: {input_str}"
)
parts = input_str.split("=", 1) # Split on first '=' only parts = input_str.split("=", 1) # Split on first '=' only
if len(parts) != 2: if len(parts) != 2:
raise ValueError( raise ValueError(f"Invalid input format: {input_str}")
f"Invalid input format. Expected: <label>=<endpoint>, got: {input_str}"
)
label, value = parts label, value = parts
if not label.strip(): if not label.strip():
raise ValueError("Label cannot be empty") raise ValueError("Empty label")
if not value.strip(): if not value.strip():
raise ValueError("Value cannot be empty") raise ValueError("Empty value")
label = label.strip() label = label.strip()
value = value.strip() value = value.strip()
# Validate label characters # Validate label characters
if not re.match(r"^[a-zA-Z0-9_-]+$", label): if not re.match(r"^[a-zA-Z0-9_-]+$", label):
raise ValueError( raise ValueError(f"Invalid label: {label}")
f"Label must contain only letters, numbers, hyphens, and underscores. Invalid label: {label}"
)
return label, value return label, value
...@@ -114,7 +120,7 @@ def main() -> int: ...@@ -114,7 +120,7 @@ def main() -> int:
) )
print() print()
# Validate that all inputs are HTTP endpoints # Validate that inputs are HTTP endpoints or in-cluster service URLs
validate_inputs(parsed_inputs) validate_inputs(parsed_inputs)
except ValueError as e: except ValueError as e:
......
...@@ -6,6 +6,21 @@ from typing import Dict, List ...@@ -6,6 +6,21 @@ from typing import Dict, List
from benchmarks.utils.genai import run_concurrency_sweep from benchmarks.utils.genai import run_concurrency_sweep
from benchmarks.utils.plot import generate_plots from benchmarks.utils.plot import generate_plots
from deploy.utils.kubernetes import is_running_in_cluster
def has_http_scheme(url: str) -> bool:
"""Check if URL has HTTP or HTTPS scheme."""
return url.lower().startswith(("http://", "https://"))
def normalize_service_url(endpoint: str) -> str:
e = endpoint.strip()
if has_http_scheme(e):
return e
if is_running_in_cluster():
return f"http://{e}"
return e # Outside cluster, validation will have ensured scheme is present
def print_concurrency_start( def print_concurrency_start(
...@@ -30,7 +45,10 @@ def run_endpoint_benchmark( ...@@ -30,7 +45,10 @@ def run_endpoint_benchmark(
output_dir: Path, output_dir: Path,
) -> None: ) -> None:
"""Run benchmark for an existing endpoint with custom label""" """Run benchmark for an existing endpoint with custom label"""
print(f"🚀 Starting benchmark of endpoint '{label}': {endpoint}") # Normalize endpoint to a usable URL (handles in-cluster scheme-less inputs)
service_url = normalize_service_url(endpoint)
print(f"🚀 Starting benchmark of endpoint '{label}': {service_url}")
print(f"📁 Results will be saved to: {output_dir / label}") print(f"📁 Results will be saved to: {output_dir / label}")
print_concurrency_start(label, model, isl, osl, std) print_concurrency_start(label, model, isl, osl, std)
...@@ -38,7 +56,7 @@ def run_endpoint_benchmark( ...@@ -38,7 +56,7 @@ def run_endpoint_benchmark(
(output_dir / label).mkdir(parents=True, exist_ok=True) (output_dir / label).mkdir(parents=True, exist_ok=True)
run_concurrency_sweep( run_concurrency_sweep(
service_url=endpoint, service_url=service_url,
model_name=model, model_name=model,
isl=isl, isl=isl,
osl=osl, osl=osl,
...@@ -73,7 +91,7 @@ def run_benchmark_workflow( ...@@ -73,7 +91,7 @@ def run_benchmark_workflow(
model: str = "Qwen/Qwen3-0.6B", model: str = "Qwen/Qwen3-0.6B",
output_dir: str = "benchmarks/results", output_dir: str = "benchmarks/results",
) -> None: ) -> None:
"""Main benchmark workflow orchestrator for HTTP endpoints only""" """Main benchmark workflow orchestrator for HTTP endpoints (and in-cluster internal service URLs)"""
output_dir_path = Path(output_dir) output_dir_path = Path(output_dir)
output_dir_path.mkdir(parents=True, exist_ok=True) output_dir_path.mkdir(parents=True, exist_ok=True)
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import subprocess import subprocess
import sys import sys
from pathlib import Path from pathlib import Path
...@@ -20,6 +21,14 @@ from typing import List ...@@ -20,6 +21,14 @@ from typing import List
PVC_ACCESS_POD_NAME = "pvc-access-pod" PVC_ACCESS_POD_NAME = "pvc-access-pod"
K8S_SA_TOKEN = Path("/var/run/secrets/kubernetes.io/serviceaccount/token")
def is_running_in_cluster() -> bool:
"""Return True if running inside a Kubernetes cluster."""
# Prefer well-known env var; fall back to SA token presence
return bool(os.environ.get("KUBERNETES_SERVICE_HOST")) or K8S_SA_TOKEN.exists()
def run_command( def run_command(
cmd: List[str], capture_output: bool = True, exit_on_error: bool = True cmd: List[str], capture_output: bool = True, exit_on_error: bool = True
......
...@@ -19,6 +19,46 @@ This benchmarking framework lets you compare performance across any combination ...@@ -19,6 +19,46 @@ This benchmarking framework lets you compare performance across any combination
- **DynamoGraphDeployments** - **DynamoGraphDeployments**
- **External HTTP endpoints** (existing services deployed following standard documentation from vLLM, llm-d, AIBrix, etc.) - **External HTTP endpoints** (existing services deployed following standard documentation from vLLM, llm-d, AIBrix, etc.)
## Choosing Your Benchmarking Approach
Dynamo provides two benchmarking approaches to suit different use cases: **client-side** and **server-side**. Client-side refers to running benchmarks on your local machine and connecting to Kubernetes deployments via port-forwarding, while server-side refers to running benchmarks directly within the Kubernetes cluster using internal service URLs. Which method to use depends on your use case.
**TLDR:**
Need high performance/load testing? Server-side.
Just quick testing/comparison? Client-side.
### Use Client-Side Benchmarking When:
- You want to quickly test deployments
- You want immediate access to results on your local machine
- You're comparing external services or deployments (not necessarily just Dynamo deployments)
- You need to run benchmarks from your laptop/workstation
**[Go to Client-Side Benchmarking (Local)](#client-side-benchmarking-local)**
### Use Server-Side Benchmarking When:
- You have a development environment with kubectl access
- You're doing performance validation with high load/speed requirements
- You're experiencing timeouts or performance issues with client-side benchmarking
- You want optimal network performance (no port-forwarding overhead)
- You're running automated CI/CD pipelines
- You need isolated execution environments
- You're doing resource-intensive benchmarking
- You want persistent result storage in the cluster
**[Go to Server-Side Benchmarking (In-Cluster)](#server-side-benchmarking-in-cluster)**
### Quick Comparison
| Feature | Client-Side | Server-Side |
|---------|-------------|-------------|
| **Location** | Your local machine | Kubernetes cluster |
| **Network** | Port-forwarding required | Direct service DNS |
| **Setup** | Quick and simple | Requires cluster resources |
| **Performance** | Limited by local resources, may timeout under high load | Optimal cluster performance, handles high load |
| **Isolation** | Shared environment | Isolated job execution |
| **Results** | Local filesystem | Persistent volumes |
| **Best for** | Light load | High load |
## What This Tool Does ## What This Tool Does
The framework is a Python-based wrapper around `genai-perf` that: The framework is a Python-based wrapper around `genai-perf` that:
...@@ -26,34 +66,35 @@ The framework is a Python-based wrapper around `genai-perf` that: ...@@ -26,34 +66,35 @@ The framework is a Python-based wrapper around `genai-perf` that:
- Runs concurrency sweeps across configurable load levels - Runs concurrency sweeps across configurable load levels
- Generates comparison plots with your custom labels - Generates comparison plots with your custom labels
- Works with any HuggingFace-compatible model on NVIDIA GPUs (H200, H100, A100, etc.) - Works with any HuggingFace-compatible model on NVIDIA GPUs (H200, H100, A100, etc.)
- Runs locally and connects to your Kubernetes deployments/endpoints
- Provides direct Python script execution for maximum flexibility - Provides direct Python script execution for maximum flexibility
**Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`) **Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`)
**Important**: The `--model` parameter configures GenAI-Perf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s). **Important**: The `--model` parameter configures GenAI-Perf for benchmarking and provides logging context. The default `--model` value in the benchmarking script is `Qwen/Qwen3-0.6B`, but it must match the model deployed at the endpoint(s).
## Prerequisites ---
1. **Dynamo container environment** - You must be running inside a Dynamo container with the benchmarking tools pre-installed. # Client-Side Benchmarking (Local)
2. **Ubuntu 24.04** - GenAI-Perf requires Ubuntu 24.04 or higher to work properly. If you are on Ubuntu 22.04 or lower, you will need to build perf_analyzer [from source](https://github.com/triton-inference-server/perf_analyzer/blob/main/docs/install.md#build-from-source). Client-side benchmarking runs on your local machine and connects to Kubernetes deployments via port-forwarding.
3. **kubectl access** - You need `kubectl` installed and configured to access your Kubernetes cluster. ## Prerequisites
1. **Dynamo container environment** - You must be running inside a Dynamo container with the benchmarking tools pre-installed.
4. **HTTP endpoints** - Ensure you have HTTP endpoints available for benchmarking. These can be: 2. **HTTP endpoints** - Ensure you have HTTP endpoints available for benchmarking. These can be:
- DynamoGraphDeployments exposed via HTTP endpoints - DynamoGraphDeployments exposed via HTTP endpoints
- External services (vLLM, llm-d, AIBrix, etc.) - External services (vLLM, llm-d, AIBrix, etc.)
- Any HTTP endpoint serving HuggingFace-compatible models - Any HTTP endpoint serving HuggingFace-compatible models
5. **Benchmark dependencies** - Since benchmarks run locally, you need to install the required Python dependencies. Install them using: 3. **Benchmark dependencies** - Since benchmarks run locally, you need to install the required Python dependencies. Install them using:
```bash ```bash
pip install -r deploy/utils/requirements.txt pip install -r deploy/utils/requirements.txt
``` ```
## User Workflow ## User Workflow
Follow these steps to benchmark Dynamo deployments: Follow these steps to benchmark Dynamo deployments using client-side benchmarking:
### Step 1: Establish Kubernetes Cluster and Install Dynamo ### Step 1: Establish Kubernetes Cluster and Install Dynamo
Set up your Kubernetes cluster with NVIDIA GPUs and install the Dynamo Cloud platform. First follow the [installation guide](/docs/guides/dynamo_deploy/installation_guide.md) to install Dynamo Cloud, then use [deploy/utils/README](../../deploy/utils/README.md) to set up benchmarking resources. Set up your Kubernetes cluster with NVIDIA GPUs and install the Dynamo Cloud platform. First follow the [installation guide](/docs/guides/dynamo_deploy/installation_guide.md) to install Dynamo Cloud, then use [deploy/utils/README](../../deploy/utils/README.md) to set up benchmarking resources.
...@@ -206,9 +247,9 @@ Raw data is organized by deployment/benchmark type and concurrency level: ...@@ -206,9 +247,9 @@ Raw data is organized by deployment/benchmark type and concurrency level:
**For Any Benchmarking (uses your custom labels):** **For Any Benchmarking (uses your custom labels):**
```text ```text
benchmarks/results/ results/ # Client-side: ./benchmarks/results/ or custom dir
├── plots/ # Performance visualization plots ├── plots/ # Server-side: /data/results/
│ ├── SUMMARY.txt # Human-readable benchmark summary │ ├── SUMMARY.txt # Performance visualization plots
│ ├── p50_inter_token_latency_vs_concurrency.png │ ├── p50_inter_token_latency_vs_concurrency.png
│ ├── avg_inter_token_latency_vs_concurrency.png │ ├── avg_inter_token_latency_vs_concurrency.png
│ ├── request_throughput_vs_concurrency.png │ ├── request_throughput_vs_concurrency.png
...@@ -228,7 +269,7 @@ benchmarks/results/ ...@@ -228,7 +269,7 @@ benchmarks/results/
**Example with actual labels:** **Example with actual labels:**
```text ```text
benchmarks/results/ results/
├── plots/ ├── plots/
├── experiment-a/ # --input experiment-a=http://localhost:8000 ├── experiment-a/ # --input experiment-a=http://localhost:8000
├── experiment-b/ # --input experiment-b=http://localhost:8001 ├── experiment-b/ # --input experiment-b=http://localhost:8001
...@@ -240,6 +281,205 @@ Each concurrency directory contains: ...@@ -240,6 +281,205 @@ Each concurrency directory contains:
- **`profile_export.json`** - Raw GenAI-Perf results - **`profile_export.json`** - Raw GenAI-Perf results
- **`inputs.json`** - Generated test inputs - **`inputs.json`** - Generated test inputs
---
# Server-Side Benchmarking (In-Cluster)
Server-side benchmarking runs directly within the Kubernetes cluster, eliminating the need for port forwarding and providing better resource utilization.
## What Server-Side Benchmarking Does
The server-side benchmarking solution:
- Runs benchmarks directly within the Kubernetes cluster using internal service URLs
- Uses Kubernetes service DNS for direct communication (no port forwarding required)
- Leverages the existing benchmarking infrastructure (`benchmarks.utils.benchmark`)
- Stores results persistently using `dynamo-pvc`
- Provides isolated execution environment with configurable resources
- Handles high load/speed requirements without timeout issues
- **Note**: Each benchmark job runs within a single Kubernetes namespace, but can benchmark services across multiple namespaces using the full DNS format `svc_name.namespace.svc.cluster.local`
## Prerequisites
1. **Kubernetes cluster** with NVIDIA GPUs and Dynamo namespace setup (see [Dynamo Cloud/Platform docs](../guides/dynamo_deploy/README.md))
2. **Storage and service account** PersistentVolumeClaim and service account configured with appropriate permissions (see [deploy/utils README](../../deploy/utils/README.md))
3. **Docker image** containing the Dynamo benchmarking tools
## Quick Start
### Step 1: Deploy Your DynamoGraphDeployment
Deploy your DynamoGraphDeployment using the [deployment documentation](../../components/backends/). Ensure it has a frontend service exposed.
### Step 2: Deploy and Run Benchmark Job
```bash
export NAMESPACE=benchmarking
# Deploy the benchmark job with default settings
kubectl apply -f benchmarks/incluster/benchmark_job.yaml -n $NAMESPACE
# Monitor the job, wait for it to complete
kubectl logs -f job/dynamo-benchmark -n $NAMESPACE
```
#### Customize the job configuration
To customize the benchmark parameters, edit the `benchmarks/incluster/benchmark_job.yaml` file and modify:
- **Model name**: Change `"Qwen/Qwen3-0.6B"` in the args section
- **Experiment name and service URL**: Change `"qwen-vllm-agg=vllm-agg-frontend:8000"` so the service URL matches your deployed service
- **Docker image**: Change the image field if needed
Then deploy:
```bash
kubectl apply -f benchmarks/incluster/benchmark_job.yaml -n $NAMESPACE
```
### Step 3: Retrieve Results
```bash
# Download results from PVC (recommended)
python3 -m deploy.utils.download_pvc_results \
--namespace $NAMESPACE \
--output-dir ./benchmarks/results/${INPUT_NAME} \
--folder /data/results/${INPUT_NAME} \
--no-config
```
## Cross-Namespace Service Access
Server-side benchmarking can benchmark services across multiple namespaces from a single job using Kubernetes DNS. When referencing services in other namespaces, use the full DNS format:
```bash
# Access service in same namespace
SERVICE_URL=vllm-agg-frontend:8000
# Access service in different namespace
SERVICE_URL=vllm-agg-frontend.production.svc.cluster.local:8000
```
**DNS Format**: `<service-name>.<namespace>.svc.cluster.local:port`
This allows you to:
- Benchmark multiple services across different namespaces in a single job
- Compare services running in different environments (dev, staging, production)
- Test cross-namespace integrations without port-forwarding
- Run comprehensive cross-namespace performance comparisons
## Configuration
The benchmark job is configured directly in the YAML file.
### Default Configuration
- **Model**: `Qwen/Qwen3-0.6B`
- **Service**: `qwen-vllm-agg=vllm-agg-frontend:8000`
- **Docker Image**: `nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.5.0`
### Customizing the Job
To customize the benchmark, edit `benchmarks/incluster/benchmark_job.yaml`:
1. **Change the model**: Update the `--model` argument
2. **Change the experiment name and/or service URL**: Update the `--input` argument (use `svc_name.namespace.svc.cluster.local:port` for cross-namespace access)
3. **Add multiple services**: Uncomment and add more `--input` lines
4. **Change Docker image**: Update the image field if needed
### Example: Multi-Namespace Benchmarking
To benchmark services across multiple namespaces, modify the `--input` arguments:
```yaml
args:
- --model
- "Qwen/Qwen3-0.6B"
- --isl
- "2000"
- --std
- "10"
- --osl
- "256"
- --output-dir
- /data/results
- --input
- "prod-vllm=vllm-agg-frontend.production.svc.cluster.local:8000"
- --input
- "staging-vllm=vllm-agg-frontend.staging.svc.cluster.local:8000"
- --input
- "dev-vllm=vllm-agg-frontend.development.svc.cluster.local:8000"
```
## Understanding Your Results
Results are stored in `/data/results` and follow the same structure as client-side benchmarking:
```text
/data/results/
├── plots/ # Performance visualization plots
│ ├── SUMMARY.txt # Human-readable benchmark summary
│ ├── p50_inter_token_latency_vs_concurrency.png
│ ├── avg_inter_token_latency_vs_concurrency.png
│ ├── request_throughput_vs_concurrency.png
│ ├── efficiency_tok_s_gpu_vs_user.png
│ └── avg_time_to_first_token_vs_concurrency.png
└── dsr1/ # Results for dsr1 input
├── c1/ # Concurrency level 1
│ └── profile_export_genai_perf.json
├── c2/ # Concurrency level 2
└── ... # Other concurrency levels
```
## Monitoring and Debugging
### Check Job Status
```bash
kubectl describe job dynamo-benchmark -n $NAMESPACE
```
### View Logs
```bash
# Follow logs in real-time
kubectl logs -f job/dynamo-benchmark -n $NAMESPACE
```
### Debug Failed Jobs
```bash
# Check pod status
kubectl get pods -n $NAMESPACE -l job-name=dynamo-benchmark
# Describe failed pod
kubectl describe pod <pod-name> -n $NAMESPACE
```
## Troubleshooting
### Common Issues
1. **Service not found**: Ensure your DynamoGraphDeployment frontend service is running
2. **Service account permissions**: Verify `dynamo-sa` has necessary RBAC permissions
3. **PVC access**: Check that `dynamo-pvc` is properly configured and accessible
4. **Image pull issues**: Ensure the Docker image is accessible from the cluster
5. **Resource constraints**: Adjust resource limits if the job is being evicted
### Debug Commands
```bash
# Check PVC status
kubectl get pvc dynamo-pvc -n $NAMESPACE
# Verify service account
kubectl get sa dynamo-sa -n $NAMESPACE
# Check service endpoints
kubectl get svc -n $NAMESPACE
# Verify your service exists and has endpoints
SVC_NAME="${SERVICE_URL%%:*}"
kubectl get svc "$SVC_NAME" -n "$NAMESPACE"
kubectl get endpoints "$SVC_NAME" -n "$NAMESPACE"
```
---
## Customize Benchmarking Behavior ## Customize Benchmarking Behavior
The built-in Python workflow connects to endpoints, benchmarks with genai-perf, and generates plots. If you want to modify the behavior: The built-in Python workflow connects to endpoints, benchmarks with genai-perf, and generates plots. If you want to modify the behavior:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment