Unverified Commit 699996e4 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: add benchmarking guide (#2620)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 3c4adde5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Package marker for deploy.utils
......@@ -16,42 +16,51 @@
# limitations under the License.
"""
PVC Results Download Script
PVC Results Download Script (generic)
This script downloads all relevant profiling results from the profiling PVC to a local directory.
It creates the necessary access pod, downloads the files, and cleans up automatically.
Downloads files from a specified folder path inside a Kubernetes PVC into a local directory.
Creates an access pod, copies files, and exits. You can optionally exclude YAML configs.
Usage:
python3 download_pvc_results.py --namespace <namespace> --output-dir <local_directory> [--no-config]
Examples:
# Download to ./results directory
python3 download_pvc_results.py --namespace <namespace> --output-dir ./results
# Download to specific directory
python3 download_pvc_results.py --namespace <namespace> --output-dir /home/user/profiling_data
# Download without configuration files
python3 download_pvc_results.py --namespace <namespace> --output-dir ./results --no-config
python3 download_pvc_results.py --namespace <namespace> --output-dir <local_directory> \
--folder </absolute/folder/in/pvc> [--no-config]
"""
import argparse
import subprocess
import sys
import time
from pathlib import Path
from typing import List
from utils.kubernetes import check_kubectl_access, deploy_access_pod, run_command
try:
from deploy.utils.kubernetes import (
check_kubectl_access,
cleanup_access_pod,
deploy_access_pod,
run_command,
)
except ModuleNotFoundError:
# Allow running as a script: add repo root to sys.path
repo_root = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(repo_root))
from deploy.utils.kubernetes import (
check_kubectl_access,
cleanup_access_pod,
deploy_access_pod,
run_command,
)
def list_pvc_contents(
namespace: str, pod_name: str, skip_config: bool = False
namespace: str, pod_name: str, base_folder: str, skip_config: bool = False
) -> List[str]:
"""List contents of the PVC to identify relevant files."""
"""List contents of the PVC to identify files.
Downloads all files under base_folder. If skip_config is True, excludes *.yaml and *.yml.
"""
print("Scanning PVC contents...")
# Build find command with optional config file exclusion
# Build find command: all files
find_cmd = [
"kubectl",
"exec",
......@@ -60,44 +69,28 @@ def list_pvc_contents(
namespace,
"--",
"find",
"/profiling_results",
base_folder,
"-type",
"f",
"-name",
"*.png",
"-o",
"-name",
"*.npz",
]
# Add config file patterns if not skipping them
if not skip_config:
find_cmd.extend(
[
"-o",
"-name",
"*.yaml",
"-o",
"-name",
"*.yml",
]
)
# Exclude YAML files when requested
if skip_config:
find_cmd.extend(["-not", "-name", "*.yaml", "-not", "-name", "*.yml"])
try:
result = run_command(find_cmd, capture_output=True)
files = [f.strip() for f in result.stdout.split("\n") if f.strip()]
config_note = " (excluding config files)" if skip_config else ""
print(f"Found {len(files)} relevant files to download{config_note}")
print(f"Found {len(files)} files to download{config_note}")
return files
except subprocess.CalledProcessError:
print("ERROR: Failed to list PVC contents")
sys.exit(1)
def download_files(
namespace: str, pod_name: str, files: List[str], output_dir: Path
namespace: str, pod_name: str, files: List[str], output_dir: Path, base_folder: str
) -> None:
"""Download relevant files from PVC to local directory."""
if not files:
......@@ -113,8 +106,13 @@ def download_files(
for file_path in files:
try:
# Determine relative path and create local structure
rel_path = file_path.replace("/profiling_results/", "")
# Determine relative path and create local structure based on base_folder
prefix = base_folder.rstrip("/") + "/"
rel_path = (
file_path[len(prefix) :]
if file_path.startswith(prefix)
else file_path.lstrip("/")
)
# Validate relative path
if ".." in rel_path or rel_path.startswith("/"):
......@@ -154,143 +152,6 @@ def download_files(
print(f"✓ Download completed: {downloaded} successful, {failed} failed")
def download_summary_files(
namespace: str, pod_name: str, output_dir: Path, skip_config: bool = False
) -> None:
"""Download key summary files that might not match the pattern."""
summary_files = [
"/profiling_results/prefill_performance.png",
"/profiling_results/decode_performance.png",
]
# Add config files if not skipping them
if not skip_config:
summary_files.append(
"/profiling_results/disagg.yaml"
) # In case it was injected
print("Downloading summary files...")
for file_path in summary_files:
try:
# Check if file exists first using subprocess.run directly
result = subprocess.run(
[
"kubectl",
"exec",
pod_name,
"-n",
namespace,
"--",
"test",
"-f",
file_path,
],
capture_output=True,
text=True,
check=False,
)
if result.returncode != 0:
# File doesn't exist, skip silently
continue
# File exists, download it
rel_path = file_path.replace("/profiling_results/", "")
# Validate relative path
if ".." in rel_path or rel_path.startswith("/"):
print(
f" ⚠️ Skipped {file_path.split('/')[-1]}: potentially unsafe path"
)
continue
local_file = output_dir / rel_path
# Ensure the file is within output_dir
if not local_file.resolve().is_relative_to(output_dir.resolve()):
print(
f" ⚠️ Skipped {file_path.split('/')[-1]}: outside output directory"
)
continue
local_file.parent.mkdir(parents=True, exist_ok=True)
run_command(
[
"kubectl",
"cp",
f"{namespace}/{pod_name}:{file_path}",
str(local_file),
],
capture_output=True,
)
print(f" ✓ {rel_path}")
except Exception as e:
# File doesn't exist or failed to download, skip silently
print(f" ⚠️ Skipped {file_path.split('/')[-1]}: {e}")
pass
def cleanup_access_pod(namespace: str, pod_name: str) -> None:
"""Clean up the access pod (let it auto-delete via activeDeadlineSeconds)."""
print(f"ℹ️ Access pod '{pod_name}' will auto-delete in 5 minutes")
print(f" To delete immediately: kubectl delete pod {pod_name} -n {namespace}")
def generate_readme(output_dir: Path, file_count: int) -> None:
"""Generate a README file explaining the downloaded contents."""
readme_content = f"""# Profiling Results
Downloaded {file_count} files from profiling PVC.
## File Structure
### Performance Plots
- `prefill_performance.png` - Main prefill performance across TP sizes
- `decode_performance.png` - Main decode performance across TP sizes
### Interpolation Data
- `selected_prefill_interpolation/raw_data.npz` - Prefill performance data
- `selected_prefill_interpolation/*.png` - Prefill interpolation plots
- `selected_decode_interpolation/raw_data.npz` - Decode performance data
- `selected_decode_interpolation/*.png` - Decode interpolation plots
### Configuration Files
- `disagg.yaml` - DynamoGraphDeployment configuration used for profiling
### Individual TP Results
- `prefill_tp*/` - Individual tensor parallelism profiling results
- `decode_tp*/` - Individual tensor parallelism profiling results
## Loading Data
To load the .npz data files in Python:
```python
import numpy as np
# Load prefill data
prefill_data = np.load('selected_prefill_interpolation/raw_data.npz')
print("Prefill data keys:", list(prefill_data.keys()))
# Load decode data
decode_data = np.load('selected_decode_interpolation/raw_data.npz')
print("Decode data keys:", list(decode_data.keys()))
```
Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}
"""
readme_path = output_dir / "README.md"
with open(readme_path, "w") as f:
f.write(readme_content)
print("📝 Generated README.md with download summary")
def main():
parser = argparse.ArgumentParser(
description="Download profiling results from PVC to local directory",
......@@ -318,6 +179,11 @@ def main():
action="store_true",
help="Skip downloading configuration files (*.yaml, *.yml)",
)
parser.add_argument(
"--folder",
required=True,
help="Absolute folder path in the PVC to download, e.g. /profiling_results or /benchmarking_results",
)
args = parser.parse_args()
......@@ -329,19 +195,13 @@ def main():
# Deploy access pod
pod_name = deploy_access_pod(args.namespace)
try:
# List and download files
files = list_pvc_contents(args.namespace, pod_name, args.no_config)
download_files(args.namespace, pod_name, files, args.output_dir)
# Download additional summary files
download_summary_files(args.namespace, pod_name, args.output_dir, args.no_config)
# Generate README
generate_readme(args.output_dir, len(files))
# Cleanup info
cleanup_access_pod(args.namespace, pod_name)
files = list_pvc_contents(args.namespace, pod_name, args.folder, args.no_config)
download_files(args.namespace, pod_name, files, args.output_dir, args.folder)
finally:
# Cleanup
cleanup_access_pod(args.namespace)
print("\n✅ Download completed!")
print(f"📁 Results available at: {args.output_dir.absolute()}")
......
......@@ -15,6 +15,10 @@
import argparse
import asyncio
import os
import re
import subprocess
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
......@@ -39,6 +43,38 @@ EXAMPLE_CHAT_REQUEST = {
}
class ProgressDisplay:
"""Helper class for cleaner progress display during deployment waiting"""
def __init__(self, verbose: bool = False):
self.verbose = verbose
self.last_message = ""
self.spinner_chars = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
self.spinner_idx = 0
def update(self, message: str, newline: bool = False):
"""Update progress display"""
if self.verbose or newline:
print(message)
else:
# Clear previous line and write new message
sys.stdout.write(f"\r\033[K{message}")
sys.stdout.flush()
self.last_message = message
def spinner(self) -> str:
"""Get next spinner character"""
char = self.spinner_chars[self.spinner_idx]
self.spinner_idx = (self.spinner_idx + 1) % len(self.spinner_chars)
return char
def finish(self, message: str):
"""Finish with a final message"""
if not self.verbose and self.last_message:
sys.stdout.write("\r\033[K") # Clear the line
print(message)
class DynamoDeploymentClient:
def __init__(
self,
......@@ -68,20 +104,84 @@ class DynamoDeploymentClient:
] = None # Will store the full deployment spec
self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs")
self.frontend_port = frontend_port
self.port_forward_process: Optional[subprocess.Popen[bytes]] = None
def _init_kubernetes(self):
async def _init_kubernetes(self):
"""Initialize kubernetes client"""
try:
# Try in-cluster config first (for pods with service accounts)
config.load_incluster_config()
except Exception:
# Fallback to kube config file (for local development)
config.load_kube_config()
await config.load_kube_config()
self.k8s_client = client.ApiClient()
self.custom_api = client.CustomObjectsApi(self.k8s_client)
self.core_api = client.CoreV1Api(self.k8s_client)
def port_forward_frontend(self, local_port: int = 8000, quiet: bool = False) -> str:
"""
Port forward the frontend service to a local port.
Args:
local_port: Local port to forward to (default: 8000)
quiet: If True, suppress kubectl port-forward output messages (default: False)
"""
cmd = [
"kubectl",
"port-forward",
f"svc/{self.service_name}",
f"{local_port}:{self.frontend_port}",
"-n",
self.namespace,
]
print(f"Starting port forward: {' '.join(cmd)}")
# Configure output redirection based on quiet flag
if quiet:
# Suppress kubectl's "Handling connection for..." messages
stdout = subprocess.DEVNULL
stderr = subprocess.DEVNULL
else:
stdout = None
stderr = None
# Start port forward in background
try:
self.port_forward_process = subprocess.Popen(
cmd, stdout=stdout, stderr=stderr
)
except FileNotFoundError as e:
raise RuntimeError(
"kubectl not found in PATH; required for port-forwarding"
) from e
# Wait a moment for port forward to establish
print("Waiting for port forward to establish...")
time.sleep(3)
print(f"Port forward started with PID: {self.port_forward_process.pid}")
return f"http://localhost:{local_port}"
def stop_port_forward(self):
"""
Stop the port forward process.
"""
if self.port_forward_process:
print(
f"Stopping port forward process (PID: {self.port_forward_process.pid})"
)
self.port_forward_process.terminate()
try:
self.port_forward_process.wait(timeout=5)
print("Port forward stopped")
except subprocess.TimeoutExpired:
print("Port forward process did not terminate, killing it")
self.port_forward_process.kill()
self.port_forward_process.wait()
self.port_forward_process = None
def get_service_url(self) -> str:
"""
Get the service URL using Kubernetes service DNS.
......@@ -97,7 +197,7 @@ class DynamoDeploymentClient:
Args:
deployment: Either a dict containing the deployment spec or a path to a yaml file
"""
self._init_kubernetes()
await self._init_kubernetes()
if isinstance(deployment, str):
# Load from yaml file
......@@ -107,6 +207,11 @@ class DynamoDeploymentClient:
else:
self.deployment_spec = deployment
# Ensure deployment_spec is properly loaded
assert (
self.deployment_spec is not None
), "Failed to load deployment specification"
# Extract component names
self.components = [
svc.lower() for svc in self.deployment_spec["spec"]["services"].keys()
......@@ -139,15 +244,30 @@ class DynamoDeploymentClient:
print(f"Failed to create deployment {self.deployment_name}: {e}")
raise
async def wait_for_deployment_ready(self, timeout: int = 1800):
async def wait_for_deployment_ready(
self, timeout: int = 1800, verbose: Optional[bool] = None
):
"""
Wait for the custom resource to be ready.
Wait for the custom resource to be ready with improved progress display.
Args:
timeout: Maximum time to wait in seconds, default to 30 mins (image pulling can take a while)
verbose: If True, show detailed status updates. If None, uses DYNAMO_VERBOSE env var.
"""
# Allow environment variable to control verbosity
if verbose is None:
verbose = os.environ.get("DYNAMO_VERBOSE", "false").lower() == "true"
progress = ProgressDisplay(verbose=verbose)
start_time = time.time()
# TODO: A little brittle, also should output intermediate status every so often.
last_status = None
last_conditions_str = ""
check_interval = 20 if not verbose else 10
# Initial message
if not verbose:
print(f"⏳ Waiting for deployment '{self.deployment_name}'...")
while (time.time() - start_time) < timeout:
try:
status = await self.custom_api.get_namespaced_custom_object(
......@@ -157,57 +277,129 @@ class DynamoDeploymentClient:
plural="dynamographdeployments",
name=self.deployment_name,
)
# Check both conditions:
# 1. Ready condition is True
# 2. State is successful
status_obj = status.get("status", {})
conditions = status_obj.get("conditions", [])
current_state = status_obj.get("state", "unknown")
elapsed = time.time() - start_time
print(f"Current deployment state: {current_state}")
print(f"Current conditions: {conditions}")
print(f"Elapsed time: {time.time() - start_time:.1f}s / {timeout}s")
# Check readiness
ready_condition = False
ready_message = ""
for condition in conditions:
if (
condition.get("type") == "Ready"
and condition.get("status") == "True"
):
ready_condition = True
if condition.get("type") == "Ready":
ready_condition = condition.get("status") == "True"
ready_message = condition.get("message", "")
break
state_successful = status_obj.get("state") == "successful"
state_successful = current_state == "successful"
# Extract not ready components from message
not_ready_components = []
if re.search(r"resources not ready:", ready_message, re.IGNORECASE):
match = re.search(r"\[(.*?)\]", ready_message)
if match:
items = match.group(1)
not_ready_components = [
s.strip() for s in re.split(r"[,\s]+", items) if s.strip()
]
# Format progress message based on mode
if not verbose:
# Concise single-line progress with spinner
spinner = progress.spinner()
# Create status string
if not_ready_components:
# Show first 2 components, abbreviate if more
components_str = ", ".join(not_ready_components[:2])
if len(not_ready_components) > 2:
components_str += f" +{len(not_ready_components)-2} more"
status_str = f"Waiting for: {components_str}"
else:
status_str = f"State: {current_state}"
# Format time
time_str = f"[{elapsed:.0f}s]"
message = f"{spinner} {time_str} {status_str}"
progress.update(message)
else:
# Verbose mode - show details when status changes
conditions_str = str(conditions)
if (
current_state != last_status
or conditions_str != last_conditions_str
):
progress.update(f"Current deployment state: {current_state}")
progress.update(f"Current conditions: {conditions}")
progress.update(f"Elapsed time: {elapsed:.1f}s / {timeout}s")
progress.update(
f"Deployment not ready yet - Ready: {ready_condition}, "
f"State successful: {state_successful}"
)
last_status = current_state
last_conditions_str = conditions_str
# Check if deployment is ready
if ready_condition and state_successful:
print(
"Deployment is ready: Ready condition is True and state is successful"
progress.finish(
f"✅ Deployment '{self.deployment_name}' ready after {elapsed:.1f}s"
)
return True
else:
print(
f"Deployment not ready yet - Ready condition: {ready_condition}, State successful: {state_successful}"
)
except kubernetes.client.rest.ApiException as e:
print(f"API Exception while checking deployment status: {e}")
print(f"Status code: {e.status}, Reason: {e.reason}")
if verbose:
progress.update(
f"API Exception while checking deployment status: {e}",
newline=True,
)
progress.update(
f"Status code: {e.status}, Reason: {e.reason}", newline=True
)
except Exception as e:
print(f"Unexpected exception while checking deployment status: {e}")
await asyncio.sleep(20)
raise TimeoutError("Deployment failed to become ready within timeout")
if verbose:
progress.update(
f"Unexpected exception while checking deployment status: {e}",
newline=True,
)
await asyncio.sleep(check_interval)
# Timeout reached
progress.finish(
f"❌ Deployment '{self.deployment_name}' failed to become ready within {timeout}s"
)
raise TimeoutError(f"Deployment failed to become ready within {timeout}s")
async def check_chat_completion(self):
async def check_chat_completion(
self,
use_port_forward: bool = False,
local_port: int = 8000,
quiet: bool = True,
timeout_s: float = 30.0,
):
"""
Test the deployment with a chat completion request using httpx.
"""
EXAMPLE_CHAT_REQUEST["model"] = self.model_name
# Use cluster DNS in-cluster; otherwise optionally port-forward
inside_cluster = bool(os.environ.get("KUBERNETES_SERVICE_HOST"))
base_url = self.get_service_url()
if use_port_forward or not inside_cluster:
base_url = self.port_forward_frontend(local_port=local_port, quiet=quiet)
url = f"{base_url}/v1/chat/completions"
async with httpx.AsyncClient() as client:
try:
async with httpx.AsyncClient(timeout=timeout_s) as client:
response = await client.post(url, json=EXAMPLE_CHAT_REQUEST)
response.raise_for_status()
return response.text
finally:
if use_port_forward or not inside_cluster:
self.stop_port_forward()
async def get_deployment_logs(self):
"""
......@@ -257,6 +449,10 @@ class DynamoDeploymentClient:
except kubernetes.client.rest.ApiException as e:
if e.status != 404: # Ignore if already deleted
raise
finally:
# Close the kubernetes client session to avoid warnings
if hasattr(self, "k8s_client"):
await self.k8s_client.close()
async def cleanup_remaining_deployments(deployment_clients, namespace):
......@@ -339,7 +535,7 @@ async def main():
# Test chat completion
print("Testing chat completion...")
response = await client.check_chat_completion()
response = await client.check_chat_completion(use_port_forward=True)
print(f"Chat completion response: {response}")
# Get logs
......
......@@ -16,51 +16,55 @@
# limitations under the License.
"""
Disagg Config Injection Script
Manifest Injection Script
This script copies a DynamoGraphDeployment disagg configuration file into the profiling PVC
so it can be used by the SLA profiler job. The profiler can then reference this config
using the DGD_CONFIG_FILE environment variable.
Copies any Kubernetes manifest file into the PVC for later use by jobs.
Both the source manifest path and destination path in the PVC are required.
Usage:
python3 inject_disagg_config.py --namespace <namespace> [--disagg-config <path>] [--target-path <path>]
python3 inject_manifest.py --namespace <namespace> --src <local_manifest.yaml> --dest <absolute_path_in_pvc>
Examples:
# Use default disagg.yaml from components/backends/vllm/deploy/
python3 inject_disagg_config.py --namespace <namespace>
# Use custom disagg config
python3 inject_disagg_config.py --namespace <namespace> --disagg-config ./my-custom-disagg.yaml
# Use custom target path in PVC
python3 inject_disagg_config.py --namespace <namespace> --target-path /profiling_results/custom-disagg.yaml
python3 inject_manifest.py --namespace <ns> --src ./my-disagg.yaml --dest /configs/disagg.yaml
python3 inject_manifest.py --namespace <ns> --src ./my-agg.yaml --dest /configs/agg.yaml
"""
import argparse
import sys
from pathlib import Path
from utils.kubernetes import check_kubectl_access, deploy_access_pod, run_command
from deploy.utils.kubernetes import (
PVC_ACCESS_POD_NAME,
check_kubectl_access,
cleanup_access_pod,
deploy_access_pod,
run_command,
)
def copy_disagg_config(
namespace: str, disagg_config_path: Path, target_path: str
) -> None:
"""Copy the disagg config file into the PVC via the access pod."""
pod_name = "pvc-access-pod"
def copy_manifest(namespace: str, manifest_path: Path, target_path: str) -> None:
"""Copy a manifest file into the PVC via the access pod."""
pod_name = PVC_ACCESS_POD_NAME
if not disagg_config_path.exists():
print(f"ERROR: Disagg config file not found: {disagg_config_path}")
if not manifest_path.exists():
print(f"ERROR: Manifest file not found: {manifest_path}")
sys.exit(1)
print(f"Copying {disagg_config_path} to {target_path} in PVC...")
print(f"Copying {manifest_path} to {target_path} in PVC...")
# Ensure destination directory exists
target_dir = str(Path(target_path).parent)
run_command(
["kubectl", "exec", pod_name, "-n", namespace, "--", "mkdir", "-p", target_dir],
capture_output=False,
)
# Copy file to pod
run_command(
[
"kubectl",
"cp",
str(disagg_config_path),
str(manifest_path),
f"{namespace}/{pod_name}:{target_path}",
],
capture_output=False,
......@@ -72,38 +76,13 @@ def copy_disagg_config(
capture_output=True,
)
print("✓ Disagg config successfully copied to PVC")
print("✓ Manifest successfully copied to PVC")
print(f"File details: {result.stdout.strip()}")
def cleanup_access_pod(namespace: str, keep_pod: bool = True) -> None:
"""Optionally clean up the access pod."""
if keep_pod:
print("ℹ️ Access pod 'pvc-access-pod' left running for future use")
print(
f" To access PVC: kubectl exec -it pvc-access-pod -n {namespace} -- /bin/bash"
)
print(f" To delete pod: kubectl delete pod pvc-access-pod -n {namespace}")
else:
print("Cleaning up access pod...")
run_command(
[
"kubectl",
"delete",
"pod",
"pvc-access-pod",
"-n",
namespace,
"--ignore-not-found",
],
capture_output=False,
)
print("✓ Access pod deleted")
def main():
parser = argparse.ArgumentParser(
description="Inject disagg config into profiling PVC",
description="Inject a Kubernetes manifest into the PVC",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
......@@ -116,36 +95,28 @@ def main():
)
parser.add_argument(
"--disagg-config",
type=Path,
default=Path("components/backends/vllm/deploy/disagg.yaml"),
help="Path to disagg config file (default: components/backends/vllm/deploy/disagg.yaml)",
"--src", required=True, type=Path, help="Path to manifest file to copy"
)
parser.add_argument(
"--target-path",
default="/profiling_results/disagg.yaml",
help="Target path in PVC (default: /profiling_results/disagg.yaml)",
)
parser.add_argument(
"--cleanup",
action="store_true",
help="Delete the access pod after copying (default: keep running)",
"--dest",
required=True,
help="Absolute target path in PVC (e.g., /profiling_results/agg.yaml)",
)
args = parser.parse_args()
# Validate target_path to prevent directory traversal
if not args.target_path.startswith("/profiling_results/"):
print("ERROR: Target path must be within /profiling_results/")
if not args.dest.startswith("/"):
print(
"ERROR: Target path must be an absolute path inside the PVC (start with '/')."
)
sys.exit(1)
if ".." in args.target_path:
if ".." in args.dest:
print("ERROR: Target path cannot contain '..'")
sys.exit(1)
print("🚀 Disagg Config Injection")
print("🚀 Manifest Injection")
print("=" * 40)
# Validate inputs
......@@ -153,16 +124,14 @@ def main():
# Deploy access pod
deploy_access_pod(args.namespace)
# Copy disagg config
copy_disagg_config(args.namespace, args.disagg_config, args.target_path)
# Cleanup
cleanup_access_pod(args.namespace, keep_pod=not args.cleanup)
print("\n✅ Disagg config injection completed!")
print(f"📁 Config available at: {args.target_path}")
print(f"🔧 Set DGD_CONFIG_FILE=/workspace{args.target_path} in your profiler job")
try:
# Copy manifest
copy_manifest(args.namespace, args.src, args.dest)
print("\n✅ Manifest injection completed!")
print(f"📁 File available at: {args.dest}")
finally:
# Cleanup even on failure
cleanup_access_pod(args.namespace)
if __name__ == "__main__":
......
......@@ -15,10 +15,11 @@
import subprocess
import sys
import time
from pathlib import Path
from typing import List
PVC_ACCESS_POD_NAME = "pvc-access-pod"
def run_command(
cmd: List[str], capture_output: bool = True
......@@ -48,7 +49,6 @@ def check_kubectl_access(namespace: str) -> None:
def deploy_access_pod(namespace: str) -> str:
"""Deploy the PVC access pod and return pod name."""
pod_name = "pvc-access-pod"
# Check if pod already exists and is running
try:
......@@ -57,7 +57,7 @@ def deploy_access_pod(namespace: str) -> str:
"kubectl",
"get",
"pod",
pod_name,
PVC_ACCESS_POD_NAME,
"-n",
namespace,
"-o",
......@@ -69,17 +69,17 @@ def deploy_access_pod(namespace: str) -> str:
)
if result.returncode == 0 and result.stdout.strip() == "Running":
print(f"✓ Access pod '{pod_name}' already running")
return pod_name
print(f"✓ Access pod '{PVC_ACCESS_POD_NAME}' already running")
return PVC_ACCESS_POD_NAME
except Exception:
# Pod doesn't exist or isn't running
pass
print(f"Deploying access pod '{pod_name}' in namespace '{namespace}'...")
print(f"Deploying access pod '{PVC_ACCESS_POD_NAME}' in namespace '{namespace}'...")
# Get the directory where this script is located
script_dir = Path(__file__).parent.parent
pod_yaml_path = script_dir / "deploy" / "pvc-access-pod.yaml"
script_dir = Path(__file__).parent
pod_yaml_path = script_dir / "manifests" / "pvc-access-pod.yaml"
if not pod_yaml_path.exists():
print(f"ERROR: Pod YAML not found at {pod_yaml_path}")
......@@ -92,36 +92,34 @@ def deploy_access_pod(namespace: str) -> str:
)
print("Waiting for pod to be ready...")
# Wait for pod to be ready (up to 60 seconds)
for i in range(60):
try:
result = subprocess.run(
run_command(
[
"kubectl",
"get",
"pod",
pod_name,
"wait",
f"pod/{PVC_ACCESS_POD_NAME}",
"-n",
namespace,
"-o",
"jsonpath={.status.phase}",
"--for=condition=Ready",
"--timeout=60s",
],
capture_output=True,
text=True,
check=False,
capture_output=False,
)
if result.returncode == 0 and result.stdout.strip() == "Running":
print("✓ Access pod is ready")
return pod_name
except Exception:
pass
return PVC_ACCESS_POD_NAME
time.sleep(1)
if i % 10 == 0:
print(f" Still waiting... ({i+1}s)")
print("ERROR: Access pod failed to become ready within 60 seconds")
sys.exit(1)
def cleanup_access_pod(namespace: str) -> None:
print("Cleaning up access pod...")
run_command(
[
"kubectl",
"delete",
"pod",
PVC_ACCESS_POD_NAME,
"-n",
namespace,
"--ignore-not-found",
],
capture_output=False,
)
print("✓ Access pod deleted")
......@@ -37,5 +37,5 @@ spec:
volumes:
- name: profiling-storage
persistentVolumeClaim:
claimName: profiling-pvc
claimName: dynamo-pvc
restartPolicy: Never
......@@ -3,7 +3,7 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: profiling-pvc
name: dynamo-pvc
namespace: ${NAMESPACE}
spec:
accessModes:
......
......@@ -3,7 +3,7 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: profile-sla-role
name: dynamo-role
namespace: ${NAMESPACE}
rules:
# DynamoGraphDeployment custom resources - needed for create/get/delete operations
......@@ -17,3 +17,10 @@ rules:
- apiGroups: [""]
resources: ["pods/log"]
verbs: ["get"]
# Services and Deployments - needed for vLLM deployments
- apiGroups: [""]
resources: ["services"]
verbs: ["get", "create", "delete"]
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "create", "delete"]
......@@ -3,13 +3,13 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: profile-sla-binding
name: dynamo-binding
namespace: ${NAMESPACE}
subjects:
- kind: ServiceAccount
name: profile-sla-sa
name: dynamo-sa
namespace: ${NAMESPACE}
roleRef:
kind: Role
name: profile-sla-role
name: dynamo-role
apiGroup: rbac.authorization.k8s.io
......@@ -3,7 +3,7 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: profile-sla-sa
name: dynamo-sa
namespace: ${NAMESPACE}
imagePullSecrets:
- name: nvcr-imagepullsecret
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Kubernetes and async dependencies
aiofiles>=0.8.0
# Benchmarking dependencies for Dynamo
genai-perf==0.0.15
httpx>=0.24.0
kubernetes-asyncio>=24.0.0
# Plotting and visualization
matplotlib>=3.5.0
numpy>=1.21.0
pandas>=1.3.0
plotly>=5.0.0
# YAML processing
pyyaml>=6.0.0
scipy>=1.7.0
seaborn>=0.11.0
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -euo pipefail
# Resolve repo root relative to this script
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# Inputs
NAMESPACE="${NAMESPACE:-default}"
DOCKER_SERVER="${DOCKER_SERVER:-}"
IMAGE_TAG="${IMAGE_TAG:-}"
DOCKER_USERNAME="${DOCKER_USERNAME:-}"
DOCKER_PASSWORD="${DOCKER_PASSWORD:-}"
HF_TOKEN="${HF_TOKEN:-}"
PULL_SECRET_NAME=""
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log() { echo -e "${BLUE}[INFO]${NC} $*"; }
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
err() { echo -e "${RED}[ERROR]${NC} $*"; }
create_or_update_pull_secret() {
local server="$1"; local user="$2"; local pass="$3"
if [[ -n "$server" && -n "$user" && -n "$pass" ]]; then
log "Creating/updating docker-imagepullsecret for $server in namespace $NAMESPACE"
kubectl create secret docker-registry docker-imagepullsecret \
--docker-server="$server" \
--docker-username="$user" \
--docker-password="$pass" \
--namespace="$NAMESPACE" \
--dry-run=client -o yaml | kubectl apply -f -
ok "docker-imagepullsecret configured"
PULL_SECRET_NAME="docker-imagepullsecret"
fi
}
usage() {
cat << EOF
Usage:
NAMESPACE=<ns> deploy/utils/setup_k8s_namespace.sh
NAMESPACE=<ns> DOCKER_SERVER=<registry> IMAGE_TAG=<tag> [DOCKER_USERNAME=<user>] [DOCKER_PASSWORD=<token>] \
deploy/utils/setup_k8s_namespace.sh
Sets up Kubernetes namespace for Dynamo (one-time per namespace):
- Creates namespace if absent
- Applies common manifests (ServiceAccount, Role, RoleBinding, PVC)
- Installs CRDs once per cluster (if not already installed)
- If DOCKER_SERVER/IMAGE_TAG are provided:
* Builds/pushes a custom operator image with Earthly
* Installs/updates the operator Helm release using that image
* If credentials (DOCKER_USERNAME/DOCKER_PASSWORD) are provided, creates/updates docker-imagepullsecret
* If credentials are not provided, prompts interactively to create the pull secret
- Otherwise installs the operator using default image: nvcr.io/nvidia/ai-dynamo/kubernetes-operator:0.4.0
Environment variables:
NAMESPACE Target Kubernetes namespace (default: default)
DOCKER_SERVER Registry server for operator image (optional)
IMAGE_TAG Image tag for operator (optional)
DOCKER_USERNAME Registry username (optional; if provided with DOCKER_PASSWORD, secret is created)
DOCKER_PASSWORD Registry password/token (optional)
HF_TOKEN Hugging Face token; if set, a secret named hf-token-secret is created in the namespace (optional)
EOF
}
if ! command -v kubectl &>/dev/null; then err "kubectl not found"; exit 1; fi
# 1) Ensure namespace exists
if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
log "Creating namespace $NAMESPACE"
kubectl create namespace "$NAMESPACE"
else
log "Namespace $NAMESPACE exists"
fi
# 2) Apply common manifests
log "Applying common manifests to namespace $NAMESPACE"
for mf in "$(dirname "$0")/manifests"/*.yaml; do
envsubst < "$mf" | kubectl apply -f -
done
ok "Common manifests applied"
# 3) Install CRDs once per cluster (only if not already installed)
if command -v helm &>/dev/null; then
if ! helm status dynamo-crds -n "$NAMESPACE" &>/dev/null; then
log "Installing CRDs via Helm release dynamo-crds in namespace $NAMESPACE"
pushd "$REPO_ROOT/deploy/cloud/helm" >/dev/null
helm upgrade --install dynamo-crds ./crds/ \
--namespace "$NAMESPACE" \
--wait \
--atomic
popd >/dev/null
ok "CRDs installed"
fi
fi
# 4) Optional: Create Hugging Face token secret if HF_TOKEN provided
if [[ -n "$HF_TOKEN" ]]; then
kubectl create secret generic hf-token-secret \
--from-literal=HF_TOKEN="$HF_TOKEN" \
-n "$NAMESPACE" \
--dry-run=client -o yaml | kubectl apply -f -
ok "hf-token-secret created/updated"
fi
# 5) Optional: Create imagePullSecret for private registry if credentials provided or requested
if [[ -n "$DOCKER_SERVER" ]]; then
if [[ -n "$DOCKER_USERNAME" && -n "$DOCKER_PASSWORD" ]]; then
create_or_update_pull_secret "$DOCKER_SERVER" "$DOCKER_USERNAME" "$DOCKER_PASSWORD"
elif [[ -n "$IMAGE_TAG" ]]; then
echo
read -p "Do you need image pull credentials for $DOCKER_SERVER (private registry)? [y/N]: " -r ans
if [[ "$ans" =~ ^[Yy]$ ]]; then
read -p "Docker username (often '$oauthtoken' for NGC): " DOCKER_USERNAME
read -s -p "Docker password/token: " DOCKER_PASSWORD; echo
if [[ -n "$DOCKER_USERNAME" && -n "$DOCKER_PASSWORD" ]]; then
create_or_update_pull_secret "$DOCKER_SERVER" "$DOCKER_USERNAME" "$DOCKER_PASSWORD"
else
warn "Username or password empty; skipping secret creation"
fi
fi
fi
fi
# 6) Operator: Build/push custom image if both vars provided, else use default NGC image
if [[ -n "$DOCKER_SERVER" && -n "$IMAGE_TAG" ]]; then
if ! command -v earthly &>/dev/null; then warn "earthly not found; skipping operator build/push"; else
log "Building and pushing operator images via earthly"
earthly --push +all-docker --DOCKER_SERVER="$DOCKER_SERVER" --IMAGE_TAG="$IMAGE_TAG"
fi
if ! command -v helm &>/dev/null; then warn "helm not found; skipping helm install"; else
pushd "$REPO_ROOT/deploy/cloud/helm/platform" >/dev/null
helm dep build
popd >/dev/null
pushd "$REPO_ROOT/deploy/cloud/helm" >/dev/null
# Build Helm args
HELM_ARGS=(upgrade dynamo-platform ./platform/ --install --namespace "$NAMESPACE" \
--set "dynamo-operator.controllerManager.manager.image.repository=${DOCKER_SERVER}/dynamo-operator" \
--set "dynamo-operator.controllerManager.manager.image.tag=${IMAGE_TAG}")
if [[ -n "$PULL_SECRET_NAME" ]]; then
HELM_ARGS+=(--set "dynamo-operator.imagePullSecrets[0].name=${PULL_SECRET_NAME}")
fi
helm "${HELM_ARGS[@]}"
popd >/dev/null
ok "Helm chart installed/updated"
fi
else
# Use default published image when custom not provided
DEFAULT_OPERATOR_IMAGE="nvcr.io/nvidia/ai-dynamo/kubernetes-operator:0.4.0"
if ! command -v helm &>/dev/null; then warn "helm not found; skipping helm install"; else
pushd "$REPO_ROOT/deploy/cloud/helm/platform" >/dev/null
helm dep build
popd >/dev/null
pushd "$REPO_ROOT/deploy/cloud/helm" >/dev/null
# Only set imagePullSecrets if the referenced secret exists; otherwise rely on SA
HELM_ARGS=(upgrade dynamo-platform ./platform/ --install --namespace "$NAMESPACE" \
--set "dynamo-operator.controllerManager.manager.image.repository=${DEFAULT_OPERATOR_IMAGE%:*}" \
--set "dynamo-operator.controllerManager.manager.image.tag=${DEFAULT_OPERATOR_IMAGE##*:}")
if kubectl get secret nvcr-imagepullsecret -n "$NAMESPACE" &>/dev/null; then
HELM_ARGS+=(--set "dynamo-operator.imagePullSecrets[0].name=nvcr-imagepullsecret")
fi
helm "${HELM_ARGS[@]}"
popd >/dev/null
ok "Helm chart installed/updated with default operator image"
fi
fi
# 7) Install benchmark dependencies if requirements.txt exists
REQUIREMENTS_FILE="$SCRIPT_DIR/requirements.txt"
if [[ -f "$REQUIREMENTS_FILE" ]]; then
log "Installing benchmark dependencies..."
if command -v uv >/dev/null 2>&1; then
uv pip install -r "$REQUIREMENTS_FILE"
elif command -v pip3 >/dev/null 2>&1; then
pip3 install -r "$REQUIREMENTS_FILE"
elif command -v pip >/dev/null 2>&1; then
pip install -r "$REQUIREMENTS_FILE"
else
warn "No pip/pip3/uv found; skipping benchmark dependency installation"
warn "To run benchmarks, manually install: pip install -r $REQUIREMENTS_FILE"
fi
ok "Benchmark dependencies installed"
fi
ok "Kubernetes namespace setup complete"
......@@ -24,7 +24,7 @@ There are two additional rules set by planner to prevent over-compensation:
## SLA-based Scaling Up/Down Prefill/Decode Workers
See [Pre-Deployment Profiling](pre_deployment_profiling.md) for more details.
See [Pre-Deployment Profiling](../benchmarks/pre_deployment_profiling.md) for more details.
## Usage
......
......@@ -71,6 +71,6 @@ Key features include:
:hidden:
Overview <self>
Pre-Deployment Profiling <pre_deployment_profiling.md>
Pre-Deployment Profiling <../benchmarks/pre_deployment_profiling.md>
Load-based Planner <load_planner.md>
SLA-based Planner <sla_planner.md>
Planner Benchmark <../guides/planner_benchmark/README.md>
\ No newline at end of file
......@@ -28,7 +28,7 @@ The SLA planner consists of several key components:
## Pre-Deployment Profiling
SLA-based planner requires pre-deployment profiling to operate. See [Pre-Deployment Profiling](pre_deployment_profiling.md) for more details.
SLA-based planner requires pre-deployment profiling to operate. See [Pre-Deployment Profiling](../benchmarks/pre_deployment_profiling.md) for more details.
## Load Prediction
......
<!-- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License. -->
# Dynamo Benchmarking Guide
This benchmarking framework lets you compare performance across any combination of:
- **DynamoGraphDeployments** (automatically deployed from your manifests)
- **External HTTP endpoints** (existing services, vLLM, TensorRT-LLM, etc.)
You can mix and match these in a single benchmark run using custom labels. Configure your DynamoGraphDeployment manifests for your specific models, hardware, and parallelization needs.
## What This Tool Does
The framework is a wrapper around `genai-perf` that:
- Deploys user-specified `DynamoGraphDeployments` automatically
- Benchmarks any HTTP endpoints (no deployment needed)
- Runs concurrency sweeps across configurable load levels
- Generates comparison plots with your custom labels
- Works with any HuggingFace-compatible model on NVIDIA GPUs (H200, H100, A100, etc.)
- Runs locally and connects to your Kubernetes deployments/endpoints
**Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`)
**Important**: The `--model` parameter configures GenAI-Perf for benchmarking and provides logging context. The actual model loaded is determined by your deployment manifests. Only one model can be benchmarked at a time across all inputs to ensure fair comparison. The default `--model` value in the benchmarking script is `deepseek-ai/DeepSeek-R1-Distill-Llama-8B`, but it must match the model in the manifest(s) and the model deployed at the endpoint(s).
## Prerequisites
1. **Kubernetes cluster with NVIDIA GPUs and Dynamo namespace setup** - You need a Kubernetes cluster with eligible NVIDIA GPUs and a properly configured namespace for Dynamo benchmarking. See the [deploy/utils/README](../../deploy/utils/README.md) for complete setup instructions.
2. **kubectl access** - You need `kubectl` installed and configured to access your Kubernetes cluster. All other required tools (GenAI-Perf, Python, etc.) are included in the Dynamo containers. If you are not working within a Dynamo container, you can install the necessary requirements using `deploy/utils/requirements.txt`. *Note: if you are on Ubuntu 22.04 or lower, you will also need to build perf_analyzer [from source](https://github.com/triton-inference-server/perf_analyzer/blob/main/docs/install.md#build-from-source).*
## Quick Start Examples
The tool can be used to deploy, benchmark and compare Dynamo deployments (DynamoGraphDeployments) on a Kubernetes cluster as well as benchmark and compare servers deployed separately given a URL. In the examples below, Dynamo deployments are specified with a yaml and servers deployed separately by URL.
```bash
export NAMESPACE=benchmarking
# Compare multiple DynamoGraphDeployments of a single backend
./benchmarks/benchmark.sh --namespace $NAMESPACE \
--input agg=components/backends/vllm/deploy/agg.yaml \
--input disagg=components/backends/vllm/deploy/disagg.yaml
# Compare different backend types (vLLM vs TensorRT-LLM)
./benchmarks/benchmark.sh --namespace $NAMESPACE \
--input vllm-disagg=components/backends/vllm/deploy/disagg.yaml \
--input trtllm-disagg=components/backends/trtllm/deploy/disagg.yaml
# Compare Dynamo deployment vs existing deployment (external endpoint)
./benchmarks/benchmark.sh --namespace $NAMESPACE \
--input dynamo=components/backends/vllm/deploy/disagg.yaml \
--input vllm-baseline=http://localhost:8000
# Compare three different configurations
./benchmarks/benchmark.sh --namespace $NAMESPACE \
--input dynamo-agg=components/backends/vllm/deploy/agg.yaml \
--input dynamo-disagg=components/backends/vllm/deploy/disagg.yaml \
--input external-vllm=http://localhost:8000
# Benchmark single external endpoint
./benchmarks/benchmark.sh --namespace $NAMESPACE \
--input production-api=http://your-api:8000
# Custom model and sequence lengths
./benchmarks/benchmark.sh --namespace $NAMESPACE \
--input my-setup=my-custom-manifest.yaml \
--model "meta-llama/Meta-Llama-3-8B" --isl 512 --osl 256
```
**Key**: Configure your manifests for your specific models, hardware, and parallelization strategy before benchmarking.
### Important: Image Accessibility
Ensure container images in your DynamoGraphDeployment manifests are accessible:
- **Public images**: Use [Dynamo NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo/artifacts) public releases
- **Custom registries**: Configure proper credentials in your Kubernetes namespace
## Configuration and Usage
### Command Line Options
```bash
./benchmarks/benchmark.sh --namespace NAMESPACE --input <label>=<manifest_path_or_endpoint> [--input <label>=<manifest_path_or_endpoint>]... [OPTIONS]
REQUIRED:
-n, --namespace NAMESPACE Kubernetes namespace
--input <label>=<manifest_path_or_endpoint> Benchmark input with custom label
- <label>: becomes the name/label in plots
- <manifest_path_or_endpoint>: either a DynamoGraphDeployment manifest or HTTP endpoint URL
Can be specified multiple times for comparisons
OPTIONS:
-h, --help Show help message and examples
-m, --model MODEL Model name for GenAI-Perf configuration and logging (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)
NOTE: This must match the model configured in your deployment manifests and endpoints
-i, --isl LENGTH Input sequence length (default: 2000)
-s, --std STDDEV Input sequence standard deviation (default: 10)
-o, --osl LENGTH Output sequence length (default: 256)
-d, --output-dir DIR Output directory (default: ./benchmarks/results)
--verbose Enable verbose output
```
### Important Notes
- **Custom Labels**: Each input must have a unique label that becomes the name in plots and results
- **Label Restrictions**: Labels can only contain letters, numbers, hyphens, and underscores. The label `plots` is reserved.
- **Input Types**: Supports DynamoGraphDeployment manifests for automatic deployment, or HTTP endpoints for existing services
- **Model Parameter**: The `--model` parameter configures GenAI-Perf for testing and logging, not deployment (deployment model is determined by the manifest files)
- **Standalone Deployments**: For non-Dynamo backends (vLLM, TensorRT-LLM, SGLang, etc.), you must deploy them manually following their respective Kubernetes deployment guides. The benchmarking framework only supports automatic deployment of DynamoGraphDeployments.
- **Single Model Requirement**: Only one model can be benchmarked at a time across all inputs to ensure fair comparison.
### What Happens During Benchmarking
The script automatically:
1. **Deploys** each DynamoGraphDeployment configuration to Kubernetes if manifests are passed in
2. **Benchmarks** using GenAI-Perf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250)
3. **Measures** key metrics: latency, throughput, time-to-first-token
4. **Generates** comparison plots using your custom labels in `./benchmarks/results/plots/`
5. **Cleans up** deployments when complete
### Results Clearing Behavior
**Important**: The benchmark script automatically clears the output directory before each run to ensure clean, reproducible results. This means:
- Previous benchmark results in the same output directory will be completely removed
- Each benchmark run starts with a clean slate
- Results from different runs are not mixed or accumulated
If you want to preserve results from previous runs, use different output directories with the `--output-dir` flag.
### Using Your Own Models and Configuration
The benchmarking framework supports any HuggingFace-compatible LLM model. To benchmark your own custom deployment:
1. **Edit your deployment YAML files** to specify your model in the `--model` argument of the container command
2. **Use the corresponding model name** in the benchmark script's `--model` parameter
**Note**: You can override the default sequence lengths (2000/256 tokens) with `--isl` and `--osl` flags if needed for your specific workload.
### Direct Python Execution
For direct control over the benchmark workflow:
```bash
# Endpoint benchmarking
python3 -u -m benchmarks.utils.benchmark \
--endpoint "http://your-endpoint:8000" \
--namespace $NAMESPACE \
--isl 2000 \
--std 10 \
--osl 256 \
--output-dir $OUTPUT_DIR
# Deployment benchmarking (any combination)
python3 -u -m benchmarks.utils.benchmark \
--input agg=$AGG_CONFIG \
--input disagg=$DISAGG_CONFIG \
--namespace $NAMESPACE \
--isl 2000 \
--std 10 \
--osl 256 \
--output-dir $OUTPUT_DIR
# Generate plots separately
python3 -m benchmarks.utils.plot --data-dir $OUTPUT_DIR
```
### Comparison Limitations
The plotting system supports up to 12 different inputs in a single comparison. If you need to compare more than 12 different deployments/endpoints, consider running separate benchmark sessions or grouping related comparisons together.
### Concurrency Configuration
You can customize the concurrency levels using the CONCURRENCIES environment variable:
```bash
# Custom concurrency levels
CONCURRENCIES="1,5,20,50" ./benchmarks/benchmark.sh --namespace $NAMESPACE --input my-test=components/backends/vllm/deploy/disagg.yaml
# Or set permanently
export CONCURRENCIES="1,2,5,10,25,50,100"
./benchmarks/benchmark.sh --namespace $NAMESPACE --input test=disagg.yaml
```
## Understanding Your Results
After benchmarking completes, check `./benchmarks/results/` (or your custom output directory):
### Summary and Plots
```text
benchmarks/results/
├── SUMMARY.txt # Quick overview of all results
└── plots/ # Visual comparisons (these are what you want!)
├── p50_inter_token_latency_vs_concurrency.png # Token generation speed
├── avg_time_to_first_token_vs_concurrency.png # Response time
├── request_throughput_vs_concurrency.png # Requests per second
├── efficiency_tok_s_gpu_vs_user.png # GPU efficiency
└── avg_inter_token_latency_vs_concurrency.png # Average latency
```
### Data Files
Raw data is organized by deployment/benchmark type and concurrency level:
**For Any Benchmarking (uses your custom labels):**
```text
benchmarks/results/
├── plots/ # Performance visualization plots
│ ├── SUMMARY.txt # Human-readable benchmark summary
│ ├── p50_inter_token_latency_vs_concurrency.png
│ ├── avg_inter_token_latency_vs_concurrency.png
│ ├── request_throughput_vs_concurrency.png
│ ├── efficiency_tok_s_gpu_vs_user.png
│ └── avg_time_to_first_token_vs_concurrency.png
├── <your-label-1>/ # Results for first input (uses your custom label)
│ ├── c1/ # Concurrency level 1
│ │ └── profile_export_genai_perf.json
│ ├── c2/ # Concurrency level 2
│ ├── c5/ # Concurrency level 5
│ └── ... # Other concurrency levels (10, 50, 100, 250)
├── <your-label-2>/ # Results for second input (if provided)
│ └── c*/ # Same structure as above
└── <your-label-N>/ # Results for additional inputs
└── c*/ # Same structure as above
```
**Example with actual labels:**
```text
benchmarks/results/
├── plots/
├── dynamo-agg/ # --input dynamo-agg=agg.yaml
├── dynamo-disagg/ # --input dynamo-disagg=disagg.yaml
└── external-vllm/ # --input external-vllm=http://localhost:8000
```
Each concurrency directory contains:
- **`profile_export_genai_perf.json`** - Structured metrics from GenAI-Perf
- **`profile_export.json`** - Raw GenAI-Perf results
- **`inputs.json`** - Generated test inputs
## Customize Benchmarking Behavior
The built-in workflow handles DynamoGraphDeployment deployment, benchmarking with genai-perf, and plot generation automatically. If you want to modify the behavior:
1. **Extend the workflow**: Modify `benchmarks/utils/workflow.py` to add custom deployment types or metrics collection
2. **Generate different plots**: Modify `benchmarks/utils/plot.py` to generate a different set of plots for whatever you wish to visualize.
The `benchmark.sh` script provides a complete end-to-end benchmarking experience. For more granular control, use the Python modules directly.
......@@ -71,22 +71,29 @@ SLA planner can work with any interpolation data that follows the above format.
## Running the Profiling Script in Kubernetes
Set your environment variables:
Set up your Kubernetes namespace (one-time per namespace). Follow the instructions [here](../../deploy/utils/README.md#kubernetes-setup-one-time-per-namespace). If your namespace is already set up, skip this step.
**Prerequisites**: Ensure all dependencies are installed. If you ran the setup script above, dependencies are already installed. Otherwise, install them manually:
```bash
export NAMESPACE=your-namespace
pip install -r deploy/utils/requirements.txt
```
**Optional Step 0: add a kubernetes secret**
### Step 1: Inject your DGD configuration
Use the injector utility to place your DGD manifest into the PVC. The profiling job will read the path you specify.
```bash
kubectl create secret docker-registry nvcr-imagepullsecret \
--docker-server=nvcr.io \
--docker-username='$oauthtoken' \
--docker-password=<nvapi key> \
-n $NAMESPACE
# Inject your disagg manifest
python3 deploy/utils/inject_manifest.py \
--namespace $NAMESPACE \
--src components/backends/vllm/deploy/disagg.yaml \
--dest /configs/disagg.yaml
# Set the docker image for the profiling job; any docker image that contains your script.
export DOCKER_IMAGE=nvcr.io/nvidia/dynamo:latest-vllm
```
**Step 1: Configure container image**
### Configure container image (optional)
You have two options for configuring your profiling setup:
......@@ -102,13 +109,13 @@ Use the default pre-built image and inject custom configurations via PVC:
2. **Inject your custom disagg configuration:**
```bash
# Use default disagg.yaml config
python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE
python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src components/backends/vllm/deploy/disagg.yaml --dest /configs/disagg.yaml
# Or use a custom disagg config file
python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE --disagg-config my-custom-disagg.yaml
python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src my-custom-disagg.yaml --dest /configs/disagg.yaml
# Or specify a custom target path in the PVC
python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE --target-path /profiling_results/my-disagg.yaml
python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src my-custom-disagg.yaml --dest /profiling_results/my-disagg.yaml
```
3. **Set the config path for the profiling job:**
......@@ -123,19 +130,6 @@ This approach allows you to:
> **Important**: For profiling, disagg configs should be run with Grove disabled by adding the annotation `nvidia.com/enable-grove: "false"` to avoid alpha Grove status issues.
> **Note**: The default location in the PVC is `/profiling_results/disagg.yaml`. If you don't inject a config, the profiler will fall back to the built-in config at `/workspace/components/backends/vllm/deploy/disagg.yaml`.
**Option B: Build custom image (only if you need code changes)**
Only needed if you require custom code modifications beyond configuration changes:
```bash
# in the project's root folder
./container/build.sh --framework <VLLM/sglang>
# Tag and push to your container registry
export DOCKER_IMAGE=<your docker tag>
export DGD_CONFIG_FILE=<disagg config path> # path to your disagg.yaml file within the DOCKER_IMAGE
```
**Step 2: Set SLA target**
Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL. Also, set the backend type to `vllm` or `sglang`. The backend type must match the dynamo deployment in the `DGD_CONFIG_FILE`.
......@@ -162,12 +156,7 @@ spec:
**Step 3: Run profiling (required)**
```bash
cd $DYNAMO_HOME/benchmarks/profiler/deploy
envsubst < profiling_pvc.yaml | kubectl apply -f -
envsubst < profile_sla_sa.yaml | kubectl apply -f -
envsubst < profile_sla_rbac.yaml | kubectl apply -f -
envsubst < profile_sla_binding.yaml | kubectl apply -f -
envsubst < profile_sla_job.yaml | kubectl apply -f -
envsubst < benchmarks/profiler/deploy/profile_sla_job.yaml | kubectl apply -f -
```
**Step 4: Wait for profiling to complete**
......@@ -176,40 +165,24 @@ kubectl get jobs -n $NAMESPACE
kubectl logs job/profile-sla -n $NAMESPACE
```
### RBAC Configuration
The SLA profiling job requires specific Kubernetes permissions to manage DynamoGraphDeployment resources and access namespace information. The RBAC setup consists of:
- **`profile_sla_sa.yaml`** - Service account with image pull secret for NVIDIA Container Registry access
- **`profile_sla_rbac.yaml`** - Role defining required permissions for managing deployments and accessing namespace resources
- **`profile_sla_binding.yaml`** - RoleBinding that associates the Role with the service account
All three files are necessary:
1. The service account provides identity and image pull credentials
2. The Role defines what operations are allowed
3. The RoleBinding connects the permissions to the service account
### Viewing Profiling Results
After the profiling job completes successfully, the results are stored in the persistent volume claim (PVC) created during Step 2. Here's how to access and view your profiling results:
#### Accessing the Profiling Results PVC
After the profiling job completes successfully, the results are stored in the persistent volume claim (PVC) created during Step 2.
The profiling results are stored in a PVC named `profiling-pvc`. To access the results:
To download the results:
1. **Deploy the PVC access pod (if not already running):**
```bash
kubectl apply -f benchmarks/profiler/deploy/pvc-access-pod.yaml -n $NAMESPACE
```
```bash
# Download to directory
python3 deploy/utils/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --folder /profiling_results
2. **Access the PVC through the pod:**
```bash
kubectl exec -it pvc-access-pod -n $NAMESPACE -- /bin/bash
cd /profiling_results
ls -la
```
# Download without any of the auto-created config.yaml files used in profiling
python3 deploy/utils/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --folder /profiling_results --no-config
```
> **Note**: The same `pvc-access-pod` is used for both injecting disagg configs and accessing results. If you used the `inject_disagg_config.py` script earlier, the pod may already be running. The pod auto-deletes after 5 minutes of activity.
The script will:
* Deploy a temporary access pod
* Download all files maintaining directory structure
* Clean the pod up automatically
#### File Structure
......@@ -231,62 +204,6 @@ The profiling results directory contains the following structure:
└── decode_tp{best_tp}.png # 3D ITL surface plot
```
#### Downloading Results Locally
You can download the profiling results using the automated download script or manually:
**Option 1: Automated Download (Recommended)**
Use the provided download script to automatically fetch all relevant files:
```bash
# Download to ./results directory
python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results
# Download to specific directory
python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir /path/to/my/results
# Download without any of the auto-created config.yaml files used in profiling
python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --no-config
```
The script will:
- Deploy a temporary access pod (auto-deletes after 5 minutes)
- Scan for relevant files (*.png, *.npz, *.yaml)
- Download all files maintaining directory structure
- Generate a README.md with file descriptions
- Clean up automatically
**Option 2: Manual Download**
To download the profiling results manually:
1. **Download performance plots and data files:**
```bash
# Create a local directory for results
mkdir -p ./profiling_results
# Copy main performance plots
kubectl cp pvc-access-pod:/profiling_results/prefill_performance.png ./profiling_results/ -n $NAMESPACE
kubectl cp pvc-access-pod:/profiling_results/decode_performance.png ./profiling_results/ -n $NAMESPACE
# Copy interpolation directories (includes additional plots and data)
kubectl cp pvc-access-pod:/profiling_results/selected_prefill_interpolation/ ./profiling_results/ -n $NAMESPACE -r
kubectl cp pvc-access-pod:/profiling_results/selected_decode_interpolation/ ./profiling_results/ -n $NAMESPACE -r
```
2. **Alternative: Tar and download entire results directory:**
```bash
# Inside the access pod, create a tar archive
tar -czf /profiling_results/profiling_results.tar.gz -C /profiling_results .
# Download the archive to your local machine
kubectl cp pvc-access-pod:/profiling_results/profiling_results.tar.gz ./profiling_results.tar.gz -n $NAMESPACE
# Extract locally
tar -xzf profiling_results.tar.gz -C ./profiling_results/
```
#### Viewing Performance Plots
The profiling generates several performance visualization files:
......@@ -316,20 +233,6 @@ decode_data = np.load('selected_decode_interpolation/raw_data.npz')
print("Decode data keys:", list(decode_data.keys()))
```
#### Cleaning Up
The access pod automatically deletes after 5 minutes of activity, but you can also clean it up manually:
```bash
# Exit the access pod (if still inside)
exit
# Delete the access pod immediately (optional - it will auto-delete)
kubectl delete pod pvc-access-pod -n $NAMESPACE
```
> **Note**: The access pod has `activeDeadlineSeconds: 300` and will auto-delete after 5 minutes to prevent resource waste.
### Troubleshooting
#### Image Pull Authentication Errors
......@@ -343,7 +246,7 @@ If you see `ErrImagePull` or `ImagePullBackOff` errors with 401 unauthorized mes
2. Verify the service account was created with the image pull secret:
```bash
kubectl get serviceaccount profile-sla-sa -n $NAMESPACE -o yaml
kubectl get serviceaccount dynamo-sa -n $NAMESPACE -o yaml
```
3. The service account should show `imagePullSecrets` containing `nvcr-imagepullsecret`.
......@@ -16,7 +16,6 @@ Quick deployment guide for the disaggregated planner with automatic scaling.
```mermaid
flowchart LR
Frontend --"/metrics"--> Prometheus
Prometheus --"scrape"--> Prometheus
Planner --"query API"--> Prometheus
Planner --"scaling decisions"--> Workers["prefill<br/>backend"]
Frontend -.->|"requests"| Workers
......@@ -25,7 +24,7 @@ flowchart LR
## Prerequisites
- Kubernetes cluster with GPU nodes
- `hf-token-secret` created in target namespace
- [Pre-Deployment Profiling](../../architecture/pre_deployment_profiling.md) results saved to `profiling-pvc` PVC.
- [Pre-Deployment Profiling](../../benchmarks/pre_deployment_profiling.md) results saved to `dynamo-pvc` PVC.
- Prefill and decode worker uses the best parallelization mapping suggested by the pre-deployment profiling script.
```bash
......@@ -62,7 +61,7 @@ vllm-disagg-planner-prefill-* 1/1 Running
kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-frontend 8000:8000
# Send a streaming request (required for full metrics)
curl http://localhost:8000/v1/chat/completions \
curl -N http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-0.6B",
......@@ -101,8 +100,8 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
**Connection Issues:**
```bash
# Verify Prometheus is accessible (runs on port 8000)
kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-prometheus 8000:8000
curl "http://localhost:8000/api/v1/query?query=up"
kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-prometheus 9090:8000
curl "http://localhost:9090/api/v1/query?query=up"
```
**Missing Metrics:**
......
......@@ -21,7 +21,7 @@ Use the pre-configured test deployment with sample profiling data, we provide th
### Option B: Use Your Own Profiling Results
1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../docs/architecture/pre_deployment_profiling.md) for detailed instructions.
1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../docs/benchmarks/pre_deployment_profiling.md) for detailed instructions.
## Interpolator Testing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment