Unverified Commit 699996e4 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: add benchmarking guide (#2620)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 3c4adde5
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Package marker for deploy.utils
...@@ -16,42 +16,51 @@ ...@@ -16,42 +16,51 @@
# limitations under the License. # limitations under the License.
""" """
PVC Results Download Script PVC Results Download Script (generic)
This script downloads all relevant profiling results from the profiling PVC to a local directory. Downloads files from a specified folder path inside a Kubernetes PVC into a local directory.
It creates the necessary access pod, downloads the files, and cleans up automatically. Creates an access pod, copies files, and exits. You can optionally exclude YAML configs.
Usage: Usage:
python3 download_pvc_results.py --namespace <namespace> --output-dir <local_directory> [--no-config] python3 download_pvc_results.py --namespace <namespace> --output-dir <local_directory> \
--folder </absolute/folder/in/pvc> [--no-config]
Examples:
# Download to ./results directory
python3 download_pvc_results.py --namespace <namespace> --output-dir ./results
# Download to specific directory
python3 download_pvc_results.py --namespace <namespace> --output-dir /home/user/profiling_data
# Download without configuration files
python3 download_pvc_results.py --namespace <namespace> --output-dir ./results --no-config
""" """
import argparse import argparse
import subprocess import subprocess
import sys import sys
import time
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from utils.kubernetes import check_kubectl_access, deploy_access_pod, run_command try:
from deploy.utils.kubernetes import (
check_kubectl_access,
cleanup_access_pod,
deploy_access_pod,
run_command,
)
except ModuleNotFoundError:
# Allow running as a script: add repo root to sys.path
repo_root = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(repo_root))
from deploy.utils.kubernetes import (
check_kubectl_access,
cleanup_access_pod,
deploy_access_pod,
run_command,
)
def list_pvc_contents( def list_pvc_contents(
namespace: str, pod_name: str, skip_config: bool = False namespace: str, pod_name: str, base_folder: str, skip_config: bool = False
) -> List[str]: ) -> List[str]:
"""List contents of the PVC to identify relevant files.""" """List contents of the PVC to identify files.
Downloads all files under base_folder. If skip_config is True, excludes *.yaml and *.yml.
"""
print("Scanning PVC contents...") print("Scanning PVC contents...")
# Build find command with optional config file exclusion # Build find command: all files
find_cmd = [ find_cmd = [
"kubectl", "kubectl",
"exec", "exec",
...@@ -60,44 +69,28 @@ def list_pvc_contents( ...@@ -60,44 +69,28 @@ def list_pvc_contents(
namespace, namespace,
"--", "--",
"find", "find",
"/profiling_results", base_folder,
"-type", "-type",
"f", "f",
"-name",
"*.png",
"-o",
"-name",
"*.npz",
] ]
# Add config file patterns if not skipping them # Exclude YAML files when requested
if not skip_config: if skip_config:
find_cmd.extend( find_cmd.extend(["-not", "-name", "*.yaml", "-not", "-name", "*.yml"])
[
"-o",
"-name",
"*.yaml",
"-o",
"-name",
"*.yml",
]
)
try: try:
result = run_command(find_cmd, capture_output=True) result = run_command(find_cmd, capture_output=True)
files = [f.strip() for f in result.stdout.split("\n") if f.strip()] files = [f.strip() for f in result.stdout.split("\n") if f.strip()]
config_note = " (excluding config files)" if skip_config else "" config_note = " (excluding config files)" if skip_config else ""
print(f"Found {len(files)} relevant files to download{config_note}") print(f"Found {len(files)} files to download{config_note}")
return files return files
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
print("ERROR: Failed to list PVC contents") print("ERROR: Failed to list PVC contents")
sys.exit(1) sys.exit(1)
def download_files( def download_files(
namespace: str, pod_name: str, files: List[str], output_dir: Path namespace: str, pod_name: str, files: List[str], output_dir: Path, base_folder: str
) -> None: ) -> None:
"""Download relevant files from PVC to local directory.""" """Download relevant files from PVC to local directory."""
if not files: if not files:
...@@ -113,8 +106,13 @@ def download_files( ...@@ -113,8 +106,13 @@ def download_files(
for file_path in files: for file_path in files:
try: try:
# Determine relative path and create local structure # Determine relative path and create local structure based on base_folder
rel_path = file_path.replace("/profiling_results/", "") prefix = base_folder.rstrip("/") + "/"
rel_path = (
file_path[len(prefix) :]
if file_path.startswith(prefix)
else file_path.lstrip("/")
)
# Validate relative path # Validate relative path
if ".." in rel_path or rel_path.startswith("/"): if ".." in rel_path or rel_path.startswith("/"):
...@@ -154,143 +152,6 @@ def download_files( ...@@ -154,143 +152,6 @@ def download_files(
print(f"✓ Download completed: {downloaded} successful, {failed} failed") print(f"✓ Download completed: {downloaded} successful, {failed} failed")
def download_summary_files(
namespace: str, pod_name: str, output_dir: Path, skip_config: bool = False
) -> None:
"""Download key summary files that might not match the pattern."""
summary_files = [
"/profiling_results/prefill_performance.png",
"/profiling_results/decode_performance.png",
]
# Add config files if not skipping them
if not skip_config:
summary_files.append(
"/profiling_results/disagg.yaml"
) # In case it was injected
print("Downloading summary files...")
for file_path in summary_files:
try:
# Check if file exists first using subprocess.run directly
result = subprocess.run(
[
"kubectl",
"exec",
pod_name,
"-n",
namespace,
"--",
"test",
"-f",
file_path,
],
capture_output=True,
text=True,
check=False,
)
if result.returncode != 0:
# File doesn't exist, skip silently
continue
# File exists, download it
rel_path = file_path.replace("/profiling_results/", "")
# Validate relative path
if ".." in rel_path or rel_path.startswith("/"):
print(
f" ⚠️ Skipped {file_path.split('/')[-1]}: potentially unsafe path"
)
continue
local_file = output_dir / rel_path
# Ensure the file is within output_dir
if not local_file.resolve().is_relative_to(output_dir.resolve()):
print(
f" ⚠️ Skipped {file_path.split('/')[-1]}: outside output directory"
)
continue
local_file.parent.mkdir(parents=True, exist_ok=True)
run_command(
[
"kubectl",
"cp",
f"{namespace}/{pod_name}:{file_path}",
str(local_file),
],
capture_output=True,
)
print(f" ✓ {rel_path}")
except Exception as e:
# File doesn't exist or failed to download, skip silently
print(f" ⚠️ Skipped {file_path.split('/')[-1]}: {e}")
pass
def cleanup_access_pod(namespace: str, pod_name: str) -> None:
"""Clean up the access pod (let it auto-delete via activeDeadlineSeconds)."""
print(f"ℹ️ Access pod '{pod_name}' will auto-delete in 5 minutes")
print(f" To delete immediately: kubectl delete pod {pod_name} -n {namespace}")
def generate_readme(output_dir: Path, file_count: int) -> None:
"""Generate a README file explaining the downloaded contents."""
readme_content = f"""# Profiling Results
Downloaded {file_count} files from profiling PVC.
## File Structure
### Performance Plots
- `prefill_performance.png` - Main prefill performance across TP sizes
- `decode_performance.png` - Main decode performance across TP sizes
### Interpolation Data
- `selected_prefill_interpolation/raw_data.npz` - Prefill performance data
- `selected_prefill_interpolation/*.png` - Prefill interpolation plots
- `selected_decode_interpolation/raw_data.npz` - Decode performance data
- `selected_decode_interpolation/*.png` - Decode interpolation plots
### Configuration Files
- `disagg.yaml` - DynamoGraphDeployment configuration used for profiling
### Individual TP Results
- `prefill_tp*/` - Individual tensor parallelism profiling results
- `decode_tp*/` - Individual tensor parallelism profiling results
## Loading Data
To load the .npz data files in Python:
```python
import numpy as np
# Load prefill data
prefill_data = np.load('selected_prefill_interpolation/raw_data.npz')
print("Prefill data keys:", list(prefill_data.keys()))
# Load decode data
decode_data = np.load('selected_decode_interpolation/raw_data.npz')
print("Decode data keys:", list(decode_data.keys()))
```
Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}
"""
readme_path = output_dir / "README.md"
with open(readme_path, "w") as f:
f.write(readme_content)
print("📝 Generated README.md with download summary")
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Download profiling results from PVC to local directory", description="Download profiling results from PVC to local directory",
...@@ -318,6 +179,11 @@ def main(): ...@@ -318,6 +179,11 @@ def main():
action="store_true", action="store_true",
help="Skip downloading configuration files (*.yaml, *.yml)", help="Skip downloading configuration files (*.yaml, *.yml)",
) )
parser.add_argument(
"--folder",
required=True,
help="Absolute folder path in the PVC to download, e.g. /profiling_results or /benchmarking_results",
)
args = parser.parse_args() args = parser.parse_args()
...@@ -329,19 +195,13 @@ def main(): ...@@ -329,19 +195,13 @@ def main():
# Deploy access pod # Deploy access pod
pod_name = deploy_access_pod(args.namespace) pod_name = deploy_access_pod(args.namespace)
try:
# List and download files # List and download files
files = list_pvc_contents(args.namespace, pod_name, args.no_config) files = list_pvc_contents(args.namespace, pod_name, args.folder, args.no_config)
download_files(args.namespace, pod_name, files, args.output_dir) download_files(args.namespace, pod_name, files, args.output_dir, args.folder)
finally:
# Download additional summary files # Cleanup
download_summary_files(args.namespace, pod_name, args.output_dir, args.no_config) cleanup_access_pod(args.namespace)
# Generate README
generate_readme(args.output_dir, len(files))
# Cleanup info
cleanup_access_pod(args.namespace, pod_name)
print("\n✅ Download completed!") print("\n✅ Download completed!")
print(f"📁 Results available at: {args.output_dir.absolute()}") print(f"📁 Results available at: {args.output_dir.absolute()}")
......
...@@ -15,6 +15,10 @@ ...@@ -15,6 +15,10 @@
import argparse import argparse
import asyncio import asyncio
import os
import re
import subprocess
import sys
import time import time
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Union
...@@ -39,6 +43,38 @@ EXAMPLE_CHAT_REQUEST = { ...@@ -39,6 +43,38 @@ EXAMPLE_CHAT_REQUEST = {
} }
class ProgressDisplay:
"""Helper class for cleaner progress display during deployment waiting"""
def __init__(self, verbose: bool = False):
self.verbose = verbose
self.last_message = ""
self.spinner_chars = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
self.spinner_idx = 0
def update(self, message: str, newline: bool = False):
"""Update progress display"""
if self.verbose or newline:
print(message)
else:
# Clear previous line and write new message
sys.stdout.write(f"\r\033[K{message}")
sys.stdout.flush()
self.last_message = message
def spinner(self) -> str:
"""Get next spinner character"""
char = self.spinner_chars[self.spinner_idx]
self.spinner_idx = (self.spinner_idx + 1) % len(self.spinner_chars)
return char
def finish(self, message: str):
"""Finish with a final message"""
if not self.verbose and self.last_message:
sys.stdout.write("\r\033[K") # Clear the line
print(message)
class DynamoDeploymentClient: class DynamoDeploymentClient:
def __init__( def __init__(
self, self,
...@@ -68,20 +104,84 @@ class DynamoDeploymentClient: ...@@ -68,20 +104,84 @@ class DynamoDeploymentClient:
] = None # Will store the full deployment spec ] = None # Will store the full deployment spec
self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs") self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs")
self.frontend_port = frontend_port self.frontend_port = frontend_port
self.port_forward_process: Optional[subprocess.Popen[bytes]] = None
def _init_kubernetes(self): async def _init_kubernetes(self):
"""Initialize kubernetes client""" """Initialize kubernetes client"""
try: try:
# Try in-cluster config first (for pods with service accounts) # Try in-cluster config first (for pods with service accounts)
config.load_incluster_config() config.load_incluster_config()
except Exception: except Exception:
# Fallback to kube config file (for local development) # Fallback to kube config file (for local development)
config.load_kube_config() await config.load_kube_config()
self.k8s_client = client.ApiClient() self.k8s_client = client.ApiClient()
self.custom_api = client.CustomObjectsApi(self.k8s_client) self.custom_api = client.CustomObjectsApi(self.k8s_client)
self.core_api = client.CoreV1Api(self.k8s_client) self.core_api = client.CoreV1Api(self.k8s_client)
def port_forward_frontend(self, local_port: int = 8000, quiet: bool = False) -> str:
"""
Port forward the frontend service to a local port.
Args:
local_port: Local port to forward to (default: 8000)
quiet: If True, suppress kubectl port-forward output messages (default: False)
"""
cmd = [
"kubectl",
"port-forward",
f"svc/{self.service_name}",
f"{local_port}:{self.frontend_port}",
"-n",
self.namespace,
]
print(f"Starting port forward: {' '.join(cmd)}")
# Configure output redirection based on quiet flag
if quiet:
# Suppress kubectl's "Handling connection for..." messages
stdout = subprocess.DEVNULL
stderr = subprocess.DEVNULL
else:
stdout = None
stderr = None
# Start port forward in background
try:
self.port_forward_process = subprocess.Popen(
cmd, stdout=stdout, stderr=stderr
)
except FileNotFoundError as e:
raise RuntimeError(
"kubectl not found in PATH; required for port-forwarding"
) from e
# Wait a moment for port forward to establish
print("Waiting for port forward to establish...")
time.sleep(3)
print(f"Port forward started with PID: {self.port_forward_process.pid}")
return f"http://localhost:{local_port}"
def stop_port_forward(self):
"""
Stop the port forward process.
"""
if self.port_forward_process:
print(
f"Stopping port forward process (PID: {self.port_forward_process.pid})"
)
self.port_forward_process.terminate()
try:
self.port_forward_process.wait(timeout=5)
print("Port forward stopped")
except subprocess.TimeoutExpired:
print("Port forward process did not terminate, killing it")
self.port_forward_process.kill()
self.port_forward_process.wait()
self.port_forward_process = None
def get_service_url(self) -> str: def get_service_url(self) -> str:
""" """
Get the service URL using Kubernetes service DNS. Get the service URL using Kubernetes service DNS.
...@@ -97,7 +197,7 @@ class DynamoDeploymentClient: ...@@ -97,7 +197,7 @@ class DynamoDeploymentClient:
Args: Args:
deployment: Either a dict containing the deployment spec or a path to a yaml file deployment: Either a dict containing the deployment spec or a path to a yaml file
""" """
self._init_kubernetes() await self._init_kubernetes()
if isinstance(deployment, str): if isinstance(deployment, str):
# Load from yaml file # Load from yaml file
...@@ -107,6 +207,11 @@ class DynamoDeploymentClient: ...@@ -107,6 +207,11 @@ class DynamoDeploymentClient:
else: else:
self.deployment_spec = deployment self.deployment_spec = deployment
# Ensure deployment_spec is properly loaded
assert (
self.deployment_spec is not None
), "Failed to load deployment specification"
# Extract component names # Extract component names
self.components = [ self.components = [
svc.lower() for svc in self.deployment_spec["spec"]["services"].keys() svc.lower() for svc in self.deployment_spec["spec"]["services"].keys()
...@@ -139,15 +244,30 @@ class DynamoDeploymentClient: ...@@ -139,15 +244,30 @@ class DynamoDeploymentClient:
print(f"Failed to create deployment {self.deployment_name}: {e}") print(f"Failed to create deployment {self.deployment_name}: {e}")
raise raise
async def wait_for_deployment_ready(self, timeout: int = 1800): async def wait_for_deployment_ready(
self, timeout: int = 1800, verbose: Optional[bool] = None
):
""" """
Wait for the custom resource to be ready. Wait for the custom resource to be ready with improved progress display.
Args: Args:
timeout: Maximum time to wait in seconds, default to 30 mins (image pulling can take a while) timeout: Maximum time to wait in seconds, default to 30 mins (image pulling can take a while)
verbose: If True, show detailed status updates. If None, uses DYNAMO_VERBOSE env var.
""" """
# Allow environment variable to control verbosity
if verbose is None:
verbose = os.environ.get("DYNAMO_VERBOSE", "false").lower() == "true"
progress = ProgressDisplay(verbose=verbose)
start_time = time.time() start_time = time.time()
# TODO: A little brittle, also should output intermediate status every so often. last_status = None
last_conditions_str = ""
check_interval = 20 if not verbose else 10
# Initial message
if not verbose:
print(f"⏳ Waiting for deployment '{self.deployment_name}'...")
while (time.time() - start_time) < timeout: while (time.time() - start_time) < timeout:
try: try:
status = await self.custom_api.get_namespaced_custom_object( status = await self.custom_api.get_namespaced_custom_object(
...@@ -157,57 +277,129 @@ class DynamoDeploymentClient: ...@@ -157,57 +277,129 @@ class DynamoDeploymentClient:
plural="dynamographdeployments", plural="dynamographdeployments",
name=self.deployment_name, name=self.deployment_name,
) )
# Check both conditions:
# 1. Ready condition is True
# 2. State is successful
status_obj = status.get("status", {}) status_obj = status.get("status", {})
conditions = status_obj.get("conditions", []) conditions = status_obj.get("conditions", [])
current_state = status_obj.get("state", "unknown") current_state = status_obj.get("state", "unknown")
elapsed = time.time() - start_time
print(f"Current deployment state: {current_state}") # Check readiness
print(f"Current conditions: {conditions}")
print(f"Elapsed time: {time.time() - start_time:.1f}s / {timeout}s")
ready_condition = False ready_condition = False
ready_message = ""
for condition in conditions: for condition in conditions:
if ( if condition.get("type") == "Ready":
condition.get("type") == "Ready" ready_condition = condition.get("status") == "True"
and condition.get("status") == "True" ready_message = condition.get("message", "")
):
ready_condition = True
break break
state_successful = status_obj.get("state") == "successful" state_successful = current_state == "successful"
# Extract not ready components from message
not_ready_components = []
if re.search(r"resources not ready:", ready_message, re.IGNORECASE):
match = re.search(r"\[(.*?)\]", ready_message)
if match:
items = match.group(1)
not_ready_components = [
s.strip() for s in re.split(r"[,\s]+", items) if s.strip()
]
# Format progress message based on mode
if not verbose:
# Concise single-line progress with spinner
spinner = progress.spinner()
# Create status string
if not_ready_components:
# Show first 2 components, abbreviate if more
components_str = ", ".join(not_ready_components[:2])
if len(not_ready_components) > 2:
components_str += f" +{len(not_ready_components)-2} more"
status_str = f"Waiting for: {components_str}"
else:
status_str = f"State: {current_state}"
# Format time
time_str = f"[{elapsed:.0f}s]"
message = f"{spinner} {time_str} {status_str}"
progress.update(message)
else:
# Verbose mode - show details when status changes
conditions_str = str(conditions)
if (
current_state != last_status
or conditions_str != last_conditions_str
):
progress.update(f"Current deployment state: {current_state}")
progress.update(f"Current conditions: {conditions}")
progress.update(f"Elapsed time: {elapsed:.1f}s / {timeout}s")
progress.update(
f"Deployment not ready yet - Ready: {ready_condition}, "
f"State successful: {state_successful}"
)
last_status = current_state
last_conditions_str = conditions_str
# Check if deployment is ready
if ready_condition and state_successful: if ready_condition and state_successful:
print( progress.finish(
"Deployment is ready: Ready condition is True and state is successful" f"✅ Deployment '{self.deployment_name}' ready after {elapsed:.1f}s"
) )
return True return True
else:
print(
f"Deployment not ready yet - Ready condition: {ready_condition}, State successful: {state_successful}"
)
except kubernetes.client.rest.ApiException as e: except kubernetes.client.rest.ApiException as e:
print(f"API Exception while checking deployment status: {e}") if verbose:
print(f"Status code: {e.status}, Reason: {e.reason}") progress.update(
f"API Exception while checking deployment status: {e}",
newline=True,
)
progress.update(
f"Status code: {e.status}, Reason: {e.reason}", newline=True
)
except Exception as e: except Exception as e:
print(f"Unexpected exception while checking deployment status: {e}") if verbose:
await asyncio.sleep(20) progress.update(
raise TimeoutError("Deployment failed to become ready within timeout") f"Unexpected exception while checking deployment status: {e}",
newline=True,
)
await asyncio.sleep(check_interval)
# Timeout reached
progress.finish(
f"❌ Deployment '{self.deployment_name}' failed to become ready within {timeout}s"
)
raise TimeoutError(f"Deployment failed to become ready within {timeout}s")
async def check_chat_completion(self): async def check_chat_completion(
self,
use_port_forward: bool = False,
local_port: int = 8000,
quiet: bool = True,
timeout_s: float = 30.0,
):
""" """
Test the deployment with a chat completion request using httpx. Test the deployment with a chat completion request using httpx.
""" """
EXAMPLE_CHAT_REQUEST["model"] = self.model_name EXAMPLE_CHAT_REQUEST["model"] = self.model_name
# Use cluster DNS in-cluster; otherwise optionally port-forward
inside_cluster = bool(os.environ.get("KUBERNETES_SERVICE_HOST"))
base_url = self.get_service_url() base_url = self.get_service_url()
if use_port_forward or not inside_cluster:
base_url = self.port_forward_frontend(local_port=local_port, quiet=quiet)
url = f"{base_url}/v1/chat/completions" url = f"{base_url}/v1/chat/completions"
async with httpx.AsyncClient() as client: try:
response = await client.post(url, json=EXAMPLE_CHAT_REQUEST) async with httpx.AsyncClient(timeout=timeout_s) as client:
response.raise_for_status() response = await client.post(url, json=EXAMPLE_CHAT_REQUEST)
return response.text response.raise_for_status()
return response.text
finally:
if use_port_forward or not inside_cluster:
self.stop_port_forward()
async def get_deployment_logs(self): async def get_deployment_logs(self):
""" """
...@@ -257,6 +449,10 @@ class DynamoDeploymentClient: ...@@ -257,6 +449,10 @@ class DynamoDeploymentClient:
except kubernetes.client.rest.ApiException as e: except kubernetes.client.rest.ApiException as e:
if e.status != 404: # Ignore if already deleted if e.status != 404: # Ignore if already deleted
raise raise
finally:
# Close the kubernetes client session to avoid warnings
if hasattr(self, "k8s_client"):
await self.k8s_client.close()
async def cleanup_remaining_deployments(deployment_clients, namespace): async def cleanup_remaining_deployments(deployment_clients, namespace):
...@@ -339,7 +535,7 @@ async def main(): ...@@ -339,7 +535,7 @@ async def main():
# Test chat completion # Test chat completion
print("Testing chat completion...") print("Testing chat completion...")
response = await client.check_chat_completion() response = await client.check_chat_completion(use_port_forward=True)
print(f"Chat completion response: {response}") print(f"Chat completion response: {response}")
# Get logs # Get logs
......
...@@ -16,51 +16,55 @@ ...@@ -16,51 +16,55 @@
# limitations under the License. # limitations under the License.
""" """
Disagg Config Injection Script Manifest Injection Script
This script copies a DynamoGraphDeployment disagg configuration file into the profiling PVC Copies any Kubernetes manifest file into the PVC for later use by jobs.
so it can be used by the SLA profiler job. The profiler can then reference this config Both the source manifest path and destination path in the PVC are required.
using the DGD_CONFIG_FILE environment variable.
Usage: Usage:
python3 inject_disagg_config.py --namespace <namespace> [--disagg-config <path>] [--target-path <path>] python3 inject_manifest.py --namespace <namespace> --src <local_manifest.yaml> --dest <absolute_path_in_pvc>
Examples: Examples:
# Use default disagg.yaml from components/backends/vllm/deploy/ python3 inject_manifest.py --namespace <ns> --src ./my-disagg.yaml --dest /configs/disagg.yaml
python3 inject_disagg_config.py --namespace <namespace> python3 inject_manifest.py --namespace <ns> --src ./my-agg.yaml --dest /configs/agg.yaml
# Use custom disagg config
python3 inject_disagg_config.py --namespace <namespace> --disagg-config ./my-custom-disagg.yaml
# Use custom target path in PVC
python3 inject_disagg_config.py --namespace <namespace> --target-path /profiling_results/custom-disagg.yaml
""" """
import argparse import argparse
import sys import sys
from pathlib import Path from pathlib import Path
from utils.kubernetes import check_kubectl_access, deploy_access_pod, run_command from deploy.utils.kubernetes import (
PVC_ACCESS_POD_NAME,
check_kubectl_access,
cleanup_access_pod,
deploy_access_pod,
run_command,
)
def copy_disagg_config( def copy_manifest(namespace: str, manifest_path: Path, target_path: str) -> None:
namespace: str, disagg_config_path: Path, target_path: str """Copy a manifest file into the PVC via the access pod."""
) -> None: pod_name = PVC_ACCESS_POD_NAME
"""Copy the disagg config file into the PVC via the access pod."""
pod_name = "pvc-access-pod"
if not disagg_config_path.exists(): if not manifest_path.exists():
print(f"ERROR: Disagg config file not found: {disagg_config_path}") print(f"ERROR: Manifest file not found: {manifest_path}")
sys.exit(1) sys.exit(1)
print(f"Copying {disagg_config_path} to {target_path} in PVC...") print(f"Copying {manifest_path} to {target_path} in PVC...")
# Ensure destination directory exists
target_dir = str(Path(target_path).parent)
run_command(
["kubectl", "exec", pod_name, "-n", namespace, "--", "mkdir", "-p", target_dir],
capture_output=False,
)
# Copy file to pod # Copy file to pod
run_command( run_command(
[ [
"kubectl", "kubectl",
"cp", "cp",
str(disagg_config_path), str(manifest_path),
f"{namespace}/{pod_name}:{target_path}", f"{namespace}/{pod_name}:{target_path}",
], ],
capture_output=False, capture_output=False,
...@@ -72,38 +76,13 @@ def copy_disagg_config( ...@@ -72,38 +76,13 @@ def copy_disagg_config(
capture_output=True, capture_output=True,
) )
print("✓ Disagg config successfully copied to PVC") print("✓ Manifest successfully copied to PVC")
print(f"File details: {result.stdout.strip()}") print(f"File details: {result.stdout.strip()}")
def cleanup_access_pod(namespace: str, keep_pod: bool = True) -> None:
"""Optionally clean up the access pod."""
if keep_pod:
print("ℹ️ Access pod 'pvc-access-pod' left running for future use")
print(
f" To access PVC: kubectl exec -it pvc-access-pod -n {namespace} -- /bin/bash"
)
print(f" To delete pod: kubectl delete pod pvc-access-pod -n {namespace}")
else:
print("Cleaning up access pod...")
run_command(
[
"kubectl",
"delete",
"pod",
"pvc-access-pod",
"-n",
namespace,
"--ignore-not-found",
],
capture_output=False,
)
print("✓ Access pod deleted")
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Inject disagg config into profiling PVC", description="Inject a Kubernetes manifest into the PVC",
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__, epilog=__doc__,
) )
...@@ -116,36 +95,28 @@ def main(): ...@@ -116,36 +95,28 @@ def main():
) )
parser.add_argument( parser.add_argument(
"--disagg-config", "--src", required=True, type=Path, help="Path to manifest file to copy"
type=Path,
default=Path("components/backends/vllm/deploy/disagg.yaml"),
help="Path to disagg config file (default: components/backends/vllm/deploy/disagg.yaml)",
) )
parser.add_argument( parser.add_argument(
"--target-path", "--dest",
default="/profiling_results/disagg.yaml", required=True,
help="Target path in PVC (default: /profiling_results/disagg.yaml)", help="Absolute target path in PVC (e.g., /profiling_results/agg.yaml)",
)
parser.add_argument(
"--cleanup",
action="store_true",
help="Delete the access pod after copying (default: keep running)",
) )
args = parser.parse_args() args = parser.parse_args()
# Validate target_path to prevent directory traversal # Validate target_path to prevent directory traversal
if not args.target_path.startswith("/profiling_results/"): if not args.dest.startswith("/"):
print("ERROR: Target path must be within /profiling_results/") print(
"ERROR: Target path must be an absolute path inside the PVC (start with '/')."
)
sys.exit(1) sys.exit(1)
if ".." in args.target_path: if ".." in args.dest:
print("ERROR: Target path cannot contain '..'") print("ERROR: Target path cannot contain '..'")
sys.exit(1) sys.exit(1)
print("🚀 Disagg Config Injection") print("🚀 Manifest Injection")
print("=" * 40) print("=" * 40)
# Validate inputs # Validate inputs
...@@ -153,16 +124,14 @@ def main(): ...@@ -153,16 +124,14 @@ def main():
# Deploy access pod # Deploy access pod
deploy_access_pod(args.namespace) deploy_access_pod(args.namespace)
try:
# Copy disagg config # Copy manifest
copy_disagg_config(args.namespace, args.disagg_config, args.target_path) copy_manifest(args.namespace, args.src, args.dest)
print("\n✅ Manifest injection completed!")
# Cleanup print(f"📁 File available at: {args.dest}")
cleanup_access_pod(args.namespace, keep_pod=not args.cleanup) finally:
# Cleanup even on failure
print("\n✅ Disagg config injection completed!") cleanup_access_pod(args.namespace)
print(f"📁 Config available at: {args.target_path}")
print(f"🔧 Set DGD_CONFIG_FILE=/workspace{args.target_path} in your profiler job")
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -15,10 +15,11 @@ ...@@ -15,10 +15,11 @@
import subprocess import subprocess
import sys import sys
import time
from pathlib import Path from pathlib import Path
from typing import List from typing import List
PVC_ACCESS_POD_NAME = "pvc-access-pod"
def run_command( def run_command(
cmd: List[str], capture_output: bool = True cmd: List[str], capture_output: bool = True
...@@ -48,7 +49,6 @@ def check_kubectl_access(namespace: str) -> None: ...@@ -48,7 +49,6 @@ def check_kubectl_access(namespace: str) -> None:
def deploy_access_pod(namespace: str) -> str: def deploy_access_pod(namespace: str) -> str:
"""Deploy the PVC access pod and return pod name.""" """Deploy the PVC access pod and return pod name."""
pod_name = "pvc-access-pod"
# Check if pod already exists and is running # Check if pod already exists and is running
try: try:
...@@ -57,7 +57,7 @@ def deploy_access_pod(namespace: str) -> str: ...@@ -57,7 +57,7 @@ def deploy_access_pod(namespace: str) -> str:
"kubectl", "kubectl",
"get", "get",
"pod", "pod",
pod_name, PVC_ACCESS_POD_NAME,
"-n", "-n",
namespace, namespace,
"-o", "-o",
...@@ -69,17 +69,17 @@ def deploy_access_pod(namespace: str) -> str: ...@@ -69,17 +69,17 @@ def deploy_access_pod(namespace: str) -> str:
) )
if result.returncode == 0 and result.stdout.strip() == "Running": if result.returncode == 0 and result.stdout.strip() == "Running":
print(f"✓ Access pod '{pod_name}' already running") print(f"✓ Access pod '{PVC_ACCESS_POD_NAME}' already running")
return pod_name return PVC_ACCESS_POD_NAME
except Exception: except Exception:
# Pod doesn't exist or isn't running # Pod doesn't exist or isn't running
pass pass
print(f"Deploying access pod '{pod_name}' in namespace '{namespace}'...") print(f"Deploying access pod '{PVC_ACCESS_POD_NAME}' in namespace '{namespace}'...")
# Get the directory where this script is located # Get the directory where this script is located
script_dir = Path(__file__).parent.parent script_dir = Path(__file__).parent
pod_yaml_path = script_dir / "deploy" / "pvc-access-pod.yaml" pod_yaml_path = script_dir / "manifests" / "pvc-access-pod.yaml"
if not pod_yaml_path.exists(): if not pod_yaml_path.exists():
print(f"ERROR: Pod YAML not found at {pod_yaml_path}") print(f"ERROR: Pod YAML not found at {pod_yaml_path}")
...@@ -92,36 +92,34 @@ def deploy_access_pod(namespace: str) -> str: ...@@ -92,36 +92,34 @@ def deploy_access_pod(namespace: str) -> str:
) )
print("Waiting for pod to be ready...") print("Waiting for pod to be ready...")
run_command(
[
"kubectl",
"wait",
f"pod/{PVC_ACCESS_POD_NAME}",
"-n",
namespace,
"--for=condition=Ready",
"--timeout=60s",
],
capture_output=False,
)
print("✓ Access pod is ready")
return PVC_ACCESS_POD_NAME
# Wait for pod to be ready (up to 60 seconds)
for i in range(60): def cleanup_access_pod(namespace: str) -> None:
try: print("Cleaning up access pod...")
result = subprocess.run( run_command(
[ [
"kubectl", "kubectl",
"get", "delete",
"pod", "pod",
pod_name, PVC_ACCESS_POD_NAME,
"-n", "-n",
namespace, namespace,
"-o", "--ignore-not-found",
"jsonpath={.status.phase}", ],
], capture_output=False,
capture_output=True, )
text=True, print("✓ Access pod deleted")
check=False,
)
if result.returncode == 0 and result.stdout.strip() == "Running":
print("✓ Access pod is ready")
return pod_name
except Exception:
pass
time.sleep(1)
if i % 10 == 0:
print(f" Still waiting... ({i+1}s)")
print("ERROR: Access pod failed to become ready within 60 seconds")
sys.exit(1)
...@@ -37,5 +37,5 @@ spec: ...@@ -37,5 +37,5 @@ spec:
volumes: volumes:
- name: profiling-storage - name: profiling-storage
persistentVolumeClaim: persistentVolumeClaim:
claimName: profiling-pvc claimName: dynamo-pvc
restartPolicy: Never restartPolicy: Never
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
apiVersion: v1 apiVersion: v1
kind: PersistentVolumeClaim kind: PersistentVolumeClaim
metadata: metadata:
name: profiling-pvc name: dynamo-pvc
namespace: ${NAMESPACE} namespace: ${NAMESPACE}
spec: spec:
accessModes: accessModes:
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: Role kind: Role
metadata: metadata:
name: profile-sla-role name: dynamo-role
namespace: ${NAMESPACE} namespace: ${NAMESPACE}
rules: rules:
# DynamoGraphDeployment custom resources - needed for create/get/delete operations # DynamoGraphDeployment custom resources - needed for create/get/delete operations
...@@ -17,3 +17,10 @@ rules: ...@@ -17,3 +17,10 @@ rules:
- apiGroups: [""] - apiGroups: [""]
resources: ["pods/log"] resources: ["pods/log"]
verbs: ["get"] verbs: ["get"]
# Services and Deployments - needed for vLLM deployments
- apiGroups: [""]
resources: ["services"]
verbs: ["get", "create", "delete"]
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "create", "delete"]
...@@ -3,13 +3,13 @@ ...@@ -3,13 +3,13 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding kind: RoleBinding
metadata: metadata:
name: profile-sla-binding name: dynamo-binding
namespace: ${NAMESPACE} namespace: ${NAMESPACE}
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: profile-sla-sa name: dynamo-sa
namespace: ${NAMESPACE} namespace: ${NAMESPACE}
roleRef: roleRef:
kind: Role kind: Role
name: profile-sla-role name: dynamo-role
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
apiVersion: v1 apiVersion: v1
kind: ServiceAccount kind: ServiceAccount
metadata: metadata:
name: profile-sla-sa name: dynamo-sa
namespace: ${NAMESPACE} namespace: ${NAMESPACE}
imagePullSecrets: imagePullSecrets:
- name: nvcr-imagepullsecret - name: nvcr-imagepullsecret
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Kubernetes and async dependencies
aiofiles>=0.8.0
# Benchmarking dependencies for Dynamo
genai-perf==0.0.15
httpx>=0.24.0
kubernetes-asyncio>=24.0.0
# Plotting and visualization
matplotlib>=3.5.0
numpy>=1.21.0
pandas>=1.3.0
plotly>=5.0.0
# YAML processing
pyyaml>=6.0.0
scipy>=1.7.0
seaborn>=0.11.0
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -euo pipefail
# Resolve repo root relative to this script
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# Inputs
NAMESPACE="${NAMESPACE:-default}"
DOCKER_SERVER="${DOCKER_SERVER:-}"
IMAGE_TAG="${IMAGE_TAG:-}"
DOCKER_USERNAME="${DOCKER_USERNAME:-}"
DOCKER_PASSWORD="${DOCKER_PASSWORD:-}"
HF_TOKEN="${HF_TOKEN:-}"
PULL_SECRET_NAME=""
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log() { echo -e "${BLUE}[INFO]${NC} $*"; }
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
err() { echo -e "${RED}[ERROR]${NC} $*"; }
create_or_update_pull_secret() {
local server="$1"; local user="$2"; local pass="$3"
if [[ -n "$server" && -n "$user" && -n "$pass" ]]; then
log "Creating/updating docker-imagepullsecret for $server in namespace $NAMESPACE"
kubectl create secret docker-registry docker-imagepullsecret \
--docker-server="$server" \
--docker-username="$user" \
--docker-password="$pass" \
--namespace="$NAMESPACE" \
--dry-run=client -o yaml | kubectl apply -f -
ok "docker-imagepullsecret configured"
PULL_SECRET_NAME="docker-imagepullsecret"
fi
}
usage() {
cat << EOF
Usage:
NAMESPACE=<ns> deploy/utils/setup_k8s_namespace.sh
NAMESPACE=<ns> DOCKER_SERVER=<registry> IMAGE_TAG=<tag> [DOCKER_USERNAME=<user>] [DOCKER_PASSWORD=<token>] \
deploy/utils/setup_k8s_namespace.sh
Sets up Kubernetes namespace for Dynamo (one-time per namespace):
- Creates namespace if absent
- Applies common manifests (ServiceAccount, Role, RoleBinding, PVC)
- Installs CRDs once per cluster (if not already installed)
- If DOCKER_SERVER/IMAGE_TAG are provided:
* Builds/pushes a custom operator image with Earthly
* Installs/updates the operator Helm release using that image
* If credentials (DOCKER_USERNAME/DOCKER_PASSWORD) are provided, creates/updates docker-imagepullsecret
* If credentials are not provided, prompts interactively to create the pull secret
- Otherwise installs the operator using default image: nvcr.io/nvidia/ai-dynamo/kubernetes-operator:0.4.0
Environment variables:
NAMESPACE Target Kubernetes namespace (default: default)
DOCKER_SERVER Registry server for operator image (optional)
IMAGE_TAG Image tag for operator (optional)
DOCKER_USERNAME Registry username (optional; if provided with DOCKER_PASSWORD, secret is created)
DOCKER_PASSWORD Registry password/token (optional)
HF_TOKEN Hugging Face token; if set, a secret named hf-token-secret is created in the namespace (optional)
EOF
}
if ! command -v kubectl &>/dev/null; then err "kubectl not found"; exit 1; fi
# 1) Ensure namespace exists
if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then
log "Creating namespace $NAMESPACE"
kubectl create namespace "$NAMESPACE"
else
log "Namespace $NAMESPACE exists"
fi
# 2) Apply common manifests
log "Applying common manifests to namespace $NAMESPACE"
for mf in "$(dirname "$0")/manifests"/*.yaml; do
envsubst < "$mf" | kubectl apply -f -
done
ok "Common manifests applied"
# 3) Install CRDs once per cluster (only if not already installed)
if command -v helm &>/dev/null; then
if ! helm status dynamo-crds -n "$NAMESPACE" &>/dev/null; then
log "Installing CRDs via Helm release dynamo-crds in namespace $NAMESPACE"
pushd "$REPO_ROOT/deploy/cloud/helm" >/dev/null
helm upgrade --install dynamo-crds ./crds/ \
--namespace "$NAMESPACE" \
--wait \
--atomic
popd >/dev/null
ok "CRDs installed"
fi
fi
# 4) Optional: Create Hugging Face token secret if HF_TOKEN provided
if [[ -n "$HF_TOKEN" ]]; then
kubectl create secret generic hf-token-secret \
--from-literal=HF_TOKEN="$HF_TOKEN" \
-n "$NAMESPACE" \
--dry-run=client -o yaml | kubectl apply -f -
ok "hf-token-secret created/updated"
fi
# 5) Optional: Create imagePullSecret for private registry if credentials provided or requested
if [[ -n "$DOCKER_SERVER" ]]; then
if [[ -n "$DOCKER_USERNAME" && -n "$DOCKER_PASSWORD" ]]; then
create_or_update_pull_secret "$DOCKER_SERVER" "$DOCKER_USERNAME" "$DOCKER_PASSWORD"
elif [[ -n "$IMAGE_TAG" ]]; then
echo
read -p "Do you need image pull credentials for $DOCKER_SERVER (private registry)? [y/N]: " -r ans
if [[ "$ans" =~ ^[Yy]$ ]]; then
read -p "Docker username (often '$oauthtoken' for NGC): " DOCKER_USERNAME
read -s -p "Docker password/token: " DOCKER_PASSWORD; echo
if [[ -n "$DOCKER_USERNAME" && -n "$DOCKER_PASSWORD" ]]; then
create_or_update_pull_secret "$DOCKER_SERVER" "$DOCKER_USERNAME" "$DOCKER_PASSWORD"
else
warn "Username or password empty; skipping secret creation"
fi
fi
fi
fi
# 6) Operator: Build/push custom image if both vars provided, else use default NGC image
if [[ -n "$DOCKER_SERVER" && -n "$IMAGE_TAG" ]]; then
if ! command -v earthly &>/dev/null; then warn "earthly not found; skipping operator build/push"; else
log "Building and pushing operator images via earthly"
earthly --push +all-docker --DOCKER_SERVER="$DOCKER_SERVER" --IMAGE_TAG="$IMAGE_TAG"
fi
if ! command -v helm &>/dev/null; then warn "helm not found; skipping helm install"; else
pushd "$REPO_ROOT/deploy/cloud/helm/platform" >/dev/null
helm dep build
popd >/dev/null
pushd "$REPO_ROOT/deploy/cloud/helm" >/dev/null
# Build Helm args
HELM_ARGS=(upgrade dynamo-platform ./platform/ --install --namespace "$NAMESPACE" \
--set "dynamo-operator.controllerManager.manager.image.repository=${DOCKER_SERVER}/dynamo-operator" \
--set "dynamo-operator.controllerManager.manager.image.tag=${IMAGE_TAG}")
if [[ -n "$PULL_SECRET_NAME" ]]; then
HELM_ARGS+=(--set "dynamo-operator.imagePullSecrets[0].name=${PULL_SECRET_NAME}")
fi
helm "${HELM_ARGS[@]}"
popd >/dev/null
ok "Helm chart installed/updated"
fi
else
# Use default published image when custom not provided
DEFAULT_OPERATOR_IMAGE="nvcr.io/nvidia/ai-dynamo/kubernetes-operator:0.4.0"
if ! command -v helm &>/dev/null; then warn "helm not found; skipping helm install"; else
pushd "$REPO_ROOT/deploy/cloud/helm/platform" >/dev/null
helm dep build
popd >/dev/null
pushd "$REPO_ROOT/deploy/cloud/helm" >/dev/null
# Only set imagePullSecrets if the referenced secret exists; otherwise rely on SA
HELM_ARGS=(upgrade dynamo-platform ./platform/ --install --namespace "$NAMESPACE" \
--set "dynamo-operator.controllerManager.manager.image.repository=${DEFAULT_OPERATOR_IMAGE%:*}" \
--set "dynamo-operator.controllerManager.manager.image.tag=${DEFAULT_OPERATOR_IMAGE##*:}")
if kubectl get secret nvcr-imagepullsecret -n "$NAMESPACE" &>/dev/null; then
HELM_ARGS+=(--set "dynamo-operator.imagePullSecrets[0].name=nvcr-imagepullsecret")
fi
helm "${HELM_ARGS[@]}"
popd >/dev/null
ok "Helm chart installed/updated with default operator image"
fi
fi
# 7) Install benchmark dependencies if requirements.txt exists
REQUIREMENTS_FILE="$SCRIPT_DIR/requirements.txt"
if [[ -f "$REQUIREMENTS_FILE" ]]; then
log "Installing benchmark dependencies..."
if command -v uv >/dev/null 2>&1; then
uv pip install -r "$REQUIREMENTS_FILE"
elif command -v pip3 >/dev/null 2>&1; then
pip3 install -r "$REQUIREMENTS_FILE"
elif command -v pip >/dev/null 2>&1; then
pip install -r "$REQUIREMENTS_FILE"
else
warn "No pip/pip3/uv found; skipping benchmark dependency installation"
warn "To run benchmarks, manually install: pip install -r $REQUIREMENTS_FILE"
fi
ok "Benchmark dependencies installed"
fi
ok "Kubernetes namespace setup complete"
...@@ -24,7 +24,7 @@ There are two additional rules set by planner to prevent over-compensation: ...@@ -24,7 +24,7 @@ There are two additional rules set by planner to prevent over-compensation:
## SLA-based Scaling Up/Down Prefill/Decode Workers ## SLA-based Scaling Up/Down Prefill/Decode Workers
See [Pre-Deployment Profiling](pre_deployment_profiling.md) for more details. See [Pre-Deployment Profiling](../benchmarks/pre_deployment_profiling.md) for more details.
## Usage ## Usage
......
...@@ -71,6 +71,6 @@ Key features include: ...@@ -71,6 +71,6 @@ Key features include:
:hidden: :hidden:
Overview <self> Overview <self>
Pre-Deployment Profiling <pre_deployment_profiling.md> Pre-Deployment Profiling <../benchmarks/pre_deployment_profiling.md>
Load-based Planner <load_planner.md>
SLA-based Planner <sla_planner.md> SLA-based Planner <sla_planner.md>
Planner Benchmark <../guides/planner_benchmark/README.md>
\ No newline at end of file
...@@ -28,7 +28,7 @@ The SLA planner consists of several key components: ...@@ -28,7 +28,7 @@ The SLA planner consists of several key components:
## Pre-Deployment Profiling ## Pre-Deployment Profiling
SLA-based planner requires pre-deployment profiling to operate. See [Pre-Deployment Profiling](pre_deployment_profiling.md) for more details. SLA-based planner requires pre-deployment profiling to operate. See [Pre-Deployment Profiling](../benchmarks/pre_deployment_profiling.md) for more details.
## Load Prediction ## Load Prediction
......
<!-- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License. -->
# Dynamo Benchmarking Guide
This benchmarking framework lets you compare performance across any combination of:
- **DynamoGraphDeployments** (automatically deployed from your manifests)
- **External HTTP endpoints** (existing services, vLLM, TensorRT-LLM, etc.)
You can mix and match these in a single benchmark run using custom labels. Configure your DynamoGraphDeployment manifests for your specific models, hardware, and parallelization needs.
## What This Tool Does
The framework is a wrapper around `genai-perf` that:
- Deploys user-specified `DynamoGraphDeployments` automatically
- Benchmarks any HTTP endpoints (no deployment needed)
- Runs concurrency sweeps across configurable load levels
- Generates comparison plots with your custom labels
- Works with any HuggingFace-compatible model on NVIDIA GPUs (H200, H100, A100, etc.)
- Runs locally and connects to your Kubernetes deployments/endpoints
**Default sequence lengths**: Input: 2000 tokens, Output: 256 tokens (configurable with `--isl` and `--osl`)
**Important**: The `--model` parameter configures GenAI-Perf for benchmarking and provides logging context. The actual model loaded is determined by your deployment manifests. Only one model can be benchmarked at a time across all inputs to ensure fair comparison. The default `--model` value in the benchmarking script is `deepseek-ai/DeepSeek-R1-Distill-Llama-8B`, but it must match the model in the manifest(s) and the model deployed at the endpoint(s).
## Prerequisites
1. **Kubernetes cluster with NVIDIA GPUs and Dynamo namespace setup** - You need a Kubernetes cluster with eligible NVIDIA GPUs and a properly configured namespace for Dynamo benchmarking. See the [deploy/utils/README](../../deploy/utils/README.md) for complete setup instructions.
2. **kubectl access** - You need `kubectl` installed and configured to access your Kubernetes cluster. All other required tools (GenAI-Perf, Python, etc.) are included in the Dynamo containers. If you are not working within a Dynamo container, you can install the necessary requirements using `deploy/utils/requirements.txt`. *Note: if you are on Ubuntu 22.04 or lower, you will also need to build perf_analyzer [from source](https://github.com/triton-inference-server/perf_analyzer/blob/main/docs/install.md#build-from-source).*
## Quick Start Examples
The tool can be used to deploy, benchmark and compare Dynamo deployments (DynamoGraphDeployments) on a Kubernetes cluster as well as benchmark and compare servers deployed separately given a URL. In the examples below, Dynamo deployments are specified with a yaml and servers deployed separately by URL.
```bash
export NAMESPACE=benchmarking
# Compare multiple DynamoGraphDeployments of a single backend
./benchmarks/benchmark.sh --namespace $NAMESPACE \
--input agg=components/backends/vllm/deploy/agg.yaml \
--input disagg=components/backends/vllm/deploy/disagg.yaml
# Compare different backend types (vLLM vs TensorRT-LLM)
./benchmarks/benchmark.sh --namespace $NAMESPACE \
--input vllm-disagg=components/backends/vllm/deploy/disagg.yaml \
--input trtllm-disagg=components/backends/trtllm/deploy/disagg.yaml
# Compare Dynamo deployment vs existing deployment (external endpoint)
./benchmarks/benchmark.sh --namespace $NAMESPACE \
--input dynamo=components/backends/vllm/deploy/disagg.yaml \
--input vllm-baseline=http://localhost:8000
# Compare three different configurations
./benchmarks/benchmark.sh --namespace $NAMESPACE \
--input dynamo-agg=components/backends/vllm/deploy/agg.yaml \
--input dynamo-disagg=components/backends/vllm/deploy/disagg.yaml \
--input external-vllm=http://localhost:8000
# Benchmark single external endpoint
./benchmarks/benchmark.sh --namespace $NAMESPACE \
--input production-api=http://your-api:8000
# Custom model and sequence lengths
./benchmarks/benchmark.sh --namespace $NAMESPACE \
--input my-setup=my-custom-manifest.yaml \
--model "meta-llama/Meta-Llama-3-8B" --isl 512 --osl 256
```
**Key**: Configure your manifests for your specific models, hardware, and parallelization strategy before benchmarking.
### Important: Image Accessibility
Ensure container images in your DynamoGraphDeployment manifests are accessible:
- **Public images**: Use [Dynamo NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamo/artifacts) public releases
- **Custom registries**: Configure proper credentials in your Kubernetes namespace
## Configuration and Usage
### Command Line Options
```bash
./benchmarks/benchmark.sh --namespace NAMESPACE --input <label>=<manifest_path_or_endpoint> [--input <label>=<manifest_path_or_endpoint>]... [OPTIONS]
REQUIRED:
-n, --namespace NAMESPACE Kubernetes namespace
--input <label>=<manifest_path_or_endpoint> Benchmark input with custom label
- <label>: becomes the name/label in plots
- <manifest_path_or_endpoint>: either a DynamoGraphDeployment manifest or HTTP endpoint URL
Can be specified multiple times for comparisons
OPTIONS:
-h, --help Show help message and examples
-m, --model MODEL Model name for GenAI-Perf configuration and logging (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)
NOTE: This must match the model configured in your deployment manifests and endpoints
-i, --isl LENGTH Input sequence length (default: 2000)
-s, --std STDDEV Input sequence standard deviation (default: 10)
-o, --osl LENGTH Output sequence length (default: 256)
-d, --output-dir DIR Output directory (default: ./benchmarks/results)
--verbose Enable verbose output
```
### Important Notes
- **Custom Labels**: Each input must have a unique label that becomes the name in plots and results
- **Label Restrictions**: Labels can only contain letters, numbers, hyphens, and underscores. The label `plots` is reserved.
- **Input Types**: Supports DynamoGraphDeployment manifests for automatic deployment, or HTTP endpoints for existing services
- **Model Parameter**: The `--model` parameter configures GenAI-Perf for testing and logging, not deployment (deployment model is determined by the manifest files)
- **Standalone Deployments**: For non-Dynamo backends (vLLM, TensorRT-LLM, SGLang, etc.), you must deploy them manually following their respective Kubernetes deployment guides. The benchmarking framework only supports automatic deployment of DynamoGraphDeployments.
- **Single Model Requirement**: Only one model can be benchmarked at a time across all inputs to ensure fair comparison.
### What Happens During Benchmarking
The script automatically:
1. **Deploys** each DynamoGraphDeployment configuration to Kubernetes if manifests are passed in
2. **Benchmarks** using GenAI-Perf at various concurrency levels (default: 1, 2, 5, 10, 50, 100, 250)
3. **Measures** key metrics: latency, throughput, time-to-first-token
4. **Generates** comparison plots using your custom labels in `./benchmarks/results/plots/`
5. **Cleans up** deployments when complete
### Results Clearing Behavior
**Important**: The benchmark script automatically clears the output directory before each run to ensure clean, reproducible results. This means:
- Previous benchmark results in the same output directory will be completely removed
- Each benchmark run starts with a clean slate
- Results from different runs are not mixed or accumulated
If you want to preserve results from previous runs, use different output directories with the `--output-dir` flag.
### Using Your Own Models and Configuration
The benchmarking framework supports any HuggingFace-compatible LLM model. To benchmark your own custom deployment:
1. **Edit your deployment YAML files** to specify your model in the `--model` argument of the container command
2. **Use the corresponding model name** in the benchmark script's `--model` parameter
**Note**: You can override the default sequence lengths (2000/256 tokens) with `--isl` and `--osl` flags if needed for your specific workload.
### Direct Python Execution
For direct control over the benchmark workflow:
```bash
# Endpoint benchmarking
python3 -u -m benchmarks.utils.benchmark \
--endpoint "http://your-endpoint:8000" \
--namespace $NAMESPACE \
--isl 2000 \
--std 10 \
--osl 256 \
--output-dir $OUTPUT_DIR
# Deployment benchmarking (any combination)
python3 -u -m benchmarks.utils.benchmark \
--input agg=$AGG_CONFIG \
--input disagg=$DISAGG_CONFIG \
--namespace $NAMESPACE \
--isl 2000 \
--std 10 \
--osl 256 \
--output-dir $OUTPUT_DIR
# Generate plots separately
python3 -m benchmarks.utils.plot --data-dir $OUTPUT_DIR
```
### Comparison Limitations
The plotting system supports up to 12 different inputs in a single comparison. If you need to compare more than 12 different deployments/endpoints, consider running separate benchmark sessions or grouping related comparisons together.
### Concurrency Configuration
You can customize the concurrency levels using the CONCURRENCIES environment variable:
```bash
# Custom concurrency levels
CONCURRENCIES="1,5,20,50" ./benchmarks/benchmark.sh --namespace $NAMESPACE --input my-test=components/backends/vllm/deploy/disagg.yaml
# Or set permanently
export CONCURRENCIES="1,2,5,10,25,50,100"
./benchmarks/benchmark.sh --namespace $NAMESPACE --input test=disagg.yaml
```
## Understanding Your Results
After benchmarking completes, check `./benchmarks/results/` (or your custom output directory):
### Summary and Plots
```text
benchmarks/results/
├── SUMMARY.txt # Quick overview of all results
└── plots/ # Visual comparisons (these are what you want!)
├── p50_inter_token_latency_vs_concurrency.png # Token generation speed
├── avg_time_to_first_token_vs_concurrency.png # Response time
├── request_throughput_vs_concurrency.png # Requests per second
├── efficiency_tok_s_gpu_vs_user.png # GPU efficiency
└── avg_inter_token_latency_vs_concurrency.png # Average latency
```
### Data Files
Raw data is organized by deployment/benchmark type and concurrency level:
**For Any Benchmarking (uses your custom labels):**
```text
benchmarks/results/
├── plots/ # Performance visualization plots
│ ├── SUMMARY.txt # Human-readable benchmark summary
│ ├── p50_inter_token_latency_vs_concurrency.png
│ ├── avg_inter_token_latency_vs_concurrency.png
│ ├── request_throughput_vs_concurrency.png
│ ├── efficiency_tok_s_gpu_vs_user.png
│ └── avg_time_to_first_token_vs_concurrency.png
├── <your-label-1>/ # Results for first input (uses your custom label)
│ ├── c1/ # Concurrency level 1
│ │ └── profile_export_genai_perf.json
│ ├── c2/ # Concurrency level 2
│ ├── c5/ # Concurrency level 5
│ └── ... # Other concurrency levels (10, 50, 100, 250)
├── <your-label-2>/ # Results for second input (if provided)
│ └── c*/ # Same structure as above
└── <your-label-N>/ # Results for additional inputs
└── c*/ # Same structure as above
```
**Example with actual labels:**
```text
benchmarks/results/
├── plots/
├── dynamo-agg/ # --input dynamo-agg=agg.yaml
├── dynamo-disagg/ # --input dynamo-disagg=disagg.yaml
└── external-vllm/ # --input external-vllm=http://localhost:8000
```
Each concurrency directory contains:
- **`profile_export_genai_perf.json`** - Structured metrics from GenAI-Perf
- **`profile_export.json`** - Raw GenAI-Perf results
- **`inputs.json`** - Generated test inputs
## Customize Benchmarking Behavior
The built-in workflow handles DynamoGraphDeployment deployment, benchmarking with genai-perf, and plot generation automatically. If you want to modify the behavior:
1. **Extend the workflow**: Modify `benchmarks/utils/workflow.py` to add custom deployment types or metrics collection
2. **Generate different plots**: Modify `benchmarks/utils/plot.py` to generate a different set of plots for whatever you wish to visualize.
The `benchmark.sh` script provides a complete end-to-end benchmarking experience. For more granular control, use the Python modules directly.
...@@ -71,22 +71,29 @@ SLA planner can work with any interpolation data that follows the above format. ...@@ -71,22 +71,29 @@ SLA planner can work with any interpolation data that follows the above format.
## Running the Profiling Script in Kubernetes ## Running the Profiling Script in Kubernetes
Set your environment variables: Set up your Kubernetes namespace (one-time per namespace). Follow the instructions [here](../../deploy/utils/README.md#kubernetes-setup-one-time-per-namespace). If your namespace is already set up, skip this step.
**Prerequisites**: Ensure all dependencies are installed. If you ran the setup script above, dependencies are already installed. Otherwise, install them manually:
```bash ```bash
export NAMESPACE=your-namespace pip install -r deploy/utils/requirements.txt
``` ```
**Optional Step 0: add a kubernetes secret** ### Step 1: Inject your DGD configuration
Use the injector utility to place your DGD manifest into the PVC. The profiling job will read the path you specify.
```bash ```bash
kubectl create secret docker-registry nvcr-imagepullsecret \ # Inject your disagg manifest
--docker-server=nvcr.io \ python3 deploy/utils/inject_manifest.py \
--docker-username='$oauthtoken' \ --namespace $NAMESPACE \
--docker-password=<nvapi key> \ --src components/backends/vllm/deploy/disagg.yaml \
-n $NAMESPACE --dest /configs/disagg.yaml
# Set the docker image for the profiling job; any docker image that contains your script.
export DOCKER_IMAGE=nvcr.io/nvidia/dynamo:latest-vllm
``` ```
**Step 1: Configure container image** ### Configure container image (optional)
You have two options for configuring your profiling setup: You have two options for configuring your profiling setup:
...@@ -102,13 +109,13 @@ Use the default pre-built image and inject custom configurations via PVC: ...@@ -102,13 +109,13 @@ Use the default pre-built image and inject custom configurations via PVC:
2. **Inject your custom disagg configuration:** 2. **Inject your custom disagg configuration:**
```bash ```bash
# Use default disagg.yaml config # Use default disagg.yaml config
python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src components/backends/vllm/deploy/disagg.yaml --dest /configs/disagg.yaml
# Or use a custom disagg config file # Or use a custom disagg config file
python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE --disagg-config my-custom-disagg.yaml python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src my-custom-disagg.yaml --dest /configs/disagg.yaml
# Or specify a custom target path in the PVC # Or specify a custom target path in the PVC
python3 benchmarks/profiler/inject_disagg_config.py --namespace $NAMESPACE --target-path /profiling_results/my-disagg.yaml python3 deploy/utils/inject_manifest.py --namespace $NAMESPACE --src my-custom-disagg.yaml --dest /profiling_results/my-disagg.yaml
``` ```
3. **Set the config path for the profiling job:** 3. **Set the config path for the profiling job:**
...@@ -123,19 +130,6 @@ This approach allows you to: ...@@ -123,19 +130,6 @@ This approach allows you to:
> **Important**: For profiling, disagg configs should be run with Grove disabled by adding the annotation `nvidia.com/enable-grove: "false"` to avoid alpha Grove status issues. > **Important**: For profiling, disagg configs should be run with Grove disabled by adding the annotation `nvidia.com/enable-grove: "false"` to avoid alpha Grove status issues.
> **Note**: The default location in the PVC is `/profiling_results/disagg.yaml`. If you don't inject a config, the profiler will fall back to the built-in config at `/workspace/components/backends/vllm/deploy/disagg.yaml`.
**Option B: Build custom image (only if you need code changes)**
Only needed if you require custom code modifications beyond configuration changes:
```bash
# in the project's root folder
./container/build.sh --framework <VLLM/sglang>
# Tag and push to your container registry
export DOCKER_IMAGE=<your docker tag>
export DGD_CONFIG_FILE=<disagg config path> # path to your disagg.yaml file within the DOCKER_IMAGE
```
**Step 2: Set SLA target** **Step 2: Set SLA target**
Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL. Also, set the backend type to `vllm` or `sglang`. The backend type must match the dynamo deployment in the `DGD_CONFIG_FILE`. Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL. Also, set the backend type to `vllm` or `sglang`. The backend type must match the dynamo deployment in the `DGD_CONFIG_FILE`.
...@@ -162,12 +156,7 @@ spec: ...@@ -162,12 +156,7 @@ spec:
**Step 3: Run profiling (required)** **Step 3: Run profiling (required)**
```bash ```bash
cd $DYNAMO_HOME/benchmarks/profiler/deploy envsubst < benchmarks/profiler/deploy/profile_sla_job.yaml | kubectl apply -f -
envsubst < profiling_pvc.yaml | kubectl apply -f -
envsubst < profile_sla_sa.yaml | kubectl apply -f -
envsubst < profile_sla_rbac.yaml | kubectl apply -f -
envsubst < profile_sla_binding.yaml | kubectl apply -f -
envsubst < profile_sla_job.yaml | kubectl apply -f -
``` ```
**Step 4: Wait for profiling to complete** **Step 4: Wait for profiling to complete**
...@@ -176,40 +165,24 @@ kubectl get jobs -n $NAMESPACE ...@@ -176,40 +165,24 @@ kubectl get jobs -n $NAMESPACE
kubectl logs job/profile-sla -n $NAMESPACE kubectl logs job/profile-sla -n $NAMESPACE
``` ```
### RBAC Configuration
The SLA profiling job requires specific Kubernetes permissions to manage DynamoGraphDeployment resources and access namespace information. The RBAC setup consists of:
- **`profile_sla_sa.yaml`** - Service account with image pull secret for NVIDIA Container Registry access
- **`profile_sla_rbac.yaml`** - Role defining required permissions for managing deployments and accessing namespace resources
- **`profile_sla_binding.yaml`** - RoleBinding that associates the Role with the service account
All three files are necessary:
1. The service account provides identity and image pull credentials
2. The Role defines what operations are allowed
3. The RoleBinding connects the permissions to the service account
### Viewing Profiling Results ### Viewing Profiling Results
After the profiling job completes successfully, the results are stored in the persistent volume claim (PVC) created during Step 2. Here's how to access and view your profiling results: After the profiling job completes successfully, the results are stored in the persistent volume claim (PVC) created during Step 2.
#### Accessing the Profiling Results PVC
The profiling results are stored in a PVC named `profiling-pvc`. To access the results: To download the results:
1. **Deploy the PVC access pod (if not already running):** ```bash
```bash # Download to directory
kubectl apply -f benchmarks/profiler/deploy/pvc-access-pod.yaml -n $NAMESPACE python3 deploy/utils/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --folder /profiling_results
```
2. **Access the PVC through the pod:** # Download without any of the auto-created config.yaml files used in profiling
```bash python3 deploy/utils/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --folder /profiling_results --no-config
kubectl exec -it pvc-access-pod -n $NAMESPACE -- /bin/bash ```
cd /profiling_results
ls -la
```
> **Note**: The same `pvc-access-pod` is used for both injecting disagg configs and accessing results. If you used the `inject_disagg_config.py` script earlier, the pod may already be running. The pod auto-deletes after 5 minutes of activity. The script will:
* Deploy a temporary access pod
* Download all files maintaining directory structure
* Clean the pod up automatically
#### File Structure #### File Structure
...@@ -231,62 +204,6 @@ The profiling results directory contains the following structure: ...@@ -231,62 +204,6 @@ The profiling results directory contains the following structure:
└── decode_tp{best_tp}.png # 3D ITL surface plot └── decode_tp{best_tp}.png # 3D ITL surface plot
``` ```
#### Downloading Results Locally
You can download the profiling results using the automated download script or manually:
**Option 1: Automated Download (Recommended)**
Use the provided download script to automatically fetch all relevant files:
```bash
# Download to ./results directory
python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results
# Download to specific directory
python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir /path/to/my/results
# Download without any of the auto-created config.yaml files used in profiling
python3 benchmarks/profiler/download_pvc_results.py --namespace $NAMESPACE --output-dir ./results --no-config
```
The script will:
- Deploy a temporary access pod (auto-deletes after 5 minutes)
- Scan for relevant files (*.png, *.npz, *.yaml)
- Download all files maintaining directory structure
- Generate a README.md with file descriptions
- Clean up automatically
**Option 2: Manual Download**
To download the profiling results manually:
1. **Download performance plots and data files:**
```bash
# Create a local directory for results
mkdir -p ./profiling_results
# Copy main performance plots
kubectl cp pvc-access-pod:/profiling_results/prefill_performance.png ./profiling_results/ -n $NAMESPACE
kubectl cp pvc-access-pod:/profiling_results/decode_performance.png ./profiling_results/ -n $NAMESPACE
# Copy interpolation directories (includes additional plots and data)
kubectl cp pvc-access-pod:/profiling_results/selected_prefill_interpolation/ ./profiling_results/ -n $NAMESPACE -r
kubectl cp pvc-access-pod:/profiling_results/selected_decode_interpolation/ ./profiling_results/ -n $NAMESPACE -r
```
2. **Alternative: Tar and download entire results directory:**
```bash
# Inside the access pod, create a tar archive
tar -czf /profiling_results/profiling_results.tar.gz -C /profiling_results .
# Download the archive to your local machine
kubectl cp pvc-access-pod:/profiling_results/profiling_results.tar.gz ./profiling_results.tar.gz -n $NAMESPACE
# Extract locally
tar -xzf profiling_results.tar.gz -C ./profiling_results/
```
#### Viewing Performance Plots #### Viewing Performance Plots
The profiling generates several performance visualization files: The profiling generates several performance visualization files:
...@@ -316,20 +233,6 @@ decode_data = np.load('selected_decode_interpolation/raw_data.npz') ...@@ -316,20 +233,6 @@ decode_data = np.load('selected_decode_interpolation/raw_data.npz')
print("Decode data keys:", list(decode_data.keys())) print("Decode data keys:", list(decode_data.keys()))
``` ```
#### Cleaning Up
The access pod automatically deletes after 5 minutes of activity, but you can also clean it up manually:
```bash
# Exit the access pod (if still inside)
exit
# Delete the access pod immediately (optional - it will auto-delete)
kubectl delete pod pvc-access-pod -n $NAMESPACE
```
> **Note**: The access pod has `activeDeadlineSeconds: 300` and will auto-delete after 5 minutes to prevent resource waste.
### Troubleshooting ### Troubleshooting
#### Image Pull Authentication Errors #### Image Pull Authentication Errors
...@@ -343,7 +246,7 @@ If you see `ErrImagePull` or `ImagePullBackOff` errors with 401 unauthorized mes ...@@ -343,7 +246,7 @@ If you see `ErrImagePull` or `ImagePullBackOff` errors with 401 unauthorized mes
2. Verify the service account was created with the image pull secret: 2. Verify the service account was created with the image pull secret:
```bash ```bash
kubectl get serviceaccount profile-sla-sa -n $NAMESPACE -o yaml kubectl get serviceaccount dynamo-sa -n $NAMESPACE -o yaml
``` ```
3. The service account should show `imagePullSecrets` containing `nvcr-imagepullsecret`. 3. The service account should show `imagePullSecrets` containing `nvcr-imagepullsecret`.
...@@ -16,7 +16,6 @@ Quick deployment guide for the disaggregated planner with automatic scaling. ...@@ -16,7 +16,6 @@ Quick deployment guide for the disaggregated planner with automatic scaling.
```mermaid ```mermaid
flowchart LR flowchart LR
Frontend --"/metrics"--> Prometheus Frontend --"/metrics"--> Prometheus
Prometheus --"scrape"--> Prometheus
Planner --"query API"--> Prometheus Planner --"query API"--> Prometheus
Planner --"scaling decisions"--> Workers["prefill<br/>backend"] Planner --"scaling decisions"--> Workers["prefill<br/>backend"]
Frontend -.->|"requests"| Workers Frontend -.->|"requests"| Workers
...@@ -25,7 +24,7 @@ flowchart LR ...@@ -25,7 +24,7 @@ flowchart LR
## Prerequisites ## Prerequisites
- Kubernetes cluster with GPU nodes - Kubernetes cluster with GPU nodes
- `hf-token-secret` created in target namespace - `hf-token-secret` created in target namespace
- [Pre-Deployment Profiling](../../architecture/pre_deployment_profiling.md) results saved to `profiling-pvc` PVC. - [Pre-Deployment Profiling](../../benchmarks/pre_deployment_profiling.md) results saved to `dynamo-pvc` PVC.
- Prefill and decode worker uses the best parallelization mapping suggested by the pre-deployment profiling script. - Prefill and decode worker uses the best parallelization mapping suggested by the pre-deployment profiling script.
```bash ```bash
...@@ -62,7 +61,7 @@ vllm-disagg-planner-prefill-* 1/1 Running ...@@ -62,7 +61,7 @@ vllm-disagg-planner-prefill-* 1/1 Running
kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-frontend 8000:8000 kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-frontend 8000:8000
# Send a streaming request (required for full metrics) # Send a streaming request (required for full metrics)
curl http://localhost:8000/v1/chat/completions \ curl -N http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "Qwen/Qwen3-0.6B", "model": "Qwen/Qwen3-0.6B",
...@@ -101,8 +100,8 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10 ...@@ -101,8 +100,8 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
**Connection Issues:** **Connection Issues:**
```bash ```bash
# Verify Prometheus is accessible (runs on port 8000) # Verify Prometheus is accessible (runs on port 8000)
kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-prometheus 8000:8000 kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-prometheus 9090:8000
curl "http://localhost:8000/api/v1/query?query=up" curl "http://localhost:9090/api/v1/query?query=up"
``` ```
**Missing Metrics:** **Missing Metrics:**
......
...@@ -21,7 +21,7 @@ Use the pre-configured test deployment with sample profiling data, we provide th ...@@ -21,7 +21,7 @@ Use the pre-configured test deployment with sample profiling data, we provide th
### Option B: Use Your Own Profiling Results ### Option B: Use Your Own Profiling Results
1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../docs/architecture/pre_deployment_profiling.md) for detailed instructions. 1. Run pre-deployment profiling for your specific setup. See the [pre-deployment profiling documentation](../../docs/benchmarks/pre_deployment_profiling.md) for detailed instructions.
## Interpolator Testing ## Interpolator Testing
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment