Unverified Commit 400dceae authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: support yaml config input for pre-deployment sweep script (#3622)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent c6e3db5e
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse
import asyncio import asyncio
import logging import logging
import math import math
...@@ -26,7 +25,6 @@ from benchmarks.profiler.utils.aiperf import benchmark_decode, benchmark_prefill ...@@ -26,7 +25,6 @@ from benchmarks.profiler.utils.aiperf import benchmark_decode, benchmark_prefill
from benchmarks.profiler.utils.config import generate_dgd_config_with_planner from benchmarks.profiler.utils.config import generate_dgd_config_with_planner
from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser
from benchmarks.profiler.utils.plot import ( from benchmarks.profiler.utils.plot import (
plot_decode_performance, plot_decode_performance,
plot_prefill_performance, plot_prefill_performance,
...@@ -46,6 +44,7 @@ from benchmarks.profiler.utils.profile_prefill import ( ...@@ -46,6 +44,7 @@ from benchmarks.profiler.utils.profile_prefill import (
profile_prefill, profile_prefill,
profile_prefill_aiconfigurator, profile_prefill_aiconfigurator,
) )
from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
from deploy.utils.dynamo_deployment import ( from deploy.utils.dynamo_deployment import (
DynamoDeploymentClient, DynamoDeploymentClient,
cleanup_remaining_deployments, cleanup_remaining_deployments,
...@@ -741,166 +740,9 @@ async def run_profile(args): ...@@ -741,166 +740,9 @@ async def run_profile(args):
await cleanup_remaining_deployments(deployment_clients, args.namespace) await cleanup_remaining_deployments(deployment_clients, args.namespace)
logger.info("Final cleanup completed.") logger.info("Final cleanup completed.")
# deploy the optimized DGD with planner
if args.deploy_after_profile and not args.dry_run:
logger.info("Deploying the optimized DGD with planner...")
# TODO: check conflicts for dynamo namespace and DGD name
# TODO: handle deployment errors and propagate proper error messages to users
client = DynamoDeploymentClient(
namespace=args.namespace,
base_log_dir=f"{args.output_dir}/final_deployment",
model_name=model_name,
service_name=args.service_name,
frontend_port=frontend_port,
deployment_name=config["metadata"]["name"],
)
await client.create_deployment(f"{args.output_dir}/config_with_planner.yaml")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( args = create_profiler_parser()
description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill."
)
parser.add_argument(
"--namespace",
type=str,
default="dynamo-sla-profiler",
help="Kubernetes namespace to deploy the DynamoGraphDeployment",
)
parser.add_argument(
"--backend",
type=str,
default="vllm",
choices=["vllm", "sglang", "trtllm"],
help="backend type, currently support [vllm, sglang, trtllm]",
)
parser.add_argument(
"--config",
type=str,
required=True,
help="Path to the DynamoGraphDeployment config file",
)
parser.add_argument(
"--output-dir",
type=str,
default="profiling_results",
help="Path to the output results directory",
)
parser.add_argument(
"--min-num-gpus-per-engine",
type=int,
default=1,
help="minimum number of GPUs per engine",
)
parser.add_argument(
"--max-num-gpus-per-engine",
type=int,
default=8,
help="maximum number of GPUs per engine",
)
parser.add_argument(
"--skip-existing-results",
action="store_true",
help="Skip TP sizes that already have results in the output directory",
)
parser.add_argument(
"--force-rerun",
action="store_true",
help="Force re-running all tests even if results already exist (overrides --skip-existing-results)",
)
parser.add_argument(
"--isl", type=int, default=3000, help="target input sequence length"
)
parser.add_argument(
"--osl", type=int, default=500, help="target output sequence length"
)
parser.add_argument(
"--ttft", type=int, default=50, help="target Time To First Token in ms"
)
parser.add_argument(
"--itl", type=int, default=10, help="target Inter Token Latency in ms"
)
# arguments used for interpolating TTFT and ITL under different ISL/OSL
parser.add_argument(
"--max-context-length",
type=int,
default=16384,
help="maximum context length supported by the served model",
)
parser.add_argument(
"--prefill-interpolation-granularity",
type=int,
default=16,
help="how many samples to benchmark to interpolate TTFT under different ISL",
)
parser.add_argument(
"--decode-interpolation-granularity",
type=int,
default=6,
help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
)
parser.add_argument(
"--service-name",
type=str,
default="",
help="Service name for port forwarding (default: {deployment_name}-frontend)",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Dry run the profile job",
)
parser.add_argument(
"--is-moe-model",
action="store_true",
dest="is_moe_model",
help="Enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode",
)
parser.add_argument(
"--num-gpus-per-node",
type=int,
default=8,
help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
)
# arguments for dgd config generation and deployment
parser.add_argument(
"--deploy-after-profile",
action="store_true",
help="deploy the optimized DGD with planner",
)
# Dynamically add all planner arguments from planner_argparse.py
add_planner_arguments_to_parser(parser, prefix="planner-")
# arguments if using aiconfigurator
parser.add_argument(
"--use-ai-configurator",
action="store_true",
help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
)
parser.add_argument(
"--aic-system",
type=str,
help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
)
parser.add_argument(
"--aic-model-name",
type=str,
help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)",
)
parser.add_argument(
"--aic-backend",
type=str,
default="",
help="aiconfigurator backend of the target model, if not provided, will use args.backend",
)
parser.add_argument(
"--aic-backend-version",
type=str,
help="Specify backend version when using aiconfigurator to estimate perf.",
)
args = parser.parse_args()
# setup file logging # setup file logging
os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.output_dir, exist_ok=True)
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import argparse
import ast
from typing import Any, Dict
import yaml
from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser
def parse_config_string(config_str: str) -> Dict[str, Any]:
"""Parse configuration string as Python dict literal, YAML, or JSON.
Supports multiple input formats:
1. Python dict literal: "{'engine': {'backend': 'vllm'}, 'sla': {'isl': 3000}}"
2. YAML string: "engine:\n backend: vllm\nsla:\n isl: 3000"
3. JSON string: '{"engine": {"backend": "vllm"}, "sla": {"isl": 3000}}'
Args:
config_str: Configuration string in one of the supported formats
Returns:
Dictionary containing the configuration
Raises:
ValueError: If config cannot be parsed or is not a dictionary
"""
config = None
# Try 1: Parse as Python dict literal (most direct for CLI)
try:
config = ast.literal_eval(config_str)
if isinstance(config, dict):
return config
except (ValueError, SyntaxError):
pass
# Try 2: Parse as YAML/JSON (for K8s ConfigMaps and files)
try:
config = yaml.safe_load(config_str)
if config is not None and isinstance(config, dict):
return config
except yaml.YAMLError:
pass
# If we got here, parsing failed
raise ValueError(
"Failed to parse config string. Expected Python dict literal, YAML, or JSON format. "
f"Examples:\n"
f" Python dict: \"{'engine': {'backend': 'vllm'}}\"\n"
f' YAML: "engine:\\n backend: vllm"\n'
f' JSON: \'{{"engine": {{"backend": "vllm"}}}}\''
)
def create_profiler_parser() -> argparse.Namespace:
"""
Create argument parser with support for YAML config string.
Config structure:
output_dir: String (path to the output results directory, default: profiling_results)
deployment:
namespace: String (kubernetes namespace, default: dynamo-sla-profiler)
service_name: String (service name, default: "")
engine:
backend: String (backend type, currently support [vllm, sglang, trtllm], default: vllm)
config: String (path to the DynamoGraphDeployment config file)
max_context_length: Int (maximum context length supported by the served model, default: 16384)
is_moe_model: Boolean (enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode, default: False)
hardware:
min_num_gpus_per_engine: Int (minimum number of GPUs per engine, default: 1)
max_num_gpus_per_engine: Int (maximum number of GPUs per engine, default: 8)
num_gpus_per_node: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 8)
sweep:
skip_existing_results: Boolean (skip TP sizes that already have results in the output directory, default: False)
force_rerun: Boolean (force re-running all tests even if results already exist (overrides --skip-existing-results), default: False)
prefill_interpolation_granularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16)
decode_interpolation_granularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
use_ai_configurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
aic_system: String (target system for use with aiconfigurator, default: None)
aic_model_name: String (aiconfigurator name of the target model, default: None)
aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None)
dry_run: Boolean (dry run the profile job, default: False)
sla:
isl: Int (target input sequence length, default: 3000)
osl: Int (target output sequence length, default: 500)
ttft: Int (target Time To First Token in ms, default: 50)
itl: Int (target Inter Token Latency in ms, default: 10)
planner: (planner-bypass arguments, use hyphens or underscores)
i.e., planner-min-endpoint: 2 # or planner_min_endpoint: 2 (both work)
"""
# Step 1: Pre-parse to check if --profile-config is provided
pre_parser = argparse.ArgumentParser(add_help=False)
pre_parser.add_argument("--profile-config", type=str)
pre_args, _ = pre_parser.parse_known_args()
# Step 2: Parse config if provided
config = {}
if pre_args.profile_config:
config = parse_config_string(pre_args.profile_config)
# Step 3: Create main parser with config-aware defaults
parser = argparse.ArgumentParser(
description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill."
)
parser.add_argument(
"--profile-config",
type=str,
help="Configuration as Python dict literal, YAML, or JSON string. CLI args override config values. "
"Example: \"{'engine': {'backend': 'vllm', 'config': '/path'}, 'sla': {'isl': 3000}}\"",
)
# CLI arguments with config-aware defaults (using nested .get() for cleaner code)
parser.add_argument(
"--namespace",
type=str,
default=config.get("deployment", {}).get("namespace", "dynamo-sla-profiler"),
help="Kubernetes namespace to deploy the DynamoGraphDeployment",
)
parser.add_argument(
"--backend",
type=str,
default=config.get("engine", {}).get("backend", "vllm"),
choices=["vllm", "sglang", "trtllm"],
help="backend type, currently support [vllm, sglang, trtllm]",
)
parser.add_argument(
"--config",
type=str,
default=config.get("engine", {}).get("config", ""),
required=False,
help="Path to the DynamoGraphDeployment config file (required, can be provided via CLI or config)",
)
parser.add_argument(
"--output-dir",
type=str,
default=config.get("output_dir", "profiling_results"),
help="Path to the output results directory",
)
parser.add_argument(
"--min-num-gpus-per-engine",
type=int,
default=config.get("hardware", {}).get("min_num_gpus_per_engine", 1),
help="minimum number of GPUs per engine",
)
parser.add_argument(
"--max-num-gpus-per-engine",
type=int,
default=config.get("hardware", {}).get("max_num_gpus_per_engine", 8),
help="maximum number of GPUs per engine",
)
parser.add_argument(
"--skip-existing-results",
action="store_true",
default=config.get("sweep", {}).get("skip_existing_results", False),
help="Skip TP sizes that already have results in the output directory",
)
parser.add_argument(
"--force-rerun",
action="store_true",
default=config.get("sweep", {}).get("force_rerun", False),
help="Force re-running all tests even if results already exist (overrides --skip-existing-results)",
)
parser.add_argument(
"--isl",
type=int,
default=config.get("sla", {}).get("isl", 3000),
help="target input sequence length",
)
parser.add_argument(
"--osl",
type=int,
default=config.get("sla", {}).get("osl", 500),
help="target output sequence length",
)
parser.add_argument(
"--ttft",
type=int,
default=config.get("sla", {}).get("ttft", 50),
help="target Time To First Token in ms",
)
parser.add_argument(
"--itl",
type=int,
default=config.get("sla", {}).get("itl", 10),
help="target Inter Token Latency in ms",
)
# arguments used for interpolating TTFT and ITL under different ISL/OSL
parser.add_argument(
"--max-context-length",
type=int,
default=config.get("engine", {}).get("max_context_length", 16384),
help="maximum context length supported by the served model",
)
parser.add_argument(
"--prefill-interpolation-granularity",
type=int,
default=config.get("sweep", {}).get("prefill_interpolation_granularity", 16),
help="how many samples to benchmark to interpolate TTFT under different ISL",
)
parser.add_argument(
"--decode-interpolation-granularity",
type=int,
default=config.get("sweep", {}).get("decode_interpolation_granularity", 6),
help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
)
parser.add_argument(
"--service-name",
type=str,
default=config.get("deployment", {}).get("service_name", ""),
help="Service name for port forwarding (default: {deployment_name}-frontend)",
)
parser.add_argument(
"--dry-run",
action="store_true",
default=config.get("sweep", {}).get("dry_run", False),
help="Dry run the profile job",
)
parser.add_argument(
"--is-moe-model",
action="store_true",
dest="is_moe_model",
default=config.get("engine", {}).get("is_moe_model", False),
help="Enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode",
)
parser.add_argument(
"--num-gpus-per-node",
type=int,
default=config.get("hardware", {}).get("num_gpus_per_node", 8),
help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
)
# Dynamically add all planner arguments from planner_argparse.py
add_planner_arguments_to_parser(parser, prefix="planner-")
# Set defaults for any planner arguments found in config.planner
# Note: argparse converts hyphens to underscores, so we need to normalize keys
planner_config = config.get("planner", {})
if planner_config:
# Convert hyphens to underscores to match argparse's internal naming
normalized_planner_config = {
key.replace("-", "_"): value for key, value in planner_config.items()
}
parser.set_defaults(**normalized_planner_config)
# arguments if using aiconfigurator
parser.add_argument(
"--use-ai-configurator",
action="store_true",
default=config.get("sweep", {}).get("use_ai_configurator", False),
help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
)
parser.add_argument(
"--aic-system",
type=str,
default=config.get("sweep", {}).get("aic_system"),
help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
)
parser.add_argument(
"--aic-model-name",
type=str,
default=config.get("sweep", {}).get("aic_model_name"),
help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)",
)
parser.add_argument(
"--aic-backend",
type=str,
default=config.get("sweep", {}).get("aic_backend", ""),
help="aiconfigurator backend of the target model, if not provided, will use args.backend",
)
parser.add_argument(
"--aic-backend-version",
type=str,
default=config.get("sweep", {}).get("aic_backend_version"),
help="Specify backend version when using aiconfigurator to estimate perf.",
)
# Parse arguments
args = parser.parse_args()
# remove --profile-config from args
if hasattr(args, "profile_config"):
delattr(args, "profile_config")
# Validate required arguments
if not args.config:
parser.error("--config is required (either via CLI or profile-config)")
return args
...@@ -128,8 +128,6 @@ spec: ...@@ -128,8 +128,6 @@ spec:
**For MoE models**, use `profile_sla_moe_job.yaml` with TEP/DEP configuration instead. **For MoE models**, use `profile_sla_moe_job.yaml` with TEP/DEP configuration instead.
If you want to automatically deploy the optimized DGD with planner after profiling, add `--deploy-after-profile` to the profiling job. It will deploy the DGD with the engine of the optimized parallelization mapping found for the SLA targets.
### Advanced Configuration ### Advanced Configuration
- **Model caching**: For large models, create a multi-attach PVC to cache the model. See [recipes](../../recipes/README.md) for details. - **Model caching**: For large models, create a multi-attach PVC to cache the model. See [recipes](../../recipes/README.md) for details.
......
...@@ -97,13 +97,10 @@ spec: ...@@ -97,13 +97,10 @@ spec:
- "20" # target ITL is 20ms - "20" # target ITL is 20ms
- --backend - --backend
- <vllm/sglang> - <vllm/sglang>
- --deploy-after-profile
``` ```
For MoE models, edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_moe_job.yaml` instead. For MoE models, edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_moe_job.yaml` instead.
To automatically deploy the optimized DGD with planner after profiling, add `--deploy-after-profile` to the profiling job. It will deploy the DGD with the engine of the optimized parallelization mapping found for the SLA targets.
### Step 1.4: Run Profiling ### Step 1.4: Run Profiling
Set the container image and config path: Set the container image and config path:
...@@ -136,7 +133,7 @@ kubectl logs job/profile-sla -n $NAMESPACE ...@@ -136,7 +133,7 @@ kubectl logs job/profile-sla -n $NAMESPACE
> [!NOTE] > [!NOTE]
> **Time Investment**: This profiling process is comprehensive and typically takes **2-4 hours** to complete. The script systematically tests multiple tensor parallelism configurations and load conditions to find optimal performance settings. > **Time Investment**: This profiling process is comprehensive and typically takes **2-4 hours** to complete. The script systematically tests multiple tensor parallelism configurations and load conditions to find optimal performance settings.
### Step 1.6: Download Profiling Results (Optional) ### Step 1.6: Download Profiling Results
If you want to view the profiling results and performance plots: If you want to view the profiling results and performance plots:
...@@ -156,7 +153,13 @@ Final DGD config with planner: {...} ...@@ -156,7 +153,13 @@ Final DGD config with planner: {...}
Deploying the optimized DGD with planner... Deploying the optimized DGD with planner...
``` ```
### Step 1.7: Wait for Deployment to be Ready ### Step 1.7: Deploy the DGD with Planner
```bash
kubectl apply -f ./results/config_with_planner.yaml
```
### Step 1.8: Wait for Deployment to be Ready
```bash ```bash
kubectl get pods -n $NAMESPACE kubectl get pods -n $NAMESPACE
...@@ -170,7 +173,7 @@ vllm-disagg-planner-backend-* 1/1 Running ...@@ -170,7 +173,7 @@ vllm-disagg-planner-backend-* 1/1 Running
vllm-disagg-planner-prefill-* 1/1 Running vllm-disagg-planner-prefill-* 1/1 Running
``` ```
### Step 1.8: Test the System ### Step 1.9: Test the System
```bash ```bash
# Port forward to frontend # Port forward to frontend
...@@ -192,7 +195,7 @@ curl -N http://localhost:8000/v1/chat/completions \ ...@@ -192,7 +195,7 @@ curl -N http://localhost:8000/v1/chat/completions \
}' }'
``` ```
### Step 1.9: Monitor Scaling ### Step 1.10: Monitor Scaling
```bash ```bash
# Check planner logs for scaling decisions # Check planner logs for scaling decisions
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment