feat: support yaml config input for pre-deployment sweep script (#3622)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

feat: support yaml config input for pre-deployment sweep script (#3622)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
400dceae · Hongkuan Zhou · GitHub · c6e3db5e · 400dceae · 400dceae
Unverified Commit 400dceae authored Oct 14, 2025 by Hongkuan Zhou Committed by GitHub Oct 14, 2025
4 changed files
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
 import asyncio
 import logging
 import math
@@ -26,7 +25,6 @@ from benchmarks.profiler.utils.aiperf import benchmark_decode, benchmark_prefill
 from benchmarks.profiler.utils.config import generate_dgd_config_with_planner
 from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS
 from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
-from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser
 from benchmarks.profiler.utils.plot import (
    plot_decode_performance,
    plot_prefill_performance,
@@ -46,6 +44,7 @@ from benchmarks.profiler.utils.profile_prefill import (
    profile_prefill,
    profile_prefill_aiconfigurator,
 )
+from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
 from deploy.utils.dynamo_deployment import (
    DynamoDeploymentClient,
    cleanup_remaining_deployments,
@@ -741,166 +740,9 @@ async def run_profile(args):
        await cleanup_remaining_deployments(deployment_clients, args.namespace)
        logger.info("Final cleanup completed.")
-    # deploy the optimized DGD with planner
-    if args.deploy_after_profile and not args.dry_run:
-        logger.info("Deploying the optimized DGD with planner...")
-        # TODO: check conflicts for dynamo namespace and DGD name
-        # TODO: handle deployment errors and propagate proper error messages to users
-        client = DynamoDeploymentClient(
-            namespace=args.namespace,
-            base_log_dir=f"{args.output_dir}/final_deployment",
-            model_name=model_name,
-            service_name=args.service_name,
-            frontend_port=frontend_port,
-            deployment_name=config["metadata"]["name"],
-        )
-        await client.create_deployment(f"{args.output_dir}/config_with_planner.yaml")
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    args = create_profiler_parser()
-        description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill."
-    )
-    parser.add_argument(
-        "--namespace",
-        type=str,
-        default="dynamo-sla-profiler",
-        help="Kubernetes namespace to deploy the DynamoGraphDeployment",
-    )
-    parser.add_argument(
-        "--backend",
-        type=str,
-        default="vllm",
-        choices=["vllm", "sglang", "trtllm"],
-        help="backend type, currently support [vllm, sglang, trtllm]",
-    )
-    parser.add_argument(
-        "--config",
-        type=str,
-        required=True,
-        help="Path to the DynamoGraphDeployment config file",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        default="profiling_results",
-        help="Path to the output results directory",
-    )
-    parser.add_argument(
-        "--min-num-gpus-per-engine",
-        type=int,
-        default=1,
-        help="minimum number of GPUs per engine",
-    )
-    parser.add_argument(
-        "--max-num-gpus-per-engine",
-        type=int,
-        default=8,
-        help="maximum number of GPUs per engine",
-    )
-    parser.add_argument(
-        "--skip-existing-results",
-        action="store_true",
-        help="Skip TP sizes that already have results in the output directory",
-    )
-    parser.add_argument(
-        "--force-rerun",
-        action="store_true",
-        help="Force re-running all tests even if results already exist (overrides --skip-existing-results)",
-    )
-    parser.add_argument(
-        "--isl", type=int, default=3000, help="target input sequence length"
-    )
-    parser.add_argument(
-        "--osl", type=int, default=500, help="target output sequence length"
-    )
-    parser.add_argument(
-        "--ttft", type=int, default=50, help="target Time To First Token in ms"
-    )
-    parser.add_argument(
-        "--itl", type=int, default=10, help="target Inter Token Latency in ms"
-    )
-    # arguments used for interpolating TTFT and ITL under different ISL/OSL
-    parser.add_argument(
-        "--max-context-length",
-        type=int,
-        default=16384,
-        help="maximum context length supported by the served model",
-    )
-    parser.add_argument(
-        "--prefill-interpolation-granularity",
-        type=int,
-        default=16,
-        help="how many samples to benchmark to interpolate TTFT under different ISL",
-    )
-    parser.add_argument(
-        "--decode-interpolation-granularity",
-        type=int,
-        default=6,
-        help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
-    )
-    parser.add_argument(
-        "--service-name",
-        type=str,
-        default="",
-        help="Service name for port forwarding (default: {deployment_name}-frontend)",
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="Dry run the profile job",
-    )
-    parser.add_argument(
-        "--is-moe-model",
-        action="store_true",
-        dest="is_moe_model",
-        help="Enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode",
-    )
-    parser.add_argument(
-        "--num-gpus-per-node",
-        type=int,
-        default=8,
-        help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
-    )
-    # arguments for dgd config generation and deployment
-    parser.add_argument(
-        "--deploy-after-profile",
-        action="store_true",
-        help="deploy the optimized DGD with planner",
-    )
-    # Dynamically add all planner arguments from planner_argparse.py
-    add_planner_arguments_to_parser(parser, prefix="planner-")
-    # arguments if using aiconfigurator
-    parser.add_argument(
-        "--use-ai-configurator",
-        action="store_true",
-        help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
-    )
-    parser.add_argument(
-        "--aic-system",
-        type=str,
-        help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
-    )
-    parser.add_argument(
-        "--aic-model-name",
-        type=str,
-        help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)",
-    )
-    parser.add_argument(
-        "--aic-backend",
-        type=str,
-        default="",
-        help="aiconfigurator backend of the target model, if not provided, will use args.backend",
-    )
-    parser.add_argument(
-        "--aic-backend-version",
-        type=str,
-        help="Specify backend version when using aiconfigurator to estimate perf.",
-    )
-    args = parser.parse_args()
    # setup file logging
    os.makedirs(args.output_dir, exist_ok=True)

--- a/benchmarks/profiler/utils/profiler_argparse.py
+++ b/benchmarks/profiler/utils/profiler_argparse.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import ast
+from typing import Any, Dict
+import yaml
+from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser
+def parse_config_string(config_str: str) -> Dict[str, Any]:
+    """Parse configuration string as Python dict literal, YAML, or JSON.
+    Supports multiple input formats:
+    1. Python dict literal: "{'engine': {'backend': 'vllm'}, 'sla': {'isl': 3000}}"
+    2. YAML string: "engine:\n  backend: vllm\nsla:\n  isl: 3000"
+    3. JSON string: '{"engine": {"backend": "vllm"}, "sla": {"isl": 3000}}'
+    Args:
+        config_str: Configuration string in one of the supported formats
+    Returns:
+        Dictionary containing the configuration
+    Raises:
+        ValueError: If config cannot be parsed or is not a dictionary
+    """
+    config = None
+    # Try 1: Parse as Python dict literal (most direct for CLI)
+    try:
+        config = ast.literal_eval(config_str)
+        if isinstance(config, dict):
+            return config
+    except (ValueError, SyntaxError):
+        pass
+    # Try 2: Parse as YAML/JSON (for K8s ConfigMaps and files)
+    try:
+        config = yaml.safe_load(config_str)
+        if config is not None and isinstance(config, dict):
+            return config
+    except yaml.YAMLError:
+        pass
+    # If we got here, parsing failed
+    raise ValueError(
+        "Failed to parse config string. Expected Python dict literal, YAML, or JSON format. "
+        f"Examples:\n"
+        f"  Python dict: \"{'engine': {'backend': 'vllm'}}\"\n"
+        f'  YAML: "engine:\\n  backend: vllm"\n'
+        f'  JSON: \'{{"engine": {{"backend": "vllm"}}}}\''
+    )
+def create_profiler_parser() -> argparse.Namespace:
+    """
+    Create argument parser with support for YAML config string.
+    Config structure:
+        output_dir: String (path to the output results directory, default: profiling_results)
+        deployment:
+            namespace: String (kubernetes namespace, default: dynamo-sla-profiler)
+            service_name: String (service name, default: "")
+        engine:
+            backend: String (backend type, currently support [vllm, sglang, trtllm], default: vllm)
+            config: String (path to the DynamoGraphDeployment config file)
+            max_context_length: Int (maximum context length supported by the served model, default: 16384)
+            is_moe_model: Boolean (enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode, default: False)
+        hardware:
+            min_num_gpus_per_engine: Int (minimum number of GPUs per engine, default: 1)
+            max_num_gpus_per_engine: Int (maximum number of GPUs per engine, default: 8)
+            num_gpus_per_node: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 8)
+        sweep:
+            skip_existing_results: Boolean (skip TP sizes that already have results in the output directory, default: False)
+            force_rerun: Boolean (force re-running all tests even if results already exist (overrides --skip-existing-results), default: False)
+            prefill_interpolation_granularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16)
+            decode_interpolation_granularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
+            use_ai_configurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
+            aic_system: String (target system for use with aiconfigurator, default: None)
+            aic_model_name: String (aiconfigurator name of the target model, default: None)
+            aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
+            aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None)
+            dry_run: Boolean (dry run the profile job, default: False)
+        sla:
+            isl: Int (target input sequence length, default: 3000)
+            osl: Int (target output sequence length, default: 500)
+            ttft: Int (target Time To First Token in ms, default: 50)
+            itl: Int (target Inter Token Latency in ms, default: 10)
+        planner: (planner-bypass arguments, use hyphens or underscores)
+            i.e., planner-min-endpoint: 2  # or planner_min_endpoint: 2 (both work)
+    """
+    # Step 1: Pre-parse to check if --profile-config is provided
+    pre_parser = argparse.ArgumentParser(add_help=False)
+    pre_parser.add_argument("--profile-config", type=str)
+    pre_args, _ = pre_parser.parse_known_args()
+    # Step 2: Parse config if provided
+    config = {}
+    if pre_args.profile_config:
+        config = parse_config_string(pre_args.profile_config)
+    # Step 3: Create main parser with config-aware defaults
+    parser = argparse.ArgumentParser(
+        description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill."
+    )
+    parser.add_argument(
+        "--profile-config",
+        type=str,
+        help="Configuration as Python dict literal, YAML, or JSON string. CLI args override config values. "
+        "Example: \"{'engine': {'backend': 'vllm', 'config': '/path'}, 'sla': {'isl': 3000}}\"",
+    )
+    # CLI arguments with config-aware defaults (using nested .get() for cleaner code)
+    parser.add_argument(
+        "--namespace",
+        type=str,
+        default=config.get("deployment", {}).get("namespace", "dynamo-sla-profiler"),
+        help="Kubernetes namespace to deploy the DynamoGraphDeployment",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default=config.get("engine", {}).get("backend", "vllm"),
+        choices=["vllm", "sglang", "trtllm"],
+        help="backend type, currently support [vllm, sglang, trtllm]",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default=config.get("engine", {}).get("config", ""),
+        required=False,
+        help="Path to the DynamoGraphDeployment config file (required, can be provided via CLI or config)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=config.get("output_dir", "profiling_results"),
+        help="Path to the output results directory",
+    )
+    parser.add_argument(
+        "--min-num-gpus-per-engine",
+        type=int,
+        default=config.get("hardware", {}).get("min_num_gpus_per_engine", 1),
+        help="minimum number of GPUs per engine",
+    )
+    parser.add_argument(
+        "--max-num-gpus-per-engine",
+        type=int,
+        default=config.get("hardware", {}).get("max_num_gpus_per_engine", 8),
+        help="maximum number of GPUs per engine",
+    )
+    parser.add_argument(
+        "--skip-existing-results",
+        action="store_true",
+        default=config.get("sweep", {}).get("skip_existing_results", False),
+        help="Skip TP sizes that already have results in the output directory",
+    )
+    parser.add_argument(
+        "--force-rerun",
+        action="store_true",
+        default=config.get("sweep", {}).get("force_rerun", False),
+        help="Force re-running all tests even if results already exist (overrides --skip-existing-results)",
+    )
+    parser.add_argument(
+        "--isl",
+        type=int,
+        default=config.get("sla", {}).get("isl", 3000),
+        help="target input sequence length",
+    )
+    parser.add_argument(
+        "--osl",
+        type=int,
+        default=config.get("sla", {}).get("osl", 500),
+        help="target output sequence length",
+    )
+    parser.add_argument(
+        "--ttft",
+        type=int,
+        default=config.get("sla", {}).get("ttft", 50),
+        help="target Time To First Token in ms",
+    )
+    parser.add_argument(
+        "--itl",
+        type=int,
+        default=config.get("sla", {}).get("itl", 10),
+        help="target Inter Token Latency in ms",
+    )
+    # arguments used for interpolating TTFT and ITL under different ISL/OSL
+    parser.add_argument(
+        "--max-context-length",
+        type=int,
+        default=config.get("engine", {}).get("max_context_length", 16384),
+        help="maximum context length supported by the served model",
+    )
+    parser.add_argument(
+        "--prefill-interpolation-granularity",
+        type=int,
+        default=config.get("sweep", {}).get("prefill_interpolation_granularity", 16),
+        help="how many samples to benchmark to interpolate TTFT under different ISL",
+    )
+    parser.add_argument(
+        "--decode-interpolation-granularity",
+        type=int,
+        default=config.get("sweep", {}).get("decode_interpolation_granularity", 6),
+        help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
+    )
+    parser.add_argument(
+        "--service-name",
+        type=str,
+        default=config.get("deployment", {}).get("service_name", ""),
+        help="Service name for port forwarding (default: {deployment_name}-frontend)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=config.get("sweep", {}).get("dry_run", False),
+        help="Dry run the profile job",
+    )
+    parser.add_argument(
+        "--is-moe-model",
+        action="store_true",
+        dest="is_moe_model",
+        default=config.get("engine", {}).get("is_moe_model", False),
+        help="Enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode",
+    )
+    parser.add_argument(
+        "--num-gpus-per-node",
+        type=int,
+        default=config.get("hardware", {}).get("num_gpus_per_node", 8),
+        help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
+    )
+    # Dynamically add all planner arguments from planner_argparse.py
+    add_planner_arguments_to_parser(parser, prefix="planner-")
+    # Set defaults for any planner arguments found in config.planner
+    # Note: argparse converts hyphens to underscores, so we need to normalize keys
+    planner_config = config.get("planner", {})
+    if planner_config:
+        # Convert hyphens to underscores to match argparse's internal naming
+        normalized_planner_config = {
+            key.replace("-", "_"): value for key, value in planner_config.items()
+        }
+        parser.set_defaults(**normalized_planner_config)
+    # arguments if using aiconfigurator
+    parser.add_argument(
+        "--use-ai-configurator",
+        action="store_true",
+        default=config.get("sweep", {}).get("use_ai_configurator", False),
+        help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
+    )
+    parser.add_argument(
+        "--aic-system",
+        type=str,
+        default=config.get("sweep", {}).get("aic_system"),
+        help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
+    )
+    parser.add_argument(
+        "--aic-model-name",
+        type=str,
+        default=config.get("sweep", {}).get("aic_model_name"),
+        help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)",
+    )
+    parser.add_argument(
+        "--aic-backend",
+        type=str,
+        default=config.get("sweep", {}).get("aic_backend", ""),
+        help="aiconfigurator backend of the target model, if not provided, will use args.backend",
+    )
+    parser.add_argument(
+        "--aic-backend-version",
+        type=str,
+        default=config.get("sweep", {}).get("aic_backend_version"),
+        help="Specify backend version when using aiconfigurator to estimate perf.",
+    )
+    # Parse arguments
+    args = parser.parse_args()
+    # remove --profile-config from args
+    if hasattr(args, "profile_config"):
+        delattr(args, "profile_config")
+    # Validate required arguments
+    if not args.config:
+        parser.error("--config is required (either via CLI or profile-config)")
+    return args
--- a/docs/benchmarks/pre_deployment_profiling.md
+++ b/docs/benchmarks/pre_deployment_profiling.md
@@ -128,8 +128,6 @@ spec:
 **For MoE models**, use `profile_sla_moe_job.yaml` with TEP/DEP configuration instead.
-If you want to automatically deploy the optimized DGD with planner after profiling, add `--deploy-after-profile` to the profiling job. It will deploy the DGD with the engine of the optimized parallelization mapping found for the SLA targets.
 ### Advanced Configuration
 - **Model caching**: For large models, create a multi-attach PVC to cache the model. See [recipes](../../recipes/README.md) for details.

--- a/docs/kubernetes/sla_planner_quickstart.md
+++ b/docs/kubernetes/sla_planner_quickstart.md
@@ -97,13 +97,10 @@ spec:
            - "20" # target ITL is 20ms
            - --backend
            - <vllm/sglang>
-            - --deploy-after-profile
 ```
 For MoE models, edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_moe_job.yaml` instead.
-To automatically deploy the optimized DGD with planner after profiling, add `--deploy-after-profile` to the profiling job. It will deploy the DGD with the engine of the optimized parallelization mapping found for the SLA targets.
 ### Step 1.4: Run Profiling
 Set the container image and config path:
@@ -136,7 +133,7 @@ kubectl logs job/profile-sla -n $NAMESPACE
 > [!NOTE]
 > **Time Investment**: This profiling process is comprehensive and typically takes **2-4 hours** to complete. The script systematically tests multiple tensor parallelism configurations and load conditions to find optimal performance settings.
-### Step 1.6: Download Profiling Results (Optional)
+### Step 1.6: Download Profiling Results
 If you want to view the profiling results and performance plots:
@@ -156,7 +153,13 @@ Final DGD config with planner: {...}
 Deploying the optimized DGD with planner...
 ```
-### Step 1.7: Wait for Deployment to be Ready
+### Step 1.7: Deploy the DGD with Planner
+```bash
+kubectl apply -f ./results/config_with_planner.yaml
+```
+### Step 1.8: Wait for Deployment to be Ready
 ```bash
 kubectl get pods -n $NAMESPACE
@@ -170,7 +173,7 @@ vllm-disagg-planner-backend-*             1/1 Running
 vllm-disagg-planner-prefill-*             1/1 Running
 ```
-### Step 1.8: Test the System
+### Step 1.9: Test the System
 ```bash
 # Port forward to frontend
@@ -192,7 +195,7 @@ curl -N http://localhost:8000/v1/chat/completions \
  }'
 ```
-### Step 1.9: Monitor Scaling
+### Step 1.10: Monitor Scaling
 ```bash
 # Check planner logs for scaling decisions