# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 import argparse import ast import os from typing import Any, Dict import yaml from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser from benchmarks.profiler.utils.search_space_autogen import auto_generate_search_space def _get(cfg: Dict[str, Any], camel: str, snake: str, default: Any = None) -> Any: """Get config value with camelCase preferred, snake_case fallback.""" if camel in cfg: return cfg[camel] return cfg.get(snake, default) def _camel_to_snake(name: str) -> str: """Convert camelCase to snake_case.""" import re # Insert underscore before uppercase letters and lowercase s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() def parse_config_string(config_str: str) -> Dict[str, Any]: """Parse configuration string as Python dict literal, YAML, or JSON. Supports multiple input formats: 1. Python dict literal: "{'engine': {'backend': 'vllm'}, 'sla': {'isl': 3000}}" 2. YAML string: "engine:\n backend: vllm\nsla:\n isl: 3000" 3. JSON string: '{"engine": {"backend": "vllm"}, "sla": {"isl": 3000}}' Args: config_str: Configuration string in one of the supported formats Returns: Dictionary containing the configuration Raises: ValueError: If config cannot be parsed or is not a dictionary """ config = None # Try 1: Parse as Python dict literal (most direct for CLI) try: config = ast.literal_eval(config_str) if isinstance(config, dict): return config except (ValueError, SyntaxError): pass # Try 2: Parse as YAML/JSON (for K8s ConfigMaps and files) try: config = yaml.safe_load(config_str) if config is not None and isinstance(config, dict): return config except yaml.YAMLError: pass # If we got here, parsing failed raise ValueError( "Failed to parse config string. Expected Python dict literal, YAML, or JSON format. " f"Examples:\n" f" Python dict: \"{'engine': {'backend': 'vllm'}}\"\n" f' YAML: "engine:\\n backend: vllm"\n' f' JSON: \'{{"engine": {{"backend": "vllm"}}}}\'' ) def create_profiler_parser() -> argparse.Namespace: """ Create argument parser with support for YAML config string. Config structure (camelCase preferred, snake_case supported for backwards compat): outputDir: String (path to the output results directory, default: profiling_results) deployment: namespace: String (kubernetes namespace, default: dynamo-sla-profiler) serviceName: String (service name, default: "") model: String (served model name) dgdImage: String (container image to use for DGD components (frontend, planner, workers), overrides images in config file) modelCache: pvcName: String (name of the PVC to mount the model cache, if not provided, model must be HF name and will download from HF, default: "") pvcPath: String (path to the model cache in the PVC, default: "") mountPath: String (path to the model cache in the container, note that the PVC must be mounted to the same path for the profiling job, default: "/opt/model-cache") engine: backend: String (backend type, currently support [vllm, sglang, trtllm], default: vllm) config: String (path to the DynamoGraphDeployment config file, default: "") maxContextLength: Int (maximum context length supported by the served model, default: 0) isMoeModel: Boolean (enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode, default: False) hardware: minNumGpusPerEngine: Int (minimum number of GPUs per engine, default: 0) maxNumGpusPerEngine: Int (maximum number of GPUs per engine, default: 0) numGpusPerNode: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 0) enableGpuDiscovery: Boolean (enable automatic GPU discovery from Kubernetes cluster nodes, when enabled overrides any manually specified hardware configuration, requires cluster-wide node access permissions, default: False) sweep: prefillInterpolationGranularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16) decodeInterpolationGranularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6) useAiConfigurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False) aicSystem: String (target system for use with aiconfigurator, default: None) aicHfId: String (aiconfigurator huggingface id of the target model, default: None) aicBackend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "") aicBackendVersion: String (specify backend version when using aiconfigurator to estimate perf, default: None) dryRun: Boolean (dry run the profile job, default: False) pickWithWebui: Boolean (pick the best parallelization mapping using webUI, default: False) webuiPort: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000) sla: isl: Int (target input sequence length, default: 3000) osl: Int (target output sequence length, default: 500) ttft: Float (target Time To First Token in milliseconds, default: 50) itl: Float (target Inter Token Latency in milliseconds, default: 10) planner: (planner arguments) e.g., plannerMinEndpoint: 2 """ # Step 1: Pre-parse to check if --profile-config is provided pre_parser = argparse.ArgumentParser(add_help=False) pre_parser.add_argument("--profile-config", type=str) pre_args, _ = pre_parser.parse_known_args() # Step 2: Parse config if provided config = {} if pre_args.profile_config: config = parse_config_string(pre_args.profile_config) # Step 3: Create main parser with config-aware defaults parser = argparse.ArgumentParser( description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill." ) parser.add_argument( "--profile-config", type=str, help="Configuration as Python dict literal, YAML, or JSON string. CLI args override config values. " "Example: \"{'engine': {'backend': 'vllm', 'config': '/path'}, 'sla': {'isl': 3000}}\"", ) # CLI arguments with config-aware defaults (using nested .get() for cleaner code) parser.add_argument( "--model", type=str, default=config.get("deployment", {}).get("model", ""), help="Served model name", ) model_cache_config = config.get("deployment", {}).get("modelCache", {}) parser.add_argument( "--model-cache-pvc-name", type=str, default=model_cache_config.get("pvcName", ""), help="Name of the PVC that contains the model weights. If not provided, args.model must be a HF model name and will download from HF", ) parser.add_argument( "--model-cache-pvc-path", type=str, default=model_cache_config.get("pvcPath", ""), help="Path to the model cache in the PVC", ) parser.add_argument( "--model-cache-pvc-mount-path", type=str, default=model_cache_config.get("mountPath", "/opt/model-cache"), help="Path to the model cache in the container, note that the PVC must be mounted to the same path for the profiling job", ) deployment_cfg = config.get("deployment", {}) parser.add_argument( "--dgd-image", type=str, default=_get(deployment_cfg, "dgdImage", "dgd_image", ""), help="Container image to use for DGD components (frontend, planner, workers). Overrides images in config file.", ) parser.add_argument( "--namespace", type=str, default=deployment_cfg.get("namespace", "dynamo-sla-profiler"), help="Kubernetes namespace to deploy the DynamoGraphDeployment", ) parser.add_argument( "--backend", type=str, default=config.get("engine", {}).get("backend", "vllm"), choices=["vllm", "sglang", "trtllm"], help="backend type, currently support [vllm, sglang, trtllm]", ) parser.add_argument( "--config", type=str, default=config.get("engine", {}).get("config", ""), required=False, help="Path to the DynamoGraphDeployment config file (required, can be provided via CLI or config)", ) parser.add_argument( "--output-dir", type=str, default=_get(config, "outputDir", "output_dir", "profiling_results"), help="Path to the output results directory", ) hardware_cfg = config.get("hardware", {}) parser.add_argument( "--min-num-gpus-per-engine", type=int, default=_get(hardware_cfg, "minNumGpusPerEngine", "min_num_gpus_per_engine", 0), help="minimum number of GPUs per engine", ) parser.add_argument( "--max-num-gpus-per-engine", type=int, default=_get(hardware_cfg, "maxNumGpusPerEngine", "max_num_gpus_per_engine", 0), help="maximum number of GPUs per engine", ) parser.add_argument( "--num-gpus-per-node", type=int, default=_get(hardware_cfg, "numGpusPerNode", "num_gpus_per_node", 0), help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size", ) parser.add_argument( "--isl", type=int, default=config.get("sla", {}).get("isl", 3000), help="target input sequence length", ) parser.add_argument( "--osl", type=int, default=config.get("sla", {}).get("osl", 500), help="target output sequence length", ) parser.add_argument( "--ttft", type=float, default=config.get("sla", {}).get("ttft", 50.0), help="target Time To First Token (float, in milliseconds)", ) parser.add_argument( "--itl", type=float, default=config.get("sla", {}).get("itl", 10.0), help="target Inter Token Latency (float, in milliseconds)", ) # arguments used for interpolating TTFT and ITL under different ISL/OSL engine_cfg = config.get("engine", {}) parser.add_argument( "--max-context-length", type=int, default=_get(engine_cfg, "maxContextLength", "max_context_length", 0), help="maximum context length supported by the served model", ) sweep_cfg = config.get("sweep", {}) parser.add_argument( "--prefill-interpolation-granularity", type=int, default=_get( sweep_cfg, "prefillInterpolationGranularity", "prefill_interpolation_granularity", 16, ), help="how many samples to benchmark to interpolate TTFT under different ISL", ) parser.add_argument( "--decode-interpolation-granularity", type=int, default=_get( sweep_cfg, "decodeInterpolationGranularity", "decode_interpolation_granularity", 6, ), help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length", ) parser.add_argument( "--service-name", type=str, default=_get(deployment_cfg, "serviceName", "service_name", ""), help="Service name for port forwarding (default: {deployment_name}-frontend)", ) parser.add_argument( "--dry-run", action="store_true", default=_get(sweep_cfg, "dryRun", "dry_run", False), help="Dry run the profile job", ) parser.add_argument( "--enable-gpu-discovery", action="store_true", default=_get(hardware_cfg, "enableGpuDiscovery", "enable_gpu_discovery", False), help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.", ) parser.add_argument( "--pick-with-webui", action="store_true", default=_get(sweep_cfg, "pickWithWebui", "pick_with_webui", False), help="Pick the best parallelization mapping using webUI", ) default_webui_port = 8000 webui_port_env = os.environ.get("PROFILER_WEBUI_PORT") if webui_port_env: default_webui_port = int(webui_port_env) parser.add_argument( "--webui-port", type=int, default=_get(sweep_cfg, "webuiPort", "webui_port", default_webui_port), help="WebUI port", ) # Dynamically add all planner arguments from planner_argparse.py add_planner_arguments_to_parser(parser, prefix="planner-") # Set defaults for any planner arguments found in config.planner # Normalize keys: camelCase -> snake_case, hyphens -> underscores planner_config = config.get("planner", {}) if planner_config: normalized_planner_config = { _camel_to_snake(key).replace("-", "_"): value for key, value in planner_config.items() } parser.set_defaults(**normalized_planner_config) # arguments if using aiconfigurator parser.add_argument( "--use-ai-configurator", action="store_true", default=_get(sweep_cfg, "useAiConfigurator", "use_ai_configurator", False), help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.", ) parser.add_argument( "--aic-system", type=str, default=_get(sweep_cfg, "aicSystem", "aic_system", None), help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)", ) parser.add_argument( "--aic-hf-id", type=str, default=_get(sweep_cfg, "aicHfId", "aic_hf_id", None), help="aiconfigurator name of the target model (e.g. Qwen/Qwen3-32B, meta-llama/Llama-3.1-405B)", ) parser.add_argument( "--aic-backend", type=str, default=_get(sweep_cfg, "aicBackend", "aic_backend", ""), help="aiconfigurator backend of the target model, if not provided, will use args.backend", ) parser.add_argument( "--aic-backend-version", type=str, default=_get(sweep_cfg, "aicBackendVersion", "aic_backend_version", None), help="Specify backend version when using aiconfigurator to estimate perf.", ) # Parse arguments args = parser.parse_args() # remove --profile-config from args if hasattr(args, "profile_config"): delattr(args, "profile_config") # Validate required arguments # Either --model or --config (or both) must be provided if not args.model and not args.config: parser.error("--model or --config is required (provide at least one)") auto_generate_search_space(args) return args