profiler_argparse.py 15.3 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
# SPDX-License-Identifier: Apache-2.0

import argparse
import ast
6
import os
7
8
9
10
11
from typing import Any, Dict

import yaml

from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser
12
from benchmarks.profiler.utils.search_space_autogen import auto_generate_search_space
13
14


15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def _get(cfg: Dict[str, Any], camel: str, snake: str, default: Any = None) -> Any:
    """Get config value with camelCase preferred, snake_case fallback."""
    if camel in cfg:
        return cfg[camel]
    return cfg.get(snake, default)


def _camel_to_snake(name: str) -> str:
    """Convert camelCase to snake_case."""
    import re

    # Insert underscore before uppercase letters and lowercase
    s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()


31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def parse_config_string(config_str: str) -> Dict[str, Any]:
    """Parse configuration string as Python dict literal, YAML, or JSON.

    Supports multiple input formats:
    1. Python dict literal: "{'engine': {'backend': 'vllm'}, 'sla': {'isl': 3000}}"
    2. YAML string: "engine:\n  backend: vllm\nsla:\n  isl: 3000"
    3. JSON string: '{"engine": {"backend": "vllm"}, "sla": {"isl": 3000}}'

    Args:
        config_str: Configuration string in one of the supported formats

    Returns:
        Dictionary containing the configuration

    Raises:
        ValueError: If config cannot be parsed or is not a dictionary
    """
    config = None

    # Try 1: Parse as Python dict literal (most direct for CLI)
    try:
        config = ast.literal_eval(config_str)
        if isinstance(config, dict):
            return config
    except (ValueError, SyntaxError):
        pass

    # Try 2: Parse as YAML/JSON (for K8s ConfigMaps and files)
    try:
        config = yaml.safe_load(config_str)
        if config is not None and isinstance(config, dict):
            return config
    except yaml.YAMLError:
        pass

    # If we got here, parsing failed
    raise ValueError(
        "Failed to parse config string. Expected Python dict literal, YAML, or JSON format. "
        f"Examples:\n"
        f"  Python dict: \"{'engine': {'backend': 'vllm'}}\"\n"
        f'  YAML: "engine:\\n  backend: vllm"\n'
        f'  JSON: \'{{"engine": {{"backend": "vllm"}}}}\''
    )


def create_profiler_parser() -> argparse.Namespace:
    """
    Create argument parser with support for YAML config string.

80
81
    Config structure (camelCase preferred, snake_case supported for backwards compat):
        outputDir: String (path to the output results directory, default: profiling_results)
82
83
        deployment:
            namespace: String (kubernetes namespace, default: dynamo-sla-profiler)
84
            serviceName: String (service name, default: "")
85
            model: String (served model name)
86
87
88
89
90
91
92
93
            dgdImage: String (container image to use for DGD components (frontend, planner, workers), overrides images in config file)
            modelCache:
                pvcName: String (name of the PVC to mount the model cache,
                    if not provided, model must be HF name and will download from HF, default: "")
                pvcPath: String (path to the model cache in the PVC, default: "")
                mountPath: String (path to the model cache in the container,
                    note that the PVC must be mounted to the same path for the profiling job,
                    default: "/opt/model-cache")
94
95
        engine:
            backend: String (backend type, currently support [vllm, sglang, trtllm], default: vllm)
96
            config: String (path to the DynamoGraphDeployment config file, default: "")
97
98
            maxContextLength: Int (maximum context length supported by the served model, default: 0)
            isMoeModel: Boolean (enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode, default: False)
99
        hardware:
100
101
102
103
            minNumGpusPerEngine: Int (minimum number of GPUs per engine, default: 0)
            maxNumGpusPerEngine: Int (maximum number of GPUs per engine, default: 0)
            numGpusPerNode: Int (number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size, default: 0)
            enableGpuDiscovery: Boolean (enable automatic GPU discovery from Kubernetes cluster nodes, when enabled overrides any manually specified hardware configuration, requires cluster-wide node access permissions, default: False)
104
        sweep:
105
106
107
108
109
110
111
112
113
114
            prefillInterpolationGranularity: Int (how many samples to benchmark to interpolate TTFT under different ISL, default: 16)
            decodeInterpolationGranularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
            useAiConfigurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
            aicSystem: String (target system for use with aiconfigurator, default: None)
            aicHfId: String (aiconfigurator huggingface id of the target model, default: None)
            aicBackend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
            aicBackendVersion: String (specify backend version when using aiconfigurator to estimate perf, default: None)
            dryRun: Boolean (dry run the profile job, default: False)
            pickWithWebui: Boolean (pick the best parallelization mapping using webUI, default: False)
            webuiPort: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
115
116
117
        sla:
            isl: Int (target input sequence length, default: 3000)
            osl: Int (target output sequence length, default: 500)
118
119
            ttft: Float (target Time To First Token in milliseconds, default: 50)
            itl: Float (target Inter Token Latency in milliseconds, default: 10)
120
121
        planner: (planner arguments)
            e.g., plannerMinEndpoint: 2
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
    """
    # Step 1: Pre-parse to check if --profile-config is provided
    pre_parser = argparse.ArgumentParser(add_help=False)
    pre_parser.add_argument("--profile-config", type=str)
    pre_args, _ = pre_parser.parse_known_args()

    # Step 2: Parse config if provided
    config = {}
    if pre_args.profile_config:
        config = parse_config_string(pre_args.profile_config)

    # Step 3: Create main parser with config-aware defaults
    parser = argparse.ArgumentParser(
        description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill."
    )

    parser.add_argument(
        "--profile-config",
        type=str,
        help="Configuration as Python dict literal, YAML, or JSON string. CLI args override config values. "
        "Example: \"{'engine': {'backend': 'vllm', 'config': '/path'}, 'sla': {'isl': 3000}}\"",
    )
144
145

    # CLI arguments with config-aware defaults (using nested .get() for cleaner code)
146
147
148
149
    parser.add_argument(
        "--model",
        type=str,
        default=config.get("deployment", {}).get("model", ""),
150
151
        help="Served model name",
    )
152
    model_cache_config = config.get("deployment", {}).get("modelCache", {})
153
154
155
    parser.add_argument(
        "--model-cache-pvc-name",
        type=str,
156
        default=model_cache_config.get("pvcName", ""),
157
158
159
160
161
        help="Name of the PVC that contains the model weights. If not provided, args.model must be a HF model name and will download from HF",
    )
    parser.add_argument(
        "--model-cache-pvc-path",
        type=str,
162
        default=model_cache_config.get("pvcPath", ""),
163
164
165
166
167
        help="Path to the model cache in the PVC",
    )
    parser.add_argument(
        "--model-cache-pvc-mount-path",
        type=str,
168
        default=model_cache_config.get("mountPath", "/opt/model-cache"),
169
        help="Path to the model cache in the container, note that the PVC must be mounted to the same path for the profiling job",
170
    )
171
    deployment_cfg = config.get("deployment", {})
172
173
174
    parser.add_argument(
        "--dgd-image",
        type=str,
175
        default=_get(deployment_cfg, "dgdImage", "dgd_image", ""),
176
177
        help="Container image to use for DGD components (frontend, planner, workers). Overrides images in config file.",
    )
178
179
180
181

    parser.add_argument(
        "--namespace",
        type=str,
182
        default=deployment_cfg.get("namespace", "dynamo-sla-profiler"),
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
        help="Kubernetes namespace to deploy the DynamoGraphDeployment",
    )
    parser.add_argument(
        "--backend",
        type=str,
        default=config.get("engine", {}).get("backend", "vllm"),
        choices=["vllm", "sglang", "trtllm"],
        help="backend type, currently support [vllm, sglang, trtllm]",
    )
    parser.add_argument(
        "--config",
        type=str,
        default=config.get("engine", {}).get("config", ""),
        required=False,
        help="Path to the DynamoGraphDeployment config file (required, can be provided via CLI or config)",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
202
        default=_get(config, "outputDir", "output_dir", "profiling_results"),
203
204
        help="Path to the output results directory",
    )
205
    hardware_cfg = config.get("hardware", {})
206
207
208
    parser.add_argument(
        "--min-num-gpus-per-engine",
        type=int,
209
        default=_get(hardware_cfg, "minNumGpusPerEngine", "min_num_gpus_per_engine", 0),
210
211
212
213
214
        help="minimum number of GPUs per engine",
    )
    parser.add_argument(
        "--max-num-gpus-per-engine",
        type=int,
215
        default=_get(hardware_cfg, "maxNumGpusPerEngine", "max_num_gpus_per_engine", 0),
216
217
218
        help="maximum number of GPUs per engine",
    )
    parser.add_argument(
219
220
        "--num-gpus-per-node",
        type=int,
221
        default=_get(hardware_cfg, "numGpusPerNode", "num_gpus_per_node", 0),
222
        help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
    )
    parser.add_argument(
        "--isl",
        type=int,
        default=config.get("sla", {}).get("isl", 3000),
        help="target input sequence length",
    )
    parser.add_argument(
        "--osl",
        type=int,
        default=config.get("sla", {}).get("osl", 500),
        help="target output sequence length",
    )
    parser.add_argument(
        "--ttft",
238
239
240
        type=float,
        default=config.get("sla", {}).get("ttft", 50.0),
        help="target Time To First Token (float, in milliseconds)",
241
242
243
    )
    parser.add_argument(
        "--itl",
244
245
246
        type=float,
        default=config.get("sla", {}).get("itl", 10.0),
        help="target Inter Token Latency (float, in milliseconds)",
247
248
249
    )

    # arguments used for interpolating TTFT and ITL under different ISL/OSL
250
    engine_cfg = config.get("engine", {})
251
252
253
    parser.add_argument(
        "--max-context-length",
        type=int,
254
        default=_get(engine_cfg, "maxContextLength", "max_context_length", 0),
255
256
        help="maximum context length supported by the served model",
    )
257
    sweep_cfg = config.get("sweep", {})
258
259
260
    parser.add_argument(
        "--prefill-interpolation-granularity",
        type=int,
261
262
263
264
265
266
        default=_get(
            sweep_cfg,
            "prefillInterpolationGranularity",
            "prefill_interpolation_granularity",
            16,
        ),
267
268
269
270
271
        help="how many samples to benchmark to interpolate TTFT under different ISL",
    )
    parser.add_argument(
        "--decode-interpolation-granularity",
        type=int,
272
273
274
275
276
277
        default=_get(
            sweep_cfg,
            "decodeInterpolationGranularity",
            "decode_interpolation_granularity",
            6,
        ),
278
279
280
281
282
        help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
    )
    parser.add_argument(
        "--service-name",
        type=str,
283
        default=_get(deployment_cfg, "serviceName", "service_name", ""),
284
285
286
287
288
        help="Service name for port forwarding (default: {deployment_name}-frontend)",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
289
        default=_get(sweep_cfg, "dryRun", "dry_run", False),
290
291
        help="Dry run the profile job",
    )
292
293
294
    parser.add_argument(
        "--enable-gpu-discovery",
        action="store_true",
295
        default=_get(hardware_cfg, "enableGpuDiscovery", "enable_gpu_discovery", False),
296
297
        help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.",
    )
298
299
300
    parser.add_argument(
        "--pick-with-webui",
        action="store_true",
301
        default=_get(sweep_cfg, "pickWithWebui", "pick_with_webui", False),
302
303
304
305
306
307
308
309
310
311
        help="Pick the best parallelization mapping using webUI",
    )

    default_webui_port = 8000
    webui_port_env = os.environ.get("PROFILER_WEBUI_PORT")
    if webui_port_env:
        default_webui_port = int(webui_port_env)
    parser.add_argument(
        "--webui-port",
        type=int,
312
        default=_get(sweep_cfg, "webuiPort", "webui_port", default_webui_port),
313
314
        help="WebUI port",
    )
315
316
317
318

    # Dynamically add all planner arguments from planner_argparse.py
    add_planner_arguments_to_parser(parser, prefix="planner-")
    # Set defaults for any planner arguments found in config.planner
319
    # Normalize keys: camelCase -> snake_case, hyphens -> underscores
320
321
322
    planner_config = config.get("planner", {})
    if planner_config:
        normalized_planner_config = {
323
324
            _camel_to_snake(key).replace("-", "_"): value
            for key, value in planner_config.items()
325
326
327
328
329
330
331
        }
        parser.set_defaults(**normalized_planner_config)

    # arguments if using aiconfigurator
    parser.add_argument(
        "--use-ai-configurator",
        action="store_true",
332
        default=_get(sweep_cfg, "useAiConfigurator", "use_ai_configurator", False),
333
334
335
336
337
        help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
    )
    parser.add_argument(
        "--aic-system",
        type=str,
338
        default=_get(sweep_cfg, "aicSystem", "aic_system", None),
339
340
341
        help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
    )
    parser.add_argument(
342
        "--aic-hf-id",
343
        type=str,
344
        default=_get(sweep_cfg, "aicHfId", "aic_hf_id", None),
345
        help="aiconfigurator name of the target model (e.g. Qwen/Qwen3-32B, meta-llama/Llama-3.1-405B)",
346
347
348
349
    )
    parser.add_argument(
        "--aic-backend",
        type=str,
350
        default=_get(sweep_cfg, "aicBackend", "aic_backend", ""),
351
352
353
354
355
        help="aiconfigurator backend of the target model, if not provided, will use args.backend",
    )
    parser.add_argument(
        "--aic-backend-version",
        type=str,
356
        default=_get(sweep_cfg, "aicBackendVersion", "aic_backend_version", None),
357
358
359
360
361
362
363
364
365
366
367
        help="Specify backend version when using aiconfigurator to estimate perf.",
    )

    # Parse arguments
    args = parser.parse_args()

    # remove --profile-config from args
    if hasattr(args, "profile_config"):
        delattr(args, "profile_config")

    # Validate required arguments
368
    # Either --model or --config (or both) must be provided
369
    if not args.model and not args.config:
370
        parser.error("--model or --config is required (provide at least one)")
371

372
    auto_generate_search_space(args)
373
    return args