profile_sla.py 37.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
17
import asyncio
18
19
20
21
22
23
import logging
import math
import os

import numpy as np
import yaml
24

25
from benchmarks.profiler.utils.aiperf import benchmark_decode, benchmark_prefill
26
27
from benchmarks.profiler.utils.config import generate_dgd_config_with_planner
from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS
28
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
29
from benchmarks.profiler.utils.planner_utils import add_planner_arguments_to_parser
30
31
32
33
34
from benchmarks.profiler.utils.plot import (
    plot_decode_performance,
    plot_prefill_performance,
)
from benchmarks.profiler.utils.profile_cache import (
35
36
37
38
    check_decode_results_exist,
    check_prefill_results_exist,
    load_existing_decode_results,
    load_existing_prefill_results,
39
)
40
from benchmarks.profiler.utils.profile_decode import (
41
    get_num_request_range,
42
43
44
45
46
47
48
    profile_decode,
    profile_decode_aiconfigurator,
)
from benchmarks.profiler.utils.profile_prefill import (
    profile_prefill,
    profile_prefill_aiconfigurator,
)
49
50
51
52
from deploy.utils.dynamo_deployment import (
    DynamoDeploymentClient,
    cleanup_remaining_deployments,
)
53
from dynamo.planner.defaults import WORKER_COMPONENT_NAMES
54

55
56
57
58
59
60
61
62
63
64
65
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


66
67
68
async def run_profile(args):
    # List to track all created deployment clients for cleanup in case of failure
    deployment_clients = []
69

70
71
72
73
    # Inherit aic_backend from backend if not explicitly set
    if not args.aic_backend:
        args.aic_backend = args.backend

74
    try:
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
        # Log MoE model support
        if args.is_moe_model:
            logger.info(
                "MoE (Mixture of Experts) model profiling, sweeping TEP size for prefill and DEP size for decode"
            )
            assert args.backend in [
                "sglang"
            ], "MoE model support is only available for SGLang"
            assert (
                not args.use_ai_configurator
            ), "MoE model is not supported in ai-configurator"
        else:
            logger.info(
                "Standard dense model profiling, sweeping TP size for both prefill and decode"
            )

91
92
93
94
        config_modifier = CONFIG_MODIFIERS[args.backend]

        with open(args.config, "r") as f:
            config = yaml.safe_load(f)
95

96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
        if args.is_moe_model:
            # For MoE models, use range with stride of num_gpus_per_node
            profile_num_gpus = list(
                range(
                    args.min_num_gpus_per_engine,
                    args.max_num_gpus_per_engine + 1,
                    args.num_gpus_per_node,
                )
            )
            logger.info(f"Profiling MoE GPU counts (TEP/DEP): {profile_num_gpus}")
        else:
            # For dense models, use powers of 2
            profile_num_gpus = [
                2**i
                for i in range(int(math.log2(args.max_num_gpus_per_engine)) + 1)
                if args.min_num_gpus_per_engine
                <= 2**i
                <= args.max_num_gpus_per_engine
            ]
            logger.info(f"Profiling dense model GPU counts (TP): {profile_num_gpus}")
116

117
        os.makedirs(args.output_dir, exist_ok=True)
118

119
        model_name = config_modifier.get_model_name(config)
120

121
122
123
124
125
126
127
128
129
130
131
132
        # Log skip behavior
        if args.force_rerun:
            logger.info(
                "Force rerun enabled - will re-run all tests even if results exist"
            )
        elif args.skip_existing_results:
            logger.info(
                "Skip existing results enabled - will skip TP sizes with existing results"
            )
        else:
            logger.info("Skip existing results disabled - will re-run all tests")

133
134
135
136
137
138
139
140
141
        if args.use_ai_configurator:
            if not args.aic_system:
                raise ValueError(
                    "Must provide --aic-system when using --use-ai-configurator."
                )
            if not args.aic_model_name:
                raise ValueError(
                    "Must provide --aic-model-name when using --use-ai-configurator."
                )
142
            if not args.aic_backend_version:
143
                raise ValueError(
144
                    "Must provide --aic-backend-version when using --use-ai-configurator."
145
146
147
148
149
150
                )

            logger.info("Will use aiconfigurator to estimate perf.")
            ai_configurator_perf_estimator = AIConfiguratorPerfEstimator(
                args.aic_model_name,
                args.aic_system.lower(),
151
152
                args.aic_backend,
                args.aic_backend_version,
153
154
            )
        else:
155
            if args.aic_system or args.aic_model_name or args.aic_backend_version:
156
157
158
159
160
                logger.warning(
                    "Will ignore --aic-system, --aic-model-name, and/or --backend-version "
                    "when not using --use-ai-configurator."
                )

161
        # first profile prefill
162
        prefill_num_gpus = []
163
164
165
        prefill_ttft = []
        prefill_thpt_per_gpu = []
        logger.info("Profiling prefill...")
166
167
168
        prefill_config = config_modifier.convert_config(
            config, "prefill", is_moe_model=args.is_moe_model
        )
169
        frontend_port = config_modifier.get_port(config)
170
171
        itl: float | None = None
        thpt_per_gpu: float | None = None
172
173
        for num_gpus in profile_num_gpus:
            logger.info(f"Profiling prefill with {num_gpus} GPUs...")
174

175
            # Check if results already exist for this GPU count
176
177
178
            if (
                args.skip_existing_results
                and not args.force_rerun
179
                and check_prefill_results_exist(args.output_dir, num_gpus, args.isl)
180
            ):
181
182
183
                logger.info(
                    f"Skipping prefill {num_gpus} GPU(s) - results already exist"
                )
184
                ttft, thpt_per_gpu = load_existing_prefill_results(
185
                    args.output_dir, num_gpus, args.isl
186
187
                )
                if ttft is not None and thpt_per_gpu is not None:
188
                    prefill_num_gpus.append(num_gpus)
189
190
191
                    prefill_ttft.append(ttft)
                    prefill_thpt_per_gpu.append(thpt_per_gpu)
                    logger.info(
192
                        f"Loaded existing prefill results: {num_gpus} GPU TTFT={ttft:.2f}ms, throughput={thpt_per_gpu:.2f} tokens/s/GPU"
193
194
                    )
                continue
195

196
197
198
199
200
201
202
203
            if args.is_moe_model:
                prefill_config = config_modifier.set_config_tep_size(
                    prefill_config, num_gpus, args.num_gpus_per_node
                )
            else:
                prefill_config = config_modifier.set_config_tp_size(
                    prefill_config, num_gpus
                )
204
            logger.info(f"Dynamo config: {prefill_config}")
205

206
            work_dir = f"{args.output_dir}/prefill_{num_gpus}gpus"
207
            os.makedirs(work_dir, exist_ok=True)
208

209
210
211
            prefill_config_fn = f"{work_dir}/config.yaml"
            with open(prefill_config_fn, "w") as f:
                yaml.dump(prefill_config, f)
212

213
            ttft = None
214
215
            if args.dry_run:
                logger.info("Skipping deployment creation in dry run mode")
216
217
218
219
            elif args.use_ai_configurator:
                logger.info("Using ai-configurator to estimate prefill latency.")
                perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
                    args.isl,
220
                    tp_size=num_gpus,
221
222
223
                )
                ttft = perf_dict["context_latency"]
                logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms")
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
            else:
                client = DynamoDeploymentClient(
                    namespace=args.namespace,
                    base_log_dir=work_dir,
                    model_name=model_name,
                    service_name=args.service_name,
                    frontend_port=frontend_port,
                    deployment_name=prefill_config["metadata"]["name"],
                )
                logger.info(f"Created client with service_name: {client.service_name}")
                deployment_clients.append(client)  # Track for cleanup
                await client.create_deployment(prefill_config_fn)
                logger.info("Waiting for deployment to be ready...")
                await client.wait_for_deployment_ready()
                logger.info("Deployment is ready")

                logger.info("Getting deployment logs...")
                await client.get_deployment_logs()
                logger.info(
                    f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
                )
245

246
                # run ai-perf
247
                base_url = client.get_service_url()
248
249
                ai_perf_artifact_dir = f"{work_dir}/aiperf_isl{args.isl}"
                aiperf_result = benchmark_prefill(
250
                    args.isl,
251
                    ai_perf_artifact_dir,
252
253
254
255
                    model_name,
                    model_name,
                    base_url=base_url,
                )
256
257
                if aiperf_result is not None:
                    ttft = aiperf_result["records"]["ttft"]["avg"]
258

259
260
261
262
                logger.info("Cleaning up deployment...")
                await client.delete_deployment()
                deployment_clients.remove(client)
                logger.info("Deployment deleted")
263

264
            if ttft is not None:
265
                prefill_num_gpus.append(num_gpus)
266
                prefill_ttft.append(ttft)
267
                prefill_thpt_per_gpu.append(args.isl / ttft / num_gpus * 1000)
268

269
        # Plot the results as a 2D scatter plot
270
        if prefill_num_gpus and prefill_ttft and prefill_thpt_per_gpu:
271
            plot_prefill_performance(
272
                prefill_num_gpus,
273
274
275
276
277
                prefill_ttft,
                prefill_thpt_per_gpu,
                args.ttft,
                args.output_dir,
            )
278

279
        # then profile decode
280
        decode_num_gpus = []
281
282
283
284
285
286
        decode_itl = []
        decode_thpt_per_gpu = []
        decode_concurrency = []
        decode_kv_cache_size = []
        decode_results = []  # Store partial results for plotting later
        logger.info("Profiling decode...")
287
288
289
290
291
        decode_config = config_modifier.convert_config(
            config, "decode", is_moe_model=args.is_moe_model
        )
        for num_gpus in profile_num_gpus:
            logger.info(f"Profiling decode with {num_gpus} GPUs...")
292

293
            # Check if results already exist for this GPU count
294
295
296
297
            if (
                args.skip_existing_results
                and not args.force_rerun
                and check_decode_results_exist(
298
                    args.output_dir, num_gpus, args.isl, args.osl
299
300
                )
            ):
301
302
303
                logger.info(
                    f"Skipping decode {num_gpus} GPU(s) - results already exist"
                )
304
                existing_results = load_existing_decode_results(
305
                    args.output_dir, num_gpus, args.isl, args.osl
306
307
308
309
310
311
                )
                if existing_results:
                    # Add existing results to our arrays
                    engine_decode_itl = []
                    engine_decode_thpt_per_gpu = []
                    for itl, thpt_per_gpu, concurrency in existing_results:
312
                        decode_num_gpus.append(num_gpus)
313
314
315
316
317
318
319
320
321
322
323
324
325
                        decode_itl.append(itl)
                        decode_thpt_per_gpu.append(thpt_per_gpu)
                        decode_concurrency.append(concurrency)
                        # We need to get kv_cache_size from existing logs or estimate it
                        estimated_kv_cache = max(
                            100000, concurrency * (args.isl + args.osl) * 2
                        )  # Conservative estimate
                        decode_kv_cache_size.append(estimated_kv_cache)
                        engine_decode_itl.append(itl)
                        engine_decode_thpt_per_gpu.append(thpt_per_gpu)

                    # Store results for plotting
                    decode_results.append(
326
                        (num_gpus, engine_decode_itl, engine_decode_thpt_per_gpu)
327
328
                    )
                    logger.info(
329
                        f"Loaded {len(existing_results)} existing decode results for {num_gpus} GPU(s)"
330
331
                    )
                continue
332

333
334
335
336
337
338
339
340
            if args.is_moe_model:
                decode_config = config_modifier.set_config_dep_size(
                    decode_config, num_gpus, args.num_gpus_per_node
                )
            else:
                decode_config = config_modifier.set_config_tp_size(
                    decode_config, num_gpus
                )
341
            logger.info(f"Dynamo config: {decode_config}")
342

343
            work_dir = f"{args.output_dir}/decode_{num_gpus}gpus"
344
            os.makedirs(work_dir, exist_ok=True)
345

346
347
348
            decode_config_fn = f"{work_dir}/config.yaml"
            with open(decode_config_fn, "w") as f:
                yaml.dump(decode_config, f)
349

350
351
            if args.dry_run:
                logger.info("Skipping deployment creation in dry run mode")
352
353
354
355
356

            elif args.use_ai_configurator:
                # Compute max_concurrency and max_kv_tokens to know which
                # num_request to sweep over.
                max_concurrency = ai_configurator_perf_estimator.get_max_batch_size(
357
                    args.isl, args.osl, tp_size=num_gpus
358
359
360
                )
                max_kv_tokens = max_concurrency * (args.isl + args.osl)

361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
            else:
                client = DynamoDeploymentClient(
                    namespace=args.namespace,
                    base_log_dir=work_dir,
                    model_name=model_name,
                    service_name=args.service_name,
                    frontend_port=frontend_port,
                    deployment_name=decode_config["metadata"]["name"],
                )
                deployment_clients.append(client)  # Track for cleanup
                await client.create_deployment(decode_config_fn)
                logger.info("Waiting for deployment to be ready...")
                await client.wait_for_deployment_ready()
                logger.info("Deployment is ready")

                logger.info("Getting deployment logs...")
                await client.get_deployment_logs()
                logger.info(
                    f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
                )
381

382
383
                # Compute max_concurrency and max_kv_tokens to know which
                # num_request to sweep over.
384
385
                # For MoE models, attention_dp_size = DEP size (num_gpus), for dense models = 1
                attention_dp_size = num_gpus if args.is_moe_model else 1
386
                max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
387
388
                    f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log",
                    attention_dp_size=attention_dp_size,
389
                )
390
                max_concurrency = max_kv_tokens // (args.isl + args.osl)
391
392

            if not args.dry_run:
393
394
395
396
397
398
                attention_dp_size = num_gpus if args.is_moe_model else 1
                sweep_num_request = get_num_request_range(
                    attention_dp_size,
                    max_concurrency,
                    args.decode_interpolation_granularity,
                )
399
400
401
402
403
404
405
                logger.info(
                    f"Sweeping num_request range based on maximum number of kv tokens: {sweep_num_request}"
                )

                engine_decode_itl = []
                engine_decode_thpt_per_gpu = []
                for num_request in sweep_num_request:
406
407
408
409
410
411
412
413
                    itl = thpt_per_gpu = None
                    if args.use_ai_configurator:
                        logger.info("Using ai-configurator to estimate decode latency.")
                        perf_dict = ai_configurator_perf_estimator.estimate_perf(
                            args.isl,
                            args.osl,
                            num_request,
                            mode="decode",
414
                            tp_size=num_gpus,
415
416
417
418
419
420
421
422
423
424
                        )

                        itl = perf_dict["tpot"]
                        thpt_per_gpu = perf_dict["tokens/s/gpu"]
                        logger.info(f"Estimated decode ITL: {itl:.2f}ms")
                        logger.info(
                            f"Estimated decode throughput per GPU: {thpt_per_gpu:.2f} tokens/s/GPU"
                        )
                    else:
                        base_url = client.get_service_url()
425
426
                        ai_perf_artifact_dir = f"{work_dir}/aiperf_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
                        aiperf_result = benchmark_decode(
427
428
429
                            args.isl,
                            args.osl,
                            num_request,
430
                            ai_perf_artifact_dir,
431
432
433
                            model_name,
                            model_name,
                            base_url=base_url,
434
                        )
435
436
                        if aiperf_result is not None:
                            itl = aiperf_result["records"]["inter_token_latency"]["avg"]
437
                            thpt_per_gpu = (
438
439
440
441
                                aiperf_result["records"]["output_token_throughput"][
                                    "avg"
                                ]
                                / num_gpus
442
443
444
                            )

                    if itl is not None and thpt_per_gpu is not None:
445
446
                        engine_decode_itl.append(itl)
                        engine_decode_thpt_per_gpu.append(thpt_per_gpu)
447
                        decode_num_gpus.append(num_gpus)
448
449
450
451
                        decode_itl.append(itl)
                        decode_thpt_per_gpu.append(thpt_per_gpu)
                        decode_concurrency.append(num_request)
                        decode_kv_cache_size.append(max_kv_tokens)
452

453
454
                # Store partial results for plotting later
                decode_results.append(
455
                    (num_gpus, engine_decode_itl, engine_decode_thpt_per_gpu)
456
                )
457

458
459
460
461
462
463
            if not args.dry_run and not args.use_ai_configurator:
                logger.info("Cleaning up deployment...")
                await client.delete_deployment()
                deployment_clients.remove(client)
                logger.info("Deployment deleted")

464
465
466
467
        # Plot all decode results after profiling is complete
        if decode_results:
            plot_decode_performance(decode_results, args.itl, args.output_dir)

468
469
470
471
472
        if args.dry_run:
            logger.info("Skipping recommendations in dry run mode")
        else:
            logger.info("Analyzing results and generate recommendations...")
            # Safety guards: no results → exit early with a clear message
473
            if not (prefill_num_gpus and prefill_ttft and prefill_thpt_per_gpu):
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
                logger.error("No prefill results produced; skipping recommendations.")

            # select best tp size for prefill
            if min(prefill_ttft) > args.ttft:
                logger.info(
                    "No TP size satisfies the TTFT requirement, please try a smaller model or a more powerful GPU SKU"
                )
                selected_prefill_idx = int(np.argmin(np.array(prefill_ttft)))
            else:
                valid_indices = [
                    i for i, ttft in enumerate(prefill_ttft) if ttft <= args.ttft
                ]
                # Among valid TP sizes, select the one with highest throughput per GPU
                valid_thpts = [prefill_thpt_per_gpu[i] for i in valid_indices]
                max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
                selected_prefill_idx = max_thpt_idx
            logger.info(
491
                f"Suggested number of GPUs for prefill: {prefill_num_gpus[selected_prefill_idx]} (TTFT {prefill_ttft[selected_prefill_idx]:.2f} ms, throughput {prefill_thpt_per_gpu[selected_prefill_idx]:.2f} tokens/s/GPU)"
492
            )
493

494
495
496
497
498
499
500
501
            # scale up if estimated TTFT is 120% of target TTFT
            prefill_queue_size_upper_bound = max(
                0.1, args.ttft * 1.2 / prefill_ttft[selected_prefill_idx] - 1
            )
            # scale down if estimated TTFT is 80% of target TTFT
            prefill_queue_size_lower_bound = max(
                0.1, args.ttft * 0.8 / prefill_ttft[selected_prefill_idx] - 1
            )
502
            logger.info(
503
                f"Suggested planner upper/lower bound for prefill queue size: {prefill_queue_size_upper_bound:.2f}/{prefill_queue_size_lower_bound:.2f}"
504
            )
505

506
            # select best gpu count for decode
507
            if not (
508
                decode_num_gpus
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
                and decode_itl
                and decode_thpt_per_gpu
                and decode_concurrency
                and decode_kv_cache_size
            ):
                logger.error("No decode results produced; skipping recommendations.")
                return
            if min(decode_itl) > args.itl:
                logger.info(
                    "No TP size satisfies the ITL requirement, please try a smaller model or a more powerful GPU SKU"
                )
                selected_decode_idx = int(np.argmin(np.array(decode_itl)))
            else:
                valid_indices = [
                    i for i, itl in enumerate(decode_itl) if itl <= args.itl
                ]
                # Among valid TP sizes, select the one with highest throughput per GPU
                valid_thpts = [decode_thpt_per_gpu[i] for i in valid_indices]
                max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
                selected_decode_idx = max_thpt_idx
            logger.info(
530
                f"Suggested number of GPUs for decode: {decode_num_gpus[selected_decode_idx]} (ITL {decode_itl[selected_decode_idx]:.2f} ms, throughput {decode_thpt_per_gpu[selected_decode_idx]:.2f} tokens/s/GPU)"
531
            )
532

533
534
535
536
537
538
539
            # calculate kv cache utlization for the selected TP and concurrency
            selected_decode_kv_cache_utilization = (
                decode_concurrency[selected_decode_idx]
                * (args.isl + (args.osl / 2))
                / decode_kv_cache_size[selected_decode_idx]
            )
            # set a +- 20% range for the kv cache utilization
540
            logger.info(
541
                f"Suggested planner upper/lower bound for decode kv cache utilization: {min(1, selected_decode_kv_cache_utilization + 0.2):.2f}/{max(0.1, selected_decode_kv_cache_utilization - 0.2):.2f}"
542
543
            )

544
        if args.dry_run:
545
546
547
            # use min value for prefill and decode GPU counts
            prefill_num_gpus = [args.min_num_gpus_per_engine]
            decode_num_gpus = [args.min_num_gpus_per_engine]
548
549
            selected_prefill_idx = 0
            selected_decode_idx = 0
550

551
552
        # interpolate ISL - TTFT with best prefill GPU count
        best_prefill_gpus = prefill_num_gpus[selected_prefill_idx]
553
        logger.info(
554
            f"Profiling prefill under best {best_prefill_gpus} GPU(s) with different ISL..."
555
        )
556
557
        prefill_config = config_modifier.convert_config(
            config, "prefill", is_moe_model=args.is_moe_model
558
        )
559
560
561
562
563
564
565
566
        if args.is_moe_model:
            prefill_config = config_modifier.set_config_tep_size(
                prefill_config, best_prefill_gpus, args.num_gpus_per_node
            )
        else:
            prefill_config = config_modifier.set_config_tp_size(
                prefill_config, best_prefill_gpus
            )
567
        logger.info(f"Dynamo config: {prefill_config}")
568

569
570
        work_dir = f"{args.output_dir}/selected_prefill_interpolation"
        os.makedirs(work_dir, exist_ok=True)
571

572
573
574
575
        prefill_config_fn = f"{work_dir}/config.yaml"
        with open(prefill_config_fn, "w") as f:
            yaml.dump(prefill_config, f)

576
577
        if args.dry_run:
            logger.info("Skipping deployment creation in dry run mode")
578
579
580
        elif args.use_ai_configurator:
            profile_prefill_aiconfigurator(
                work_dir,
581
                best_prefill_gpus,  # num_gpus
582
583
584
                args.max_context_length,
                args.prefill_interpolation_granularity,
                ai_configurator_perf_estimator,
585
                tp_size=best_prefill_gpus,
586
            )
587
588
589
590
591
592
593
594
        else:
            client = DynamoDeploymentClient(
                namespace=args.namespace,
                base_log_dir=work_dir,
                model_name=model_name,
                service_name=args.service_name,
                frontend_port=frontend_port,
                deployment_name=prefill_config["metadata"]["name"],
595
            )
596
597
598
599
600
601
602
603
604
605
606
607
            deployment_clients.append(client)  # Track for cleanup
            await client.create_deployment(prefill_config_fn)
            logger.info("Waiting for deployment to be ready...")
            try:
                await client.wait_for_deployment_ready()
                logger.info("Deployment is ready")
                skip_profile = False
            except TimeoutError:
                logger.error(
                    "Deployment failed to become ready within timeout, skipping profiling"
                )
                skip_profile = True
608

609
610
611
612
613
614
            if not skip_profile:
                logger.info("Getting deployment logs...")
                await client.get_deployment_logs()
                logger.info(
                    f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
                )
615

616
            base_url = client.get_service_url()
617

618
619
620
621
622
            profile_prefill(
                work_dir,
                model_name,
                model_name,
                base_url,
623
                best_prefill_gpus,
624
625
626
                args.max_context_length,
                args.prefill_interpolation_granularity,
            )
627

628
629
630
631
            logger.info("Cleaning up deployment...")
            await client.delete_deployment()
            deployment_clients.remove(client)
            logger.info("Deployment deleted")
632

633
634
635
636
637
638
639
640
641
642
643
        # interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode GPU count
        best_decode_gpus = decode_num_gpus[selected_decode_idx]
        logger.info(f"Profiling decode with {best_decode_gpus} GPUs...")
        if args.is_moe_model:
            decode_config = config_modifier.set_config_dep_size(
                decode_config, best_decode_gpus, args.num_gpus_per_node
            )
        else:
            decode_config = config_modifier.set_config_tp_size(
                decode_config, best_decode_gpus
            )
644
        logger.info(f"Dynamo config: {decode_config}")
645

646
647
648
649
650
651
652
        work_dir = f"{args.output_dir}/selected_decode_interpolation"
        os.makedirs(work_dir, exist_ok=True)

        decode_config_fn = f"{work_dir}/config.yaml"
        with open(decode_config_fn, "w") as f:
            yaml.dump(decode_config, f)

653
654
        if args.dry_run:
            logger.info("Skipping deployment creation in dry run mode")
655
        elif args.use_ai_configurator:
656
657
            # For MoE models, attention_dp_size = DEP size (best_decode_gpus), for dense models = 1
            attention_dp_size = best_decode_gpus if args.is_moe_model else 1
658
            max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens(
659
                args.isl, args.osl, tp_size=best_decode_gpus
660
661
662
            )
            profile_decode_aiconfigurator(
                work_dir,
663
                best_decode_gpus,  # num_gpus
664
665
666
667
                max_kv_tokens,
                args.max_context_length,
                args.decode_interpolation_granularity,
                ai_configurator_perf_estimator,
668
669
                attention_dp_size,
                tp_size=best_decode_gpus,
670
            )
671
672
673
674
675
676
677
678
679
680
681
682
683
684
        else:
            client = DynamoDeploymentClient(
                namespace=args.namespace,
                base_log_dir=work_dir,
                model_name=model_name,
                service_name=args.service_name,
                frontend_port=frontend_port,
                deployment_name=decode_config["metadata"]["name"],
            )
            deployment_clients.append(client)  # Track for cleanup
            await client.create_deployment(decode_config_fn)
            logger.info("Waiting for deployment to be ready...")
            await client.wait_for_deployment_ready()
            logger.info("Deployment is ready")
685

686
687
688
689
690
            logger.info("Getting deployment logs...")
            await client.get_deployment_logs()
            logger.info(
                f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
            )
691

692
693
            # For MoE models, attention_dp_size = DEP size (best_decode_gpus), for dense models = 1
            attention_dp_size = best_decode_gpus if args.is_moe_model else 1
694
            max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
695
696
                f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log",
                attention_dp_size=attention_dp_size,
697
698
699
700
701
702
703
704
705
            )

            base_url = client.get_service_url()

            profile_decode(
                work_dir,
                model_name,
                model_name,
                base_url,
706
                best_decode_gpus,
707
708
709
                max_kv_tokens,
                args.max_context_length,
                args.decode_interpolation_granularity,
710
                attention_dp_size,
711
            )
712

713
714
715
716
            logger.info("Cleaning up deployment...")
            await client.delete_deployment()
            deployment_clients.remove(client)
            logger.info("Deployment deleted")
717

718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
        # generate DGD with planner based on profiling results
        config = generate_dgd_config_with_planner(
            config_path=args.config,
            config_modifier=config_modifier,
            best_prefill_gpus=best_prefill_gpus,
            best_decode_gpus=best_decode_gpus,
            output_dir=args.output_dir,
            args=args,
            is_moe_model=args.is_moe_model,
            num_gpus_per_node=args.num_gpus_per_node,
        )
        logger.info(f"Final DGD config with planner: {config}")

        # save DGD config with planner
        with open(f"{args.output_dir}/config_with_planner.yaml", "w") as f:
            yaml.dump(config, f)

735
736
737
738
739
740
741
742
743
    except Exception as e:
        logger.error(f"Profile job failed with error: {e}")
        raise
    finally:
        # Always clean up any remaining deployments, even if the job failed
        logger.info("Performing final cleanup of any remaining deployments...")
        await cleanup_remaining_deployments(deployment_clients, args.namespace)
        logger.info("Final cleanup completed.")

744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
    # deploy the optimized DGD with planner
    if args.deploy_after_profile and not args.dry_run:
        logger.info("Deploying the optimized DGD with planner...")
        # TODO: check conflicts for dynamo namespace and DGD name
        # TODO: handle deployment errors and propagate proper error messages to users
        client = DynamoDeploymentClient(
            namespace=args.namespace,
            base_log_dir=f"{args.output_dir}/final_deployment",
            model_name=model_name,
            service_name=args.service_name,
            frontend_port=frontend_port,
            deployment_name=config["metadata"]["name"],
        )
        await client.create_deployment(f"{args.output_dir}/config_with_planner.yaml")

759
760
761
762
763
764
765
766
767
768
769
770
771
772

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill."
    )
    parser.add_argument(
        "--namespace",
        type=str,
        default="dynamo-sla-profiler",
        help="Kubernetes namespace to deploy the DynamoGraphDeployment",
    )
    parser.add_argument(
        "--backend",
        type=str,
773
        default="vllm",
774
775
        choices=["vllm", "sglang", "trtllm"],
        help="backend type, currently support [vllm, sglang, trtllm]",
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
    )
    parser.add_argument(
        "--config",
        type=str,
        required=True,
        help="Path to the DynamoGraphDeployment config file",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="profiling_results",
        help="Path to the output results directory",
    )
    parser.add_argument(
        "--min-num-gpus-per-engine",
        type=int,
        default=1,
        help="minimum number of GPUs per engine",
    )
    parser.add_argument(
        "--max-num-gpus-per-engine",
        type=int,
        default=8,
        help="maximum number of GPUs per engine",
    )
    parser.add_argument(
        "--skip-existing-results",
        action="store_true",
        help="Skip TP sizes that already have results in the output directory",
    )
    parser.add_argument(
        "--force-rerun",
        action="store_true",
        help="Force re-running all tests even if results already exist (overrides --skip-existing-results)",
    )
    parser.add_argument(
        "--isl", type=int, default=3000, help="target input sequence length"
    )
    parser.add_argument(
        "--osl", type=int, default=500, help="target output sequence length"
    )
    parser.add_argument(
        "--ttft", type=int, default=50, help="target Time To First Token in ms"
    )
    parser.add_argument(
        "--itl", type=int, default=10, help="target Inter Token Latency in ms"
    )
823
824

    # arguments used for interpolating TTFT and ITL under different ISL/OSL
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
    parser.add_argument(
        "--max-context-length",
        type=int,
        default=16384,
        help="maximum context length supported by the served model",
    )
    parser.add_argument(
        "--prefill-interpolation-granularity",
        type=int,
        default=16,
        help="how many samples to benchmark to interpolate TTFT under different ISL",
    )
    parser.add_argument(
        "--decode-interpolation-granularity",
        type=int,
        default=6,
        help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
    )
    parser.add_argument(
        "--service-name",
        type=str,
        default="",
        help="Service name for port forwarding (default: {deployment_name}-frontend)",
    )
849
850
851
852
853
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Dry run the profile job",
    )
854
855
856
857
858
859
860
861
862
863
864
865
    parser.add_argument(
        "--is-moe-model",
        action="store_true",
        dest="is_moe_model",
        help="Enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode",
    )
    parser.add_argument(
        "--num-gpus-per-node",
        type=int,
        default=8,
        help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
    )
866
867
868
869
870
871
872
873
874
875
876

    # arguments for dgd config generation and deployment
    parser.add_argument(
        "--deploy-after-profile",
        action="store_true",
        help="deploy the optimized DGD with planner",
    )
    # Dynamically add all planner arguments from planner_argparse.py
    add_planner_arguments_to_parser(parser, prefix="planner-")

    # arguments if using aiconfigurator
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
    parser.add_argument(
        "--use-ai-configurator",
        action="store_true",
        help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
    )
    parser.add_argument(
        "--aic-system",
        type=str,
        help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
    )
    parser.add_argument(
        "--aic-model-name",
        type=str,
        help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)",
    )
    parser.add_argument(
893
894
895
896
897
898
899
        "--aic-backend",
        type=str,
        default="",
        help="aiconfigurator backend of the target model, if not provided, will use args.backend",
    )
    parser.add_argument(
        "--aic-backend-version",
900
901
902
        type=str,
        help="Specify backend version when using aiconfigurator to estimate perf.",
    )
903
904
    args = parser.parse_args()

905
906
907
908
909
910
911
912
913
914
    # setup file logging
    os.makedirs(args.output_dir, exist_ok=True)
    log_file_handler = logging.FileHandler(f"{args.output_dir}/profile_sla.log")
    log_file_handler.setLevel(logging.INFO)
    formatter = logging.Formatter(
        "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
    )
    log_file_handler.setFormatter(formatter)
    logger.addHandler(log_file_handler)

915
    asyncio.run(run_profile(args))