profile_sla.py 34.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
17
import asyncio
18
19
20
21
22
23
import logging
import math
import os

import numpy as np
import yaml
24
25

from benchmarks.profiler.utils.config import CONFIG_MODIFIERS, WORKER_COMPONENT_NAMES
26
from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
27
28
29
30
31
32
from benchmarks.profiler.utils.genai_perf import benchmark_decode, benchmark_prefill
from benchmarks.profiler.utils.plot import (
    plot_decode_performance,
    plot_prefill_performance,
)
from benchmarks.profiler.utils.profile_cache import (
33
34
35
36
    check_decode_results_exist,
    check_prefill_results_exist,
    load_existing_decode_results,
    load_existing_prefill_results,
37
)
38
from benchmarks.profiler.utils.profile_decode import (
39
    get_num_request_range,
40
41
42
43
44
45
46
    profile_decode,
    profile_decode_aiconfigurator,
)
from benchmarks.profiler.utils.profile_prefill import (
    profile_prefill,
    profile_prefill_aiconfigurator,
)
47
48
49
50
51
from deploy.utils.dynamo_deployment import (
    DynamoDeploymentClient,
    cleanup_remaining_deployments,
)

52
53
54
55
56
57
58
59
60
61
62
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


63
64
65
async def run_profile(args):
    # List to track all created deployment clients for cleanup in case of failure
    deployment_clients = []
66

67
    try:
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
        # Log MoE model support
        if args.is_moe_model:
            logger.info(
                "MoE (Mixture of Experts) model profiling, sweeping TEP size for prefill and DEP size for decode"
            )
            assert args.backend in [
                "sglang"
            ], "MoE model support is only available for SGLang"
            assert (
                not args.use_ai_configurator
            ), "MoE model is not supported in ai-configurator"
        else:
            logger.info(
                "Standard dense model profiling, sweeping TP size for both prefill and decode"
            )

84
85
86
87
        config_modifier = CONFIG_MODIFIERS[args.backend]

        with open(args.config, "r") as f:
            config = yaml.safe_load(f)
88

89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
        if args.is_moe_model:
            # For MoE models, use range with stride of num_gpus_per_node
            profile_num_gpus = list(
                range(
                    args.min_num_gpus_per_engine,
                    args.max_num_gpus_per_engine + 1,
                    args.num_gpus_per_node,
                )
            )
            logger.info(f"Profiling MoE GPU counts (TEP/DEP): {profile_num_gpus}")
        else:
            # For dense models, use powers of 2
            profile_num_gpus = [
                2**i
                for i in range(int(math.log2(args.max_num_gpus_per_engine)) + 1)
                if args.min_num_gpus_per_engine
                <= 2**i
                <= args.max_num_gpus_per_engine
            ]
            logger.info(f"Profiling dense model GPU counts (TP): {profile_num_gpus}")
109

110
        os.makedirs(args.output_dir, exist_ok=True)
111

112
        model_name = config_modifier.get_model_name(config)
113

114
115
116
117
118
119
120
121
122
123
124
125
        # Log skip behavior
        if args.force_rerun:
            logger.info(
                "Force rerun enabled - will re-run all tests even if results exist"
            )
        elif args.skip_existing_results:
            logger.info(
                "Skip existing results enabled - will skip TP sizes with existing results"
            )
        else:
            logger.info("Skip existing results disabled - will re-run all tests")

126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
        if args.use_ai_configurator:
            if not args.aic_system:
                raise ValueError(
                    "Must provide --aic-system when using --use-ai-configurator."
                )
            if not args.aic_model_name:
                raise ValueError(
                    "Must provide --aic-model-name when using --use-ai-configurator."
                )
            if not args.backend_version:
                raise ValueError(
                    "Must provide --backend-version when using --use-ai-configurator."
                )

            logger.info("Will use aiconfigurator to estimate perf.")
            ai_configurator_perf_estimator = AIConfiguratorPerfEstimator(
                args.aic_model_name,
                args.aic_system.lower(),
                args.backend,
                args.backend_version,
            )
        else:
            if args.aic_system or args.aic_model_name or args.backend_version:
                logger.warning(
                    "Will ignore --aic-system, --aic-model-name, and/or --backend-version "
                    "when not using --use-ai-configurator."
                )

154
        # first profile prefill
155
        prefill_num_gpus = []
156
157
158
        prefill_ttft = []
        prefill_thpt_per_gpu = []
        logger.info("Profiling prefill...")
159
160
161
        prefill_config = config_modifier.convert_config(
            config, "prefill", is_moe_model=args.is_moe_model
        )
162
        frontend_port = config_modifier.get_port(config)
163
164
        itl: float | None = None
        thpt_per_gpu: float | None = None
165
166
        for num_gpus in profile_num_gpus:
            logger.info(f"Profiling prefill with {num_gpus} GPUs...")
167

168
            # Check if results already exist for this GPU count
169
170
171
            if (
                args.skip_existing_results
                and not args.force_rerun
172
                and check_prefill_results_exist(args.output_dir, num_gpus, args.isl)
173
            ):
174
175
176
                logger.info(
                    f"Skipping prefill {num_gpus} GPU(s) - results already exist"
                )
177
                ttft, thpt_per_gpu = load_existing_prefill_results(
178
                    args.output_dir, num_gpus, args.isl
179
180
                )
                if ttft is not None and thpt_per_gpu is not None:
181
                    prefill_num_gpus.append(num_gpus)
182
183
184
                    prefill_ttft.append(ttft)
                    prefill_thpt_per_gpu.append(thpt_per_gpu)
                    logger.info(
185
                        f"Loaded existing prefill results: {num_gpus} GPU TTFT={ttft:.2f}ms, throughput={thpt_per_gpu:.2f} tokens/s/GPU"
186
187
                    )
                continue
188

189
190
191
192
193
194
195
196
            if args.is_moe_model:
                prefill_config = config_modifier.set_config_tep_size(
                    prefill_config, num_gpus, args.num_gpus_per_node
                )
            else:
                prefill_config = config_modifier.set_config_tp_size(
                    prefill_config, num_gpus
                )
197
            logger.info(f"Dynamo config: {prefill_config}")
198

199
            work_dir = f"{args.output_dir}/prefill_{num_gpus}gpus"
200
            os.makedirs(work_dir, exist_ok=True)
201

202
203
204
            prefill_config_fn = f"{work_dir}/config.yaml"
            with open(prefill_config_fn, "w") as f:
                yaml.dump(prefill_config, f)
205

206
            ttft = None
207
208
            if args.dry_run:
                logger.info("Skipping deployment creation in dry run mode")
209
210
211
212
            elif args.use_ai_configurator:
                logger.info("Using ai-configurator to estimate prefill latency.")
                perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
                    args.isl,
213
                    tp_size=num_gpus,
214
215
216
                )
                ttft = perf_dict["context_latency"]
                logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms")
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
            else:
                client = DynamoDeploymentClient(
                    namespace=args.namespace,
                    base_log_dir=work_dir,
                    model_name=model_name,
                    service_name=args.service_name,
                    frontend_port=frontend_port,
                    deployment_name=prefill_config["metadata"]["name"],
                )
                logger.info(f"Created client with service_name: {client.service_name}")
                deployment_clients.append(client)  # Track for cleanup
                await client.create_deployment(prefill_config_fn)
                logger.info("Waiting for deployment to be ready...")
                await client.wait_for_deployment_ready()
                logger.info("Deployment is ready")

                logger.info("Getting deployment logs...")
                await client.get_deployment_logs()
                logger.info(
                    f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
                )
238

239
240
241
242
243
244
245
246
247
248
249
250
                # run genai-perf
                base_url = client.get_service_url()
                genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
                gap_result = benchmark_prefill(
                    args.isl,
                    genai_perf_artifact_dir,
                    model_name,
                    model_name,
                    base_url=base_url,
                )
                if gap_result is not None:
                    ttft = gap_result["time_to_first_token"]["avg"]
251

252
253
254
255
                logger.info("Cleaning up deployment...")
                await client.delete_deployment()
                deployment_clients.remove(client)
                logger.info("Deployment deleted")
256

257
            if ttft is not None:
258
                prefill_num_gpus.append(num_gpus)
259
                prefill_ttft.append(ttft)
260
                prefill_thpt_per_gpu.append(args.isl / ttft / num_gpus * 1000)
261

262
        # Plot the results as a 2D scatter plot
263
        if prefill_num_gpus and prefill_ttft and prefill_thpt_per_gpu:
264
            plot_prefill_performance(
265
                prefill_num_gpus,
266
267
268
269
270
                prefill_ttft,
                prefill_thpt_per_gpu,
                args.ttft,
                args.output_dir,
            )
271

272
        # then profile decode
273
        decode_num_gpus = []
274
275
276
277
278
279
        decode_itl = []
        decode_thpt_per_gpu = []
        decode_concurrency = []
        decode_kv_cache_size = []
        decode_results = []  # Store partial results for plotting later
        logger.info("Profiling decode...")
280
281
282
283
284
        decode_config = config_modifier.convert_config(
            config, "decode", is_moe_model=args.is_moe_model
        )
        for num_gpus in profile_num_gpus:
            logger.info(f"Profiling decode with {num_gpus} GPUs...")
285

286
            # Check if results already exist for this GPU count
287
288
289
290
            if (
                args.skip_existing_results
                and not args.force_rerun
                and check_decode_results_exist(
291
                    args.output_dir, num_gpus, args.isl, args.osl
292
293
                )
            ):
294
295
296
                logger.info(
                    f"Skipping decode {num_gpus} GPU(s) - results already exist"
                )
297
                existing_results = load_existing_decode_results(
298
                    args.output_dir, num_gpus, args.isl, args.osl
299
300
301
302
303
304
                )
                if existing_results:
                    # Add existing results to our arrays
                    engine_decode_itl = []
                    engine_decode_thpt_per_gpu = []
                    for itl, thpt_per_gpu, concurrency in existing_results:
305
                        decode_num_gpus.append(num_gpus)
306
307
308
309
310
311
312
313
314
315
316
317
318
                        decode_itl.append(itl)
                        decode_thpt_per_gpu.append(thpt_per_gpu)
                        decode_concurrency.append(concurrency)
                        # We need to get kv_cache_size from existing logs or estimate it
                        estimated_kv_cache = max(
                            100000, concurrency * (args.isl + args.osl) * 2
                        )  # Conservative estimate
                        decode_kv_cache_size.append(estimated_kv_cache)
                        engine_decode_itl.append(itl)
                        engine_decode_thpt_per_gpu.append(thpt_per_gpu)

                    # Store results for plotting
                    decode_results.append(
319
                        (num_gpus, engine_decode_itl, engine_decode_thpt_per_gpu)
320
321
                    )
                    logger.info(
322
                        f"Loaded {len(existing_results)} existing decode results for {num_gpus} GPU(s)"
323
324
                    )
                continue
325

326
327
328
329
330
331
332
333
            if args.is_moe_model:
                decode_config = config_modifier.set_config_dep_size(
                    decode_config, num_gpus, args.num_gpus_per_node
                )
            else:
                decode_config = config_modifier.set_config_tp_size(
                    decode_config, num_gpus
                )
334
            logger.info(f"Dynamo config: {decode_config}")
335

336
            work_dir = f"{args.output_dir}/decode_{num_gpus}gpus"
337
            os.makedirs(work_dir, exist_ok=True)
338

339
340
341
            decode_config_fn = f"{work_dir}/config.yaml"
            with open(decode_config_fn, "w") as f:
                yaml.dump(decode_config, f)
342

343
344
            if args.dry_run:
                logger.info("Skipping deployment creation in dry run mode")
345
346
347
348
349

            elif args.use_ai_configurator:
                # Compute max_concurrency and max_kv_tokens to know which
                # num_request to sweep over.
                max_concurrency = ai_configurator_perf_estimator.get_max_batch_size(
350
                    args.isl, args.osl, tp_size=num_gpus
351
352
353
                )
                max_kv_tokens = max_concurrency * (args.isl + args.osl)

354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
            else:
                client = DynamoDeploymentClient(
                    namespace=args.namespace,
                    base_log_dir=work_dir,
                    model_name=model_name,
                    service_name=args.service_name,
                    frontend_port=frontend_port,
                    deployment_name=decode_config["metadata"]["name"],
                )
                deployment_clients.append(client)  # Track for cleanup
                await client.create_deployment(decode_config_fn)
                logger.info("Waiting for deployment to be ready...")
                await client.wait_for_deployment_ready()
                logger.info("Deployment is ready")

                logger.info("Getting deployment logs...")
                await client.get_deployment_logs()
                logger.info(
                    f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
                )
374

375
376
                # Compute max_concurrency and max_kv_tokens to know which
                # num_request to sweep over.
377
378
                # For MoE models, attention_dp_size = DEP size (num_gpus), for dense models = 1
                attention_dp_size = num_gpus if args.is_moe_model else 1
379
                max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
380
381
                    f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log",
                    attention_dp_size=attention_dp_size,
382
                )
383
                max_concurrency = max_kv_tokens // (args.isl + args.osl)
384
385

            if not args.dry_run:
386
387
388
389
390
391
                attention_dp_size = num_gpus if args.is_moe_model else 1
                sweep_num_request = get_num_request_range(
                    attention_dp_size,
                    max_concurrency,
                    args.decode_interpolation_granularity,
                )
392
393
394
395
396
397
398
                logger.info(
                    f"Sweeping num_request range based on maximum number of kv tokens: {sweep_num_request}"
                )

                engine_decode_itl = []
                engine_decode_thpt_per_gpu = []
                for num_request in sweep_num_request:
399
400
401
402
403
404
405
406
                    itl = thpt_per_gpu = None
                    if args.use_ai_configurator:
                        logger.info("Using ai-configurator to estimate decode latency.")
                        perf_dict = ai_configurator_perf_estimator.estimate_perf(
                            args.isl,
                            args.osl,
                            num_request,
                            mode="decode",
407
                            tp_size=num_gpus,
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
                        )

                        itl = perf_dict["tpot"]
                        thpt_per_gpu = perf_dict["tokens/s/gpu"]
                        logger.info(f"Estimated decode ITL: {itl:.2f}ms")
                        logger.info(
                            f"Estimated decode throughput per GPU: {thpt_per_gpu:.2f} tokens/s/GPU"
                        )
                    else:
                        base_url = client.get_service_url()
                        genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
                        gap_result = benchmark_decode(
                            args.isl,
                            args.osl,
                            num_request,
                            genai_perf_artifact_dir,
                            model_name,
                            model_name,
                            base_url=base_url,
427
                        )
428
429
430
                        if gap_result is not None:
                            itl = gap_result["inter_token_latency"]["avg"]
                            thpt_per_gpu = (
431
                                gap_result["output_token_throughput"]["avg"] / num_gpus
432
433
434
                            )

                    if itl is not None and thpt_per_gpu is not None:
435
436
                        engine_decode_itl.append(itl)
                        engine_decode_thpt_per_gpu.append(thpt_per_gpu)
437
                        decode_num_gpus.append(num_gpus)
438
439
440
441
                        decode_itl.append(itl)
                        decode_thpt_per_gpu.append(thpt_per_gpu)
                        decode_concurrency.append(num_request)
                        decode_kv_cache_size.append(max_kv_tokens)
442

443
444
                # Store partial results for plotting later
                decode_results.append(
445
                    (num_gpus, engine_decode_itl, engine_decode_thpt_per_gpu)
446
                )
447

448
449
450
451
452
453
            if not args.dry_run and not args.use_ai_configurator:
                logger.info("Cleaning up deployment...")
                await client.delete_deployment()
                deployment_clients.remove(client)
                logger.info("Deployment deleted")

454
455
456
457
        # Plot all decode results after profiling is complete
        if decode_results:
            plot_decode_performance(decode_results, args.itl, args.output_dir)

458
459
460
461
462
        if args.dry_run:
            logger.info("Skipping recommendations in dry run mode")
        else:
            logger.info("Analyzing results and generate recommendations...")
            # Safety guards: no results → exit early with a clear message
463
            if not (prefill_num_gpus and prefill_ttft and prefill_thpt_per_gpu):
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
                logger.error("No prefill results produced; skipping recommendations.")

            # select best tp size for prefill
            if min(prefill_ttft) > args.ttft:
                logger.info(
                    "No TP size satisfies the TTFT requirement, please try a smaller model or a more powerful GPU SKU"
                )
                selected_prefill_idx = int(np.argmin(np.array(prefill_ttft)))
            else:
                valid_indices = [
                    i for i, ttft in enumerate(prefill_ttft) if ttft <= args.ttft
                ]
                # Among valid TP sizes, select the one with highest throughput per GPU
                valid_thpts = [prefill_thpt_per_gpu[i] for i in valid_indices]
                max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
                selected_prefill_idx = max_thpt_idx
            logger.info(
481
                f"Suggested number of GPUs for prefill: {prefill_num_gpus[selected_prefill_idx]} (TTFT {prefill_ttft[selected_prefill_idx]:.2f} ms, throughput {prefill_thpt_per_gpu[selected_prefill_idx]:.2f} tokens/s/GPU)"
482
            )
483

484
485
486
487
488
489
490
491
            # scale up if estimated TTFT is 120% of target TTFT
            prefill_queue_size_upper_bound = max(
                0.1, args.ttft * 1.2 / prefill_ttft[selected_prefill_idx] - 1
            )
            # scale down if estimated TTFT is 80% of target TTFT
            prefill_queue_size_lower_bound = max(
                0.1, args.ttft * 0.8 / prefill_ttft[selected_prefill_idx] - 1
            )
492
            logger.info(
493
                f"Suggested planner upper/lower bound for prefill queue size: {prefill_queue_size_upper_bound:.2f}/{prefill_queue_size_lower_bound:.2f}"
494
            )
495

496
            # select best gpu count for decode
497
            if not (
498
                decode_num_gpus
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
                and decode_itl
                and decode_thpt_per_gpu
                and decode_concurrency
                and decode_kv_cache_size
            ):
                logger.error("No decode results produced; skipping recommendations.")
                return
            if min(decode_itl) > args.itl:
                logger.info(
                    "No TP size satisfies the ITL requirement, please try a smaller model or a more powerful GPU SKU"
                )
                selected_decode_idx = int(np.argmin(np.array(decode_itl)))
            else:
                valid_indices = [
                    i for i, itl in enumerate(decode_itl) if itl <= args.itl
                ]
                # Among valid TP sizes, select the one with highest throughput per GPU
                valid_thpts = [decode_thpt_per_gpu[i] for i in valid_indices]
                max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
                selected_decode_idx = max_thpt_idx
            logger.info(
520
                f"Suggested number of GPUs for decode: {decode_num_gpus[selected_decode_idx]} (ITL {decode_itl[selected_decode_idx]:.2f} ms, throughput {decode_thpt_per_gpu[selected_decode_idx]:.2f} tokens/s/GPU)"
521
            )
522

523
524
525
526
527
528
529
            # calculate kv cache utlization for the selected TP and concurrency
            selected_decode_kv_cache_utilization = (
                decode_concurrency[selected_decode_idx]
                * (args.isl + (args.osl / 2))
                / decode_kv_cache_size[selected_decode_idx]
            )
            # set a +- 20% range for the kv cache utilization
530
            logger.info(
531
                f"Suggested planner upper/lower bound for decode kv cache utilization: {min(1, selected_decode_kv_cache_utilization + 0.2):.2f}/{max(0.1, selected_decode_kv_cache_utilization - 0.2):.2f}"
532
533
            )

534
        if args.dry_run:
535
536
537
            # use min value for prefill and decode GPU counts
            prefill_num_gpus = [args.min_num_gpus_per_engine]
            decode_num_gpus = [args.min_num_gpus_per_engine]
538
539
            selected_prefill_idx = 0
            selected_decode_idx = 0
540

541
542
        # interpolate ISL - TTFT with best prefill GPU count
        best_prefill_gpus = prefill_num_gpus[selected_prefill_idx]
543
        logger.info(
544
            f"Profiling prefill under best {best_prefill_gpus} GPU(s) with different ISL..."
545
        )
546
547
        prefill_config = config_modifier.convert_config(
            config, "prefill", is_moe_model=args.is_moe_model
548
        )
549
550
551
552
553
554
555
556
        if args.is_moe_model:
            prefill_config = config_modifier.set_config_tep_size(
                prefill_config, best_prefill_gpus, args.num_gpus_per_node
            )
        else:
            prefill_config = config_modifier.set_config_tp_size(
                prefill_config, best_prefill_gpus
            )
557
        logger.info(f"Dynamo config: {prefill_config}")
558

559
560
        work_dir = f"{args.output_dir}/selected_prefill_interpolation"
        os.makedirs(work_dir, exist_ok=True)
561

562
563
564
565
        prefill_config_fn = f"{work_dir}/config.yaml"
        with open(prefill_config_fn, "w") as f:
            yaml.dump(prefill_config, f)

566
567
        if args.dry_run:
            logger.info("Skipping deployment creation in dry run mode")
568
569
570
        elif args.use_ai_configurator:
            profile_prefill_aiconfigurator(
                work_dir,
571
                best_prefill_gpus,  # num_gpus
572
573
574
                args.max_context_length,
                args.prefill_interpolation_granularity,
                ai_configurator_perf_estimator,
575
                tp_size=best_prefill_gpus,
576
            )
577
578
579
580
581
582
583
584
        else:
            client = DynamoDeploymentClient(
                namespace=args.namespace,
                base_log_dir=work_dir,
                model_name=model_name,
                service_name=args.service_name,
                frontend_port=frontend_port,
                deployment_name=prefill_config["metadata"]["name"],
585
            )
586
587
588
589
590
591
592
593
594
595
596
597
            deployment_clients.append(client)  # Track for cleanup
            await client.create_deployment(prefill_config_fn)
            logger.info("Waiting for deployment to be ready...")
            try:
                await client.wait_for_deployment_ready()
                logger.info("Deployment is ready")
                skip_profile = False
            except TimeoutError:
                logger.error(
                    "Deployment failed to become ready within timeout, skipping profiling"
                )
                skip_profile = True
598

599
600
601
602
603
604
            if not skip_profile:
                logger.info("Getting deployment logs...")
                await client.get_deployment_logs()
                logger.info(
                    f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
                )
605

606
            base_url = client.get_service_url()
607

608
609
610
611
612
            profile_prefill(
                work_dir,
                model_name,
                model_name,
                base_url,
613
                best_prefill_gpus,
614
615
616
                args.max_context_length,
                args.prefill_interpolation_granularity,
            )
617

618
619
620
621
            logger.info("Cleaning up deployment...")
            await client.delete_deployment()
            deployment_clients.remove(client)
            logger.info("Deployment deleted")
622

623
624
625
626
627
628
629
630
631
632
633
        # interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode GPU count
        best_decode_gpus = decode_num_gpus[selected_decode_idx]
        logger.info(f"Profiling decode with {best_decode_gpus} GPUs...")
        if args.is_moe_model:
            decode_config = config_modifier.set_config_dep_size(
                decode_config, best_decode_gpus, args.num_gpus_per_node
            )
        else:
            decode_config = config_modifier.set_config_tp_size(
                decode_config, best_decode_gpus
            )
634
        logger.info(f"Dynamo config: {decode_config}")
635

636
637
638
639
640
641
642
        work_dir = f"{args.output_dir}/selected_decode_interpolation"
        os.makedirs(work_dir, exist_ok=True)

        decode_config_fn = f"{work_dir}/config.yaml"
        with open(decode_config_fn, "w") as f:
            yaml.dump(decode_config, f)

643
644
        if args.dry_run:
            logger.info("Skipping deployment creation in dry run mode")
645
        elif args.use_ai_configurator:
646
647
            # For MoE models, attention_dp_size = DEP size (best_decode_gpus), for dense models = 1
            attention_dp_size = best_decode_gpus if args.is_moe_model else 1
648
            max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens(
649
                args.isl, args.osl, tp_size=best_decode_gpus
650
651
652
            )
            profile_decode_aiconfigurator(
                work_dir,
653
                best_decode_gpus,  # num_gpus
654
655
656
657
                max_kv_tokens,
                args.max_context_length,
                args.decode_interpolation_granularity,
                ai_configurator_perf_estimator,
658
659
                attention_dp_size,
                tp_size=best_decode_gpus,
660
            )
661
662
663
664
665
666
667
668
669
670
671
672
673
674
        else:
            client = DynamoDeploymentClient(
                namespace=args.namespace,
                base_log_dir=work_dir,
                model_name=model_name,
                service_name=args.service_name,
                frontend_port=frontend_port,
                deployment_name=decode_config["metadata"]["name"],
            )
            deployment_clients.append(client)  # Track for cleanup
            await client.create_deployment(decode_config_fn)
            logger.info("Waiting for deployment to be ready...")
            await client.wait_for_deployment_ready()
            logger.info("Deployment is ready")
675

676
677
678
679
680
            logger.info("Getting deployment logs...")
            await client.get_deployment_logs()
            logger.info(
                f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
            )
681

682
683
            # For MoE models, attention_dp_size = DEP size (best_decode_gpus), for dense models = 1
            attention_dp_size = best_decode_gpus if args.is_moe_model else 1
684
            max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
685
686
                f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log",
                attention_dp_size=attention_dp_size,
687
688
689
690
691
692
693
694
695
            )

            base_url = client.get_service_url()

            profile_decode(
                work_dir,
                model_name,
                model_name,
                base_url,
696
                best_decode_gpus,
697
698
699
                max_kv_tokens,
                args.max_context_length,
                args.decode_interpolation_granularity,
700
                attention_dp_size,
701
            )
702

703
704
705
706
            logger.info("Cleaning up deployment...")
            await client.delete_deployment()
            deployment_clients.remove(client)
            logger.info("Deployment deleted")
707

708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
    except Exception as e:
        logger.error(f"Profile job failed with error: {e}")
        raise
    finally:
        # Always clean up any remaining deployments, even if the job failed
        logger.info("Performing final cleanup of any remaining deployments...")
        await cleanup_remaining_deployments(deployment_clients, args.namespace)
        logger.info("Final cleanup completed.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill."
    )
    parser.add_argument(
        "--namespace",
        type=str,
        default="dynamo-sla-profiler",
        help="Kubernetes namespace to deploy the DynamoGraphDeployment",
    )
    parser.add_argument(
        "--backend",
        type=str,
731
        default="vllm",
732
733
        choices=["vllm", "sglang", "trtllm"],
        help="backend type, currently support [vllm, sglang, trtllm]",
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
    )
    parser.add_argument(
        "--config",
        type=str,
        required=True,
        help="Path to the DynamoGraphDeployment config file",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="profiling_results",
        help="Path to the output results directory",
    )
    parser.add_argument(
        "--min-num-gpus-per-engine",
        type=int,
        default=1,
        help="minimum number of GPUs per engine",
    )
    parser.add_argument(
        "--max-num-gpus-per-engine",
        type=int,
        default=8,
        help="maximum number of GPUs per engine",
    )
    parser.add_argument(
        "--skip-existing-results",
        action="store_true",
        help="Skip TP sizes that already have results in the output directory",
    )
    parser.add_argument(
        "--force-rerun",
        action="store_true",
        help="Force re-running all tests even if results already exist (overrides --skip-existing-results)",
    )
    parser.add_argument(
        "--isl", type=int, default=3000, help="target input sequence length"
    )
    parser.add_argument(
        "--osl", type=int, default=500, help="target output sequence length"
    )
    parser.add_argument(
        "--ttft", type=int, default=50, help="target Time To First Token in ms"
    )
    parser.add_argument(
        "--itl", type=int, default=10, help="target Inter Token Latency in ms"
    )
    # below are arguments used for interpolating TTFT and ITL under different ISL/OSL
    parser.add_argument(
        "--max-context-length",
        type=int,
        default=16384,
        help="maximum context length supported by the served model",
    )
    parser.add_argument(
        "--prefill-interpolation-granularity",
        type=int,
        default=16,
        help="how many samples to benchmark to interpolate TTFT under different ISL",
    )
    parser.add_argument(
        "--decode-interpolation-granularity",
        type=int,
        default=6,
        help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
    )
    parser.add_argument(
        "--service-name",
        type=str,
        default="",
        help="Service name for port forwarding (default: {deployment_name}-frontend)",
    )
806
807
808
809
810
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Dry run the profile job",
    )
811
812
813
814
815
816
817
818
819
820
821
822
    parser.add_argument(
        "--is-moe-model",
        action="store_true",
        dest="is_moe_model",
        help="Enable MoE (Mixture of Experts) model support, use TEP for prefill and DEP for decode",
    )
    parser.add_argument(
        "--num-gpus-per-node",
        type=int,
        default=8,
        help="Number of GPUs per node for MoE models - this will be the granularity when searching for the best TEP/DEP size",
    )
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
    parser.add_argument(
        "--use-ai-configurator",
        action="store_true",
        help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
    )
    parser.add_argument(
        "--aic-system",
        type=str,
        help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
    )
    parser.add_argument(
        "--aic-model-name",
        type=str,
        help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)",
    )
    parser.add_argument(
        "--backend-version",
        type=str,
        help="Specify backend version when using aiconfigurator to estimate perf.",
    )
843
844
845
    args = parser.parse_args()

    asyncio.run(run_profile(args))