feat: Add --use-ai-configurator to profile_sla.py (#3079)

Signed-off-by: Ilya Sherstyuk <isherstyuk@nvidia.com> Signed-off-by: Ilya Sherstyuk <46343317+ilyasher@users.noreply.github.com> Co-authored-by: Hongkuan Zhou <tedzhouhk@gmail.com>

feat: Add --use-ai-configurator to profile_sla.py (#3079)
Signed-off-by: Ilya Sherstyuk <isherstyuk@nvidia.com> Signed-off-by: Ilya Sherstyuk <46343317+ilyasher@users.noreply.github.com> Co-authored-by: Hongkuan Zhou <tedzhouhk@gmail.com>
19948b7f · Ilya Sherstyuk · GitHub · 8534d170 · 19948b7f · 19948b7f
Unverified Commit 19948b7f authored Sep 19, 2025 by Ilya Sherstyuk Committed by GitHub Sep 19, 2025
8 changed files
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -24,6 +24,7 @@ import yaml
 from benchmarks.profiler.utils.config import CONFIG_MODIFIERS, WORKER_COMPONENT_NAMES
 from benchmarks.profiler.utils.defaults import DECODE_NUM_REQUESTS_RANGE
+from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
 from benchmarks.profiler.utils.genai_perf import benchmark_decode, benchmark_prefill
 from benchmarks.profiler.utils.plot import (
    plot_decode_performance,
@@ -35,8 +36,14 @@ from benchmarks.profiler.utils.profile_cache import (
    load_existing_decode_results,
    load_existing_prefill_results,
 )
-from benchmarks.profiler.utils.profile_decode import profile_decode
+from benchmarks.profiler.utils.profile_decode import (
-from benchmarks.profiler.utils.profile_prefill import profile_prefill
+    profile_decode,
+    profile_decode_aiconfigurator,
+)
+from benchmarks.profiler.utils.profile_prefill import (
+    profile_prefill,
+    profile_prefill_aiconfigurator,
+)
 from deploy.utils.dynamo_deployment import (
    DynamoDeploymentClient,
    cleanup_remaining_deployments,
@@ -86,6 +93,34 @@ async def run_profile(args):
        else:
            logger.info("Skip existing results disabled - will re-run all tests")
+        if args.use_ai_configurator:
+            if not args.aic_system:
+                raise ValueError(
+                    "Must provide --aic-system when using --use-ai-configurator."
+                )
+            if not args.aic_model_name:
+                raise ValueError(
+                    "Must provide --aic-model-name when using --use-ai-configurator."
+                )
+            if not args.backend_version:
+                raise ValueError(
+                    "Must provide --backend-version when using --use-ai-configurator."
+                )
+            logger.info("Will use aiconfigurator to estimate perf.")
+            ai_configurator_perf_estimator = AIConfiguratorPerfEstimator(
+                args.aic_model_name,
+                args.aic_system.lower(),
+                args.backend,
+                args.backend_version,
+            )
+        else:
+            if args.aic_system or args.aic_model_name or args.backend_version:
+                logger.warning(
+                    "Will ignore --aic-system, --aic-model-name, and/or --backend-version "
+                    "when not using --use-ai-configurator."
+                )
        # first profile prefill
        prefill_tp_size = []
        prefill_ttft = []
@@ -93,6 +128,8 @@ async def run_profile(args):
        logger.info("Profiling prefill...")
        prefill_config = config_modifier.convert_config(config, "prefill")
        frontend_port = config_modifier.get_port(config)
+        itl: float | None = None
+        thpt_per_gpu: float | None = None
        for tp_size in profile_tp_size:
            logger.info(f"Profiling prefill with TP size {tp_size}...")
@@ -125,8 +162,17 @@ async def run_profile(args):
            with open(prefill_config_fn, "w") as f:
                yaml.dump(prefill_config, f)
+            ttft = None
            if args.dry_run:
                logger.info("Skipping deployment creation in dry run mode")
+            elif args.use_ai_configurator:
+                logger.info("Using ai-configurator to estimate prefill latency.")
+                perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
+                    args.isl,
+                    tp_size=tp_size,
+                )
+                ttft = perf_dict["context_latency"]
+                logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms")
            else:
                client = DynamoDeploymentClient(
                    namespace=args.namespace,
@@ -161,15 +207,17 @@ async def run_profile(args):
                )
                if gap_result is not None:
                    ttft = gap_result["time_to_first_token"]["avg"]
-                    prefill_tp_size.append(tp_size)
-                    prefill_ttft.append(ttft)
-                    prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
                logger.info("Cleaning up deployment...")
                await client.delete_deployment()
                deployment_clients.remove(client)
                logger.info("Deployment deleted")
+            if ttft is not None:
+                prefill_tp_size.append(tp_size)
+                prefill_ttft.append(ttft)
+                prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
        # Plot the results as a 2D scatter plot
        if prefill_tp_size and prefill_ttft and prefill_thpt_per_gpu:
            plot_prefill_performance(
@@ -242,6 +290,15 @@ async def run_profile(args):
            if args.dry_run:
                logger.info("Skipping deployment creation in dry run mode")
+            elif args.use_ai_configurator:
+                # Compute max_concurrency and max_kv_tokens to know which
+                # num_request to sweep over.
+                max_concurrency = ai_configurator_perf_estimator.get_max_batch_size(
+                    args.isl, args.osl, tp_size=tp_size
+                )
+                max_kv_tokens = max_concurrency * (args.isl + args.osl)
            else:
                client = DynamoDeploymentClient(
                    namespace=args.namespace,
@@ -263,10 +320,14 @@ async def run_profile(args):
                    f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
                )
+                # Compute max_concurrency and max_kv_tokens to know which
+                # num_request to sweep over.
                max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
                    f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log"
                )
                max_concurrency = max_kv_tokens // (args.isl + args.osl)
+            if not args.dry_run:
                sweep_num_request = [
                    num for num in DECODE_NUM_REQUESTS_RANGE if num <= max_concurrency
                ]
@@ -276,8 +337,26 @@ async def run_profile(args):
                engine_decode_itl = []
                engine_decode_thpt_per_gpu = []
-                base_url = client.get_service_url()
                for num_request in sweep_num_request:
+                    itl = thpt_per_gpu = None
+                    if args.use_ai_configurator:
+                        logger.info("Using ai-configurator to estimate decode latency.")
+                        perf_dict = ai_configurator_perf_estimator.estimate_perf(
+                            args.isl,
+                            args.osl,
+                            num_request,
+                            mode="decode",
+                            tp_size=tp_size,
+                        )
+                        itl = perf_dict["tpot"]
+                        thpt_per_gpu = perf_dict["tokens/s/gpu"]
+                        logger.info(f"Estimated decode ITL: {itl:.2f}ms")
+                        logger.info(
+                            f"Estimated decode throughput per GPU: {thpt_per_gpu:.2f} tokens/s/GPU"
+                        )
+                    else:
+                        base_url = client.get_service_url()
                        genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
                        gap_result = benchmark_decode(
                            args.isl,
@@ -293,6 +372,8 @@ async def run_profile(args):
                            thpt_per_gpu = (
                                gap_result["output_token_throughput"]["avg"] / tp_size
                            )
+                    if itl is not None and thpt_per_gpu is not None:
                        engine_decode_itl.append(itl)
                        engine_decode_thpt_per_gpu.append(thpt_per_gpu)
                        decode_tp_size.append(tp_size)
@@ -301,16 +382,17 @@ async def run_profile(args):
                        decode_concurrency.append(num_request)
                        decode_kv_cache_size.append(max_kv_tokens)
-                logger.info("Cleaning up deployment...")
-                await client.delete_deployment()
-                deployment_clients.remove(client)
-                logger.info("Deployment deleted")
                # Store partial results for plotting later
                decode_results.append(
                    (tp_size, engine_decode_itl, engine_decode_thpt_per_gpu)
                )
+            if not args.dry_run and not args.use_ai_configurator:
+                logger.info("Cleaning up deployment...")
+                await client.delete_deployment()
+                deployment_clients.remove(client)
+                logger.info("Deployment deleted")
        # Plot all decode results after profiling is complete
        if decode_results:
            plot_decode_performance(decode_results, args.itl, args.output_dir)
@@ -418,6 +500,15 @@ async def run_profile(args):
        if args.dry_run:
            logger.info("Skipping deployment creation in dry run mode")
+        elif args.use_ai_configurator:
+            profile_prefill_aiconfigurator(
+                work_dir,
+                best_prefill_tp,  # num_gpus
+                args.max_context_length,
+                args.prefill_interpolation_granularity,
+                ai_configurator_perf_estimator,
+                tp_size=best_prefill_tp,
+            )
        else:
            client = DynamoDeploymentClient(
                namespace=args.namespace,
@@ -481,6 +572,19 @@ async def run_profile(args):
        if args.dry_run:
            logger.info("Skipping deployment creation in dry run mode")
+        elif args.use_ai_configurator:
+            max_kv_tokens = ai_configurator_perf_estimator.get_max_kv_tokens(
+                args.isl, args.osl, tp_size=best_decode_tp
+            )
+            profile_decode_aiconfigurator(
+                work_dir,
+                best_decode_tp,  # num_gpus
+                max_kv_tokens,
+                args.max_context_length,
+                args.decode_interpolation_granularity,
+                ai_configurator_perf_estimator,
+                tp_size=best_decode_tp,
+            )
        else:
            client = DynamoDeploymentClient(
                namespace=args.namespace,
@@ -627,6 +731,26 @@ if __name__ == "__main__":
        action="store_true",
        help="Dry run the profile job",
    )
+    parser.add_argument(
+        "--use-ai-configurator",
+        action="store_true",
+        help="Use ai-configurator to estimate benchmarking results instead of running actual deployment.",
+    )
+    parser.add_argument(
+        "--aic-system",
+        type=str,
+        help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
+    )
+    parser.add_argument(
+        "--aic-model-name",
+        type=str,
+        help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)",
+    )
+    parser.add_argument(
+        "--backend-version",
+        type=str,
+        help="Specify backend version when using aiconfigurator to estimate perf.",
+    )
    args = parser.parse_args()
    asyncio.run(run_profile(args))
--- a/benchmarks/profiler/utils/estimate_perf.py
+++ b/benchmarks/profiler/utils/estimate_perf.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from typing import Any
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+formatter = logging.Formatter(
+    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
+)
+console_handler.setFormatter(formatter)
+logger.addHandler(console_handler)
+def _try_import_aiconfigurator():
+    # Lazy-import aiconfigurator because it's an optional dependency in profile_sla.py
+    import aiconfigurator.sdk.backends.factory
+    import aiconfigurator.sdk.config
+    import aiconfigurator.sdk.inference_session
+    import aiconfigurator.sdk.models
+    import aiconfigurator.sdk.perf_database
+    return aiconfigurator
+class AIConfiguratorPerfEstimator:
+    """
+    This class is used to estimate the performance of a model using aiconfigurator.
+    An instance of this class stores information about the model, system, and backend.
+    Methods can be called to estimate prefill and/or decode perf for a given ISL, OSL,
+    batch_size, and parallelism config.
+    """
+    def __init__(
+        self,
+        model_name: str,  # e.g. "QWEN3_32B"
+        system: str,  # e.g. "h200_sxm"
+        backend: str,  # e.g. "trtllm"
+        version: str,  # e.g. "0.20.0"
+    ):
+        aiconfigurator = _try_import_aiconfigurator()
+        logger.info("Loading aiconfigurator database. This might take a few seconds...")
+        self.database = aiconfigurator.sdk.perf_database.get_database(
+            system=system,
+            backend=backend,
+            version=version,
+        )
+        if not self.database:
+            raise ValueError(
+                f"Database not found for system: {system}, backend: {backend}, version: {version}"
+            )
+        logger.info("aiconfigurator database loaded.")
+        self.backend = aiconfigurator.sdk.backends.factory.get_backend(backend)
+        # This is the aiconfigurator model name (such as QWEN3_32B or DEEPSEEK_V3)
+        # rather than the HF model name.
+        self.model_name = model_name
+    def _get_model(self, **model_config_kwargs):
+        aiconfigurator = _try_import_aiconfigurator()
+        # NOTE: MOE models error out unless moe_tp_size and moe_ep_size are provided.
+        model_config = aiconfigurator.sdk.config.ModelConfig(**model_config_kwargs)
+        model = aiconfigurator.sdk.models.get_model(self.model_name, model_config)
+        return model
+    def estimate_perf(
+        self,
+        isl: int,
+        osl: int,
+        batch_size: int,
+        mode: str = "full",
+        **model_config_kwargs,
+    ) -> dict[str, Any]:
+        """
+        Estimate the perf of this model + system + backend + ISL/OSL/model_config
+        using aiconfigurator.
+        Args:
+            isl: Input sequence length
+            osl: Output sequence length
+            batch_size: Batch size
+            mode: Indicates what perf data to estimate.
+                "full": Estimate prefill and decode perf.
+                "prefill": Only estimate context perf.
+                "decode": Only estimate decode perf.
+            **model_config_kwargs: aiconfigurator model config kwargs
+                                   (such as tp_size, moe_tp_size, etc).
+        Returns:
+            dict: Perf metrics returned by aiconfigurator
+        """
+        aiconfigurator = _try_import_aiconfigurator()
+        mode_to_aic_mode = {
+            "full": "static",
+            "prefill": "static_ctx",
+            "decode": "static_gen",
+        }
+        if mode not in mode_to_aic_mode:
+            raise ValueError(
+                f"Invalid mode: {mode}. Must be one of {list(mode_to_aic_mode.keys())}."
+            )
+        self.runtime_config = aiconfigurator.sdk.config.RuntimeConfig(
+            batch_size=batch_size,
+            beam_width=1,
+            isl=isl,
+            osl=osl,
+        )
+        model = self._get_model(**model_config_kwargs)
+        session = aiconfigurator.sdk.inference_session.InferenceSession(
+            model, self.database, self.backend
+        )
+        summary = session.run_static(
+            mode=mode_to_aic_mode[mode], runtime_config=self.runtime_config, stride=32
+        )
+        summary_df = summary.get_summary_df()
+        # Convert pd.Dataframe to dict since there's only one row
+        return summary_df.to_dict(orient="records")[0]
+    def estimate_prefill_perf(
+        self,
+        isl: int,
+        **model_config_kwargs,
+    ) -> dict[str, Any]:
+        """
+        Estimate the perf of this model + system + backend + etc assuming it is a prefill worker.
+        Args:
+            isl: Input sequence length
+            **model_config_kwargs: aiconfigurator model config kwargs
+                                   (such as tp_size, moe_tp_size, etc).
+        Returns:
+            dict: Perf metrics returned by aiconfigurator
+        """
+        return self.estimate_perf(
+            isl,
+            5,  # small osl
+            1,  # concurrency = 1
+            mode="prefill",
+            **model_config_kwargs,
+        )
+    def get_max_batch_size(
+        self,
+        isl: int,
+        osl: int,
+        **model_config_kwargs,
+    ) -> int:
+        """
+        Estimate the largest batch size that would fit on this GPU.
+        Args:
+            isl: Input sequence length
+            osl: Output sequence length
+            **model_config_kwargs: aiconfigurator model config kwargs
+                                   (such as tp_size, moe_tp_size, etc).
+        Returns:
+            int: Estimated largest batch size that will fit on the system.
+        """
+        model = self._get_model(**model_config_kwargs)
+        def get_mem_usage(bs: int):
+            # TODO: _get_memory_usage might be underestimating because
+            # 1. it doesn't account for runtime buffers
+            # 2. it calculates num_tokens = isl*bs which ignores osl
+            return self.backend._get_memory_usage(
+                model, self.database, bs, 1, isl, osl
+            )["total"]
+        max_memory_gb = self.database.system_spec["gpu"]["mem_capacity"] / (1024**3)
+        bs = 1
+        if get_mem_usage(bs) > max_memory_gb:
+            # Model does not fit on GPU with the given model config.
+            return 0
+        # Step 1: find upper bound on batch size.
+        while get_mem_usage(bs) < max_memory_gb:
+            bs *= 2
+        # We know that bs // 2 will fit on GPU but bs will not.
+        min_bs = bs // 2
+        max_bs = bs
+        # Step 2: binary search for max batch size that fits on GPU.
+        while min_bs < max_bs:
+            test_bs = (min_bs + max_bs) // 2
+            if get_mem_usage(test_bs) < max_memory_gb:
+                # Because of the +1, the new value of min_bs might not fit on the GPU
+                # even though test_bs did fit. So at the end when min_bs and max_bs converge,
+                # we need to remember to subtract 1 from the result.
+                min_bs = test_bs + 1
+            else:
+                # max_bs is always a value that doesn't fit on the GPU.
+                max_bs = test_bs
+        return min_bs - 1  # see comment above
+    def get_max_kv_tokens(
+        self,
+        isl: int,
+        osl: int,
+        **model_config_kwargs,
+    ) -> int:
+        """
+        Estimate the max number of kv cache tokens that will fit on this GPU
+        for the given ISL, OSL, and model config.
+        Args:
+            isl: Input sequence length
+            osl: Output sequence length
+            **model_config_kwargs: aiconfigurator model config kwargs
+                                   (such as tp_size, moe_tp_size, etc).
+        Returns:
+            int: Estimated number of KV cache tokens that will fit on the system.
+        """
+        max_concurrency = self.get_max_batch_size(isl, osl, **model_config_kwargs)
+        return max_concurrency * (isl + osl)
--- a/benchmarks/profiler/utils/profile_decode.py
+++ b/benchmarks/profiler/utils/profile_decode.py
@@ -2,9 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 import logging
+from typing import Callable, Optional, Tuple
 import numpy as np
+from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
 from benchmarks.profiler.utils.genai_perf import benchmark_decode
 from benchmarks.profiler.utils.plot import plot_decode_3d_surface
@@ -19,15 +21,15 @@ console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
-def profile_decode(
+def _profile_decode_helper(
    work_dir,
-    model_name,
-    tokenizer,
-    url,
    num_gpus,
    max_kv_tokens,
    max_context_length,
    interpolation_granularity,
+    get_itl_and_thpt_per_gpu: Callable[
+        [int, int, int], Tuple[Optional[float], Optional[float]]
+    ],
 ):
    """interpolate ITL - Active_KV_Cache - Decode_Context_Length"""
    x_kv_usage = []
@@ -63,24 +65,13 @@ def profile_decode(
                max_concurrency // interpolation_granularity,
            )
        for num_request in sweep_num_request:
-            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
+            itl, thpt_per_gpu = get_itl_and_thpt_per_gpu(isl, osl, num_request)
-            gap_result = benchmark_decode(
-                isl,
+            if itl is not None and thpt_per_gpu is not None:
-                osl,
-                num_request,
-                genai_perf_artifact_dir,
-                model_name,
-                tokenizer,
-                base_url=url,
-            )
-            if gap_result is not None:
-                itl = gap_result["inter_token_latency"]["avg"]
                x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
                y_context_length.append(isl + osl / 2)
                z_itl.append(itl)
-                z_thpt_per_gpu.append(
+                z_thpt_per_gpu.append(thpt_per_gpu)
-                    gap_result["output_token_throughput"]["avg"] / num_gpus
-                )
    # Save the data points to a .npz file
    save_path = f"{work_dir}/raw_data.npz"
@@ -100,3 +91,69 @@ def profile_decode(
    )
    return
+def profile_decode(
+    work_dir,
+    model_name,
+    tokenizer,
+    url,
+    num_gpus,
+    max_kv_tokens,
+    max_context_length,
+    interpolation_granularity,
+):
+    def get_itl_and_thpt_per_gpu(isl, osl, num_request):
+        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
+        gap_result = benchmark_decode(
+            isl,
+            osl,
+            num_request,
+            genai_perf_artifact_dir,
+            model_name,
+            tokenizer,
+            base_url=url,
+        )
+        if gap_result is not None:
+            itl = gap_result["inter_token_latency"]["avg"]
+            thpt_per_gpu = gap_result["output_token_throughput"]["avg"] / num_gpus
+            return itl, thpt_per_gpu
+        return None, None
+    return _profile_decode_helper(
+        work_dir,
+        num_gpus,
+        max_kv_tokens,
+        max_context_length,
+        interpolation_granularity,
+        get_itl_and_thpt_per_gpu,
+    )
+def profile_decode_aiconfigurator(
+    work_dir,
+    num_gpus,
+    max_kv_tokens,
+    max_context_length,
+    interpolation_granularity,
+    ai_configurator_perf_estimator: AIConfiguratorPerfEstimator,
+    **model_config_kwargs,
+):
+    def get_itl_and_thpt_per_gpu(isl, osl, num_request):
+        perf_dict = ai_configurator_perf_estimator.estimate_perf(
+            isl,
+            osl,
+            num_request,
+            mode="decode",
+            **model_config_kwargs,
+        )
+        return perf_dict["tpot"], perf_dict["tokens/s/gpu"]
+    return _profile_decode_helper(
+        work_dir,
+        num_gpus,
+        max_kv_tokens,
+        max_context_length,
+        interpolation_granularity,
+        get_itl_and_thpt_per_gpu,
+    )
--- a/benchmarks/profiler/utils/profile_prefill.py
+++ b/benchmarks/profiler/utils/profile_prefill.py
@@ -2,9 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 import logging
+from typing import Callable, Optional
 import numpy as np
+from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
 from benchmarks.profiler.utils.genai_perf import benchmark_prefill
 from benchmarks.profiler.utils.plot import plot_prefill_interpolation
@@ -19,14 +21,12 @@ console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
-def profile_prefill(
+def _profile_prefill_helper(
    work_dir,
-    model_name,
-    tokenizer,
-    url,
    num_gpus,
    max_context_length,
    interpolation_granularity,
+    get_ttft: Callable[[int], Optional[float]],
 ):
    prefill_isl = []
    prefill_ttft = []
@@ -36,17 +36,8 @@ def profile_prefill(
        max_context_length,
        (max_context_length - 100) // interpolation_granularity,
    ):
-        # run genai-perf
+        ttft = get_ttft(isl)
-        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
+        if ttft is not None:
-        gap_result = benchmark_prefill(
-            isl,
-            genai_perf_artifact_dir,
-            model_name,
-            tokenizer,
-            base_url=url,
-        )
-        if gap_result is not None:
-            ttft = gap_result["time_to_first_token"]["avg"]
            prefill_isl.append(isl)
            prefill_ttft.append(ttft)
            prefill_thpt_per_gpu.append(isl / ttft / num_gpus * 1000)
@@ -78,3 +69,61 @@ def profile_prefill(
        )
    return
+def profile_prefill(
+    work_dir,
+    model_name,
+    tokenizer,
+    url,
+    num_gpus,
+    max_context_length,
+    interpolation_granularity,
+):
+    def get_ttft(isl):
+        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
+        gap_result = benchmark_prefill(
+            isl,
+            genai_perf_artifact_dir,
+            model_name,
+            tokenizer,
+            base_url=url,
+        )
+        if gap_result is not None:
+            return gap_result["time_to_first_token"]["avg"]
+        return None
+    return _profile_prefill_helper(
+        work_dir,
+        num_gpus,
+        max_context_length,
+        interpolation_granularity,
+        get_ttft,
+    )
+def profile_prefill_aiconfigurator(
+    work_dir,
+    num_gpus,
+    max_context_length,
+    interpolation_granularity,
+    ai_configurator_perf_estimator: AIConfiguratorPerfEstimator,
+    **model_config_kwargs,
+):
+    def get_ttft(isl):
+        perf_dict = ai_configurator_perf_estimator.estimate_prefill_perf(
+            isl,
+            **model_config_kwargs,
+        )
+        ttft = perf_dict["context_latency"]
+        logger.info(f"Estimated prefill TTFT: {ttft:.2f}ms")
+        return ttft
+    return _profile_prefill_helper(
+        work_dir,
+        num_gpus,
+        max_context_length,
+        interpolation_granularity,
+        get_ttft,
+    )
--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -2,9 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 accelerate==1.6.0
+aiconfigurator==0.2.0
 aiofiles
 av==15.0.0
-fastapi==0.115.6
+fastapi==0.115.12
 ftfy
 genai-perf==0.0.15
 grpcio-tools==1.66.0
@@ -26,7 +27,7 @@ prometheus-api-client
 prometheus_client
 prophet
 protobuf==5.29.5
-pydantic==2.7.1
+pydantic==2.10.6
 pyright
 PyYAML
 scikit-learn

--- a/docs/benchmarks/pre_deployment_profiling.md
+++ b/docs/benchmarks/pre_deployment_profiling.md
@@ -66,7 +66,7 @@ In prefill engine, prefills are usually done with batch size=1 and only the ISL
 ### Decode Interpolation Data
 In decode engine, decode requests are added inflight and iteration time (or ITL) depends on both the context length and the real-time load of the engine. We capture the real-time load of the engine with active kv usage and average context length. The active kv usage determines the complexity of the memory-bounded attention kernel while the active kv usage divided the average context length determines the complexity of the computation bound MLP kernel. For example, the below figure shows the ITL of DS-Distilled Llama 8b model on H100 TP4. The ITL grows near-linearly with active kv usage under a fixed context length. And the slope increases as the context length decreases.
-![images](../images/itl_interpolation.png)
+![images](../../docs/images/itl_interpolation.png)
 The script profiles the selected decode TP configuration across different active kv blocks and average context length.
@@ -266,7 +266,66 @@ If you see `ErrImagePull` or `ImagePullBackOff` errors with 401 unauthorized mes
 2. Verify the service account was created with the image pull secret:
  ```bash
-kubectl get serviceaccount dynamo-sa -n $NAMESPACE -o yaml
+  kubectl get serviceaccount dynamo-sa -n $NAMESPACE -o yaml
   ```
 3. The service account should show `imagePullSecrets` containing `nvcr-imagepullsecret`.
+## Running the Profiling Script with `aiconfigurator`
+The profiling script can be run much quicker by using `aiconfigurator` to estimate perf numbers instead of running and benchmarking real dynamo deployments. To enable estimation using `aiconfigurator`, pass the `--use-ai-configurator` flag to the profiling script.
+**Advantages** of `--use-ai-configurator`:
+* Script will finish in seconds rather than hours.
+* No k8s or GPU access is required.
+**Disadvantages**:
+* Estimated perf could contain some error, especially when the input dimensions out-of-distribution compared to the sampled values in aiconfigurator.
+* `aiconfigurator` has a limited list of supported models.
+* `aiconfigurator`'s database has a limited list of systems and backends.
+### Prerequisites
+You will need a virtual environment with `dynamo` installed. Either use the local dev environment or the docker images. If using local environment, install the required dependencies:
+```bash
+pip install -r deploy/utils/requirements.txt
+```
+Additionally, install `aiconfigurator`:
+```bash
+pip install aiconfigurator
+```
+### Available Models, Systems, and Backends
+`aiconfigurator` supports a limited list of models, systems, and backends.
+You can use the `aiconfigurator` CLI to see the support matrix:
+```bash
+aiconfigurator cli --help
+```
+This will display:
+```
+...options...
+  --model {GPT_7B,GPT_13B,GPT_30B,GPT_66B,GPT_175B,LLAMA2_7B,LLAMA2_13B,LLAMA2_70B,LLAMA3.1_8B,LLAMA3.1_70B,LLAMA3.1_405B,MOE_Mixtral8x7B,MOE_Mixtral8x22B,DEEPSEEK_V3,KIMI_K2,QWEN2.5_1.5B,QWEN2.5_7B,QWEN2.5_32B,QWEN2.5_72B,QWEN3_32B,QWEN3_235B,QWEN3_480B,Nemotron_super_v1.1}
+                        Model name
+  --system {h100_sxm,h200_sxm}
+                        System name
+  --backend {trtllm,sglang,vllm}
+                        Backend name, suport trtllm for now
+  --version VERSION     Version, 0.20.0,1.0.0rc3 for trtllm
+...more options...
+```
+### Running the Script
+In addition to passing the `--use-ai-configurator` flag, you must also provide the `--aic-system`, `--aic-model-name`, and `--backend-version` arguments.
+Example command:
+```bash
+python3 profile_sla.py \
+   --config ../../components/backends/trtllm/deploy/disagg.yaml \
+   --use-ai-configurator \
+   --aic-system h200_sxm \
+   --aic-model-name QWEN3_32B \
+   --backend trtllm \
+   --backend-version 0.20.0
+```
+The output will be written to `./profiling_results/`.
--- a/tests/profiler/test_profile_sla_aiconfigurator.py
+++ b/tests/profiler/test_profile_sla_aiconfigurator.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test suite for profile_sla aiconfigurator functionality.
+profile_sla should be able to use aiconfigurator functionality
+even without access to any GPU system.
+"""
+import sys
+from pathlib import Path
+import pytest
+# Add the project root to sys.path to enable imports
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+from benchmarks.profiler.profile_sla import run_profile  # noqa: E402
+class TestProfileSlaAiconfigurator:
+    """Test class for profile_sla aiconfigurator functionality."""
+    @pytest.fixture
+    def trtllm_args(self):
+        class Args:
+            backend = "trtllm"
+            config = "components/backends/trtllm/deploy/disagg.yaml"
+            output_dir = "/tmp/test_profiling_results"
+            namespace = "test-namespace"
+            min_num_gpus_per_engine = 1
+            max_num_gpus_per_engine = 8
+            skip_existing_results = False
+            force_rerun = False
+            isl = 3000
+            osl = 500
+            ttft = 50
+            itl = 10
+            max_context_length = 16384
+            prefill_interpolation_granularity = 16
+            decode_interpolation_granularity = 6
+            service_name = ""
+            dry_run = False
+            use_ai_configurator = True
+            aic_system = "h200_sxm"
+            aic_model_name = "QWEN3_32B"
+            backend_version = "0.20.0"
+        return Args()
+    @pytest.mark.pre_merge
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize(
+        "missing_arg", ["aic_system", "aic_model_name", "backend_version"]
+    )
+    async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg):
+        # Check that validation error happens when a required arg is missing.
+        setattr(trtllm_args, missing_arg, None)
+        with pytest.raises(ValueError):
+            await run_profile(trtllm_args)
+    @pytest.mark.pre_merge
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize(
+        "arg_name, bad_value",
+        [
+            # these values don't exist in the aiconfigurator database.
+            ("aic_system", "fake_gpu_system"),
+            ("backend_version", "0.1.0"),
+        ],
+    )
+    async def test_aiconfiguator_no_data(self, trtllm_args, arg_name, bad_value):
+        # Check that an appropriate error is raised when the system/model/backend
+        # is not found in the aiconfigurator database.
+        setattr(trtllm_args, arg_name, bad_value)
+        with pytest.raises(ValueError, match="Database not found"):
+            await run_profile(trtllm_args)
+    @pytest.mark.pre_merge
+    @pytest.mark.asyncio
+    async def test_trtllm_aiconfigurator_single_model(self, trtllm_args):
+        # Test that profile_sla works with the model & backend in the trtllm_args fixture.
+        await run_profile(trtllm_args)
+    @pytest.mark.asyncio
+    @pytest.mark.parametrize(
+        "backend, backend_version",
+        [
+            ("trtllm", "0.20.0"),
+            ("trtllm", "1.0.0rc3"),
+        ],
+    )
+    @pytest.mark.parametrize("model_name", ["QWEN3_32B", "GPT_7B", "LLAMA3.1_405B"])
+    async def test_trtllm_aiconfigurator_many(
+        self, trtllm_args, model_name, backend, backend_version
+    ):
+        # Test that profile_sla works with a variety of backend versions and model names.
+        trtllm_args.aic_model_name = model_name
+        trtllm_args.backend = backend
+        trtllm_args.backend_version = backend_version
+        await run_profile(trtllm_args)
--- a/tests/profiler/test_profile_sla_dryrun.py
+++ b/tests/profiler/test_profile_sla_dryrun.py
@@ -45,6 +45,10 @@ class TestProfileSLADryRun:
            decode_interpolation_granularity = 6
            service_name = ""
            dry_run = True
+            use_ai_configurator = False
+            aic_system = None
+            aic_model_name = None
+            backend_version = None
        return Args()
@@ -70,6 +74,10 @@ class TestProfileSLADryRun:
            decode_interpolation_granularity = 6
            service_name = ""
            dry_run = True
+            use_ai_configurator = False
+            aic_system = None
+            aic_model_name = None
+            backend_version = None
        return Args()
@@ -109,6 +117,10 @@ class TestProfileSLADryRun:
            decode_interpolation_granularity = 6
            service_name = ""
            dry_run = True
+            use_ai_configurator = False
+            aic_system = None
+            aic_model_name = None
+            backend_version = None
        return Args()