feat: DynamoPlanner profiler to use hf_id for AIConfigurator 0.4.0 (#4167)

Signed-off-by: Jason Zhou <jasonzho@jasonzho-mlt.client.nvidia.com> Signed-off-by: Jason Zhou <jasonzho@nvidia.com> Co-authored-by: Jason Zhou <jasonzho@jasonzho-mlt.client.nvidia.com>

feat: DynamoPlanner profiler to use hf_id for AIConfigurator 0.4.0 (#4167)
Signed-off-by: Jason Zhou <jasonzho@jasonzho-mlt.client.nvidia.com> Signed-off-by: Jason Zhou <jasonzho@nvidia.com> Co-authored-by: Jason Zhou <jasonzho@jasonzho-mlt.client.nvidia.com>
43986372 · Jason Zhou · GitHub · 7afb5431 · 43986372 · 43986372
Unverified Commit 43986372 authored Nov 10, 2025 by Jason Zhou Committed by GitHub Nov 10, 2025
13 changed files
--- a/ATTRIBUTIONS-Python.md
+++ b/ATTRIBUTIONS-Python.md
@@ -441,7 +441,7 @@ License: `Apache`
  - `Homepage`: https://github.com/huggingface/accelerate
-## aiconfigurator (0.2.0)
+## aiconfigurator (0.4.0)
 ### Licenses
 License: `Apache-2.0`
--- a/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml
@@ -19,7 +19,7 @@ spec:
        # AI Configurator mode (fast simulation-based profiling)
        use_ai_configurator: true
        aic_system: h200_sxm
-        aic_model_name: QWEN3_32B
+        aic_hf_id: Qwen/Qwen3-32B
        aic_backend_version: "0.20.0"
      # SLA targets for profiling

--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -143,9 +143,6 @@ async def run_profile(args):
            assert args.backend in [
                "sglang"
            ], "MoE model support is only available for SGLang"
-            assert (
-                not args.use_ai_configurator
-            ), "MoE model is not supported in ai-configurator"
        else:
            logger.info(
                "Dense model profiling, sweeping TP size for prefill and decode"
@@ -204,26 +201,30 @@ async def run_profile(args):
                raise ValueError(
                    "Must provide --aic-system when using --use-ai-configurator."
                )
-            if not args.aic_model_name:
-                raise ValueError(
+            # Fallback to args.model if aic_hf_id is not provided
-                    "Must provide --aic-model-name when using --use-ai-configurator."
+            if not args.aic_hf_id:
+                if args.model:
+                    logger.info(
+                        f"--aic-hf-id not provided, using --model ({args.model}) as HuggingFace ID for AI configurator"
                    )
-            if not args.aic_backend_version:
+                    args.aic_hf_id = args.model
+                else:
                    raise ValueError(
-                    "Must provide --aic-backend-version when using --use-ai-configurator."
+                        "Must provide --aic-hf-id or --model when using --use-ai-configurator."
                    )
            logger.info("Using aiconfigurator to estimate performance...")
            ai_configurator_perf_estimator = AIConfiguratorPerfEstimator(
-                args.aic_model_name,
+                args.aic_hf_id,
                args.aic_system.lower(),
                args.aic_backend,
                args.aic_backend_version,
            )
        else:
-            if args.aic_system or args.aic_model_name or args.aic_backend_version:
+            if args.aic_system or args.aic_hf_id or args.aic_backend_version:
                logger.warning(
-                    "Ignoring --aic-system, --aic-model-name, and/or --backend-version "
+                    "Ignoring --aic-system, --aic-hf-id, and/or --backend-version "
                    "when not using --use-ai-configurator."
                )

--- a/benchmarks/profiler/utils/estimate_perf.py
+++ b/benchmarks/profiler/utils/estimate_perf.py
@@ -36,7 +36,7 @@ class AIConfiguratorPerfEstimator:
    def __init__(
        self,
-        model_name: str,  # e.g. "QWEN3_32B"
+        hf_id: str,  # e.g. "Qwen/Qwen3-32B"
        system: str,  # e.g. "h200_sxm"
        backend: str,  # e.g. "trtllm"
        version: str,  # e.g. "0.20.0"
@@ -44,6 +44,11 @@ class AIConfiguratorPerfEstimator:
        aiconfigurator = _try_import_aiconfigurator()
        logger.info("Loading aiconfigurator database. This might take a few seconds...")
+        if not version:
+            version = aiconfigurator.sdk.perf_database.get_latest_database_version(
+                system,
+                backend,
+            )
        self.database = aiconfigurator.sdk.perf_database.get_database(
            system=system,
            backend=backend,
@@ -56,10 +61,7 @@ class AIConfiguratorPerfEstimator:
        logger.info("aiconfigurator database loaded.")
        self.backend = aiconfigurator.sdk.backends.factory.get_backend(backend)
+        self.hf_id = hf_id
-        # This is the aiconfigurator model name (such as QWEN3_32B or DEEPSEEK_V3)
-        # rather than the HF model name.
-        self.model_name = model_name
    def _get_model(self, **model_config_kwargs):
        aiconfigurator = _try_import_aiconfigurator()
@@ -67,7 +69,7 @@ class AIConfiguratorPerfEstimator:
        # NOTE: MOE models error out unless moe_tp_size and moe_ep_size are provided.
        model_config = aiconfigurator.sdk.config.ModelConfig(**model_config_kwargs)
        model = aiconfigurator.sdk.models.get_model(
-            self.model_name, model_config, self.backend
+            self.hf_id, model_config, self.backend
        )
        return model

--- a/benchmarks/profiler/utils/profiler_argparse.py
+++ b/benchmarks/profiler/utils/profiler_argparse.py
@@ -80,7 +80,7 @@ def create_profiler_parser() -> argparse.Namespace:
            decode_interpolation_granularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
            use_ai_configurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
            aic_system: String (target system for use with aiconfigurator, default: None)
-            aic_model_name: String (aiconfigurator name of the target model, default: None)
+            aic_hf_id: String (aiconfigurator huggingface id of the target model, default: None)
            aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
            aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None)
            dry_run: Boolean (dry run the profile job, default: False)
@@ -260,10 +260,10 @@ def create_profiler_parser() -> argparse.Namespace:
        help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
    )
    parser.add_argument(
-        "--aic-model-name",
+        "--aic-hf-id",
        type=str,
-        default=config.get("sweep", {}).get("aic_model_name"),
+        default=config.get("sweep", {}).get("aic_hf_id"),
-        help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)",
+        help="aiconfigurator name of the target model (e.g. Qwen/Qwen3-32B, meta-llama/Llama-3.1-405B)",
    )
    parser.add_argument(
        "--aic-backend",

--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@@ -40,7 +40,7 @@ classifiers = [
 ]
 dependencies = [
-    "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@11b6d821f1fbb34300bb0ed4945f647e89fb411a",
+    "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759",
    "networkx",
    "pandas",
    "pydantic>=2",

--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 accelerate==1.6.0
-aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@11b6d821f1fbb34300bb0ed4945f647e89fb411a
+aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759
 aiofiles
 aiperf @ git+https://github.com/ai-dynamo/aiperf.git@16dad7c02fcd959ba96823d7bfe7e681e5d5b41d
 av==15.0.0

--- a/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
+++ b/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
@@ -53,7 +53,7 @@ spec:
        # AI Configurator mode (fast simulation-based profiling, 20-30 seconds)
        use_ai_configurator: false  # Set to false for online profiling (2-4 hours)
        aic_system: h200_sxm  # Target GPU system for AI Configurator
-        aic_model_name: QWEN3_0.6B  # Model name for AI Configurator
+        aic_hf_id: Qwen/Qwen3-0.6B  # HuggingFace model ID for AI Configurator
        aic_backend_version: "0.20.0"  # Backend version for AI Configurator
      # SLA targets for profiling

--- a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
+++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
@@ -348,7 +348,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
 							"sweep": map[string]interface{}{
 								"use_ai_configurator": true,
 								"aic_system":          "h200_sxm",
-								"aic_model_name":      "QWEN3_32B",
+								"aic_hf_id":           "Qwen/Qwen3-32B",
 								"aic_backend_version": "0.20.0",
 							},
 						}),
@@ -1058,7 +1058,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
 							"sweep": map[string]interface{}{
 								"use_ai_configurator": true,
 								"aic_system":          "h200_sxm",
-								"aic_model_name":      "QWEN3_32B",
+								"aic_hf_id":           "Qwen/Qwen3-32B",
 								"aic_backend_version": "0.20.0",
 							},
 						}),

--- a/docs/benchmarks/sla_driven_profiling.md
+++ b/docs/benchmarks/sla_driven_profiling.md
@@ -303,17 +303,12 @@ profilingConfig:
    sweep:
      use_ai_configurator: true
      aic_system: h200_sxm              # GPU system: h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm
-      aic_model_name: QWEN3_32B         # AIC model identifier (see supported list)
+      aic_hf_id: Qwen/Qwen3-32B         # Huggingface model id
-      aic_backend_version: "0.20.0"     # TensorRT-LLM version: 0.20.0, 1.0.0rc3, 1.0.0rc6
+      aic_backend_version: "0.20.0"     # TensorRT-LLM version: 0.20.0, 1.0.0rc3
 ```
 **Supported configurations:** See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features)
-**Model name mapping examples:**
- `Qwen/Qwen3-32B` → `QWEN3_32B`
- `meta-llama/Llama-3.1-70B` → `LLAMA3.1_70B`
- `deepseek-ai/DeepSeek-V3` → `DEEPSEEK_V3`
 ### Planner Configuration (Optional)
 Pass arguments to the SLA planner:

--- a/docs/planner/sla_planner_quickstart.md
+++ b/docs/planner/sla_planner_quickstart.md
@@ -229,7 +229,7 @@ sweep:
 sweep:
  use_ai_configurator: true
  aic_system: h200_sxm
-  aic_model_name: QWEN3_32B
+  aic_hf_id: Qwen/Qwen3-32B
  aic_backend_version: "0.20.0"
 ```

--- a/tests/profiler/test_profile_sla_aiconfigurator.py
+++ b/tests/profiler/test_profile_sla_aiconfigurator.py
@@ -60,9 +60,9 @@ class TestProfileSlaAiconfigurator:
                self.dry_run = False
                self.use_ai_configurator = True
                self.aic_system = "h200_sxm"
-                self.aic_model_name = "QWEN3_32B"
+                self.aic_hf_id = "Qwen/Qwen3-32B"
                self.aic_backend = ""
-                self.aic_backend_version = "0.20.0"
+                self.aic_backend_version = None
                self.num_gpus_per_node = 8
                self.deploy_after_profile = False
                # Provide minimal model_info to avoid HF queries
@@ -77,11 +77,10 @@ class TestProfileSlaAiconfigurator:
    @pytest.mark.pre_merge
    @pytest.mark.asyncio
-    @pytest.mark.parametrize(
+    @pytest.mark.parametrize("missing_arg", ["aic_system", "aic_hf_id"])
-        "missing_arg", ["aic_system", "aic_model_name", "aic_backend_version"]
-    )
    async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg):
        # Check that validation error happens when a required arg is missing.
+        # Note: aic_backend_version is optional - when None, auto-detects latest version
        setattr(trtllm_args, missing_arg, None)
        with pytest.raises(ValueError):
            await run_profile(trtllm_args)
@@ -113,16 +112,23 @@ class TestProfileSlaAiconfigurator:
    @pytest.mark.parametrize(
        "backend, aic_backend_version",
        [
+            ("trtllm", None),
            ("trtllm", "0.20.0"),
            ("trtllm", "1.0.0rc3"),
        ],
    )
-    @pytest.mark.parametrize("model_name", ["QWEN3_32B", "LLAMA3.1_405B"])
+    @pytest.mark.parametrize(
+        "hf_model_id",
+        [
+            "Qwen/Qwen3-32B",
+            "meta-llama/Llama-3.1-405B",
+        ],
+    )
    async def test_trtllm_aiconfigurator_many(
-        self, trtllm_args, model_name, backend, aic_backend_version
+        self, trtllm_args, hf_model_id, backend, aic_backend_version
    ):
        # Test that profile_sla works with a variety of backend versions and model names.
-        trtllm_args.aic_model_name = model_name
+        trtllm_args.aic_hf_id = hf_model_id
        trtllm_args.backend = backend
        trtllm_args.aic_backend_version = aic_backend_version
        await run_profile(trtllm_args)
--- a/tests/profiler/test_profile_sla_dryrun.py
+++ b/tests/profiler/test_profile_sla_dryrun.py
@@ -67,7 +67,7 @@ class TestProfileSLADryRun:
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
-                self.aic_model_name = None
+                self.aic_hf_id = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
@@ -109,7 +109,7 @@ class TestProfileSLADryRun:
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
-                self.aic_model_name = None
+                self.aic_hf_id = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
@@ -164,7 +164,7 @@ class TestProfileSLADryRun:
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
-                self.aic_model_name = None
+                self.aic_hf_id = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
@@ -212,7 +212,7 @@ class TestProfileSLADryRun:
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
-                self.aic_model_name = None
+                self.aic_hf_id = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
@@ -282,7 +282,7 @@ class TestProfileSLADryRun:
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
-                self.aic_model_name = None
+                self.aic_hf_id = None
                self.aic_backend = ""
                self.aic_backend_version = None
                # Set to 0 to trigger auto-generation path
@@ -345,7 +345,7 @@ class TestProfileSLADryRun:
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
-                self.aic_model_name = None
+                self.aic_hf_id = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = 0
@@ -407,7 +407,7 @@ class TestProfileSLADryRun:
                self.dry_run = True
                self.use_ai_configurator = False
                self.aic_system = None
-                self.aic_model_name = None
+                self.aic_hf_id = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = 0