chore: better error message for planner sweeping mode (#6844)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

chore: better error message for planner sweeping mode (#6844)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
d688aa68 · Hongkuan Zhou · GitHub · a60cdf59 · d688aa68 · d688aa68
Unverified Commit d688aa68 authored Mar 03, 2026 by Hongkuan Zhou Committed by GitHub Mar 03, 2026
5 changed files
--- a/components/src/dynamo/profiler/profile_sla.py
+++ b/components/src/dynamo/profiler/profile_sla.py
@@ -38,8 +38,8 @@ from dynamo.profiler.utils.dgdr_v1beta1_types import (
    DynamoGraphDeploymentRequestSpec,
 )
 from dynamo.profiler.utils.dgdr_validate import (
-    run_gate_checks,
-    validate_dgdr_for_profiler,
+    valid_dgdr_spec,
+    validate_dgdr_dynamo_features,
 )
 from dynamo.profiler.utils.profile_common import (
    ProfilerOperationalConfig,
@@ -67,6 +67,27 @@ def _check_auto_backend_support(model: str, system: str) -> bool:
    )


+def _needs_interpolation(dgdr: DynamoGraphDeploymentRequestSpec) -> bool:
+    """True when interpolation data will actually be consumed.
+
+    Only throughput-based scaling and the mocker backend use the
+    per-engine performance curves produced by ``run_interpolation``.
+    Load-based scaling does not require them.
+    """
+    if dgdr.features is None:
+        return False
+
+    planner = dgdr.features.planner
+    if planner and planner.enable_throughput_scaling:
+        return True
+
+    mocker = dgdr.features.mocker
+    if mocker and mocker.enabled:
+        return True
+
+    return False
+
+
 def _extract_profiler_params(dgdr: DynamoGraphDeploymentRequestSpec) -> tuple:
    """Pull all profiler parameters from dgdr and log them."""
    model = dgdr.model
@@ -311,9 +332,8 @@ async def run_profile(
    )

    try:
-        # Validate and normalise — after this, required fields are guaranteed non-None
-        validate_dgdr_for_profiler(dgdr)
-
+        # Validate DGDR spec — after this, required fields are guaranteed non-None
+        valid_dgdr_spec(dgdr)
        (
            model,
            backend,
@@ -327,12 +347,12 @@ async def run_profile(
            search_strategy,
            picking_mode,
        ) = _extract_profiler_params(dgdr)
-
        if backend == "auto":
            aic_supported = _check_auto_backend_support(model, system)
        else:
            aic_supported = check_model_hardware_support(model, system, backend)
-        run_gate_checks(dgdr, aic_supported, search_strategy, backend)
+        # then validate DGDR features based on AIC support
+        validate_dgdr_dynamo_features(dgdr, aic_supported)

        (
            pick_result,
@@ -361,9 +381,10 @@ async def run_profile(
        dgd_config = pick_result.get("dgd_config") if not ops.dry_run else None

        # ---------------------------------------------------------------
-        # Interpolation curves
+        # Interpolation curves — only needed when something consumes
+        # the per-engine performance data (throughput scaling or mocker).
        # ---------------------------------------------------------------
-        if not ops.dry_run and is_planner_enabled(dgdr) and dgd_config:
+        if not ops.dry_run and dgd_config and _needs_interpolation(dgdr):
            try:
                model_cfg = get_model_config_from_model_path(resolve_model_path(dgdr))
                sweep_max_context_length = model_cfg.get("max_position_embeddings", 0)

--- a/components/src/dynamo/profiler/rapid.py
+++ b/components/src/dynamo/profiler/rapid.py
@@ -132,6 +132,8 @@ def _run_naive_fallback(
                pvc_mount_path=dgdr.modelCache.pvcMountPath,
                pvc_path=dgdr.modelCache.pvcModelPath or "",
            )
+        else:
+            dgd_config = config_modifier.update_model(dgd_config, model_name=model)

    return {
        "best_config_df": pd.DataFrame(),

--- a/components/src/dynamo/profiler/utils/dgdr_validate.py
+++ b/components/src/dynamo/profiler/utils/dgdr_validate.py
@@ -39,7 +39,7 @@ from dynamo.profiler.utils.profile_common import is_planner_enabled
 logger = logging.getLogger(__name__)


-def validate_dgdr_for_profiler(
+def valid_dgdr_spec(
    dgdr: DynamoGraphDeploymentRequestSpec,
 ) -> DynamoGraphDeploymentRequestSpec:
    """Validate and normalise a DGDR spec for the profiler.
@@ -63,7 +63,7 @@ def validate_dgdr_for_profiler(
    _validate_required_fields(dgdr)
    _validate_workload(dgdr.workload)
    _validate_sla(dgdr.sla)
-    _validate_features(dgdr)
+    _validate_parallelization_sweeping_mode(dgdr)
    return dgdr


@@ -124,53 +124,47 @@ def _validate_sla(sla: SLASpec) -> None:
        )


-def run_gate_checks(
+def _validate_parallelization_sweeping_mode(
    dgdr: DynamoGraphDeploymentRequestSpec,
-    aic_supported: bool,
-    search_strategy: SearchStrategy,
-    backend: str,
 ) -> None:
-    """Raise ValueError or log warnings for unsupported combos.
-
-    Must be called after ``validate_dgdr_for_profiler``.
-    """
-    if is_planner_enabled(dgdr) and not aic_supported:
-        model = dgdr.model
-        system = dgdr.hardware.gpuSku.lower()
-        planner_cfg = dgdr.features.planner
-        if planner_cfg.enable_throughput_scaling:
-            raise ValueError(
-                "Throughput-based planner scaling requires AIC support, but "
-                f"{model} on {system}/{backend} is not supported by AIC. "
-                "Use a supported model/hardware/backend combination or disable throughput scaling."
-            )
-        if (
-            planner_cfg.pre_deployment_sweeping_mode
-            == PlannerPreDeploymentSweepMode.Rapid
-        ):
-            logger.warning(
-                "Planner pre-deployment sweeping mode is 'rapid' but AIC does not support "
-                "%s on %s/%s. Falling back to 'none' (no pre-deployment sweeping).",
-                model,
-                system,
-                backend,
-            )
-            planner_cfg.pre_deployment_sweeping_mode = (
-                PlannerPreDeploymentSweepMode.None_
-            )
-
-    if search_strategy == SearchStrategy.THOROUGH and backend == "auto":
+    # do not support auto backend selection for real GPU sweeping
+    if dgdr.searchStrategy == SearchStrategy.THOROUGH and dgdr.backend == "auto":
        raise ValueError(
            "THOROUGH search strategy does not support 'auto' backend. "
            "Please specify a concrete backend (trtllm, vllm, sglang)."
        )


-def _validate_features(dgdr: DynamoGraphDeploymentRequestSpec) -> None:
+def validate_dgdr_dynamo_features(
+    dgdr: DynamoGraphDeploymentRequestSpec, aic_supported: bool
+) -> None:
    """Cross-field validation for features."""
    if not dgdr.features:
        return

+    # Planner
+    if is_planner_enabled(dgdr):
+        planner_cfg = dgdr.features.planner
+        # throughput scaling requires in-depth profiling data
+        if planner_cfg.enable_throughput_scaling:
+            planner_sweep_mode = planner_cfg.pre_deployment_sweeping_mode
+            if (
+                planner_sweep_mode is None
+                or planner_sweep_mode == PlannerPreDeploymentSweepMode.None_
+            ):
+                raise ValueError(
+                    "pre_deployment_sweeping_mode in PlannerConfig cannot be 'none' when enable_throughput_scaling is enabled. "
+                    "Throughput-based scaling requires pre-deployment sweeping to generate engine performance data."
+                )
+            elif (
+                planner_sweep_mode == PlannerPreDeploymentSweepMode.Rapid
+                and not aic_supported
+            ):
+                raise ValueError(
+                    f"AIC does not support {dgdr.model} on {dgdr.hardware.gpuSku.lower()} and {dgdr.backend}. "
+                    "pre_deployment_sweeping_mode in PlannerConfig can only be 'thorough' when AIC does not support the model/hardware/backend combination. "
+                )
+
    # Mocker requires pre-deployment sweeping
    if dgdr.features.mocker and dgdr.features.mocker.enabled and dgdr.features.planner:
        sweep_mode = dgdr.features.planner.pre_deployment_sweeping_mode

--- a/tests/profiler/test_helpers_profile_sla.py
+++ b/tests/profiler/test_helpers_profile_sla.py
@@ -41,7 +41,10 @@ try:
        SLASpec,
        WorkloadSpec,
    )
-    from dynamo.profiler.utils.dgdr_validate import run_gate_checks
+    from dynamo.profiler.utils.dgdr_validate import (
+        valid_dgdr_spec,
+        validate_dgdr_dynamo_features,
+    )
    from dynamo.profiler.utils.profile_common import ProfilerOperationalConfig
 except ImportError as e:
    pytest.skip(f"Skip (missing dependency): {e}", allow_module_level=True)
@@ -159,123 +162,211 @@ class TestExtractProfilerParams:


 # ---------------------------------------------------------------------------
-# run_gate_checks
+# valid_dgdr_spec
 # ---------------------------------------------------------------------------


-class TestRunGateChecks:
+class TestValidDgdrSpec:
    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
    def test_thorough_auto_backend_raises(self):
        """THOROUGH + 'auto' backend is rejected."""
-        dgdr = _make_dgdr()
+        dgdr = _make_dgdr(searchStrategy="thorough", backend="auto")
        with pytest.raises(ValueError, match="does not support 'auto' backend"):
-            run_gate_checks(
-                dgdr,
-                aic_supported=True,
-                search_strategy=SearchStrategy.THOROUGH,
-                backend="auto",
-            )
+            valid_dgdr_spec(dgdr)

    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
    def test_thorough_concrete_backend_passes(self):
        """THOROUGH + concrete backend is fine."""
-        dgdr = _make_dgdr()
-        run_gate_checks(
-            dgdr,
-            aic_supported=True,
-            search_strategy=SearchStrategy.THOROUGH,
-            backend="trtllm",
-        )
+        dgdr = _make_dgdr(searchStrategy="thorough", backend="trtllm")
+        valid_dgdr_spec(dgdr)

    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
    def test_rapid_auto_backend_passes(self):
        """RAPID allows 'auto' backend."""
-        dgdr = _make_dgdr()
-        run_gate_checks(
-            dgdr,
-            aic_supported=False,
-            search_strategy=SearchStrategy.RAPID,
-            backend="auto",
+        dgdr = _make_dgdr(backend="auto")
+        valid_dgdr_spec(dgdr)
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_missing_image_raises(self):
+        """image is required."""
+        dgdr = _make_dgdr(image="")
+        with pytest.raises(ValueError, match="image.*required"):
+            valid_dgdr_spec(dgdr)
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_missing_hardware_raises(self):
+        """hardware is required."""
+        dgdr = _make_dgdr(hardware=None)
+        with pytest.raises(ValueError, match="hardware.*required"):
+            valid_dgdr_spec(dgdr)
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_missing_gpu_sku_raises(self):
+        """hardware.gpuSku is required."""
+        dgdr = _make_dgdr(hardware=HardwareSpec(gpuSku="", numGpusPerNode=8))
+        with pytest.raises(ValueError, match="gpuSku.*required"):
+            valid_dgdr_spec(dgdr)
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_zero_gpus_per_node_raises(self):
+        """hardware.numGpusPerNode must be positive."""
+        dgdr = _make_dgdr(hardware=HardwareSpec(gpuSku="h200_sxm", numGpusPerNode=0))
+        with pytest.raises(ValueError, match="numGpusPerNode.*positive"):
+            valid_dgdr_spec(dgdr)
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_none_workload_gets_default(self):
+        """None workload is populated with a default WorkloadSpec."""
+        dgdr = _make_dgdr(workload=None)
+        valid_dgdr_spec(dgdr)
+        assert dgdr.workload is not None
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_none_sla_gets_default(self):
+        """None sla is populated with a default SLASpec."""
+        dgdr = _make_dgdr(sla=None)
+        valid_dgdr_spec(dgdr)
+        assert dgdr.sla is not None
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_both_concurrency_and_rate_raises(self):
+        """concurrency and requestRate are mutually exclusive."""
+        dgdr = _make_dgdr(
+            workload=WorkloadSpec(isl=4000, osl=1000, concurrency=10, requestRate=5.0)
        )
+        with pytest.raises(ValueError, match="concurrency.*requestRate"):
+            valid_dgdr_spec(dgdr)
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_negative_sla_ttft_raises(self):
+        """Negative SLA ttft must be rejected."""
+        dgdr = _make_dgdr(sla=SLASpec(ttft=-1.0, itl=30.0))
+        with pytest.raises(ValueError, match="ttft.*positive"):
+            valid_dgdr_spec(dgdr)
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_e2e_latency_clears_ttft_itl(self):
+        """e2eLatency takes precedence and nulls out ttft/itl."""
+        dgdr = _make_dgdr(sla=SLASpec(ttft=None, itl=None, e2eLatency=35000.0))
+        valid_dgdr_spec(dgdr)
+        assert dgdr.sla.ttft is None
+        assert dgdr.sla.itl is None
+        assert dgdr.sla.e2eLatency == 35000.0
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_missing_ttft_and_itl_and_e2e_raises(self):
+        """At least ttft+itl or e2eLatency must be provided."""
+        dgdr = _make_dgdr(sla=SLASpec(ttft=None, itl=None, e2eLatency=None))
+        with pytest.raises(ValueError, match="ttft.*itl.*e2eLatency"):
+            valid_dgdr_spec(dgdr)

+
+# ---------------------------------------------------------------------------
+# validate_dgdr_dynamo_features
+# ---------------------------------------------------------------------------
+
+
+class TestValidateDgdrDynamoFeatures:
    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
-    def test_no_planner_aic_unsupported_passes(self):
-        """No planner, AIC unsupported — no error."""
+    def test_no_features_passes(self):
+        """No features → no error."""
        dgdr = _make_dgdr()
-        run_gate_checks(
-            dgdr,
-            aic_supported=False,
-            search_strategy=SearchStrategy.RAPID,
-            backend="vllm",
-        )
+        validate_dgdr_dynamo_features(dgdr, aic_supported=False)

    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
-    def test_planner_throughput_scaling_aic_unsupported_raises(self):
-        """Throughput-based planner scaling requires AIC support."""
+    def test_planner_throughput_scaling_aic_unsupported_rapid_sweep_raises(self):
+        """Throughput scaling + rapid sweep + AIC unsupported is rejected."""
        dgdr = _make_dgdr(
            features=FeaturesSpec(
                planner=_make_planner(
                    enable_throughput_scaling=True,
+                    pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid,
                    backend="vllm",
                )
            )
        )
-        with pytest.raises(
-            ValueError, match="Throughput-based planner scaling requires AIC support"
-        ):
-            run_gate_checks(
-                dgdr,
-                aic_supported=False,
-                search_strategy=SearchStrategy.RAPID,
-                backend="vllm",
-            )
+        with pytest.raises(ValueError, match="AIC does not support"):
+            validate_dgdr_dynamo_features(dgdr, aic_supported=False)

    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
-    def test_planner_rapid_sweep_aic_unsupported_mutates_to_none(self):
-        """Rapid pre-deployment sweep falls back to None when AIC is unsupported."""
+    def test_planner_throughput_scaling_aic_supported_passes(self):
+        """Throughput scaling + rapid sweep + AIC supported is fine."""
        planner = _make_planner(
-            enable_throughput_scaling=False,
-            enable_load_scaling=True,
            pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid,
-            backend="vllm",
        )
        dgdr = _make_dgdr(features=FeaturesSpec(planner=planner))
-        run_gate_checks(
-            dgdr,
-            aic_supported=False,
-            search_strategy=SearchStrategy.RAPID,
-            backend="vllm",
-        )
+        validate_dgdr_dynamo_features(dgdr, aic_supported=True)
        assert (
            dgdr.features.planner.pre_deployment_sweeping_mode
-            == PlannerPreDeploymentSweepMode.None_
+            == PlannerPreDeploymentSweepMode.Rapid
        )

    @pytest.mark.pre_merge
    @pytest.mark.gpu_0
-    def test_planner_aic_supported_no_mutation(self):
-        """When AIC is supported, planner config is left unchanged."""
+    def test_planner_load_scaling_only_aic_unsupported_passes(self):
+        """Load scaling only (no throughput scaling) + AIC unsupported passes."""
        planner = _make_planner(
+            enable_throughput_scaling=False,
+            enable_load_scaling=True,
            pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid,
+            backend="vllm",
        )
        dgdr = _make_dgdr(features=FeaturesSpec(planner=planner))
-        run_gate_checks(
-            dgdr,
-            aic_supported=True,
-            search_strategy=SearchStrategy.RAPID,
-            backend="trtllm",
-        )
+        validate_dgdr_dynamo_features(dgdr, aic_supported=False)
        assert (
            dgdr.features.planner.pre_deployment_sweeping_mode
            == PlannerPreDeploymentSweepMode.Rapid
        )

+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_mocker_enabled_sweep_none_raises(self):
+        """Mocker enabled + sweep mode None_ is rejected."""
+        dgdr = _make_dgdr(
+            features=FeaturesSpec(
+                planner=_make_planner(
+                    enable_throughput_scaling=False,
+                    enable_load_scaling=True,
+                    pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.None_,
+                ),
+                mocker=MockerSpec(enabled=True),
+            )
+        )
+        with pytest.raises(ValueError, match="cannot be 'none'.*mocker"):
+            validate_dgdr_dynamo_features(dgdr, aic_supported=True)
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_mocker_enabled_sweep_rapid_passes(self):
+        """Mocker enabled + sweep mode Rapid is fine."""
+        dgdr = _make_dgdr(
+            features=FeaturesSpec(
+                planner=_make_planner(
+                    enable_throughput_scaling=False,
+                    enable_load_scaling=True,
+                    pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid,
+                ),
+                mocker=MockerSpec(enabled=True),
+            )
+        )
+        validate_dgdr_dynamo_features(dgdr, aic_supported=True)
+

 # ---------------------------------------------------------------------------
 # _write_final_output

--- a/tests/profiler/test_profile_sla_dgdr.py
+++ b/tests/profiler/test_profile_sla_dgdr.py
@@ -186,9 +186,7 @@ class TestRapidUnsupported:
            CONFIGS_DIR / "5b_rapid_unsupported_planner_throughput_error.yaml"
        )
        ops = _make_ops(tmp_path)
-        with pytest.raises(
-            ValueError, match="Throughput-based planner scaling requires AIC support"
-        ):
+        with pytest.raises(ValueError, match="AIC does not support"):
            asyncio.run(run_profile(dgdr, ops))