refactor: move core logics of DPP -> AIC and support static profiling (#6285)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>

refactor: move core logics of DPP -> AIC and support static profiling (#6285)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>
4c648b11 · Hongkuan Zhou · GitHub · f6d4351f · 4c648b11 · 4c648b11
Unverified Commit 4c648b11 authored Feb 25, 2026 by Hongkuan Zhou Committed by GitHub Feb 26, 2026
7 changed files
--- a/tests/profiler/configs/9a_thorough_dsr1_sglang_overrides.yaml
+++ b/tests/profiler/configs/9a_thorough_dsr1_sglang_overrides.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Case 9a: Thorough sweep for DeepSeek-R1 on SGLang with DGD overrides.
+# Overrides are derived by comparing the base sglang disagg template
+# (examples/backends/sglang/deploy/disagg.yaml) with the DSR1 recipe
+# (recipes/deepseek-r1/sglang/disagg-8gpu/deploy.yaml), excluding
+# parallelization-mapping fields (--tp, --dp, --ep-size,
+# --enable-dp-attention) which are swept by the profiler.
+model: "deepseek-ai/DeepSeek-R1"
+backend: sglang
+image: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.8.0"
+hardware:
+  gpuSku: h200_sxm
+  totalGpus: 16
+  numGpusPerNode: 8
+modelCache:
+  pvcName: model-cache
+  pvcMountPath: /opt/model
+workload:
+  isl: 4000
+  osl: 1000
+sla:
+  ttft: 2000.0
+  itl: 50.0
+searchStrategy: thorough
+overrides:
+  dgd:
+    spec:
+      services:
+        decode:
+          sharedMemory:
+            size: 80Gi
+          extraPodSpec:
+            mainContainer:
+              workingDir: /workspace
+              args:
+                - --mem-fraction-static
+                - "0.75"
+                - --prefill-round-robin-balance
+                - --watchdog-timeout
+                - "3600"
+        prefill:
+          sharedMemory:
+            size: 80Gi
+          extraPodSpec:
+            mainContainer:
+              workingDir: /workspace
+              args:
+                - --mem-fraction-static
+                - "0.75"
+                - --load-balance-method
+                - round_robin
+                - --watchdog-timeout
+                - "3600"
--- a/tests/profiler/test_helpers_profile_sla.py
+++ b/tests/profiler/test_helpers_profile_sla.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests for profile_sla.py private helper functions.
+
+These tests exercise each helper in isolation, without running the full
+profiling pipeline.  External I/O (DGD generation, deployment) is mocked
+where needed.
+"""
+
+import os
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+import yaml
+
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+try:
+    from dynamo.planner.utils.planner_config import (
+        PlannerConfig,
+        PlannerPreDeploymentSweepMode,
+    )
+    from dynamo.profiler.profile_sla import (
+        _assemble_final_config,
+        _extract_profiler_params,
+        _write_final_output,
+    )
+    from dynamo.profiler.utils.config_modifiers.parallelization_mapping import (
+        PickedParallelConfig,
+    )
+    from dynamo.profiler.utils.defaults import SearchStrategy
+    from dynamo.profiler.utils.dgdr_v1beta1_types import (
+        DynamoGraphDeploymentRequestSpec,
+        FeaturesSpec,
+        HardwareSpec,
+        MockerSpec,
+        SLASpec,
+        WorkloadSpec,
+    )
+    from dynamo.profiler.utils.dgdr_validate import run_gate_checks
+    from dynamo.profiler.utils.profile_common import ProfilerOperationalConfig
+except ImportError as e:
+    pytest.skip(f"Skip (missing dependency): {e}", allow_module_level=True)
+
+
+# ---------------------------------------------------------------------------
+# Shared fixtures
+# ---------------------------------------------------------------------------
+
+
+def _make_dgdr(**overrides) -> DynamoGraphDeploymentRequestSpec:
+    """Build a minimal dgdr with all required fields set."""
+    base = dict(
+        model="Qwen/Qwen3-32B",
+        backend="trtllm",
+        image="nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest",
+        hardware=HardwareSpec(gpuSku="h200_sxm", totalGpus=8, numGpusPerNode=8),
+        workload=WorkloadSpec(isl=4000, osl=1000),
+        sla=SLASpec(ttft=2000.0, itl=50.0),
+    )
+    base.update(overrides)
+    return DynamoGraphDeploymentRequestSpec(**base)
+
+
+def _make_planner(**overrides) -> PlannerConfig:
+    base = dict(
+        enable_throughput_scaling=True,
+        enable_load_scaling=False,
+        pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid,
+        mode="disagg",
+        backend="trtllm",
+    )
+    base.update(overrides)
+    return PlannerConfig(**base)
+
+
+def _make_ops(tmp_path, **kwargs) -> ProfilerOperationalConfig:
+    return ProfilerOperationalConfig(
+        output_dir=str(tmp_path / "out"),
+        **kwargs,
+    )
+
+
+# ---------------------------------------------------------------------------
+# _extract_profiler_params
+# ---------------------------------------------------------------------------
+
+
+class TestExtractProfilerParams:
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_basic_ttft_itl(self):
+        """Returns correct values when ttft/itl SLA is used."""
+        dgdr = _make_dgdr()
+        (
+            model,
+            backend,
+            system,
+            total_gpus,
+            isl,
+            osl,
+            req_lat,
+            ttft,
+            tpot,
+            strategy,
+            picking,
+        ) = _extract_profiler_params(dgdr)
+
+        assert model == "Qwen/Qwen3-32B"
+        assert backend == "trtllm"
+        assert system == "h200_sxm"
+        assert total_gpus == 8
+        assert isl == 4000
+        assert osl == 1000
+        assert req_lat is None
+        assert ttft == 2000.0
+        assert tpot == 50.0
+        assert strategy == SearchStrategy.RAPID
+        assert picking == "default"
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_e2e_latency_sets_both_targets(self):
+        """Both ttft and tpot equal e2eLatency when it is set."""
+        dgdr = _make_dgdr(sla=SLASpec(ttft=None, itl=None, e2eLatency=35000.0))
+        _, _, _, _, _, _, req_lat, ttft, tpot, _, _ = _extract_profiler_params(dgdr)
+        assert req_lat == 35000.0
+        assert ttft == 35000.0
+        assert tpot == 35000.0
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_request_rate_yields_load_match_picking(self):
+        """requestRate present in workload → picking_mode == 'load_match'."""
+        dgdr = _make_dgdr(workload=WorkloadSpec(isl=4000, osl=1000, requestRate=5.0))
+        _, _, _, _, _, _, _, _, _, _, picking = _extract_profiler_params(dgdr)
+        assert picking == "load_match"
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_backend_lowercased(self):
+        """backend value is always lower-cased."""
+        dgdr = _make_dgdr(backend="trtllm")
+        _, backend, _, _, _, _, _, _, _, _, _ = _extract_profiler_params(dgdr)
+        assert backend == "trtllm"
+        assert backend == backend.lower()
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_thorough_strategy_extracted(self):
+        """searchStrategy: thorough is correctly reflected in the returned tuple."""
+        dgdr = _make_dgdr(searchStrategy="thorough")
+        _, _, _, _, _, _, _, _, _, strategy, _ = _extract_profiler_params(dgdr)
+        assert strategy == SearchStrategy.THOROUGH
+
+
+# ---------------------------------------------------------------------------
+# run_gate_checks
+# ---------------------------------------------------------------------------
+
+
+class TestRunGateChecks:
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_thorough_auto_backend_raises(self):
+        """THOROUGH + 'auto' backend is rejected."""
+        dgdr = _make_dgdr()
+        with pytest.raises(ValueError, match="does not support 'auto' backend"):
+            run_gate_checks(
+                dgdr,
+                aic_supported=True,
+                search_strategy=SearchStrategy.THOROUGH,
+                backend="auto",
+            )
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_thorough_concrete_backend_passes(self):
+        """THOROUGH + concrete backend is fine."""
+        dgdr = _make_dgdr()
+        run_gate_checks(
+            dgdr,
+            aic_supported=True,
+            search_strategy=SearchStrategy.THOROUGH,
+            backend="trtllm",
+        )
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_rapid_auto_backend_passes(self):
+        """RAPID allows 'auto' backend."""
+        dgdr = _make_dgdr()
+        run_gate_checks(
+            dgdr,
+            aic_supported=False,
+            search_strategy=SearchStrategy.RAPID,
+            backend="auto",
+        )
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_no_planner_aic_unsupported_passes(self):
+        """No planner, AIC unsupported — no error."""
+        dgdr = _make_dgdr()
+        run_gate_checks(
+            dgdr,
+            aic_supported=False,
+            search_strategy=SearchStrategy.RAPID,
+            backend="vllm",
+        )
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_planner_throughput_scaling_aic_unsupported_raises(self):
+        """Throughput-based planner scaling requires AIC support."""
+        dgdr = _make_dgdr(
+            features=FeaturesSpec(
+                planner=_make_planner(
+                    enable_throughput_scaling=True,
+                    backend="vllm",
+                )
+            )
+        )
+        with pytest.raises(
+            ValueError, match="Throughput-based planner scaling requires AIC support"
+        ):
+            run_gate_checks(
+                dgdr,
+                aic_supported=False,
+                search_strategy=SearchStrategy.RAPID,
+                backend="vllm",
+            )
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_planner_rapid_sweep_aic_unsupported_mutates_to_none(self):
+        """Rapid pre-deployment sweep falls back to None when AIC is unsupported."""
+        planner = _make_planner(
+            enable_throughput_scaling=False,
+            enable_load_scaling=True,
+            pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid,
+            backend="vllm",
+        )
+        dgdr = _make_dgdr(features=FeaturesSpec(planner=planner))
+        run_gate_checks(
+            dgdr,
+            aic_supported=False,
+            search_strategy=SearchStrategy.RAPID,
+            backend="vllm",
+        )
+        assert (
+            dgdr.features.planner.pre_deployment_sweeping_mode
+            == PlannerPreDeploymentSweepMode.None_
+        )
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_planner_aic_supported_no_mutation(self):
+        """When AIC is supported, planner config is left unchanged."""
+        planner = _make_planner(
+            pre_deployment_sweeping_mode=PlannerPreDeploymentSweepMode.Rapid,
+        )
+        dgdr = _make_dgdr(features=FeaturesSpec(planner=planner))
+        run_gate_checks(
+            dgdr,
+            aic_supported=True,
+            search_strategy=SearchStrategy.RAPID,
+            backend="trtllm",
+        )
+        assert (
+            dgdr.features.planner.pre_deployment_sweeping_mode
+            == PlannerPreDeploymentSweepMode.Rapid
+        )
+
+
+# ---------------------------------------------------------------------------
+# _write_final_output
+# ---------------------------------------------------------------------------
+
+
+class TestWriteFinalOutput:
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_normal_config_writes_file_and_returns_true(self, tmp_path):
+        ops = _make_ops(tmp_path)
+        os.makedirs(ops.output_dir, exist_ok=True)
+        final_config = {"apiVersion": "v1", "kind": "Deployment"}
+
+        result = _write_final_output(ops, final_config)
+
+        assert result is True
+        out = Path(ops.output_dir) / "final_config.yaml"
+        assert out.exists()
+        assert yaml.safe_load(out.read_text()) == final_config
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_list_config_writes_multi_doc_yaml(self, tmp_path):
+        ops = _make_ops(tmp_path)
+        os.makedirs(ops.output_dir, exist_ok=True)
+        final_config = [{"kind": "A"}, {"kind": "B"}]
+
+        result = _write_final_output(ops, final_config)
+
+        assert result is True
+        out = Path(ops.output_dir) / "final_config.yaml"
+        docs = list(yaml.safe_load_all(out.read_text()))
+        assert len(docs) == 2
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_none_config_not_dry_run_returns_false(self, tmp_path):
+        ops = _make_ops(tmp_path, dry_run=False)
+        os.makedirs(ops.output_dir, exist_ok=True)
+
+        result = _write_final_output(ops, None)
+
+        assert result is False
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_none_config_dry_run_writes_empty_yaml_and_returns_true(self, tmp_path):
+        ops = _make_ops(tmp_path, dry_run=True)
+        os.makedirs(ops.output_dir, exist_ok=True)
+
+        result = _write_final_output(ops, None)
+
+        assert result is True
+        out = Path(ops.output_dir) / "final_config.yaml"
+        assert out.exists()
+        assert yaml.safe_load(out.read_text()) is None  # empty YAML == None
+
+
+# ---------------------------------------------------------------------------
+# _assemble_final_config
+# ---------------------------------------------------------------------------
+
+
+class TestAssembleFinalConfig:
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_no_planner_no_mocker_returns_dgd_config_unchanged(self, tmp_path):
+        dgdr = _make_dgdr()
+        ops = _make_ops(tmp_path)
+        dgd_config = {"kind": "DynamoGraphDeployment"}
+
+        result = _assemble_final_config(
+            dgdr,
+            ops,
+            dgd_config,
+            PickedParallelConfig(tp=1),
+            PickedParallelConfig(tp=1),
+        )
+
+        assert result is dgd_config
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_none_dgd_config_passes_through_as_none(self, tmp_path):
+        dgdr = _make_dgdr()
+        ops = _make_ops(tmp_path)
+
+        result = _assemble_final_config(
+            dgdr,
+            ops,
+            None,
+            PickedParallelConfig(tp=1),
+            PickedParallelConfig(tp=1),
+        )
+
+        assert result is None
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_planner_no_mocker_returns_real_config(self, tmp_path):
+        dgdr = _make_dgdr(features=FeaturesSpec(planner=_make_planner()))
+        ops = _make_ops(tmp_path)
+        os.makedirs(ops.output_dir, exist_ok=True)
+        dgd_config = {"kind": "DGD"}
+        real_cfg = {"kind": "real"}
+        mocker_cfg = {"kind": "mocker"}
+
+        with patch(
+            "dynamo.profiler.profile_sla.generate_dgd_config_with_planner",
+            return_value=(real_cfg, mocker_cfg),
+        ):
+            result = _assemble_final_config(
+                dgdr,
+                ops,
+                dgd_config,
+                PickedParallelConfig(tp=1),
+                PickedParallelConfig(tp=1),
+            )
+
+        assert result is real_cfg
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_mocker_enabled_returns_mocker_config(self, tmp_path):
+        dgdr = _make_dgdr(
+            features=FeaturesSpec(
+                planner=_make_planner(),
+                mocker=MockerSpec(enabled=True),
+            )
+        )
+        ops = _make_ops(tmp_path)
+        os.makedirs(ops.output_dir, exist_ok=True)
+        dgd_config = {"kind": "DGD"}
+        real_cfg = {"kind": "real"}
+        mocker_cfg = {"kind": "mocker"}
+
+        with patch(
+            "dynamo.profiler.profile_sla.generate_dgd_config_with_planner",
+            return_value=(real_cfg, mocker_cfg),
+        ):
+            result = _assemble_final_config(
+                dgdr,
+                ops,
+                dgd_config,
+                PickedParallelConfig(tp=1),
+                PickedParallelConfig(tp=1),
+            )
+
+        assert result is mocker_cfg
--- a/tests/profiler/test_helpers_rapid.py
+++ b/tests/profiler/test_helpers_rapid.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests for rapid.py private helper functions.
+
+Tests _run_naive_fallback and _run_default_sim in isolation; AIC simulation
+helpers (_run_autoscale_sim) require the full AIC stack and are covered by
+the end-to-end test suite.
+"""
+
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pandas as pd
+import pytest
+
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+try:
+    from dynamo.profiler.rapid import _run_default_sim, _run_naive_fallback
+    from dynamo.profiler.utils.dgdr_v1beta1_types import (
+        DynamoGraphDeploymentRequestSpec,
+        HardwareSpec,
+        ModelCacheSpec,
+        SLASpec,
+        WorkloadSpec,
+    )
+except ImportError as e:
+    pytest.skip(f"Skip (missing dependency): {e}", allow_module_level=True)
+
+
+# ---------------------------------------------------------------------------
+# Shared fixtures
+# ---------------------------------------------------------------------------
+
+
+def _make_dgdr(**overrides) -> DynamoGraphDeploymentRequestSpec:
+    base = dict(
+        model="Qwen/Qwen3-32B",
+        backend="vllm",
+        image="nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest",
+        hardware=HardwareSpec(gpuSku="l40s", totalGpus=4, numGpusPerNode=4),
+        workload=WorkloadSpec(isl=4000, osl=1000),
+        sla=SLASpec(ttft=2000.0, itl=50.0),
+    )
+    base.update(overrides)
+    return DynamoGraphDeploymentRequestSpec(**base)
+
+
+def _fake_modifier(update_image_return=None):
+    m = MagicMock()
+    m.update_image.return_value = update_image_return or {"kind": "DGD"}
+    m.update_model_from_pvc.return_value = {"kind": "DGD"}
+    return m
+
+
+# ---------------------------------------------------------------------------
+# _run_naive_fallback
+# ---------------------------------------------------------------------------
+
+
+class TestRunNaiveFallback:
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_returns_expected_structure(self):
+        """Result always has the four required keys with zeroed latencies."""
+        dgdr = _make_dgdr()
+        with patch(
+            "dynamo.profiler.rapid.generate_naive_config",
+            return_value={"artifacts": {}},
+        ):
+            result = _run_naive_fallback(dgdr, "Qwen/Qwen3-32B", 4, "l40s", "vllm")
+
+        assert set(result) >= {
+            "best_config_df",
+            "best_latencies",
+            "dgd_config",
+            "chosen_exp",
+        }
+        assert result["best_latencies"] == {
+            "ttft": 0.0,
+            "tpot": 0.0,
+            "request_latency": 0.0,
+        }
+        assert result["chosen_exp"] is None
+        assert isinstance(result["best_config_df"], pd.DataFrame)
+        assert result["best_config_df"].empty
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_empty_artifacts_yields_none_dgd_config(self):
+        """No k8s_deploy.yaml in artifacts → dgd_config is None."""
+        dgdr = _make_dgdr()
+        with patch(
+            "dynamo.profiler.rapid.generate_naive_config",
+            return_value={"artifacts": {}},
+        ):
+            result = _run_naive_fallback(dgdr, "Qwen/Qwen3-32B", 4, "l40s", "vllm")
+        assert result["dgd_config"] is None
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_with_pvc_calls_update_model_from_pvc(self):
+        """When modelCache.pvcName is set, update_model_from_pvc is called."""
+        dgdr = _make_dgdr(
+            modelCache=ModelCacheSpec(
+                pvcName="model-cache",
+                pvcModelPath="/model/qwen",
+                pvcMountPath="/opt/model-cache",
+            )
+        )
+        fake_modifier = _fake_modifier()
+        with (
+            patch(
+                "dynamo.profiler.rapid.generate_naive_config",
+                return_value={"artifacts": {"k8s_deploy.yaml": "kind: DGD"}},
+            ),
+            patch("dynamo.profiler.rapid.CONFIG_MODIFIERS", {"vllm": fake_modifier}),
+        ):
+            _run_naive_fallback(dgdr, "Qwen/Qwen3-32B", 4, "l40s", "vllm")
+
+        fake_modifier.update_model_from_pvc.assert_called_once()
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_without_pvc_skips_update_model_from_pvc(self):
+        """When no modelCache, update_model_from_pvc is not called."""
+        dgdr = _make_dgdr()
+        fake_modifier = _fake_modifier()
+        with (
+            patch(
+                "dynamo.profiler.rapid.generate_naive_config",
+                return_value={"artifacts": {"k8s_deploy.yaml": "kind: DGD"}},
+            ),
+            patch("dynamo.profiler.rapid.CONFIG_MODIFIERS", {"vllm": fake_modifier}),
+        ):
+            _run_naive_fallback(dgdr, "Qwen/Qwen3-32B", 4, "l40s", "vllm")
+
+        fake_modifier.update_model_from_pvc.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# _run_default_sim
+# ---------------------------------------------------------------------------
+
+
+class TestRunDefaultSim:
+    def _execute_return(self, chosen="disagg", ttft=100.0, tpot=10.0):
+        """Build a fake _execute_task_configs return value."""
+        best_df = pd.DataFrame([{"tp(p)": 1}])
+        latencies = {"ttft": ttft, "tpot": tpot, "request_latency": 0.0}
+        return chosen, {chosen: best_df}, None, None, {chosen: latencies}
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_returns_required_keys(self):
+        dgdr = _make_dgdr()
+        with (
+            patch("dynamo.profiler.rapid.build_default_task_configs", return_value={}),
+            patch(
+                "dynamo.profiler.rapid._execute_task_configs",
+                return_value=self._execute_return(),
+            ),
+            patch(
+                "dynamo.profiler.rapid._generate_dgd_from_pick",
+                return_value={"kind": "DGD"},
+            ),
+        ):
+            result = _run_default_sim(
+                dgdr,
+                "Qwen/Qwen3-32B",
+                "h200_sxm",
+                "trtllm",
+                8,
+                4000,
+                1000,
+                2000.0,
+                50.0,
+                None,
+                "default",
+            )
+
+        assert set(result) >= {
+            "best_config_df",
+            "best_latencies",
+            "dgd_config",
+            "chosen_exp",
+            "task_configs",
+        }
+        assert result["chosen_exp"] == "disagg"
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_load_match_passes_load_kwargs(self):
+        """load_match picking mode forwards rate/concurrency/max_gpus to execute."""
+        dgdr = _make_dgdr(workload=WorkloadSpec(isl=4000, osl=1000, requestRate=5.0))
+        captured: dict = {}
+
+        def fake_execute(task_configs, mode, top_n, **kwargs):
+            captured.update(kwargs)
+            return self._execute_return()
+
+        with (
+            patch("dynamo.profiler.rapid.build_default_task_configs", return_value={}),
+            patch(
+                "dynamo.profiler.rapid._execute_task_configs", side_effect=fake_execute
+            ),
+            patch("dynamo.profiler.rapid._generate_dgd_from_pick", return_value=None),
+        ):
+            _run_default_sim(
+                dgdr,
+                "Qwen/Qwen3-32B",
+                "h200_sxm",
+                "trtllm",
+                8,
+                4000,
+                1000,
+                2000.0,
+                50.0,
+                None,
+                "load_match",
+            )
+
+        assert "target_request_rate" in captured
+        assert captured["target_request_rate"] == 5.0
+        assert captured["max_total_gpus"] == 8
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_default_mode_passes_no_load_kwargs(self):
+        """default picking mode does not forward load-match kwargs."""
+        dgdr = _make_dgdr()
+        captured: dict = {}
+
+        def fake_execute(task_configs, mode, top_n, **kwargs):
+            captured.update(kwargs)
+            return self._execute_return()
+
+        with (
+            patch("dynamo.profiler.rapid.build_default_task_configs", return_value={}),
+            patch(
+                "dynamo.profiler.rapid._execute_task_configs", side_effect=fake_execute
+            ),
+            patch("dynamo.profiler.rapid._generate_dgd_from_pick", return_value=None),
+        ):
+            _run_default_sim(
+                dgdr,
+                "Qwen/Qwen3-32B",
+                "h200_sxm",
+                "trtllm",
+                8,
+                4000,
+                1000,
+                2000.0,
+                50.0,
+                None,
+                "default",
+            )
+
+        assert "target_request_rate" not in captured
+        assert "max_total_gpus" not in captured
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_latencies_extracted_from_chosen_exp(self):
+        """best_latencies come from the chosen experiment's entry."""
+        dgdr = _make_dgdr()
+        with (
+            patch("dynamo.profiler.rapid.build_default_task_configs", return_value={}),
+            patch(
+                "dynamo.profiler.rapid._execute_task_configs",
+                return_value=self._execute_return(ttft=123.0, tpot=7.0),
+            ),
+            patch("dynamo.profiler.rapid._generate_dgd_from_pick", return_value=None),
+        ):
+            result = _run_default_sim(
+                dgdr,
+                "Qwen/Qwen3-32B",
+                "h200_sxm",
+                "trtllm",
+                8,
+                4000,
+                1000,
+                2000.0,
+                50.0,
+                None,
+                "default",
+            )
+
+        assert result["best_latencies"]["ttft"] == 123.0
+        assert result["best_latencies"]["tpot"] == 7.0
--- a/tests/profiler/test_helpers_thorough.py
+++ b/tests/profiler/test_helpers_thorough.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests for thorough.py's _pick_thorough_best_config helper.
+
+Benchmarking helpers (_benchmark_prefill_candidates, _benchmark_decode_candidates)
+require live K8s deployments and are covered by the mocked end-to-end tests
+in test_profile_sla_dgdr.py.
+"""
+
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+try:
+    from dynamo.profiler.thorough import _pick_thorough_best_config
+    from dynamo.profiler.utils.aic_dataframe import build_decode_row, build_prefill_row
+    from dynamo.profiler.utils.dgdr_v1beta1_types import (
+        DynamoGraphDeploymentRequestSpec,
+        HardwareSpec,
+        SLASpec,
+        WorkloadSpec,
+    )
+except ImportError as e:
+    pytest.skip(f"Skip (missing dependency): {e}", allow_module_level=True)
+
+
+# ---------------------------------------------------------------------------
+# Shared fixtures
+# ---------------------------------------------------------------------------
+
+
+def _make_dgdr(**overrides) -> DynamoGraphDeploymentRequestSpec:
+    base = dict(
+        model="Qwen/Qwen3-32B",
+        backend="trtllm",
+        image="nvcr.io/nvidia/ai-dynamo/dynamo-frontend:latest",
+        hardware=HardwareSpec(gpuSku="h200_sxm", totalGpus=8, numGpusPerNode=8),
+        workload=WorkloadSpec(isl=4000, osl=1000),
+        sla=SLASpec(ttft=2000.0, itl=50.0),
+    )
+    base.update(overrides)
+    return DynamoGraphDeploymentRequestSpec(**base)
+
+
+def _stub_dfs():
+    """Minimal prefill/decode DataFrames that satisfy pick function inputs.
+
+    Uses build_prefill_row / build_decode_row so the DataFrames contain all
+    columns expected by _build_disagg_summary_dict (called via
+    build_disagg_df_from_static in load_match / default paths).
+    """
+    prefill_row = build_prefill_row(
+        model="Qwen/Qwen3-32B",
+        isl=4000,
+        osl=1000,
+        ttft=50.0,
+        tp=1,
+        pp=1,
+        dp=1,
+        moe_tp=1,
+        moe_ep=1,
+        backend="trtllm",
+        system="h200_sxm",
+    )
+    decode_row = build_decode_row(
+        tpot=10.0,
+        thpt_per_gpu=100.0,
+        num_request=1,
+        num_gpus=1,
+        osl=1000,
+        tp=1,
+        pp=1,
+        dp=1,
+        moe_tp=1,
+        moe_ep=1,
+        backend="trtllm",
+        system="h200_sxm",
+    )
+    prefill_df = pd.DataFrame([prefill_row])
+    decode_df = pd.DataFrame([decode_row])
+    return prefill_df, decode_df
+
+
+def _mock_result():
+    return {
+        "best_config_df": pd.DataFrame(),
+        "best_latencies": {"ttft": 0.0, "tpot": 0.0, "request_latency": 0.0},
+    }
+
+
+# ---------------------------------------------------------------------------
+# _pick_thorough_best_config
+# ---------------------------------------------------------------------------
+
+
+class TestPickThoroughBestConfig:
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_autoscale_calls_pick_autoscale(self):
+        """autoscale mode delegates to pick_autoscale with ttft/tpot targets."""
+        prefill_df, decode_df = _stub_dfs()
+        dgdr = _make_dgdr()
+        mock_result = _mock_result()
+
+        with patch(
+            "dynamo.profiler.thorough.pick_autoscale", return_value=mock_result
+        ) as mock_pick:
+            result = _pick_thorough_best_config(
+                prefill_df,
+                decode_df,
+                "autoscale",
+                2000.0,
+                50.0,
+                None,
+                8,
+                dgdr,
+            )
+
+        mock_pick.assert_called_once_with(prefill_df, decode_df, 2000.0, 50.0)
+        assert result is mock_result
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_load_match_uses_request_latency_when_set(self):
+        """load_match passes target_request_latency when request_latency is provided."""
+        prefill_df, decode_df = _stub_dfs()
+        dgdr = _make_dgdr(workload=WorkloadSpec(isl=4000, osl=1000, requestRate=5.0))
+
+        with patch(
+            "dynamo.profiler.thorough.pick_load_match", return_value=_mock_result()
+        ) as mock_pick:
+            _pick_thorough_best_config(
+                prefill_df,
+                decode_df,
+                "load_match",
+                2000.0,
+                50.0,
+                35000.0,
+                8,
+                dgdr,
+            )
+
+        kwargs = mock_pick.call_args.kwargs
+        assert kwargs["target_request_latency"] == 35000.0
+        assert "target_tpot" not in kwargs
+        assert kwargs["target_request_rate"] == 5.0
+        assert kwargs["max_total_gpus"] == 8
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_load_match_falls_back_to_target_tpot(self):
+        """load_match passes target_tpot when no request_latency."""
+        prefill_df, decode_df = _stub_dfs()
+        dgdr = _make_dgdr()
+
+        with patch(
+            "dynamo.profiler.thorough.pick_load_match", return_value=_mock_result()
+        ) as mock_pick:
+            _pick_thorough_best_config(
+                prefill_df,
+                decode_df,
+                "load_match",
+                2000.0,
+                50.0,
+                None,
+                8,
+                dgdr,
+            )
+
+        kwargs = mock_pick.call_args.kwargs
+        assert kwargs["target_tpot"] == 50.0
+        assert "target_request_latency" not in kwargs
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_default_uses_request_latency_when_set(self):
+        """default mode passes target_request_latency when provided."""
+        prefill_df, decode_df = _stub_dfs()
+        dgdr = _make_dgdr()
+
+        with patch(
+            "dynamo.profiler.thorough.pick_default", return_value=_mock_result()
+        ) as mock_pick:
+            _pick_thorough_best_config(
+                prefill_df,
+                decode_df,
+                "default",
+                2000.0,
+                50.0,
+                35000.0,
+                8,
+                dgdr,
+            )
+
+        kwargs = mock_pick.call_args.kwargs
+        assert kwargs["target_request_latency"] == 35000.0
+        assert kwargs["total_gpus"] == 8
+        assert kwargs["serving_mode"] == "disagg"
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_default_falls_back_to_target_tpot(self):
+        """default mode passes target_tpot when no request_latency."""
+        prefill_df, decode_df = _stub_dfs()
+        dgdr = _make_dgdr()
+
+        with patch(
+            "dynamo.profiler.thorough.pick_default", return_value=_mock_result()
+        ) as mock_pick:
+            _pick_thorough_best_config(
+                prefill_df,
+                decode_df,
+                "default",
+                2000.0,
+                50.0,
+                None,
+                8,
+                dgdr,
+            )
+
+        kwargs = mock_pick.call_args.kwargs
+        assert kwargs["target_tpot"] == 50.0
+        assert "target_request_latency" not in kwargs
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_load_match_omits_workload_kwargs_when_no_workload(self):
+        """When dgdr.workload has no rate/concurrency, those kwargs are absent."""
+        prefill_df, decode_df = _stub_dfs()
+        dgdr = _make_dgdr()  # no requestRate or concurrency
+
+        with patch(
+            "dynamo.profiler.thorough.pick_load_match", return_value=_mock_result()
+        ) as mock_pick:
+            _pick_thorough_best_config(
+                prefill_df,
+                decode_df,
+                "load_match",
+                2000.0,
+                50.0,
+                None,
+                0,
+                dgdr,
+            )
+
+        kwargs = mock_pick.call_args.kwargs
+        assert "target_request_rate" not in kwargs
+        assert "max_total_gpus" not in kwargs
--- a/tests/profiler/test_profile_sla_aiconfigurator.py
+++ b/tests/profiler/test_profile_sla_aiconfigurator.py
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-Test suite for profile_sla aiconfigurator functionality.
-
-profile_sla should be able to use aiconfigurator functionality
-even without access to any GPU system.
-"""
-
-import sys
-from pathlib import Path
-
-import pytest
-
-# Add the project root to sys.path to enable imports
-project_root = Path(__file__).parent.parent.parent
-sys.path.insert(0, str(project_root))
-
-try:
-    from dynamo.profiler.profile_sla import run_profile  # noqa: E402
-    from dynamo.profiler.utils.defaults import SearchStrategy  # noqa: E402
-    from dynamo.profiler.utils.model_info import ModelInfo  # noqa: E402
-except ImportError as _e:
-    pytest.skip(f"Skip testing (refactor in progress): {_e}", allow_module_level=True)
-
-pytestmark = [
-    pytest.mark.aiconfigurator,
-]
-
-
-# Override the logger fixture from conftest.py to prevent directory creation
-@pytest.fixture(autouse=True)
-def logger(request):
-    """Override the logger fixture to prevent test directory creation.
-
-    This replaces the logger fixture from tests/conftest.py that creates
-    directories named after each test.
-    """
-    # Simply do nothing - no directories created, no file handlers added
-    yield
-
-
-class TestProfileSlaAiconfigurator:
-    """Test class for profile_sla aiconfigurator functionality."""
-
-    @pytest.fixture
-    def llm_args(self, request):
-        class Args:
-            def __init__(self):
-                self.model = "Qwen/Qwen3-32B"  # Set to match aic_hf_id for consistency
-                self.dgd_image = ""
-                self.backend = "trtllm"
-                self.config = "examples/backends/trtllm/deploy/disagg.yaml"
-                # Use unique output directory per test for parallel execution
-                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
-                self.namespace = f"test-namespace-{request.node.name}"
-                self.min_num_gpus_per_engine = 1
-                self.max_num_gpus_per_engine = 8
-                self.skip_existing_results = False
-                self.force_rerun = False
-                self.isl = 3000
-                self.osl = 500
-                self.ttft = 50
-                self.itl = 10
-                self.prefill_interpolation_granularity = 16
-                self.decode_interpolation_granularity = 6
-                self.service_name = ""
-                self.dry_run = False
-                self.num_gpus_per_node = 8
-                self.deploy_after_profile = False
-                self.pick_with_webui = False
-                # Use RAPID strategy to leverage AI Configurator for perf estimation
-                # This avoids Kubernetes deployments while testing aiconfigurator functionality
-                self.search_strategy = SearchStrategy.RAPID
-                self.system = "h200_sxm"  # Must match aic_system for RAPID strategy
-                # Provide minimal model_info to avoid HF queries
-                self.model_info = ModelInfo(
-                    model_size=16384.0,
-                    architecture="TestArchitecture",
-                    is_moe=False,
-                    max_context_length=16384,
-                )
-
-        return Args()
-
-    @pytest.mark.pre_merge
-    @pytest.mark.gpu_0
-    @pytest.mark.performance
-    @pytest.mark.parallel
-    @pytest.mark.asyncio
-    @pytest.mark.parametrize("missing_arg", ["system", "model"])
-    async def test_aiconfigurator_missing_args(self, llm_args, missing_arg):
-        # Check that validation error happens when a required arg is missing for RAPID strategy.
-        # These args are required when using SearchStrategy.RAPID with AI Configurator.
-        setattr(llm_args, missing_arg, None)
-        with pytest.raises(ValueError):
-            await run_profile(llm_args)
-
-    @pytest.mark.pre_merge
-    @pytest.mark.gpu_0
-    @pytest.mark.performance
-    @pytest.mark.parallel
-    @pytest.mark.asyncio
-    @pytest.mark.parametrize(
-        "arg_name, bad_value",
-        [
-            # these values don't exist in the aiconfigurator database.
-            ("system", "fake_gpu_system"),
-        ],
-    )
-    async def test_aiconfigurator_no_data(self, llm_args, arg_name, bad_value):
-        # Check that an appropriate error is raised when the system/model/backend
-        # is not found in the aiconfigurator database.
-        setattr(llm_args, arg_name, bad_value)
-        with pytest.raises(ValueError, match="Database not found"):
-            await run_profile(llm_args)
-
-    @pytest.mark.trtllm
-    @pytest.mark.pre_merge
-    @pytest.mark.parallel
-    @pytest.mark.asyncio
-    @pytest.mark.gpu_0
-    @pytest.mark.integration
-    async def test_trtllm_aiconfigurator_single_model(self, llm_args):
-        # Test that profile_sla works with the model & backend in the llm_args fixture.
-        await run_profile(llm_args)
-
-    @pytest.mark.parallel
-    @pytest.mark.asyncio
-    @pytest.mark.gpu_1
-    @pytest.mark.integration
-    @pytest.mark.nightly
-    # fmt: off
-    @pytest.mark.parametrize(
-        "backend",
-        [
-            pytest.param("trtllm", marks=pytest.mark.trtllm),
-            pytest.param("vllm",   marks=pytest.mark.vllm),
-            pytest.param("sglang", marks=pytest.mark.sglang),
-        ],
-    )
-    # fmt: on
-    @pytest.mark.parametrize(
-        "hf_model_id",
-        [
-            "Qwen/Qwen3-32B",
-            "meta-llama/Llama-3.1-405B",
-        ],
-    )
-    async def test_aiconfigurator_dense_models(self, llm_args, hf_model_id, backend):
-        # Test that profile_sla works with a variety of backends and model names
-        # using AI Configurator's RAPID strategy for performance estimation.
-        # Backend version is not used with RAPID strategy - performance comes from AI Configurator.
-        llm_args.model = hf_model_id  # Used by RAPID strategy
-        llm_args.backend = backend
-        await run_profile(llm_args)
--- a/tests/profiler/test_profile_sla_dgdr.py
+++ b/tests/profiler/test_profile_sla_dgdr.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Test suite for profile_sla with DynamoGraphDeploymentRequestSpec input.
+
+Tests the new DGDR-based profiler entry point across different configurations:
+rapid/thorough, supported/unsupported, planner/no-planner, load-match, PVC, mocker.
+
+All tests are no-GPU (gpu_0) and pre_merge.
+"""
+
+import asyncio
+import os
+import sys
+from pathlib import Path
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import numpy as np
+import pytest
+import yaml
+
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+
+try:
+    from dynamo.profiler.profile_sla import run_profile
+    from dynamo.profiler.utils.dgdr_v1beta1_types import (
+        BackendType,
+        DynamoGraphDeploymentRequestSpec,
+        SearchStrategy,
+    )
+    from dynamo.profiler.utils.profile_common import ProfilerOperationalConfig
+except ImportError as _e:
+    pytest.skip(f"Skip testing (refactor in progress): {_e}", allow_module_level=True)
+
+
+@pytest.fixture(autouse=True)
+def logger(request):
+    """Override the logger fixture to prevent test directory creation."""
+    yield
+
+
+def _load_dgdr(yaml_path) -> DynamoGraphDeploymentRequestSpec:
+    with open(yaml_path) as f:
+        data = yaml.safe_load(f)
+    return DynamoGraphDeploymentRequestSpec.model_validate(data)
+
+
+def _make_ops(tmp_path, **overrides) -> ProfilerOperationalConfig:
+    defaults = {
+        "output_dir": str(tmp_path / "profiling_results"),
+        "dry_run": False,
+    }
+    defaults.update(overrides)
+    return ProfilerOperationalConfig(**defaults)
+
+
+CONFIGS_DIR = Path(__file__).parent / "configs"
+
+
+class TestRapidSupported:
+    """Rapid strategy with AIC-supported model (Qwen3-32B on h200_sxm/trtllm)."""
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_no_planner_no_load(self, tmp_path):
+        """Case 1: default picking mode, no planner, no target load."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "1_rapid_supported_no_planner_no_load.yaml")
+        ops = _make_ops(tmp_path)
+        asyncio.run(run_profile(dgdr, ops))
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+        config = yaml.safe_load(output.read_text())
+        assert config, "final_config.yaml should not be empty"
+        assert "spec" in config
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_no_planner_with_load(self, tmp_path):
+        """Case 2: load-match picking mode with requestRate."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "2_rapid_supported_no_planner_with_load.yaml")
+        ops = _make_ops(tmp_path)
+        asyncio.run(run_profile(dgdr, ops))
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+        config = yaml.safe_load(output.read_text())
+        assert config
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_pvc_no_planner_with_load(self, tmp_path):
+        """Case 2b: load-match with PVC model cache."""
+        dgdr = _load_dgdr(
+            CONFIGS_DIR / "2b_rapid_supported_pvc_no_planner_with_load.yaml"
+        )
+        ops = _make_ops(tmp_path)
+        asyncio.run(run_profile(dgdr, ops))
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+        config = yaml.safe_load(output.read_text())
+        assert config
+        spec = config.get("spec", {})
+        pvcs = spec.get("pvcs", [])
+        assert any(
+            p.get("name") == "model-cache" for p in pvcs
+        ), "PVC should be mounted"
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_e2e_latency_sla(self, tmp_path):
+        """Case 2c: e2eLatency SLA instead of ttft/itl."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "2c_rapid_supported_e2e_latency.yaml")
+        ops = _make_ops(tmp_path)
+        asyncio.run(run_profile(dgdr, ops))
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+        config = yaml.safe_load(output.read_text())
+        assert config
+        # Verify ttft/itl were cleared by the validator
+        assert dgdr.sla.ttft is None
+        assert dgdr.sla.itl is None
+        assert dgdr.sla.e2eLatency == 35000.0
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_both_concurrency_and_rate_rejected(self, tmp_path):
+        """Case 2d: both concurrency and requestRate should fail profiler validation."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "2d_rapid_both_concurrency_and_rate_error.yaml")
+        ops = _make_ops(tmp_path)
+        with pytest.raises(ValueError, match="concurrency.*requestRate"):
+            asyncio.run(run_profile(dgdr, ops))
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_planner_rapid_sweep(self, tmp_path):
+        """Case 3: autoscale picking with planner + rapid pre-deployment sweep."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "3_rapid_supported_planner_rapid_sweep.yaml")
+        ops = _make_ops(tmp_path)
+        asyncio.run(run_profile(dgdr, ops))
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+        raw = output.read_text()
+        docs = list(yaml.safe_load_all(raw))
+        assert len(docs) >= 2, "Planner config should produce multi-doc YAML"
+        dgd = docs[-1]
+        assert "Planner" in dgd.get("spec", {}).get(
+            "services", {}
+        ), "Planner service should be added"
+
+
+class TestRapidUnsupported:
+    """Rapid strategy with AIC-unsupported model (Qwen3-32B on l40s/vllm)."""
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_no_planner_naive_fallback(self, tmp_path):
+        """Case 4: falls back to naive config generation."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "4_rapid_unsupported_no_planner.yaml")
+        ops = _make_ops(tmp_path)
+        asyncio.run(run_profile(dgdr, ops))
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+        config = yaml.safe_load(output.read_text())
+        assert config, "Naive fallback should produce a non-empty config"
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_planner_load_scaling_rapid_sweep_fallback(self, tmp_path):
+        """Case 5: planner with load scaling, rapid sweep falls back to none."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "5_rapid_unsupported_planner.yaml")
+        ops = _make_ops(tmp_path)
+        asyncio.run(run_profile(dgdr, ops))
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_planner_throughput_scaling_raises(self, tmp_path):
+        """Case 5b: planner with throughput scaling on unsupported combo should fail."""
+        dgdr = _load_dgdr(
+            CONFIGS_DIR / "5b_rapid_unsupported_planner_throughput_error.yaml"
+        )
+        ops = _make_ops(tmp_path)
+        with pytest.raises(
+            ValueError, match="Throughput-based planner scaling requires AIC support"
+        ):
+            asyncio.run(run_profile(dgdr, ops))
+
+
+class TestThoroughDryRun:
+    """Thorough strategy tested with --dry-run (no real deployments)."""
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_no_planner_with_load(self, tmp_path):
+        """Case 6: thorough + load-match, dry-run."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "6_thorough_no_planner_with_load.yaml")
+        ops = _make_ops(tmp_path, dry_run=True)
+        asyncio.run(run_profile(dgdr, ops))
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_planner_rapid_sweep(self, tmp_path):
+        """Case 7: thorough + planner + rapid pre-deployment sweep, dry-run."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "7_thorough_planner_rapid_sweep.yaml")
+        ops = _make_ops(tmp_path, dry_run=True)
+        asyncio.run(run_profile(dgdr, ops))
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+
+
+class TestMockerEnabled:
+    """Mocker feature flag selects mocker DGD over real worker DGD."""
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_mocker_config_selected(self, tmp_path):
+        """Case 3b: planner + mocker enabled, should produce mocker DGD."""
+        config_path = CONFIGS_DIR / "3b_rapid_supported_planner_rapid_sweep_mocker.yaml"
+        if not config_path.exists():
+            pytest.skip("3b mocker config not found")
+        dgdr = _load_dgdr(config_path)
+        ops = _make_ops(tmp_path)
+        asyncio.run(run_profile(dgdr, ops))
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+
+
+class TestGateChecks:
+    """Validate gate checks at profiler startup."""
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_thorough_auto_backend_rejected(self, tmp_path):
+        """Thorough + auto backend should raise ValueError."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "1_rapid_supported_no_planner_no_load.yaml")
+        dgdr.searchStrategy = SearchStrategy.Thorough
+        dgdr.backend = BackendType.Auto
+        ops = _make_ops(tmp_path)
+        with pytest.raises(ValueError, match="does not support 'auto' backend"):
+            asyncio.run(run_profile(dgdr, ops))
+
+
+class TestAutoBackend:
+    """Rapid strategy with auto backend resolution."""
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_no_planner_no_load(self, tmp_path):
+        """Case 11: auto backend, rapid, no planner, no target load."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "11_auto_rapid_no_planner_no_load.yaml")
+        assert dgdr.backend == BackendType.Auto
+        ops = _make_ops(tmp_path)
+        asyncio.run(run_profile(dgdr, ops))
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+        config = yaml.safe_load(output.read_text())
+        assert config, "final_config.yaml should not be empty"
+        assert "spec" in config
+
+
+class TestThoroughEdgeCases:
+    """Edge cases for thorough mode."""
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_empty_candidates_due_to_small_gpu(self, tmp_path):
+        """Case 8: DeepSeek-R1 on 1 L40S GPU — model too large, no candidates."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "8_thorough_empty_candidates.yaml")
+        ops = _make_ops(tmp_path, dry_run=True)
+        asyncio.run(run_profile(dgdr, ops))
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+        status_file = tmp_path / "profiling_results" / "profiler_status.yaml"
+        if status_file.exists():
+            status = yaml.safe_load(status_file.read_text())
+            assert status.get("status") in ("success", "failed")
+
+
+# ---------------------------------------------------------------------------
+# Helpers for mocking K8s deployment + benchmark functions
+# ---------------------------------------------------------------------------
+
+
+def _mock_deployment_client():
+    """Create a mock DynamoDeploymentClient that returns immediately."""
+    client = MagicMock()
+    client.create_deployment = AsyncMock()
+    client.wait_for_deployment_ready = AsyncMock()
+    client.get_deployment_logs = AsyncMock()
+    client.delete_deployment = AsyncMock()
+    client.get_service_url = MagicMock(return_value="http://mock:8000")
+    client.deployment_name = "mock-deployment"
+    client.base_log_dir = "/tmp"
+    return client
+
+
+def _save_dummy_npz(output_dir: str):
+    """Save dummy prefill + decode NPZ files matching the interpolation format."""
+    prefill_dir = os.path.join(output_dir, "selected_prefill_interpolation")
+    os.makedirs(prefill_dir, exist_ok=True)
+    np.savez(
+        os.path.join(prefill_dir, "raw_data.npz"),
+        prefill_isl=np.array([500, 1000, 2000, 4000]),
+        prefill_ttft=np.array([10.0, 20.0, 40.0, 80.0]),
+        prefill_thpt_per_gpu=np.array([50000.0, 50000.0, 50000.0, 50000.0]),
+    )
+
+    decode_dir = os.path.join(output_dir, "selected_decode_interpolation")
+    os.makedirs(decode_dir, exist_ok=True)
+    np.savez(
+        os.path.join(decode_dir, "raw_data.npz"),
+        x_kv_usage=np.array([0.1, 0.3, 0.5, 0.8]),
+        y_context_length=np.array([500, 1000, 2000, 4000]),
+        z_itl=np.array([5.0, 6.0, 7.0, 8.0]),
+        z_thpt_per_gpu=np.array([200.0, 180.0, 160.0, 140.0]),
+        max_kv_tokens=np.array([100000]),
+    )
+
+
+_DECODE_SVC_NAMES = {
+    "sglang": "decode",
+    "vllm": "VllmDecodeWorker",
+    "trtllm": "TRTLLMDecodeWorker",
+}
+
+
+def _make_thorough_patches(backend: str = "trtllm"):
+    """Build mock-patches for thorough mode, parameterised by backend."""
+    svc_name = _DECODE_SVC_NAMES.get(backend, "TRTLLMDecodeWorker")
+    return [
+        patch(
+            "dynamo.profiler.thorough.DynamoDeploymentClient",
+            side_effect=lambda **kw: _mock_deployment_client(),
+        ),
+        patch("dynamo.profiler.thorough.get_prefill_ttft", return_value=50.0),
+        patch(
+            "dynamo.profiler.thorough.get_decode_itl_and_thpt_per_gpu",
+            return_value=(8.0, 125.0),
+        ),
+        patch("dynamo.profiler.thorough.get_num_request_range", return_value=[1, 4, 8]),
+        patch(
+            "dynamo.profiler.thorough.get_service_name_by_type",
+            return_value=svc_name,
+        ),
+    ]
+
+
+# Backward compat: existing tests use the trtllm-flavored list
+_THOROUGH_PATCHES = _make_thorough_patches("trtllm")
+
+
+def _patch_kv_cache_log(backend: str = "trtllm"):
+    """Patch get_kv_cache_size_from_dynamo_log on the real config modifier."""
+    from dynamo.profiler.utils.config_modifiers import CONFIG_MODIFIERS
+
+    real_modifier = CONFIG_MODIFIERS[backend]
+    return patch.object(
+        real_modifier, "get_kv_cache_size_from_dynamo_log", return_value=100000
+    )
+
+
+class TestThoroughMocked:
+    """Thorough mode with mocked K8s deployments and benchmark functions.
+
+    Only K8s client, AIPerf benchmarks, and log-file reads are mocked.
+    Enumeration, picking, and DGD generation run for real via AIC.
+    """
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_thorough_no_planner_with_load(self, tmp_path):
+        """Case 6 (mocked): thorough + load-match, full pipeline without real GPUs."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "6_thorough_no_planner_with_load.yaml")
+        ops = _make_ops(tmp_path)
+
+        with _patch_kv_cache_log("trtllm"):
+            for p in _THOROUGH_PATCHES:
+                p.start()
+            try:
+                asyncio.run(run_profile(dgdr, ops))
+            finally:
+                for p in _THOROUGH_PATCHES:
+                    p.stop()
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+        config = yaml.safe_load(output.read_text())
+        assert config, "Mocked thorough should produce a non-empty config"
+        assert "spec" in config
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_thorough_planner_thorough_sweep(self, tmp_path):
+        """Case 7b: thorough search + thorough interpolation, fully mocked."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "7b_thorough_planner_thorough_sweep.yaml")
+        ops = _make_ops(tmp_path)
+
+        def mock_profile_prefill(work_dir, *args, **kwargs):
+            _save_dummy_npz(ops.output_dir)
+
+        def mock_profile_decode(work_dir, *args, **kwargs):
+            _save_dummy_npz(ops.output_dir)
+
+        interp_patches = [
+            patch(
+                "dynamo.profiler.interpolation.DynamoDeploymentClient",
+                side_effect=lambda **kw: _mock_deployment_client(),
+            ),
+            patch(
+                "dynamo.profiler.interpolation.profile_prefill",
+                side_effect=mock_profile_prefill,
+            ),
+            patch(
+                "dynamo.profiler.interpolation.profile_decode",
+                side_effect=mock_profile_decode,
+            ),
+            patch(
+                "dynamo.profiler.interpolation.get_service_name_by_type",
+                return_value="TRTLLMWorker",
+            ),
+        ]
+
+        with _patch_kv_cache_log("trtllm"):
+            all_patches = _THOROUGH_PATCHES + interp_patches
+            for p in all_patches:
+                p.start()
+            try:
+                asyncio.run(run_profile(dgdr, ops))
+            finally:
+                for p in all_patches:
+                    p.stop()
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists()
+        raw = output.read_text()
+        docs = list(yaml.safe_load_all(raw))
+        assert len(docs) >= 2, "Planner + profiling data should produce multi-doc YAML"
+
+        prefill_npz = (
+            tmp_path
+            / "profiling_results"
+            / "selected_prefill_interpolation"
+            / "raw_data.npz"
+        )
+        decode_npz = (
+            tmp_path
+            / "profiling_results"
+            / "selected_decode_interpolation"
+            / "raw_data.npz"
+        )
+        assert prefill_npz.exists(), "Prefill interpolation data should be saved"
+        assert decode_npz.exists(), "Decode interpolation data should be saved"
+
+
+# ---------------------------------------------------------------------------
+# Shared helper for mocked-thorough + override tests
+# ---------------------------------------------------------------------------
+
+
+def _run_mocked_thorough(dgdr, ops, backend: str):
+    """Run the full mocked-thorough pipeline for an arbitrary backend."""
+    thorough_patches = _make_thorough_patches(backend)
+    kv_patch = _patch_kv_cache_log(backend)
+
+    with kv_patch:
+        for p in thorough_patches:
+            p.start()
+        try:
+            asyncio.run(run_profile(dgdr, ops))
+        finally:
+            for p in thorough_patches:
+                p.stop()
+
+
+def _assert_overrides_applied(final_config_path: Path, dgdr):
+    """Assert the final DGD exists and that overrides are reflected."""
+    assert final_config_path.exists(), "final_config.yaml should exist"
+    raw = final_config_path.read_text()
+    docs = list(yaml.safe_load_all(raw))
+    dgd = docs[-1] if docs else {}
+    assert dgd and "spec" in dgd, "DGD should have a spec"
+
+    override_spec = dgdr.overrides.dgd.get("spec", {})
+
+    for ovr_key in ("envs", "backendFramework"):
+        if ovr_key in override_spec:
+            assert ovr_key in dgd["spec"], f"Override field spec.{ovr_key} should exist"
+
+    svc_overrides = override_spec.get("services", {})
+    dgd_services = dgd.get("spec", {}).get("services", {})
+    for svc_name, svc_ovr in svc_overrides.items():
+        if svc_name in dgd_services:
+            dgd_svc = dgd_services[svc_name]
+            if "sharedMemory" in svc_ovr:
+                assert (
+                    "sharedMemory" in dgd_svc
+                ), f"Override sharedMemory on {svc_name} should be applied"
+            mc = svc_ovr.get("extraPodSpec", {}).get("mainContainer", {})
+            if "args" in mc:
+                dgd_args = (
+                    dgd_svc.get("extraPodSpec", {})
+                    .get("mainContainer", {})
+                    .get("args", [])
+                )
+                for arg in mc["args"]:
+                    assert (
+                        arg in dgd_args
+                    ), f"Override arg '{arg}' should be in {svc_name} args"
+
+
+class TestThoroughMockedOverrides:
+    """Thorough + DGD overrides: verify overrides are applied end-to-end."""
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_9a_sglang_overrides(self, tmp_path):
+        """Case 9a: SGLang thorough sweep with DSR1 overrides."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "9a_thorough_dsr1_sglang_overrides.yaml")
+        ops = _make_ops(tmp_path)
+        _run_mocked_thorough(dgdr, ops, "sglang")
+        _assert_overrides_applied(
+            tmp_path / "profiling_results" / "final_config.yaml",
+            dgdr,
+        )
+
+    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    def test_10_override_security_context(self, tmp_path):
+        """Case 10: imagePullSecrets injected via overrides into a new spec field."""
+        dgdr = _load_dgdr(CONFIGS_DIR / "10_thorough_override_security_context.yaml")
+        ops = _make_ops(tmp_path)
+        _run_mocked_thorough(dgdr, ops, "trtllm")
+
+        output = tmp_path / "profiling_results" / "final_config.yaml"
+        assert output.exists(), "final_config.yaml should exist"
+        config = yaml.safe_load(output.read_text())
+        assert config and "spec" in config
+
+        secrets = config["spec"].get("imagePullSecrets")
+        assert secrets is not None, "imagePullSecrets should be present"
+        secret_names = [s["name"] for s in secrets]
+        assert "my-registry-secret" in secret_names
+        assert "nvcr-pull-secret" in secret_names
--- a/tests/profiler/test_profile_sla_dryrun.py
+++ b/tests/profiler/test_profile_sla_dryrun.py
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-Test suite for profile_sla dry-run functionality.
-
-This test ensures that the profile_sla script can successfully run in dry-run mode
-for vllm, sglang, and trtllm backends with their respective disagg.yaml configurations.
-"""
-
-import sys
-from pathlib import Path
-from unittest.mock import patch
-
-import pytest
-
-# Add the project root to sys.path to enable imports
-project_root = Path(__file__).parent.parent.parent
-sys.path.insert(0, str(project_root))
-
-try:
-    from dynamo.profiler.profile_sla import run_profile  # noqa: E402
-    from dynamo.profiler.utils.defaults import SearchStrategy  # noqa: E402
-    from dynamo.profiler.utils.model_info import ModelInfo  # noqa: E402
-    from dynamo.profiler.utils.search_space_autogen import (  # noqa: E402
-        auto_generate_search_space,
-    )
-except ImportError as _e:
-    pytest.skip(f"Skip testing (refactor in progress): {_e}", allow_module_level=True)
-
-
-# Override the logger fixture from conftest.py to prevent directory creation
-@pytest.fixture(autouse=True)
-def logger(request):
-    """Override the logger fixture to prevent test directory creation.
-
-    This replaces the logger fixture from tests/conftest.py that creates
-    directories named after each test.
-    """
-    # Simply do nothing - no directories created, no file handlers added
-    yield
-
-
-class TestProfileSLADryRun:
-    """Test class for profile_sla dry-run functionality."""
-
-    @pytest.fixture
-    def vllm_args(self, request):
-        """Create arguments for vllm backend dry-run test."""
-
-        class Args:
-            def __init__(self):
-                self.backend = "vllm"
-                self.config = "examples/backends/vllm/deploy/disagg.yaml"
-                # Use unique output directory per test for parallel execution
-                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
-                self.namespace = f"test-namespace-{request.node.name}"
-                self.model = ""
-                self.dgd_image = ""
-                self.min_num_gpus_per_engine = 1
-                self.max_num_gpus_per_engine = 8
-                self.skip_existing_results = False
-                self.force_rerun = False
-                self.isl = 3000
-                self.osl = 500
-                self.ttft = 50
-                self.itl = 10
-                self.max_context_length = 16384
-                self.prefill_interpolation_granularity = 16
-                self.decode_interpolation_granularity = 6
-                self.service_name = ""
-                self.dry_run = True
-                self.aic_system = None
-                self.aic_hf_id = None
-                self.aic_backend = ""
-                self.aic_backend_version = None
-                self.num_gpus_per_node = 8
-                self.search_strategy = SearchStrategy.THOROUGH
-                self.system = ""
-                self.deploy_after_profile = False
-                self.pick_with_webui = False
-                self.model_cache_pvc_name = ""
-                self.model_cache_pvc_path = ""
-                self.model_cache_pvc_mount_path = "/opt/model-cache"
-                # Provide minimal model_info to avoid HF queries
-                self.model_info = ModelInfo(
-                    model_size=16384.0,
-                    architecture="TestArchitecture",
-                    is_moe=False,
-                    max_context_length=self.max_context_length,
-                )
-
-        return Args()
-
-    @pytest.fixture
-    def sglang_args(self, request):
-        """Create arguments for sglang backend dry-run test."""
-
-        class Args:
-            def __init__(self):
-                self.backend = "sglang"
-                self.config = "examples/backends/sglang/deploy/disagg.yaml"
-                # Use unique output directory per test for parallel execution
-                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
-                self.namespace = f"test-namespace-{request.node.name}"
-                self.model = ""
-                self.dgd_image = ""
-                self.min_num_gpus_per_engine = 1
-                self.max_num_gpus_per_engine = 8
-                self.skip_existing_results = False
-                self.force_rerun = False
-                self.isl = 3000
-                self.osl = 500
-                self.ttft = 50
-                self.itl = 10
-                self.max_context_length = 16384
-                self.prefill_interpolation_granularity = 16
-                self.decode_interpolation_granularity = 6
-                self.service_name = ""
-                self.dry_run = True
-                self.aic_system = None
-                self.aic_hf_id = None
-                self.aic_backend = ""
-                self.aic_backend_version = None
-                self.num_gpus_per_node = 8
-                self.search_strategy = SearchStrategy.THOROUGH
-                self.system = ""
-                self.deploy_after_profile = False
-                self.pick_with_webui = False
-                self.model_cache_pvc_name = ""
-                self.model_cache_pvc_path = ""
-                self.model_cache_pvc_mount_path = "/opt/model-cache"
-                self.model_info = ModelInfo(
-                    model_size=16384.0,
-                    architecture="TestArchitecture",
-                    is_moe=False,
-                    max_context_length=self.max_context_length,
-                )
-
-        return Args()
-
-    @pytest.mark.pre_merge
-    @pytest.mark.parallel
-    @pytest.mark.asyncio
-    @pytest.mark.gpu_0
-    @pytest.mark.integration
-    @pytest.mark.vllm
-    async def test_vllm_dryrun(self, vllm_args):
-        """Test that profile_sla dry-run works for vllm backend with disagg.yaml config."""
-        # Run the profile in dry-run mode - should complete without errors
-        await run_profile(vllm_args)
-
-    @pytest.mark.pre_merge
-    @pytest.mark.parallel
-    @pytest.mark.asyncio
-    @pytest.mark.gpu_0
-    @pytest.mark.integration
-    @pytest.mark.sglang
-    async def test_sglang_dryrun(self, sglang_args):
-        """Test that profile_sla dry-run works for sglang backend with disagg.yaml config."""
-        # Run the profile in dry-run mode - should complete without errors
-        await run_profile(sglang_args)
-
-    @pytest.fixture
-    def trtllm_args(self, request):
-        """Create arguments for trtllm backend dry-run test."""
-
-        class Args:
-            def __init__(self):
-                self.backend = "trtllm"
-                self.config = "examples/backends/trtllm/deploy/disagg.yaml"
-                # Use unique output directory per test for parallel execution
-                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
-                self.namespace = f"test-namespace-{request.node.name}"
-                self.model = ""
-                self.dgd_image = ""
-                self.min_num_gpus_per_engine = 1
-                self.max_num_gpus_per_engine = 8
-                self.skip_existing_results = False
-                self.force_rerun = False
-                self.isl = 3000
-                self.osl = 500
-                self.ttft = 50
-                self.itl = 10
-                self.max_context_length = 16384
-                self.prefill_interpolation_granularity = 16
-                self.decode_interpolation_granularity = 6
-                self.service_name = ""
-                self.dry_run = True
-                self.aic_system = None
-                self.aic_hf_id = None
-                self.aic_backend = ""
-                self.aic_backend_version = None
-                self.num_gpus_per_node = 8
-                self.search_strategy = SearchStrategy.THOROUGH
-                self.system = ""
-                self.deploy_after_profile = False
-                self.pick_with_webui = False
-                self.model_cache_pvc_name = ""
-                self.model_cache_pvc_path = ""
-                self.model_cache_pvc_mount_path = "/opt/model-cache"
-                self.model_info = ModelInfo(
-                    model_size=16384.0,
-                    architecture="TestArchitecture",
-                    is_moe=False,
-                    max_context_length=self.max_context_length,
-                )
-
-        return Args()
-
-    @pytest.mark.pre_merge
-    @pytest.mark.parallel
-    @pytest.mark.asyncio
-    @pytest.mark.gpu_0
-    @pytest.mark.integration
-    @pytest.mark.trtllm
-    async def test_trtllm_dryrun(self, trtllm_args):
-        """Test that profile_sla dry-run works for trtllm backend with disagg.yaml config."""
-        # Run the profile in dry-run mode - should complete without errors
-        await run_profile(trtllm_args)
-
-    @pytest.fixture
-    def sglang_moe_args(self, request):
-        """Create arguments for trtllm backend dry-run test."""
-
-        class Args:
-            def __init__(self):
-                self.backend = "sglang"
-                self.config = "recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml"
-                # Use unique output directory per test for parallel execution
-                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
-                self.namespace = f"test-namespace-{request.node.name}"
-                self.model = ""
-                self.dgd_image = ""
-                self.min_num_gpus_per_engine = 8
-                self.max_num_gpus_per_engine = 32
-                self.skip_existing_results = False
-                self.force_rerun = False
-                self.isl = 3000
-                self.osl = 500
-                self.ttft = 50
-                self.itl = 10
-                self.max_context_length = 16384
-                self.prefill_interpolation_granularity = 16
-                self.decode_interpolation_granularity = 6
-                self.service_name = ""
-                self.dry_run = True
-                self.aic_system = None
-                self.aic_hf_id = None
-                self.aic_backend = ""
-                self.aic_backend_version = None
-                self.num_gpus_per_node = 8
-                self.search_strategy = SearchStrategy.THOROUGH
-                self.system = ""
-                self.deploy_after_profile = False
-                self.pick_with_webui = False
-                # Added in newer profiler versions; keep Args compatible with search_space_autogen
-                self.model_cache_pvc_name = ""
-                self.model_cache_pvc_path = ""
-                self.model_cache_pvc_mount_path = "/opt/model-cache"
-                self.model_info = ModelInfo(
-                    model_size=65536.0,
-                    architecture="TestMoEArchitecture",
-                    is_moe=True,
-                    max_context_length=self.max_context_length,
-                    num_experts=16,
-                )
-
-        return Args()
-
-    @pytest.mark.pre_merge
-    @pytest.mark.parallel
-    @pytest.mark.asyncio
-    @pytest.mark.gpu_0
-    @pytest.mark.integration
-    @pytest.mark.sglang
-    async def test_sglang_moe_dryrun(self, sglang_moe_args):
-        """Test that profile_sla dry-run works for sglang backend with MoE config."""
-        # Run the profile in dry-run mode - should complete without errors
-        await run_profile(sglang_moe_args)
-
-    # Example tests with mocked GPU inventory
-    @pytest.fixture
-    def mock_h100_gpu_info(self):
-        """Mock GPU info for H100 80GB cluster."""
-        return {
-            "gpus_per_node": 8,
-            "model": "h100_sxm",
-            "vram": 81920,  # 80GB in MiB
-        }
-
-    @pytest.fixture
-    def mock_model_info(self):
-        """Mock model info for DeepSeek-R1-Distill-Llama-8B."""
-        return ModelInfo(
-            model_size=16384.0,  # 16GB model in MiB
-            architecture="LlamaForCausalLM",
-            is_moe=False,
-            max_context_length=16384,
-        )
-
-    @pytest.fixture
-    def vllm_args_with_model_autogen(self, request):
-        """Create arguments for vllm backend with model-based search space autogeneration."""
-
-        class Args:
-            def __init__(self):
-                self.backend = "vllm"
-                self.config = ""
-                # Use unique output directory per test for parallel execution
-                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
-                self.namespace = f"test-namespace-{request.node.name}"
-                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
-                self.dgd_image = ""
-                # Set to 0 to trigger auto-generation path
-                self.min_num_gpus_per_engine = 0
-                self.max_num_gpus_per_engine = 0
-                self.skip_existing_results = False
-                self.force_rerun = False
-                self.isl = 3000
-                self.osl = 500
-                self.ttft = 50
-                self.itl = 10
-                self.max_context_length = 0
-                self.prefill_interpolation_granularity = 16
-                self.decode_interpolation_granularity = 6
-                self.service_name = ""
-                self.dry_run = True
-                self.system = "h100_sxm"  # Renamed from aic_system, moved to hardware
-                self.search_strategy = SearchStrategy.RAPID  # New top-level arg
-                # GPU discovery values (auto-populated by Operator)
-                self.num_gpus_per_node = 8
-                self.gpu_model = "H100-SXM5-80GB"
-                self.gpu_vram_mib = 81920
-                self.deploy_after_profile = False
-                self.pick_with_webui = False
-                self.model_cache_pvc_name = ""
-                self.model_cache_pvc_path = ""
-                self.model_cache_pvc_mount_path = "/opt/model-cache"
-
-        return Args()
-
-    @pytest.mark.pre_merge
-    @pytest.mark.parallel
-    @pytest.mark.asyncio
-    @pytest.mark.integration
-    @pytest.mark.gpu_0
-    @pytest.mark.vllm
-    @patch("dynamo.profiler.utils.model_info.get_model_info")
-    async def test_profile_with_autogen_search_space_h100(
-        self,
-        mock_get_model_info,
-        vllm_args_with_model_autogen,
-        mock_model_info,
-    ):
-        """Test profile_sla with auto-generated search space on mocked H100 cluster.
-
-        This test demonstrates how search space is auto-generated based on model
-        size and available GPU memory. GPU info is provided via command-line
-        arguments injected by the Operator into the profiling config (DYN-2135).
-        """
-        # Configure the mock to return the appropriate model info
-        mock_get_model_info.return_value = mock_model_info
-
-        # Run the profile - the search space will be auto-generated
-        # based on the model and GPU info from args
-        auto_generate_search_space(vllm_args_with_model_autogen)
-        await run_profile(vllm_args_with_model_autogen)
-
-    @pytest.fixture
-    def sglang_args_with_model_autogen(self, request):
-        """Create arguments for sglang backend with model-based search space autogeneration."""
-
-        class Args:
-            def __init__(self):
-                self.backend = "sglang"
-                self.config = ""
-                # Use unique output directory per test for parallel execution
-                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
-                self.namespace = f"test-namespace-{request.node.name}"
-                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
-                self.dgd_image = ""
-                self.min_num_gpus_per_engine = 0
-                self.max_num_gpus_per_engine = 0
-                self.skip_existing_results = False
-                self.force_rerun = False
-                self.isl = 3000
-                self.osl = 500
-                self.ttft = 50
-                self.itl = 10
-                self.max_context_length = 0
-                self.prefill_interpolation_granularity = 16
-                self.decode_interpolation_granularity = 6
-                self.service_name = ""
-                self.dry_run = True
-                self.system = "h100_sxm"  # Renamed from aic_system, moved to hardware
-                self.search_strategy = SearchStrategy.RAPID  # New top-level arg
-                # GPU discovery values (auto-populated by Operator)
-                self.num_gpus_per_node = 8
-                self.gpu_model = "H100-SXM5-80GB"
-                self.gpu_vram_mib = 81920
-                self.deploy_after_profile = False
-                self.pick_with_webui = False
-                self.model_cache_pvc_name = ""
-                self.model_cache_pvc_path = ""
-                self.model_cache_pvc_mount_path = "/opt/model-cache"
-
-        return Args()
-
-    @pytest.mark.pre_merge
-    @pytest.mark.parallel
-    @pytest.mark.asyncio
-    @pytest.mark.gpu_0
-    @pytest.mark.integration
-    @pytest.mark.sglang
-    @pytest.mark.skip(
-        reason="Blocked on AI Configurator database format: sglang 0.5.6.post2 database "
-        "is in legacy format missing 'gemm_dtype' field. "
-        "See: KeyError in aiconfigurator/sdk/perf_database.py"
-    )
-    @patch("dynamo.profiler.utils.model_info.get_model_info")
-    async def test_sglang_profile_with_autogen_search_space_h100(
-        self,
-        mock_get_model_info,
-        sglang_args_with_model_autogen,
-        mock_model_info,
-    ):
-        """Test profile_sla with auto-generated search space for sglang on mocked H100 cluster.
-
-        This test demonstrates how search space is auto-generated based on model
-        size and available GPU memory for sglang backend. GPU info is provided via
-        command-line arguments injected by the Operator into the profiling config (DYN-2135).
-
-        NOTE: Currently skipped due to AI Configurator database format issue.
-        The sglang 0.5.6.post2 database for h100_sxm is in legacy format and missing
-        the required 'gemm_dtype' field, causing KeyError during database loading.
-        """
-        # Configure the mock to return the appropriate model info
-        mock_get_model_info.return_value = mock_model_info
-
-        # Run the profile - the search space will be auto-generated
-        # based on the model and GPU info from args
-        auto_generate_search_space(sglang_args_with_model_autogen)
-        await run_profile(sglang_args_with_model_autogen)
-
-    @pytest.fixture
-    def trtllm_args_with_model_autogen(self, request):
-        """Create arguments for trtllm backend with model-based search space autogeneration."""
-
-        class Args:
-            def __init__(self):
-                self.backend = "trtllm"
-                self.config = ""
-                # Use unique output directory per test for parallel execution
-                self.output_dir = f"/tmp/test_profiling_results_{request.node.name}"
-                self.namespace = f"test-namespace-{request.node.name}"
-                self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"  # Specify model for autogen
-                self.dgd_image = ""
-                self.min_num_gpus_per_engine = 0
-                self.max_num_gpus_per_engine = 0
-                self.skip_existing_results = False
-                self.force_rerun = False
-                self.isl = 3000
-                self.osl = 500
-                self.ttft = 50
-                self.itl = 10
-                self.max_context_length = 0
-                self.prefill_interpolation_granularity = 16
-                self.decode_interpolation_granularity = 6
-                self.service_name = ""
-                self.dry_run = True
-                self.system = "h100_sxm"  # Renamed from aic_system, moved to hardware
-                self.search_strategy = SearchStrategy.RAPID  # New top-level arg
-                # GPU discovery values (auto-populated by Operator)
-                self.num_gpus_per_node = 8
-                self.gpu_model = "H100-SXM5-80GB"
-                self.gpu_vram_mib = 81920
-                self.deploy_after_profile = False
-                self.pick_with_webui = False
-                self.model_cache_pvc_name = ""
-                self.model_cache_pvc_path = ""
-                self.model_cache_pvc_mount_path = "/opt/model-cache"
-
-        return Args()
-
-    @pytest.mark.pre_merge
-    @pytest.mark.parallel
-    @pytest.mark.asyncio
-    @pytest.mark.gpu_0
-    @pytest.mark.integration
-    @pytest.mark.trtllm
-    @patch("dynamo.profiler.utils.model_info.get_model_info")
-    async def test_trtllm_profile_with_autogen_search_space_h100(
-        self,
-        mock_get_model_info,
-        trtllm_args_with_model_autogen,
-        mock_model_info,
-    ):
-        """Test profile_sla with auto-generated search space for trtllm on mocked H100 cluster.
-
-        This test demonstrates how search space is auto-generated based on model
-        size and available GPU memory for trtllm backend. GPU info is provided via
-        command-line arguments injected by the Operator into the profiling config (DYN-2135).
-        """
-        # Configure the mock to return the appropriate model info
-        mock_get_model_info.return_value = mock_model_info
-
-        # Run the profile - the search space will be auto-generated
-        # based on the model and GPU info from args
-        auto_generate_search_space(trtllm_args_with_model_autogen)
-        await run_profile(trtllm_args_with_model_autogen)
-
-    # Unit tests for search_strategy and system attributes
-    @pytest.mark.pre_merge
-    @pytest.mark.unit
-    @pytest.mark.gpu_0
-    def test_vllm_args_has_search_strategy(self, vllm_args):
-        """Test that vllm_args fixture has search_strategy attribute."""
-        assert hasattr(vllm_args, "search_strategy")
-        assert vllm_args.search_strategy == SearchStrategy.THOROUGH
-        assert hasattr(vllm_args, "system")
-        assert vllm_args.system == ""
-
-    @pytest.mark.pre_merge
-    @pytest.mark.unit
-    @pytest.mark.gpu_0
-    def test_sglang_args_has_search_strategy(self, sglang_args):
-        """Test that sglang_args fixture has search_strategy attribute."""
-        assert hasattr(sglang_args, "search_strategy")
-        assert sglang_args.search_strategy == SearchStrategy.THOROUGH
-        assert hasattr(sglang_args, "system")
-        assert sglang_args.system == ""
-
-    @pytest.mark.pre_merge
-    @pytest.mark.unit
-    @pytest.mark.gpu_0
-    def test_trtllm_args_has_search_strategy(self, trtllm_args):
-        """Test that trtllm_args fixture has search_strategy attribute."""
-        assert hasattr(trtllm_args, "search_strategy")
-        assert trtllm_args.search_strategy == SearchStrategy.THOROUGH
-        assert hasattr(trtllm_args, "system")
-        assert trtllm_args.system == ""
-
-    @pytest.mark.pre_merge
-    @pytest.mark.unit
-    @pytest.mark.gpu_0
-    def test_sglang_moe_args_has_search_strategy(self, sglang_moe_args):
-        """Test that sglang_moe_args fixture has search_strategy attribute."""
-        assert hasattr(sglang_moe_args, "search_strategy")
-        assert sglang_moe_args.search_strategy == SearchStrategy.THOROUGH
-        assert hasattr(sglang_moe_args, "system")
-        assert sglang_moe_args.system == ""
-
-    @pytest.mark.pre_merge
-    @pytest.mark.unit
-    @pytest.mark.gpu_0
-    def test_model_autogen_args_have_rapid_strategy(
-        self,
-        vllm_args_with_model_autogen,
-        sglang_args_with_model_autogen,
-        trtllm_args_with_model_autogen,
-    ):
-        """Test that model autogen fixtures have RAPID search strategy and GPU info."""
-        for args_fixture in [
-            vllm_args_with_model_autogen,
-            sglang_args_with_model_autogen,
-            trtllm_args_with_model_autogen,
-        ]:
-            assert hasattr(args_fixture, "search_strategy")
-            assert args_fixture.search_strategy == SearchStrategy.RAPID
-            assert hasattr(args_fixture, "system")
-            assert args_fixture.system == "h100_sxm"
-            # Verify GPU discovery attributes
-            assert hasattr(args_fixture, "num_gpus_per_node")
-            assert args_fixture.num_gpus_per_node == 8
-            assert hasattr(args_fixture, "gpu_model")
-            assert args_fixture.gpu_model == "H100-SXM5-80GB"
-            assert hasattr(args_fixture, "gpu_vram_mib")
-            assert args_fixture.gpu_vram_mib == 81920