feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with...

feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with automatic injection (#6224) Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with...
feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with automatic injection (#6224) Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
d56439ec · hhzhang16 · GitHub · 233a1e9a · d56439ec · 233a1e9a
Unverified Commit d56439ec authored Feb 13, 2026 by hhzhang16 Committed by GitHub Feb 14, 2026
8 changed files
--- a/deploy/operator/internal/webhook/validation/dynamographdeploymentrequest_test.go
+++ b/deploy/operator/internal/webhook/validation/dynamographdeploymentrequest_test.go
@@ -28,6 +28,7 @@ import (
 func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
 	validConfig := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}}`
+	validConfigWithHardware := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}, "hardware": {"numGpusPerNode": 8, "gpuModel": "H100-SXM5-80GB", "gpuVramMib": 81920}}`
 	configWithDifferentBackend := `{"engine": {"backend": "sglang"}}`
 	configWithDifferentModel := `{"deployment": {"model": "different-model"}}`
 	invalidYAML := `{invalid yaml`
@@ -128,65 +129,19 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
 			errMsg:        "spec.profilingConfig.config is required and must not be empty",
 		},
 		{
-			name: "enableGpuDiscovery true for cluster-wide operator",
+			name: "namespace-restricted operator (GPU discovery will fail gracefully)",
 			request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      "test-dgdr",
 					Namespace: "default",
 				},
 				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
-					Model:              "llama-3-8b",
+					Model:   "llama-3-8b",
-					Backend:            "vllm",
+					Backend: "vllm",
-					EnableGpuDiscovery: true,
-					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
-						ProfilerImage: "profiler:latest",
-						Config: &apiextensionsv1.JSON{
-							Raw: []byte(validConfig),
-						},
-					},
-				},
-			},
-			isClusterWide: true,
-			wantErr:       false,
-		},
-		{
-			name: "enableGpuDiscovery true for namespace-restricted operator",
-			request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
-				ObjectMeta: metav1.ObjectMeta{
-					Name:      "test-dgdr",
-					Namespace: "default",
-				},
-				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
-					Model:              "llama-3-8b",
-					Backend:            "vllm",
-					EnableGpuDiscovery: true,
-					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
-						ProfilerImage: "profiler:latest",
-						Config: &apiextensionsv1.JSON{
-							Raw: []byte(validConfig),
-						},
-					},
-				},
-			},
-			isClusterWide: false,
-			wantErr:       true,
-			errMsg:        "spec.enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in spec.profilingConfig.config",
-		},
-		{
-			name: "enableGpuDiscovery false for namespace-restricted operator",
-			request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
-				ObjectMeta: metav1.ObjectMeta{
-					Name:      "test-dgdr",
-					Namespace: "default",
-				},
-				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
-					Model:              "llama-3-8b",
-					Backend:            "vllm",
-					EnableGpuDiscovery: false,
 					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
 						ProfilerImage: "profiler:latest",
 						Config: &apiextensionsv1.JSON{
-							Raw: []byte(validConfig),
+							Raw: []byte(validConfigWithHardware),
 						},
 					},
 				},
@@ -263,16 +218,15 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
 			expectedWarning: "spec.profilingConfig.config.deployment.model (different-model) will be overwritten by spec.model (llama-3-8b)",
 		},
 		{
-			name: "multiple errors (missing profiler image, missing config, and enableGpuDiscovery for namespace-restricted)",
+			name: "multiple errors (missing profiler image and missing config)",
 			request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      "test-dgdr",
 					Namespace: "default",
 				},
 				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
-					Model:              "llama-3-8b",
+					Model:   "llama-3-8b",
-					Backend:            "vllm",
+					Backend: "vllm",
-					EnableGpuDiscovery: true,
 					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
 						ProfilerImage: "",
 						Config:        nil,
@@ -281,9 +235,12 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
 			},
 			isClusterWide: false,
 			wantErr:       true,
-			errMsg:        "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty\nspec.enableGpuDiscovery can only be set to true for cluster-wide operators",
+			errMsg:        "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty",
 			errContains:   true,
 		},
+		// TODO: Add test for invalid GPU range (min > max) validation
+		// The validation logic is in place (lines 148-152 of dynamographdeploymentrequest.go)
+		// but needs proper test coverage
 	}
 	for _, tt := range tests {

--- a/deploy/utils/gpu_inventory.py
+++ b/deploy/utils/gpu_inventory.py
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-import argparse
-import json
-import logging
-import re
-import shutil
-import subprocess
-import time
-import uuid
-from dataclasses import asdict, dataclass
-from typing import Dict, List, Optional, Tuple, Union
-from kubernetes import client, config
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-console_handler = logging.StreamHandler()
-console_handler.setLevel(logging.INFO)
-formatter = logging.Formatter(
-    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
-)
-console_handler.setFormatter(formatter)
-logger.addHandler(console_handler)
-def run_command(cmd: List[str], capture_output: bool = True, exit_on_error: bool = True):  # type: ignore
-    try:
-        return subprocess.run(cmd, capture_output=capture_output, text=True, check=True)
-    except subprocess.CalledProcessError as e:  # pragma: no cover - passthrough
-        if exit_on_error:
-            logger.error(f"Command failed: {' '.join(cmd)}")
-            if e.stdout:
-                logger.error(e.stdout)
-            if e.stderr:
-                logger.error(e.stderr)
-            raise RuntimeError(f"Command failed: {' '.join(cmd)}")
-        raise
-NVIDIA_PREFIX = "nvidia.com/"
-LABEL_GPU_COUNT = f"{NVIDIA_PREFIX}gpu.count"
-LABEL_GPU_PRODUCT = f"{NVIDIA_PREFIX}gpu.product"
-LABEL_GPU_MEMORY = f"{NVIDIA_PREFIX}gpu.memory"  # MiB per GPU
-LABEL_MIG_CAPABLE = f"{NVIDIA_PREFIX}mig.capable"
-@dataclass
-class NodeGpuInventory:
-    node_name: str
-    gpu_count: Optional[int]
-    gpu_product: Optional[str]
-    gpu_memory_mib: Optional[int]
-    mig_capable: Optional[bool]
-    allocatable_gpu: Optional[int]
-    mig_resources: Dict[str, str]
-    def to_dict(self) -> Dict[str, Union[str, int, bool, Dict[str, str], None]]:
-        return asdict(self)
-def _parse_int(value: Optional[str]) -> Optional[int]:
-    if value is None:
-        return None
-    try:
-        return int(value)
-    except (TypeError, ValueError):
-        match = re.search(r"\d+", str(value))
-        return int(match.group(0)) if match else None
-def _bool_from_str(value: Optional[str]) -> Optional[bool]:
-    if value is None:
-        return None
-    s = str(value).strip().lower()
-    if s in {"true", "1", "yes"}:
-        return True
-    if s in {"false", "0", "no"}:
-        return False
-    return None
-def _normalize_node(node: Union[client.V1Node, Dict]) -> Dict:
-    # Convert V1Node to dict for uniform access
-    if hasattr(node, "to_dict"):
-        return node.to_dict()
-    return node  # assume already dict
-def _extract_inventory(node_obj: Dict) -> NodeGpuInventory:
-    meta = node_obj.get("metadata", {})
-    status = node_obj.get("status", {})
-    labels = meta.get("labels", {}) or {}
-    node_name = meta.get("name", "<unknown>")
-    gpu_product = labels.get(LABEL_GPU_PRODUCT)
-    gpu_memory_mib = _parse_int(labels.get(LABEL_GPU_MEMORY))
-    mig_capable = _bool_from_str(labels.get(LABEL_MIG_CAPABLE))
-    # Prefer GFD-reported GPU count if present; otherwise use allocatable nvidia.com/gpu
-    gpu_count = _parse_int(labels.get(LABEL_GPU_COUNT))
-    alloc = status.get("allocatable", {}) or {}
-    alloc_gpu = _parse_int(alloc.get(f"{NVIDIA_PREFIX}gpu"))
-    if gpu_count is None:
-        gpu_count = alloc_gpu
-    # Collect MIG resource keys and counts if present
-    mig_resources: Dict[str, str] = {
-        k: str(v)
-        for k, v in alloc.items()
-        if isinstance(k, str)
-        and k.startswith(f"{NVIDIA_PREFIX}mig-")
-        and _parse_int(str(v))
-    }
-    return NodeGpuInventory(
-        node_name=node_name,
-        gpu_count=gpu_count,
-        gpu_product=gpu_product,
-        gpu_memory_mib=gpu_memory_mib,
-        mig_capable=mig_capable,
-        allocatable_gpu=alloc_gpu,
-        mig_resources=mig_resources,
-    )
-def _list_nodes_via_client() -> List[Dict]:
-    # Assume running inside a Kubernetes pod with service account
-    try:
-        config.load_incluster_config()
-    except Exception as e:
-        raise RuntimeError(
-            f"Failed to load in-cluster Kubernetes config. Ensure this runs in a pod with a service account. Error: {e}"
-        )
-    v1 = client.CoreV1Api()
-    items = v1.list_node().items  # type: ignore[attr-defined]
-    return [_normalize_node(n) for n in items]
-def _list_nodes_via_kubectl() -> List[Dict]:
-    if not shutil.which("kubectl"):
-        raise RuntimeError("kubectl not found in PATH for fallback")
-    result = run_command(["kubectl", "get", "nodes", "-o", "json"], capture_output=True)
-    data = json.loads(result.stdout)
-    return data.get("items", [])
-def collect_gpu_inventory(
-    prefer_client: bool = True,
-) -> Tuple[List[NodeGpuInventory], str]:
-    sources_tried: List[str] = []
-    errors: List[str] = []
-    def _via_client() -> List[NodeGpuInventory]:
-        items = _list_nodes_via_client()
-        return [_extract_inventory(n) for n in items]
-    def _via_kubectl() -> List[NodeGpuInventory]:
-        items = _list_nodes_via_kubectl()
-        return [_extract_inventory(n) for n in items]
-    if prefer_client:
-        try:
-            sources_tried.append("kubernetes-client")
-            return _via_client(), ",".join(sources_tried)
-        except Exception as e:
-            errors.append(str(e))
-            try:
-                sources_tried.append("kubectl-json")
-                return _via_kubectl(), ",".join(sources_tried)
-            except Exception as e2:
-                errors.append(str(e2))
-                raise RuntimeError("Failed to list nodes: " + " | ".join(errors))
-    else:
-        try:
-            sources_tried.append("kubectl-json")
-            return _via_kubectl(), ",".join(sources_tried)
-        except Exception as e:
-            errors.append(str(e))
-            try:
-                sources_tried.append("kubernetes-client")
-                return _via_client(), ",".join(sources_tried)
-            except Exception as e2:
-                errors.append(str(e2))
-                raise RuntimeError("Failed to list nodes: " + " | ".join(errors))
-def _format_gib(mib: Optional[int]) -> str:
-    if mib is None:
-        return ""
-    return f"{mib/1024:.1f} GiB"
-def print_table(rows: List[NodeGpuInventory], show_mig: bool = False) -> None:
-    headers = ["NODE", "GPUS", "MODEL", "VRAM/GPU", "MIG"]
-    table: List[List[str]] = []
-    for r in rows:
-        mig_str = ""
-        if r.mig_capable is True:
-            if r.mig_resources:
-                mig_str = ",".join(
-                    f"{k.split('/')[-1]}={v}"
-                    for k, v in sorted(r.mig_resources.items())
-                )
-            else:
-                mig_str = "capable"
-        elif r.mig_capable is False:
-            mig_str = "no"
-        table.append(
-            [
-                r.node_name,
-                "" if r.gpu_count is None else str(r.gpu_count),
-                r.gpu_product or "",
-                _format_gib(r.gpu_memory_mib),
-                mig_str if show_mig else ("yes" if r.mig_capable else ""),
-            ]
-        )
-    # Compute column widths
-    widths = [len(h) for h in headers]
-    for row in table:
-        for i, cell in enumerate(row):
-            widths[i] = max(widths[i], len(cell))
-    def _fmt_row(row: List[str]) -> str:
-        return "  ".join(cell.ljust(widths[i]) for i, cell in enumerate(row))
-    logger.info(_fmt_row(headers))
-    logger.info(_fmt_row(["-" * w for w in widths]))
-    for row in table:
-        logger.info(_fmt_row(row))
-def aggregate_valued_rows(
-    rows: List[NodeGpuInventory],
-) -> Tuple[Optional[NodeGpuInventory], int]:
-    """Aggregate rows that have meaningful GPU metadata.
-    Preference order when multiple distinct values exist:
-    1) Larger GPUs per node (gpu_count)
-    2) Larger VRAM per GPU (gpu_memory_mib)
-    Returns (selected_row_like, distinct_count).
-    """
-    valued: List[NodeGpuInventory] = [
-        r for r in rows if (r.gpu_product is not None or r.gpu_memory_mib is not None)
-    ]
-    if not valued:
-        return None, 0
-    # Group by (product, vram_mib)
-    from collections import defaultdict
-    groups: Dict[
-        Tuple[Optional[str], Optional[int]],
-        Dict[str, object],
-    ] = defaultdict(lambda: {"max_gpu": 0, "rows": []})
-    for r in valued:
-        key = (r.gpu_product, r.gpu_memory_mib)
-        meta = groups[key]
-        meta["rows"].append(r)  # type: ignore[attr-defined, index]
-        # Use known gpu_count if available for ranking
-        if r.gpu_count is not None:
-            meta["max_gpu"] = max(int(meta["max_gpu"]), int(r.gpu_count))  # type: ignore[arg-type, call-overload, index]
-    def sort_key(
-        item: Tuple[
-            Tuple[Optional[str], Optional[int]],
-            Dict[str, object],
-        ]
-    ):
-        (prod, mem_mib), meta = item
-        max_gpu = int(meta["max_gpu"])  # type: ignore[arg-type, call-overload, index]
-        mem_val = mem_mib if mem_mib is not None else -1
-        return (max_gpu, mem_val)
-    selected_key, selected_meta = sorted(groups.items(), key=sort_key, reverse=True)[0]
-    sel_prod, sel_mem_mib = selected_key
-    sel_gpu = int(selected_meta["max_gpu"])  # type: ignore[arg-type, call-overload, index]
-    selected = NodeGpuInventory(
-        node_name="<aggregate>",
-        gpu_count=sel_gpu if sel_gpu > 0 else None,
-        gpu_product=sel_prod,
-        gpu_memory_mib=sel_mem_mib,
-        mig_capable=None,
-        allocatable_gpu=None,
-        mig_resources={},
-    )
-    return selected, len(groups)
-def _get_current_namespace(default: str = "default") -> str:
-    try:
-        with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") as f:
-            return f.read().strip() or default
-    except Exception:
-        return default
-def enrich_with_smi(
-    rows: List[NodeGpuInventory],
-    namespace: Optional[str] = None,
-    timeout_seconds: int = 180,
-) -> None:
-    """For nodes missing product/memory labels, schedule a short-lived pod on each node
-    that requests 1 GPU and runs nvidia-smi to capture model and memory.
-    Requires permissions: create/get/delete pods and get pods/log in the namespace.
-    """
-    ns = namespace or _get_current_namespace()
-    try:
-        config.load_incluster_config()
-    except Exception:
-        pass
-    v1 = client.CoreV1Api()
-    for inv in rows:
-        if not inv.gpu_count or (
-            inv.gpu_product is not None and inv.gpu_memory_mib is not None
-        ):
-            continue
-        pod_name = f"gpu-inv-smi-{uuid.uuid4().hex[:6]}"
-        container = client.V1Container(
-            name="smi",
-            image="nvidia/cuda:12.3.2-base-ubuntu22.04",
-            command=["bash", "-lc"],
-            args=[
-                "nvidia-smi --query-gpu=name,memory.total --format=csv,noheader,nounits"
-            ],
-            resources=client.V1ResourceRequirements(
-                limits={"nvidia.com/gpu": "1", "cpu": "100m", "memory": "128Mi"},
-                requests={"nvidia.com/gpu": "1", "cpu": "50m", "memory": "64Mi"},
-            ),
-        )
-        pod = client.V1Pod(
-            api_version="v1",
-            kind="Pod",
-            metadata=client.V1ObjectMeta(name=pod_name, namespace=ns),
-            spec=client.V1PodSpec(
-                restart_policy="Never",
-                node_name=inv.node_name,
-                containers=[container],
-            ),
-        )
-        logs = ""
-        try:
-            v1.create_namespaced_pod(namespace=ns, body=pod)
-            start = time.time()
-            while time.time() - start < timeout_seconds:
-                p = v1.read_namespaced_pod(name=pod_name, namespace=ns)
-                phase = (p.status.phase or "").lower()
-                if phase in ("succeeded", "failed"):
-                    break
-                time.sleep(2)
-            try:
-                logs = v1.read_namespaced_pod_log(name=pod_name, namespace=ns)
-            except Exception:
-                logs = ""
-        finally:
-            try:
-                v1.delete_namespaced_pod(
-                    name=pod_name, namespace=ns, body=client.V1DeleteOptions()
-                )
-            except Exception:
-                pass
-        for line in logs.splitlines():
-            parts = [x.strip() for x in line.split(",")]
-            if len(parts) >= 2 and parts[0]:
-                inv.gpu_product = inv.gpu_product or parts[0]
-                mem_match = re.search(r"\d+", parts[1])
-                if mem_match:
-                    inv.gpu_memory_mib = inv.gpu_memory_mib or int(mem_match.group(0))
-                break
-def get_gpu_summary(
-    prefer_client: bool = True, enrich_smi: bool = True
-) -> Dict[str, object]:
-    """Return an aggregate GPU summary for the cluster.
-    Selection policy when multiple values exist: prefer higher GPUs per node,
-    then higher VRAM/GPU. Returns dict with keys: gpus_per_node, model, vram.
-    If model/VRAM unavailable anywhere, returns {"gpus_per_node": max_gpus, "model": "", "vram": 0}.
-    """
-    # TODO: use proper tools (i.e., DCGM) to get GPU inventory
-    rows, _ = collect_gpu_inventory(prefer_client=prefer_client)
-    if enrich_smi:
-        enrich_with_smi(rows)
-    agg, _distinct = aggregate_valued_rows(rows)
-    if agg is None:
-        # Fallback to max GPUs only
-        max_gpus = 0
-        for r in rows:
-            if r.gpu_count is not None:
-                max_gpus = max(max_gpus, int(r.gpu_count))
-        return {"gpus_per_node": max_gpus, "model": "", "vram": 0}
-    gpus_val = int(agg.gpu_count) if agg.gpu_count is not None else 0
-    model_val = agg.gpu_product or ""
-    vram_val = int(agg.gpu_memory_mib) if agg.gpu_memory_mib is not None else 0
-    return {
-        "gpus_per_node": gpus_val,
-        "model": model_val,
-        "vram": vram_val,
-    }
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Report GPU inventory per Kubernetes node (count, SKU, VRAM)."
-    )
-    parser.add_argument(
-        "--format",
-        "-o",
-        choices=["table", "json"],
-        default="table",
-        help="Output format",
-    )
-    parser.add_argument(
-        "--prefer",
-        choices=["client", "kubectl"],
-        default="client",
-        help="Prefer Kubernetes Python client or kubectl JSON fallback",
-    )
-    parser.add_argument(
-        "--show-mig",
-        action="store_true",
-        help="In table output, show MIG resource types and counts",
-    )
-    parser.add_argument(
-        "--enrich-smi",
-        action="store_true",
-        help="Schedule short-lived pods per node to fetch model/VRAM via nvidia-smi",
-    )
-    parser.add_argument(
-        "--aggregate",
-        action="store_true",
-        help="Print a single representative (GPUs per node, MODEL, VRAM/GPU). Warn if multiple values exist",
-    )
-    args = parser.parse_args()
-    prefer_client = args.prefer == "client"
-    rows, source = collect_gpu_inventory(prefer_client=prefer_client)
-    if args.enrich_smi:
-        enrich_with_smi(rows)
-    if args.format == "json":
-        payload = {
-            "source": source,
-            "items": [r.to_dict() for r in rows],
-        }
-        logger.info(json.dumps(payload, indent=2))
-        return
-    # Table output
-    print_table(rows, show_mig=args.show_mig)
-    if args.aggregate:
-        agg, distinct = aggregate_valued_rows(rows)
-        if agg is None:
-            logger.warning("No nodes expose MODEL/VRAM; cannot aggregate")
-            return
-        if distinct > 1:
-            logger.warning(
-                f"Multiple distinct GPU model/VRAM pairs detected across nodes: {distinct}. Showing highest GPUs per node, then highest VRAM/GPU."
-            )
-        # Print concise aggregate line
-        model = agg.gpu_product or ""
-        vram = _format_gib(agg.gpu_memory_mib)
-        gpus = agg.gpu_count if agg.gpu_count is not None else ""
-        logger.info(f"Aggregate => GPUS: {gpus}  MODEL: {model}  VRAM/GPU: {vram}")
-if __name__ == "__main__":
-    main()
--- a/docs/components/profiler/profiler_guide.md
+++ b/docs/components/profiler/profiler_guide.md
--- a/docs/kubernetes/api_reference.md
+++ b/docs/kubernetes/api_reference.md
--- a/docs/pages/components/profiler/profiler-guide.md
+++ b/docs/pages/components/profiler/profiler-guide.md
@@ -227,14 +227,15 @@ See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#
 ### Automatic GPU Discovery
-Cluster-scoped operators can optionally enable automatic GPU discovery:
+The operator automatically discovers GPU resources from your Kubernetes cluster nodes when available. GPU discovery provides:
-```yaml
+- Hardware information (GPU model, VRAM, GPUs per node)
-spec:
+- Automatic calculation of profiling search space based on model size
-  enableGpuDiscovery: true
+- Hardware system identifier for AI Configurator integration
-```
+**Permissions**: GPU discovery requires cluster-wide node read permissions. Cluster-scoped operators automatically have these permissions. Namespace-restricted operators can also use GPU discovery if granted node read permissions via RBAC.
-This is only available with cluster-scoped operators (`namespaceRestriction.enabled=false`) as it requires cluster-wide node access permissions.
+If GPU discovery is unavailable (no permissions or no GPU labels), the profiler will use manually specified hardware configuration or defaults.
 ## Configuration

--- a/docs/pages/kubernetes/api-reference.md
+++ b/docs/pages/kubernetes/api-reference.md
@@ -462,8 +462,8 @@ _Appears in:_
 | `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. |  | Required: \{\} <br /> |
 | `backend` _string_ | Backend specifies the inference backend for profiling.<br />The controller automatically sets this value in profilingConfig.config.engine.backend.<br />Profiling runs on real GPUs or via AIC simulation to collect performance data. |  | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
 | `useMocker` _boolean_ | UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of<br />a real backend deployment. When true, the deployment uses simulated engines that<br />don't require GPUs, using the profiling data to simulate realistic timing behavior.<br />Mocker is available in all backend images and useful for large-scale experiments.<br />Profiling still runs against the real backend (specified above) to collect performance data. | false |  |
-| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU<br />resources from the Kubernetes cluster nodes. When enabled, the profiler will override<br />any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,<br />numGpusPerNode) with values detected from the cluster.<br />Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: \{\} <br /> |
+| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes<br />cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),<br />discovered GPU configuration is used as defaults when hardware configuration is not manually<br />specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values<br />always take precedence over auto-discovered values. If GPU discovery fails (e.g.,<br />namespace-restricted operator without node permissions), manual hardware config is required.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. |  | Required: \{\} <br /> |
-| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. |  | Required: \{\} <br /> |
+| `enableGpuDiscovery` _boolean_ | EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.<br />DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always<br />attempted automatically. Setting this field has no effect - the operator will always try to discover<br />GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped<br />operator without permissions), manual hardware configuration is required regardless of this setting. | true | Optional: \{\} <br /> |
 | `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false |  |
 | `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. |  | Optional: \{\} <br /> |

--- a/tests/profiler/test_profile_sla_aiconfigurator.py
+++ b/tests/profiler/test_profile_sla_aiconfigurator.py
@@ -18,6 +18,7 @@ project_root = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(project_root))
 from dynamo.profiler.profile_sla import run_profile  # noqa: E402
+from dynamo.profiler.utils.defaults import SearchStrategy  # noqa: E402
 from dynamo.profiler.utils.model_info import ModelInfo  # noqa: E402
 pytestmark = [
@@ -44,7 +45,7 @@ class TestProfileSlaAiconfigurator:
    def llm_args(self, request):
        class Args:
            def __init__(self):
-                self.model = ""
+                self.model = "Qwen/Qwen3-32B"  # Set to match aic_hf_id for consistency
                self.dgd_image = ""
                self.backend = "trtllm"
                self.config = "examples/backends/trtllm/deploy/disagg.yaml"
@@ -63,14 +64,13 @@ class TestProfileSlaAiconfigurator:
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = False
-                self.use_ai_configurator = True
-                self.aic_system = "h200_sxm"
-                self.aic_hf_id = "Qwen/Qwen3-32B"
-                self.aic_backend = ""
-                self.aic_backend_version = None
                self.num_gpus_per_node = 8
                self.deploy_after_profile = False
                self.pick_with_webui = False
+                # Use RAPID strategy to leverage AI Configurator for perf estimation
+                # This avoids Kubernetes deployments while testing aiconfigurator functionality
+                self.search_strategy = SearchStrategy.RAPID
+                self.system = "h200_sxm"  # Must match aic_system for RAPID strategy
                # Provide minimal model_info to avoid HF queries
                self.model_info = ModelInfo(
                    model_size=16384.0,
@@ -86,10 +86,10 @@ class TestProfileSlaAiconfigurator:
    @pytest.mark.performance
    @pytest.mark.parallel
    @pytest.mark.asyncio
-    @pytest.mark.parametrize("missing_arg", ["aic_system", "aic_hf_id"])
+    @pytest.mark.parametrize("missing_arg", ["system", "model"])
    async def test_aiconfigurator_missing_args(self, llm_args, missing_arg):
-        # Check that validation error happens when a required arg is missing.
+        # Check that validation error happens when a required arg is missing for RAPID strategy.
-        # Note: aic_backend_version is optional - when None, auto-detects latest version
+        # These args are required when using SearchStrategy.RAPID with AI Configurator.
        setattr(llm_args, missing_arg, None)
        with pytest.raises(ValueError):
            await run_profile(llm_args)
@@ -103,8 +103,7 @@ class TestProfileSlaAiconfigurator:
        "arg_name, bad_value",
        [
            # these values don't exist in the aiconfigurator database.
-            ("aic_system", "fake_gpu_system"),
+            ("system", "fake_gpu_system"),
-            ("aic_backend_version", "0.1.0"),
        ],
    )
    async def test_aiconfigurator_no_data(self, llm_args, arg_name, bad_value):
@@ -131,14 +130,11 @@ class TestProfileSlaAiconfigurator:
    @pytest.mark.nightly
    # fmt: off
    @pytest.mark.parametrize(
-        "backend, aic_backend_version",
+        "backend",
        [
-            pytest.param("trtllm", None,          marks=pytest.mark.trtllm),
+            pytest.param("trtllm", marks=pytest.mark.trtllm),
-            pytest.param("trtllm", "1.2.0rc5",    marks=pytest.mark.trtllm),
+            pytest.param("vllm",   marks=pytest.mark.vllm),
-            pytest.param("vllm",   None,          marks=pytest.mark.vllm),
+            pytest.param("sglang", marks=pytest.mark.sglang),
-            pytest.param("vllm",   "0.12.0",      marks=pytest.mark.vllm),
-            pytest.param("sglang", None,          marks=pytest.mark.sglang),
-            pytest.param("sglang", "0.5.6.post2", marks=pytest.mark.sglang),
        ],
    )
    # fmt: on
@@ -149,11 +145,10 @@ class TestProfileSlaAiconfigurator:
            "meta-llama/Llama-3.1-405B",
        ],
    )
-    async def test_aiconfigurator_dense_models(
+    async def test_aiconfigurator_dense_models(self, llm_args, hf_model_id, backend):
-        self, llm_args, hf_model_id, backend, aic_backend_version
+        # Test that profile_sla works with a variety of backends and model names
-    ):
+        # using AI Configurator's RAPID strategy for performance estimation.
-        # Test that profile_sla works with a variety of backend versions and model names.
+        # Backend version is not used with RAPID strategy - performance comes from AI Configurator.
-        llm_args.aic_hf_id = hf_model_id
+        llm_args.model = hf_model_id  # Used by RAPID strategy
        llm_args.backend = backend
-        llm_args.aic_backend_version = aic_backend_version
        await run_profile(llm_args)
--- a/tests/profiler/test_profile_sla_dryrun.py
+++ b/tests/profiler/test_profile_sla_dryrun.py