feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with...

feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with automatic injection (#6224) Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with...
feat: migrate GPU discovery from Dynamo Profiler to Dynamo Operator with automatic injection (#6224) Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
d56439ec · hhzhang16 · GitHub · 233a1e9a · d56439ec · 233a1e9a
Unverified Commit d56439ec authored Feb 13, 2026 by hhzhang16 Committed by GitHub Feb 14, 2026
8 changed files
--- a/deploy/operator/internal/webhook/validation/dynamographdeploymentrequest_test.go
+++ b/deploy/operator/internal/webhook/validation/dynamographdeploymentrequest_test.go
@@ -28,6 +28,7 @@ import (

 func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
 	validConfig := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}}`
+	validConfigWithHardware := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}, "hardware": {"numGpusPerNode": 8, "gpuModel": "H100-SXM5-80GB", "gpuVramMib": 81920}}`
 	configWithDifferentBackend := `{"engine": {"backend": "sglang"}}`
 	configWithDifferentModel := `{"deployment": {"model": "different-model"}}`
 	invalidYAML := `{invalid yaml`
@@ -128,65 +129,19 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
 			errMsg:        "spec.profilingConfig.config is required and must not be empty",
 		},
 		{
-			name: "enableGpuDiscovery true for cluster-wide operator",
+			name: "namespace-restricted operator (GPU discovery will fail gracefully)",
 			request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      "test-dgdr",
 					Namespace: "default",
 				},
 				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
-					Model:              "llama-3-8b",
-					Backend:            "vllm",
-					EnableGpuDiscovery: true,
-					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
-						ProfilerImage: "profiler:latest",
-						Config: &apiextensionsv1.JSON{
-							Raw: []byte(validConfig),
-						},
-					},
-				},
-			},
-			isClusterWide: true,
-			wantErr:       false,
-		},
-		{
-			name: "enableGpuDiscovery true for namespace-restricted operator",
-			request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
-				ObjectMeta: metav1.ObjectMeta{
-					Name:      "test-dgdr",
-					Namespace: "default",
-				},
-				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
-					Model:              "llama-3-8b",
-					Backend:            "vllm",
-					EnableGpuDiscovery: true,
-					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
-						ProfilerImage: "profiler:latest",
-						Config: &apiextensionsv1.JSON{
-							Raw: []byte(validConfig),
-						},
-					},
-				},
-			},
-			isClusterWide: false,
-			wantErr:       true,
-			errMsg:        "spec.enableGpuDiscovery can only be set to true for cluster-wide operators. Namespace-restricted operators cannot access cluster nodes for GPU discovery. Please set enableGpuDiscovery to false and provide hardware configuration (hardware.min_num_gpus_per_engine, hardware.max_num_gpus_per_engine, hardware.num_gpus_per_node) in spec.profilingConfig.config",
-		},
-		{
-			name: "enableGpuDiscovery false for namespace-restricted operator",
-			request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
-				ObjectMeta: metav1.ObjectMeta{
-					Name:      "test-dgdr",
-					Namespace: "default",
-				},
-				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
-					Model:              "llama-3-8b",
-					Backend:            "vllm",
-					EnableGpuDiscovery: false,
+					Model:   "llama-3-8b",
+					Backend: "vllm",
 					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
 						ProfilerImage: "profiler:latest",
 						Config: &apiextensionsv1.JSON{
-							Raw: []byte(validConfig),
+							Raw: []byte(validConfigWithHardware),
 						},
 					},
 				},
@@ -263,16 +218,15 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
 			expectedWarning: "spec.profilingConfig.config.deployment.model (different-model) will be overwritten by spec.model (llama-3-8b)",
 		},
 		{
-			name: "multiple errors (missing profiler image, missing config, and enableGpuDiscovery for namespace-restricted)",
+			name: "multiple errors (missing profiler image and missing config)",
 			request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      "test-dgdr",
 					Namespace: "default",
 				},
 				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
-					Model:              "llama-3-8b",
-					Backend:            "vllm",
-					EnableGpuDiscovery: true,
+					Model:   "llama-3-8b",
+					Backend: "vllm",
 					ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
 						ProfilerImage: "",
 						Config:        nil,
@@ -281,9 +235,12 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
 			},
 			isClusterWide: false,
 			wantErr:       true,
-			errMsg:        "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty\nspec.enableGpuDiscovery can only be set to true for cluster-wide operators",
+			errMsg:        "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty",
 			errContains:   true,
 		},
+		// TODO: Add test for invalid GPU range (min > max) validation
+		// The validation logic is in place (lines 148-152 of dynamographdeploymentrequest.go)
+		// but needs proper test coverage
 	}

 	for _, tt := range tests {

--- a/deploy/utils/gpu_inventory.py
+++ b/deploy/utils/gpu_inventory.py
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-import argparse
-import json
-import logging
-import re
-import shutil
-import subprocess
-import time
-import uuid
-from dataclasses import asdict, dataclass
-from typing import Dict, List, Optional, Tuple, Union
-
-from kubernetes import client, config
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-console_handler = logging.StreamHandler()
-console_handler.setLevel(logging.INFO)
-formatter = logging.Formatter(
-    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
-)
-console_handler.setFormatter(formatter)
-logger.addHandler(console_handler)
-
-
-def run_command(cmd: List[str], capture_output: bool = True, exit_on_error: bool = True):  # type: ignore
-    try:
-        return subprocess.run(cmd, capture_output=capture_output, text=True, check=True)
-    except subprocess.CalledProcessError as e:  # pragma: no cover - passthrough
-        if exit_on_error:
-            logger.error(f"Command failed: {' '.join(cmd)}")
-            if e.stdout:
-                logger.error(e.stdout)
-            if e.stderr:
-                logger.error(e.stderr)
-            raise RuntimeError(f"Command failed: {' '.join(cmd)}")
-        raise
-
-
-NVIDIA_PREFIX = "nvidia.com/"
-LABEL_GPU_COUNT = f"{NVIDIA_PREFIX}gpu.count"
-LABEL_GPU_PRODUCT = f"{NVIDIA_PREFIX}gpu.product"
-LABEL_GPU_MEMORY = f"{NVIDIA_PREFIX}gpu.memory"  # MiB per GPU
-LABEL_MIG_CAPABLE = f"{NVIDIA_PREFIX}mig.capable"
-
-
-@dataclass
-class NodeGpuInventory:
-    node_name: str
-    gpu_count: Optional[int]
-    gpu_product: Optional[str]
-    gpu_memory_mib: Optional[int]
-    mig_capable: Optional[bool]
-    allocatable_gpu: Optional[int]
-    mig_resources: Dict[str, str]
-
-    def to_dict(self) -> Dict[str, Union[str, int, bool, Dict[str, str], None]]:
-        return asdict(self)
-
-
-def _parse_int(value: Optional[str]) -> Optional[int]:
-    if value is None:
-        return None
-    try:
-        return int(value)
-    except (TypeError, ValueError):
-        match = re.search(r"\d+", str(value))
-        return int(match.group(0)) if match else None
-
-
-def _bool_from_str(value: Optional[str]) -> Optional[bool]:
-    if value is None:
-        return None
-    s = str(value).strip().lower()
-    if s in {"true", "1", "yes"}:
-        return True
-    if s in {"false", "0", "no"}:
-        return False
-    return None
-
-
-def _normalize_node(node: Union[client.V1Node, Dict]) -> Dict:
-    # Convert V1Node to dict for uniform access
-    if hasattr(node, "to_dict"):
-        return node.to_dict()
-    return node  # assume already dict
-
-
-def _extract_inventory(node_obj: Dict) -> NodeGpuInventory:
-    meta = node_obj.get("metadata", {})
-    status = node_obj.get("status", {})
-    labels = meta.get("labels", {}) or {}
-
-    node_name = meta.get("name", "<unknown>")
-    gpu_product = labels.get(LABEL_GPU_PRODUCT)
-    gpu_memory_mib = _parse_int(labels.get(LABEL_GPU_MEMORY))
-    mig_capable = _bool_from_str(labels.get(LABEL_MIG_CAPABLE))
-
-    # Prefer GFD-reported GPU count if present; otherwise use allocatable nvidia.com/gpu
-    gpu_count = _parse_int(labels.get(LABEL_GPU_COUNT))
-
-    alloc = status.get("allocatable", {}) or {}
-    alloc_gpu = _parse_int(alloc.get(f"{NVIDIA_PREFIX}gpu"))
-
-    if gpu_count is None:
-        gpu_count = alloc_gpu
-
-    # Collect MIG resource keys and counts if present
-    mig_resources: Dict[str, str] = {
-        k: str(v)
-        for k, v in alloc.items()
-        if isinstance(k, str)
-        and k.startswith(f"{NVIDIA_PREFIX}mig-")
-        and _parse_int(str(v))
-    }
-
-    return NodeGpuInventory(
-        node_name=node_name,
-        gpu_count=gpu_count,
-        gpu_product=gpu_product,
-        gpu_memory_mib=gpu_memory_mib,
-        mig_capable=mig_capable,
-        allocatable_gpu=alloc_gpu,
-        mig_resources=mig_resources,
-    )
-
-
-def _list_nodes_via_client() -> List[Dict]:
-    # Assume running inside a Kubernetes pod with service account
-    try:
-        config.load_incluster_config()
-    except Exception as e:
-        raise RuntimeError(
-            f"Failed to load in-cluster Kubernetes config. Ensure this runs in a pod with a service account. Error: {e}"
-        )
-
-    v1 = client.CoreV1Api()
-    items = v1.list_node().items  # type: ignore[attr-defined]
-    return [_normalize_node(n) for n in items]
-
-
-def _list_nodes_via_kubectl() -> List[Dict]:
-    if not shutil.which("kubectl"):
-        raise RuntimeError("kubectl not found in PATH for fallback")
-    result = run_command(["kubectl", "get", "nodes", "-o", "json"], capture_output=True)
-    data = json.loads(result.stdout)
-    return data.get("items", [])
-
-
-def collect_gpu_inventory(
-    prefer_client: bool = True,
-) -> Tuple[List[NodeGpuInventory], str]:
-    sources_tried: List[str] = []
-    errors: List[str] = []
-
-    def _via_client() -> List[NodeGpuInventory]:
-        items = _list_nodes_via_client()
-        return [_extract_inventory(n) for n in items]
-
-    def _via_kubectl() -> List[NodeGpuInventory]:
-        items = _list_nodes_via_kubectl()
-        return [_extract_inventory(n) for n in items]
-
-    if prefer_client:
-        try:
-            sources_tried.append("kubernetes-client")
-            return _via_client(), ",".join(sources_tried)
-        except Exception as e:
-            errors.append(str(e))
-            try:
-                sources_tried.append("kubectl-json")
-                return _via_kubectl(), ",".join(sources_tried)
-            except Exception as e2:
-                errors.append(str(e2))
-                raise RuntimeError("Failed to list nodes: " + " | ".join(errors))
-    else:
-        try:
-            sources_tried.append("kubectl-json")
-            return _via_kubectl(), ",".join(sources_tried)
-        except Exception as e:
-            errors.append(str(e))
-            try:
-                sources_tried.append("kubernetes-client")
-                return _via_client(), ",".join(sources_tried)
-            except Exception as e2:
-                errors.append(str(e2))
-                raise RuntimeError("Failed to list nodes: " + " | ".join(errors))
-
-
-def _format_gib(mib: Optional[int]) -> str:
-    if mib is None:
-        return ""
-    return f"{mib/1024:.1f} GiB"
-
-
-def print_table(rows: List[NodeGpuInventory], show_mig: bool = False) -> None:
-    headers = ["NODE", "GPUS", "MODEL", "VRAM/GPU", "MIG"]
-    table: List[List[str]] = []
-    for r in rows:
-        mig_str = ""
-        if r.mig_capable is True:
-            if r.mig_resources:
-                mig_str = ",".join(
-                    f"{k.split('/')[-1]}={v}"
-                    for k, v in sorted(r.mig_resources.items())
-                )
-            else:
-                mig_str = "capable"
-        elif r.mig_capable is False:
-            mig_str = "no"
-
-        table.append(
-            [
-                r.node_name,
-                "" if r.gpu_count is None else str(r.gpu_count),
-                r.gpu_product or "",
-                _format_gib(r.gpu_memory_mib),
-                mig_str if show_mig else ("yes" if r.mig_capable else ""),
-            ]
-        )
-
-    # Compute column widths
-    widths = [len(h) for h in headers]
-    for row in table:
-        for i, cell in enumerate(row):
-            widths[i] = max(widths[i], len(cell))
-
-    def _fmt_row(row: List[str]) -> str:
-        return "  ".join(cell.ljust(widths[i]) for i, cell in enumerate(row))
-
-    logger.info(_fmt_row(headers))
-    logger.info(_fmt_row(["-" * w for w in widths]))
-    for row in table:
-        logger.info(_fmt_row(row))
-
-
-def aggregate_valued_rows(
-    rows: List[NodeGpuInventory],
-) -> Tuple[Optional[NodeGpuInventory], int]:
-    """Aggregate rows that have meaningful GPU metadata.
-
-    Preference order when multiple distinct values exist:
-    1) Larger GPUs per node (gpu_count)
-    2) Larger VRAM per GPU (gpu_memory_mib)
-    Returns (selected_row_like, distinct_count).
-    """
-    valued: List[NodeGpuInventory] = [
-        r for r in rows if (r.gpu_product is not None or r.gpu_memory_mib is not None)
-    ]
-    if not valued:
-        return None, 0
-
-    # Group by (product, vram_mib)
-    from collections import defaultdict
-
-    groups: Dict[
-        Tuple[Optional[str], Optional[int]],
-        Dict[str, object],
-    ] = defaultdict(lambda: {"max_gpu": 0, "rows": []})
-    for r in valued:
-        key = (r.gpu_product, r.gpu_memory_mib)
-        meta = groups[key]
-        meta["rows"].append(r)  # type: ignore[attr-defined, index]
-        # Use known gpu_count if available for ranking
-        if r.gpu_count is not None:
-            meta["max_gpu"] = max(int(meta["max_gpu"]), int(r.gpu_count))  # type: ignore[arg-type, call-overload, index]
-
-    def sort_key(
-        item: Tuple[
-            Tuple[Optional[str], Optional[int]],
-            Dict[str, object],
-        ]
-    ):
-        (prod, mem_mib), meta = item
-        max_gpu = int(meta["max_gpu"])  # type: ignore[arg-type, call-overload, index]
-        mem_val = mem_mib if mem_mib is not None else -1
-        return (max_gpu, mem_val)
-
-    selected_key, selected_meta = sorted(groups.items(), key=sort_key, reverse=True)[0]
-    sel_prod, sel_mem_mib = selected_key
-    sel_gpu = int(selected_meta["max_gpu"])  # type: ignore[arg-type, call-overload, index]
-
-    selected = NodeGpuInventory(
-        node_name="<aggregate>",
-        gpu_count=sel_gpu if sel_gpu > 0 else None,
-        gpu_product=sel_prod,
-        gpu_memory_mib=sel_mem_mib,
-        mig_capable=None,
-        allocatable_gpu=None,
-        mig_resources={},
-    )
-
-    return selected, len(groups)
-
-
-def _get_current_namespace(default: str = "default") -> str:
-    try:
-        with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") as f:
-            return f.read().strip() or default
-    except Exception:
-        return default
-
-
-def enrich_with_smi(
-    rows: List[NodeGpuInventory],
-    namespace: Optional[str] = None,
-    timeout_seconds: int = 180,
-) -> None:
-    """For nodes missing product/memory labels, schedule a short-lived pod on each node
-    that requests 1 GPU and runs nvidia-smi to capture model and memory.
-
-    Requires permissions: create/get/delete pods and get pods/log in the namespace.
-    """
-    ns = namespace or _get_current_namespace()
-    try:
-        config.load_incluster_config()
-    except Exception:
-        pass
-
-    v1 = client.CoreV1Api()
-
-    for inv in rows:
-        if not inv.gpu_count or (
-            inv.gpu_product is not None and inv.gpu_memory_mib is not None
-        ):
-            continue
-
-        pod_name = f"gpu-inv-smi-{uuid.uuid4().hex[:6]}"
-        container = client.V1Container(
-            name="smi",
-            image="nvidia/cuda:12.3.2-base-ubuntu22.04",
-            command=["bash", "-lc"],
-            args=[
-                "nvidia-smi --query-gpu=name,memory.total --format=csv,noheader,nounits"
-            ],
-            resources=client.V1ResourceRequirements(
-                limits={"nvidia.com/gpu": "1", "cpu": "100m", "memory": "128Mi"},
-                requests={"nvidia.com/gpu": "1", "cpu": "50m", "memory": "64Mi"},
-            ),
-        )
-
-        pod = client.V1Pod(
-            api_version="v1",
-            kind="Pod",
-            metadata=client.V1ObjectMeta(name=pod_name, namespace=ns),
-            spec=client.V1PodSpec(
-                restart_policy="Never",
-                node_name=inv.node_name,
-                containers=[container],
-            ),
-        )
-
-        logs = ""
-        try:
-            v1.create_namespaced_pod(namespace=ns, body=pod)
-            start = time.time()
-            while time.time() - start < timeout_seconds:
-                p = v1.read_namespaced_pod(name=pod_name, namespace=ns)
-                phase = (p.status.phase or "").lower()
-                if phase in ("succeeded", "failed"):
-                    break
-                time.sleep(2)
-            try:
-                logs = v1.read_namespaced_pod_log(name=pod_name, namespace=ns)
-            except Exception:
-                logs = ""
-        finally:
-            try:
-                v1.delete_namespaced_pod(
-                    name=pod_name, namespace=ns, body=client.V1DeleteOptions()
-                )
-            except Exception:
-                pass
-
-        for line in logs.splitlines():
-            parts = [x.strip() for x in line.split(",")]
-            if len(parts) >= 2 and parts[0]:
-                inv.gpu_product = inv.gpu_product or parts[0]
-                mem_match = re.search(r"\d+", parts[1])
-                if mem_match:
-                    inv.gpu_memory_mib = inv.gpu_memory_mib or int(mem_match.group(0))
-                break
-
-
-def get_gpu_summary(
-    prefer_client: bool = True, enrich_smi: bool = True
-) -> Dict[str, object]:
-    """Return an aggregate GPU summary for the cluster.
-
-    Selection policy when multiple values exist: prefer higher GPUs per node,
-    then higher VRAM/GPU. Returns dict with keys: gpus_per_node, model, vram.
-    If model/VRAM unavailable anywhere, returns {"gpus_per_node": max_gpus, "model": "", "vram": 0}.
-    """
-    # TODO: use proper tools (i.e., DCGM) to get GPU inventory
-    rows, _ = collect_gpu_inventory(prefer_client=prefer_client)
-    if enrich_smi:
-        enrich_with_smi(rows)
-
-    agg, _distinct = aggregate_valued_rows(rows)
-    if agg is None:
-        # Fallback to max GPUs only
-        max_gpus = 0
-        for r in rows:
-            if r.gpu_count is not None:
-                max_gpus = max(max_gpus, int(r.gpu_count))
-        return {"gpus_per_node": max_gpus, "model": "", "vram": 0}
-
-    gpus_val = int(agg.gpu_count) if agg.gpu_count is not None else 0
-    model_val = agg.gpu_product or ""
-    vram_val = int(agg.gpu_memory_mib) if agg.gpu_memory_mib is not None else 0
-    return {
-        "gpus_per_node": gpus_val,
-        "model": model_val,
-        "vram": vram_val,
-    }
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Report GPU inventory per Kubernetes node (count, SKU, VRAM)."
-    )
-    parser.add_argument(
-        "--format",
-        "-o",
-        choices=["table", "json"],
-        default="table",
-        help="Output format",
-    )
-    parser.add_argument(
-        "--prefer",
-        choices=["client", "kubectl"],
-        default="client",
-        help="Prefer Kubernetes Python client or kubectl JSON fallback",
-    )
-    parser.add_argument(
-        "--show-mig",
-        action="store_true",
-        help="In table output, show MIG resource types and counts",
-    )
-    parser.add_argument(
-        "--enrich-smi",
-        action="store_true",
-        help="Schedule short-lived pods per node to fetch model/VRAM via nvidia-smi",
-    )
-    parser.add_argument(
-        "--aggregate",
-        action="store_true",
-        help="Print a single representative (GPUs per node, MODEL, VRAM/GPU). Warn if multiple values exist",
-    )
-
-    args = parser.parse_args()
-
-    prefer_client = args.prefer == "client"
-    rows, source = collect_gpu_inventory(prefer_client=prefer_client)
-
-    if args.enrich_smi:
-        enrich_with_smi(rows)
-
-    if args.format == "json":
-        payload = {
-            "source": source,
-            "items": [r.to_dict() for r in rows],
-        }
-        logger.info(json.dumps(payload, indent=2))
-        return
-
-    # Table output
-    print_table(rows, show_mig=args.show_mig)
-
-    if args.aggregate:
-        agg, distinct = aggregate_valued_rows(rows)
-        if agg is None:
-            logger.warning("No nodes expose MODEL/VRAM; cannot aggregate")
-            return
-        if distinct > 1:
-            logger.warning(
-                f"Multiple distinct GPU model/VRAM pairs detected across nodes: {distinct}. Showing highest GPUs per node, then highest VRAM/GPU."
-            )
-        # Print concise aggregate line
-        model = agg.gpu_product or ""
-        vram = _format_gib(agg.gpu_memory_mib)
-        gpus = agg.gpu_count if agg.gpu_count is not None else ""
-        logger.info(f"Aggregate => GPUS: {gpus}  MODEL: {model}  VRAM/GPU: {vram}")
-
-
-if __name__ == "__main__":
-    main()
--- a/docs/components/profiler/profiler_guide.md
+++ b/docs/components/profiler/profiler_guide.md
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# Profiler Guide
+
+This guide covers deployment, configuration, integration, and troubleshooting for the Dynamo Profiler.
+
+## What is a DynamoGraphDeploymentRequest (DGDR)?
+
+A **DynamoGraphDeploymentRequest (DGDR)** is a Kubernetes Custom Resource that serves as the primary interface for users to request model deployments with specific performance and resource constraints. You specify:
+
+- **What** model you want to deploy (`model`)
+- **How** it should perform (SLA targets: `ttft`, `itl`)
+- **Where** it should run (optional GPU preferences)
+- **Which** backend to use (`backend`: vllm, sglang, or trtllm)
+- **Which** images to use (`profilingConfig.profilerImage`, `deploymentOverrides.workersImage`)
+
+The Dynamo Operator watches for DGDRs and automatically:
+1. Discovers available GPU resources in your cluster
+2. Runs profiling (online or offline) to find optimal configurations
+3. Generates an optimized DynamoGraphDeployment (DGD) configuration
+4. Deploys the DGD to your cluster
+
+**Relationship to DGD:**
+- **DGDR**: High-level "intent" - what you want deployed
+- **DGD**: Low-level "implementation" - how it's deployed
+
+## Support Matrix
+
+| Backend | Dense Models | MoE Models |
+|---------|-------------|------------|
+| vLLM | ✅ | 🚧 |
+| SGLang | ✅ | ✅ |
+| TensorRT-LLM | ✅ | 🚧 |
+
+The profiler sweeps over the following parallelization mappings for prefill and decode:
+
+| Model Architecture | Prefill Parallelization Mapping | Decode Parallelization Mapping |
+|---------|-------------|------------|
+| MLA+MoE (DeepseekV3ForCausalLM, DeepseekV32ForCausalLM) | TEP, DEP | TEP, DEP |
+| GQA+MoE (Qwen3MoeForCausalLM) | TP, TEP, DEP | TP, TEP, DEP |
+| Other Models | TP | TP |
+
+> [!NOTE]
+> Exact model x parallelization mapping support is dependent on the backend. The profiler does not guarantee that the recommended P/D engine configuration is supported and bug-free by the backend.
+
+## Deployment
+
+### Kubernetes Deployment (DGDR)
+
+The recommended deployment method is through DGDRs. Sample configurations are provided in `benchmarks/profiler/deploy/`:
+
+| Sample | Description |
+|--------|-------------|
+| `profile_sla_dgdr.yaml` | Standard online profiling with AIPerf |
+| `profile_sla_aic_dgdr.yaml` | Fast offline profiling with AI Configurator |
+| `profile_sla_moe_dgdr.yaml` | MoE model profiling (SGLang) |
+
+#### Container Images
+
+Each DGDR requires container images for profiling and deployment:
+
+- **`profilingConfig.profilerImage`** (Required): Container image for the profiling job. Must contain the profiler code and dependencies.
+- **`deploymentOverrides.workersImage`** (Optional): Container image for DGD worker components (frontend, workers, planner). If omitted, uses image from the base config file.
+
+```yaml
+spec:
+  profilingConfig:
+    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
+  deploymentOverrides:
+    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
+```
+
+#### Quick Start: Deploy with DGDR
+
+**Step 1: Create Your DGDR**
+
+Use a sample configuration or create your own:
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeploymentRequest
+metadata:
+  name: my-model-profiling
+spec:
+  model: "Qwen/Qwen3-0.6B"
+  backend: vllm
+  profilingConfig:
+    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
+    config:
+      sla:
+        isl: 3000
+        osl: 150
+        ttft: 200.0
+        itl: 20.0
+  deploymentOverrides:
+    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
+  autoApply: true
+```
+
+**Step 2: Apply the DGDR**
+
+```bash
+export NAMESPACE=your-namespace
+kubectl apply -f my-profiling-dgdr.yaml -n $NAMESPACE
+```
+
+**Step 3: Monitor Progress**
+
+```bash
+# View status
+kubectl get dgdr -n $NAMESPACE
+
+# Detailed status
+kubectl describe dgdr my-model-profiling -n $NAMESPACE
+
+# Watch profiling job logs
+kubectl logs -f job/profile-my-model-profiling -n $NAMESPACE
+```
+
+**DGDR Status States:**
+- `Pending`: Initial state, preparing to profile
+- `Profiling`: Running profiling job (20-30 seconds for AIC, 2-4 hours for online)
+- `Deploying`: Generating and applying DGD configuration
+- `Ready`: DGD successfully deployed and running
+- `Failed`: Error occurred (check events for details)
+
+**Step 4: Access Your Deployment**
+
+```bash
+# Find the frontend service
+kubectl get svc -n $NAMESPACE | grep frontend
+
+# Port-forward to access locally
+kubectl port-forward svc/<deployment>-frontend 8000:8000 -n $NAMESPACE
+
+# Test the endpoint
+curl http://localhost:8000/v1/models
+```
+
+> [!NOTE]
+> DGDRs are **immutable**. To update SLAs or configuration, delete the existing DGDR and create a new one.
+
+### Direct Script Execution
+
+For advanced use cases or local development:
+
+```bash
+python -m benchmarks.profiler.profile_sla \
+  --backend vllm \
+  --config path/to/disagg.yaml \
+  --model meta-llama/Llama-3-8B \
+  --ttft 200 --itl 15 \
+  --isl 3000 --osl 150 \
+  --min-num-gpus 1 \
+  --max-num-gpus 8
+```
+
+## Profiling Method
+
+The profiler follows a 5-step process:
+
+1. **Hardware Setup**: Uses defaults or user-specified hardware configuration. Optionally, cluster-scoped operators can enable automatic GPU discovery to detect specifications from cluster nodes.
+2. **Identify Sweep Ranges**: Automatically determine minimum and maximum number of GPUs per engine. Minimum is determined by the model size and GPU VRAM. Maximum is set to one node for dense models and 4 nodes for MoE models.
+3. **Parallelization Mapping Sweep**: Test performance of engines with different parallelization mappings using the input ISL and OSL.
+   - For dense models, test different TP sizes for both prefill and decode.
+   - For MoE models (SGLang), evaluate both TEP and DEP as candidates for prefill and decode.
+   - **Prefill**:
+     - TP/TEP: Measure TTFT with batch size = 1 (assuming ISL is long enough to saturate compute) without KV reuse.
+     - DEP: Attention uses data parallelism. Send a single burst with total concurrency `attention_dp_size × attn_dp_num_req_ratio` (defaults to 4) and compute the reported TTFT as `time_to_first_token.max / attn_dp_num_req_ratio` from the AIPerf summary of that burst.
+   ![Prefill Performance](../../images/h100_prefill_performance.png)
+   - **Decode**: Measure the ITL under different numbers of in-flight requests, from 1 to the maximum the KV cache can hold. To measure ITL without being affected by piggy-backed prefill requests, the script enables KV-reuse and warms up the engine by issuing the same prompts before measuring.
+   ![Decode Performance](../../images/h100_decode_performance.png)
+4. **Recommendation**: Select optimal parallelization mapping for prefill and decode that achieves the highest per-GPU throughput while adhering to the SLA on TTFT and ITL.
+5. **In-Depth Profiling on the Recommended P/D Engine**: Interpolate TTFT with ISL and ITL with active KV cache and decode context length for more accurate performance estimation.
+![ITL Interpolation](../../images/pd_interpolation.png)
+   - **Prefill**: Measures TTFT and throughput per GPU across different input lengths with batch size=1.
+   - **Decode**: Measures ITL and throughput per GPU under various KV cache loads and decode context lengths.
+
+### AIPerf on Real Engines
+
+Profiles your model by creating real test deployments in Kubernetes and measuring their performance.
+
+- **Duration**: 2-4 hours
+- **Accuracy**: Highest (real measurements)
+- **GPU Requirements**: Full access to test different parallelization mappings
+- **Backends**: vLLM, SGLang, TensorRT-LLM
+
+```yaml
+profilingConfig:
+  config:
+    sweep:
+      useAiConfigurator: false  # Default
+```
+
+### AI Configurator Simulation
+
+Uses performance simulation to rapidly estimate optimal configurations without running real deployments.
+
+- **Duration**: 20-30 seconds
+- **Accuracy**: Estimated (may have errors for unusual configurations)
+- **GPU Requirements**: None
+- **Backends**: TensorRT-LLM only (vLLM/SGLang coming soon)
+
+```yaml
+profilingConfig:
+  config:
+    sweep:
+      useAiConfigurator: true
+      aicSystem: h200_sxm
+      aicHfId: Qwen/Qwen3-32B
+      aicBackendVersion: "0.20.0"      # TRT-LLM version simulated by AIC
+```
+
+> [!NOTE]
+> `aicBackendVersion` specifies the TensorRT-LLM version that AI Configurator simulates. See the [AI Configurator supported features](https://github.com/ai-dynamo/aiconfigurator#supported-features) for available versions.
+
+**Currently supports:**
+- **Backends**: TensorRT-LLM (versions 0.20.0, 1.0.0rc3, 1.0.0rc6)
+- **Systems**: H100 SXM, H200 SXM, B200 SXM, GB200 SXM, A100 SXM
+- **Models**: Wide range including GPT, Llama, Mixtral, DeepSeek, Qwen, and more
+
+See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features) for the full list.
+
+### Automatic GPU Discovery
+
+The operator automatically discovers GPU resources from your Kubernetes cluster nodes when available. GPU discovery provides:
+
+- Hardware information (GPU model, VRAM, GPUs per node)
+- Automatic calculation of profiling search space based on model size
+- Hardware system identifier for AI Configurator integration
+
+**Permissions**: GPU discovery requires cluster-wide node read permissions. Cluster-scoped operators automatically have these permissions. Namespace-restricted operators can also use GPU discovery if granted node read permissions via RBAC.
+
+If GPU discovery is unavailable (no permissions or no GPU labels), the profiler will use manually specified hardware configuration or defaults.
+
+## Configuration
+
+### DGDR Configuration Structure
+
+All profiler configuration goes under `spec.profilingConfig.config`:
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeploymentRequest
+metadata:
+  name: my-deployment
+spec:
+  model: "Qwen/Qwen3-0.6B"
+  backend: vllm
+
+  profilingConfig:
+    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
+    configMapRef:                  # Optional: base DGD config
+      name: my-config
+      key: disagg.yaml
+
+    config:
+      sla: { ... }
+      hardware: { ... }
+      sweep: { ... }
+      planner: { ... }
+
+  deploymentOverrides:
+    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
+```
+
+### SLA Configuration (Required)
+
+```yaml
+sla:
+  isl: 3000      # Average input sequence length (tokens)
+  osl: 150       # Average output sequence length (tokens)
+  ttft: 200.0    # Target Time To First Token (milliseconds)
+  itl: 20.0      # Target Inter-Token Latency (milliseconds)
+```
+
+- **ISL/OSL**: Based on your expected traffic patterns
+- **TTFT**: First token latency target (lower = more GPUs needed, affects prefill engine)
+- **ITL**: Token generation latency target (lower = more GPUs needed, affects decode engine)
+- **Trade-offs**: Tighter SLAs require more GPU resources
+
+### Hardware Configuration (Optional)
+
+```yaml
+hardware:
+  minNumGpusPerEngine: 2      # Auto-determined from model size and VRAM if not provided
+  maxNumGpusPerEngine: 8      # Maximum GPUs to test
+  numGpusPerNode: 8           # GPUs per node (for multi-node MoE)
+  gpuType: h200_sxm           # GPU type hint (informational, auto-detected)
+```
+
+- **minNumGpusPerEngine**: Skip small TP sizes if your model is large
+- **maxNumGpusPerEngine**: Limit search space or work around constraints (e.g., [AIC attention heads](#ai-configurator-attention-head-constraint-error))
+- **numGpusPerNode**: Determine the upper bound of GPUs per node for dense models and configure Grove for multi-node MoE engines
+- **gpuType**: Informational only, auto-detected by the controller. For AI Configurator, use `aicSystem` in the [sweep configuration](#ai-configurator-configuration) instead
+
+> [!TIP]
+> If you don't specify hardware constraints, the controller auto-detects based on your model size and available cluster resources.
+
+### Sweep Configuration (Optional)
+
+```yaml
+sweep:
+  useAiConfigurator: false              # Use real profiling (default)
+  prefillInterpolationGranularity: 16   # Samples for prefill TTFT curve
+  decodeInterpolationGranularity: 6     # Samples for decode ITL curve
+```
+
+- **useAiConfigurator**: Set to `true` for 20-30 second profiling (TensorRT-LLM only)
+- **prefillInterpolationGranularity**: Samples for prefill TTFT curve (lower = faster but less accurate)
+- **decodeInterpolationGranularity**: Samples for decode ITL curve. Since ITL interpolation is 3D and takes longer, we default to fewer samples. Increasing this value may quadratically increase profiling time.
+
+### AI Configurator Configuration
+
+Required if `useAiConfigurator: true`:
+
+```yaml
+sweep:
+  useAiConfigurator: true
+  aicSystem: h200_sxm              # h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm
+  aicHfId: Qwen/Qwen3-32B         # HuggingFace model ID
+  aicBackendVersion: "0.20.0"      # TensorRT-LLM version
+```
+
+### Planner Configuration (Optional)
+
+Pass arguments to the SLA planner:
+
+```yaml
+planner:
+  planner_min_endpoint: 2                    # Minimum endpoints to maintain
+  planner_adjustment_interval: 60            # Adjustment interval (seconds)
+  planner_load_predictor: linear             # Load prediction method
+```
+
+> [!NOTE]
+> Planner arguments use `planner_` prefix. See the AI Configurator documentation for full list.
+
+### Model Cache PVC (Advanced)
+
+For large models, use a pre-populated PVC containing model weights instead of downloading from HuggingFace:
+
+```yaml
+deployment:
+  modelCache:
+    pvcName: "model-cache"
+    pvcPath: "hub/models--deepseek-ai--DeepSeek-R1"
+    mountPath: "/opt/model-cache"
+```
+
+Requirements:
+- The PVC must exist in the same namespace as the DGDR
+- The model weights must be accessible at `{mountPath}/{pvcPath}`
+
+### Engine Configuration (Auto-configured)
+
+The controller automatically injects these from high-level fields:
+
+```yaml
+# You specify:
+spec:
+  model: "Qwen/Qwen3-0.6B"
+  backend: vllm
+
+# Controller auto-injects:
+profilingConfig:
+  config:
+    deployment:
+      model: "Qwen/Qwen3-0.6B"
+    engine:
+      backend: vllm
+      config: /path/to/configmap
+```
+
+You should **not** manually set `deployment.model` or `engine.backend` in `profilingConfig.config`.
+
+### Using Existing DGD Configs (ConfigMap)
+
+Reference an existing DGD config via ConfigMap:
+
+```bash
+kubectl create configmap my-config \
+  --from-file=disagg.yaml=/path/to/your/disagg.yaml \
+  --namespace $NAMESPACE \
+  --dry-run=client -o yaml | kubectl apply -f -
+```
+
+```yaml
+profilingConfig:
+  configMapRef:
+    name: my-config
+    key: disagg.yaml
+```
+
+The profiler uses the DGD config as a **base template**, then optimizes it based on your SLA targets.
+
+### CLI Arguments
+
+| Argument | Type | Default | Description |
+|----------|------|---------|-------------|
+| `--backend` | string | - | Inference backend: vllm, sglang, trtllm |
+| `--config` | string | - | Path to DGD YAML config file |
+| `--model` | string | - | HuggingFace model ID |
+| `--ttft` | float | - | Target TTFT in milliseconds |
+| `--itl` | float | - | Target ITL in milliseconds |
+| `--isl` | int | - | Average input sequence length |
+| `--osl` | int | - | Average output sequence length |
+| `--min-num-gpus` | int | auto | Minimum GPUs per engine |
+| `--max-num-gpus` | int | 8 | Maximum GPUs per engine |
+| `--use-ai-configurator` | flag | false | Use offline AI Configurator |
+| `--pick-with-webui` | flag | false | Launch interactive WebUI |
+| `--webui-port` | int | 8000 | Port for WebUI |
+
+> [!NOTE]
+> CLI arguments map to DGDR config fields: `--min-num-gpus` = `hardware.minNumGpusPerEngine`, `--max-num-gpus` = `hardware.maxNumGpusPerEngine`, `--use-ai-configurator` = `sweep.useAiConfigurator`. See [DGDR Configuration Structure](#dgdr-configuration-structure) for all field mappings.
+
+## Integration
+
+### With SLA Planner
+
+The Profiler generates interpolation data that the SLA Planner uses for autoscaling decisions.
+
+**Prefill Interpolation** (`selected_prefill_interpolation/raw_data.npz`):
+- `prefill_isl`: 1D array of input sequence lengths tested
+- `prefill_ttft`: 1D array of TTFTs (ms) at each ISL
+- `prefill_thpt_per_gpu`: 1D array of throughput (tokens/s/GPU) at each ISL
+
+**Decode Interpolation** (`selected_decode_interpolation/raw_data.npz`):
+- `max_kv_tokens`: Total KV tokens capacity in decode engine
+- `x_kv_usage`: 1D array of active KV usage percentages [0, 1]
+- `y_context_length`: 1D array of average context lengths tested
+- `z_itl`: 1D array of ITLs (ms) at each (KV usage, context length) point
+- `z_thpt_per_gpu`: 1D array of throughput (tokens/s/GPU) at each point
+
+### With Dynamo Operator
+
+When using DGDR, the Dynamo Operator:
+
+1. Creates profiling jobs automatically
+2. Stores profiling data in ConfigMaps (`planner-profile-data`)
+3. Generates optimized DGD configurations
+4. Deploys the DGD with SLA Planner integration
+
+The generated DGD is tracked via labels:
+```yaml
+metadata:
+  labels:
+    dgdr.nvidia.com/name: my-deployment
+    dgdr.nvidia.com/namespace: your-namespace
+```
+
+### With Observability
+
+Monitor profiling jobs:
+
+```bash
+kubectl logs -f job/profile-<dgdr-name> -n $NAMESPACE
+kubectl describe dgdr <name> -n $NAMESPACE
+```
+
+## Advanced Topics
+
+### Manual Deployment Control
+
+Disable auto-deployment to review the generated DGD before applying:
+
+```yaml
+spec:
+  autoApply: false
+```
+
+Then manually extract and apply:
+
+```bash
+# Extract generated DGD from DGDR status
+kubectl get dgdr my-deployment -n $NAMESPACE -o jsonpath='{.status.generatedDeployment}' | kubectl apply -f -
+
+# Or save to file for review
+kubectl get dgdr my-deployment -n $NAMESPACE -o jsonpath='{.status.generatedDeployment}' > my-dgd.yaml
+```
+
+### Mocker Deployment
+
+Deploy a mocker deployment that simulates engines without GPUs:
+
+```yaml
+spec:
+  model: <model-name>
+  backend: trtllm
+  useMocker: true    # Deploy mocker instead of real backend
+  autoApply: true
+```
+
+Profiling still runs against the real backend to collect performance data. The mocker uses this data to simulate realistic timing behavior. Useful for large-scale experiments, testing Planner behavior, and validating configurations.
+
+### Accessing Profiling Artifacts
+
+By default, profiling data is stored in ConfigMaps. For detailed artifacts (plots, logs, raw data), attach a PVC:
+
+```yaml
+profilingConfig:
+  outputPVC: "dynamo-pvc"
+```
+
+**ConfigMaps (always created):**
+- `dgdr-output-<name>`: Generated DGD configuration
+- `planner-profile-data`: Profiling data for Planner (JSON)
+
+**PVC artifacts (optional):**
+- Performance plots (PNGs)
+- DGD configurations for each profiled deployment
+- AIPerf profiling artifacts
+- Raw profiling data (`.npz` files)
+- Profiler logs
+
+Access PVC results:
+```bash
+kubectl apply -f deploy/utils/manifests/pvc-access-pod.yaml -n $NAMESPACE
+kubectl wait --for=condition=Ready pod/pvc-access-pod -n $NAMESPACE --timeout=60s
+kubectl cp $NAMESPACE/pvc-access-pod:/data ./profiling-results
+kubectl delete pod pvc-access-pod -n $NAMESPACE
+```
+
+### Output Performance Plots
+
+The profiler generates plots to visualize performance data:
+
+**Parallelization Mapping Sweep Plots:**
+- `prefill_performance.png`: TTFT vs Parallelization Mapping size
+- `decode_performance.png`: ITL vs Parallelization Mapping size and in-flight requests
+
+**In-Depth Profiling Plots:**
+- `selected_prefill_interpolation/prefill_ttft_interpolation.png`: TTFT vs ISL
+- `selected_prefill_interpolation/prefill_throughput_interpolation.png`: Throughput vs ISL
+- `selected_decode_interpolation/decode_itl_interplation.png`: ITL vs KV usage and context length
+- `selected_decode_interpolation/decode_throughput_interpolation.png`: Throughput vs KV usage and context length
+
+## Runtime Profiling (SGLang)
+
+SGLang workers expose profiling endpoints for runtime performance analysis:
+
+```bash
+# Start profiling
+curl -X POST http://localhost:9090/engine/start_profile \
+  -H "Content-Type: application/json" \
+  -d '{"output_dir": "/tmp/profiler_output"}'
+
+# Run inference requests...
+
+# Stop profiling
+curl -X POST http://localhost:9090/engine/stop_profile
+```
+
+View traces using Chrome's `chrome://tracing`, [Perfetto UI](https://ui.perfetto.dev/), or TensorBoard.
+
+## Troubleshooting
+
+### Profiling Takes Too Long
+
+**Solution 1**: Use AI Configurator for rapid profiling (TensorRT-LLM only):
+```yaml
+sweep:
+  useAiConfigurator: true
+```
+
+**Solution 2**: Reduce search space:
+```yaml
+hardware:
+  minNumGpusPerEngine: 4  # Skip TP1, TP2
+  maxNumGpusPerEngine: 8  # Don't test beyond TP8
+```
+
+### SLA Cannot Be Met
+
+**Symptoms**: Profiler reports no configuration meets targets
+
+**Solutions:**
+1. Relax SLA targets (increase TTFT/ITL)
+2. Add more GPU resources
+3. Try a different backend
+4. Use a smaller model
+
+### AI Configurator: Attention Head Constraint Error
+
+**Symptoms**: Profiling fails with error:
+```text
+AssertionError: num_heads <N> should be divisible by tp_size <M> and the division result should be >= 4
+```
+
+**Cause**: AI Configurator requires **≥4 attention heads per GPU**. Small models with few heads cannot use high TP sizes.
+
+**Affected Models:**
+- **Qwen3-0.6B** (16 heads): Max TP = 4
+- **GPT-2** (12 heads): Max TP = 3
+- Most models **<1B parameters**: May hit this constraint
+
+**Solution**: Limit `maxNumGpusPerEngine`:
+```yaml
+hardware:
+  maxNumGpusPerEngine: 4  # For Qwen3-0.6B (16 heads / 4 = max TP of 4)
+```
+
+**Calculate Max TP**: `max_tp = num_attention_heads / 4`
+
+> [!NOTE]
+> This is an AI Configurator limitation. Online profiling doesn't have this constraint.
+
+### Image Pull Errors
+
+**Symptoms**: `ErrImagePull` or `ImagePullBackOff`
+
+**Solution**: Ensure image pull secrets are configured:
+```bash
+kubectl create secret docker-registry nvcr-imagepullsecret \
+  --docker-server=nvcr.io \
+  --docker-username='$oauthtoken' \
+  --docker-password=<NGC_API_KEY> \
+  --namespace <your-namespace>
+```
+
+### Out of Memory During Profiling
+
+**Symptoms**: OOM errors in profiling jobs
+
+**Solutions:**
+1. Reduce `gpu_memory_utilization` in engine config
+2. Reduce `--max-context-length`
+3. Skip larger TP configurations
+4. Use fewer GPUs per test
+
+### Unsupported Parallelization Mapping in Backend
+
+**Symptoms**: Startup/runtime error in the backend (e.g., prime number of attention heads constraining TP to 1, or backend not supporting different TP sizes for prefill and decode).
+
+**Solutions:**
+1. Contact the backend to add support and bump backend version in Dynamo
+2. Constrain the max and min number of GPUs per engine to the supported range
+
+## See Also
+
+- [DGDR Examples](../../../components/src/dynamo/profiler/deploy/) - Complete DGDR YAML examples
+- [DGDR API Reference](/docs/kubernetes/api_reference.md) - DGDR specification
+- [Profiler Arguments Reference](https://github.com/ai-dynamo/dynamo/blob/main/components/src/dynamo/profiler/utils/profiler_argparse.py) - Full CLI reference
--- a/docs/kubernetes/api_reference.md
+++ b/docs/kubernetes/api_reference.md
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+> **⚠️ Important**: This documentation is automatically generated from source code.
+> Do not edit this file directly.
+
+# API Reference
+
+## Packages
+- [nvidia.com/v1alpha1](#nvidiacomv1alpha1)
+
+
+## nvidia.com/v1alpha1
+
+Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API group.
+
+This package defines the DynamoGraphDeploymentRequest (DGDR) custom resource, which provides
+a high-level, SLA-driven interface for deploying machine learning models on Dynamo.
+
+Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API group.
+
+### Resource Types
+- [DynamoCheckpoint](#dynamocheckpoint)
+- [DynamoComponentDeployment](#dynamocomponentdeployment)
+- [DynamoGraphDeployment](#dynamographdeployment)
+- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
+- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
+- [DynamoModel](#dynamomodel)
+
+
+
+#### Autoscaling
+
+
+
+Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter
+with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md
+for migration guidance. This field will be removed in a future API version.
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `enabled` _boolean_ | Deprecated: This field is ignored. |  |  |
+| `minReplicas` _integer_ | Deprecated: This field is ignored. |  |  |
+| `maxReplicas` _integer_ | Deprecated: This field is ignored. |  |  |
+| `behavior` _[HorizontalPodAutoscalerBehavior](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling)_ | Deprecated: This field is ignored. |  |  |
+| `metrics` _[MetricSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling) array_ | Deprecated: This field is ignored. |  |  |
+
+
+
+
+#### CheckpointMode
+
+_Underlying type:_ _string_
+
+CheckpointMode defines how checkpoint creation is handled
+
+_Validation:_
+- Enum: [Auto Manual]
+
+_Appears in:_
+- [ServiceCheckpointConfig](#servicecheckpointconfig)
+
+| Field | Description |
+| --- | --- |
+| `Auto` | CheckpointModeAuto means the DGD controller will automatically create a Checkpoint CR<br /> |
+| `Manual` | CheckpointModeManual means the user must create the Checkpoint CR themselves<br /> |
+
+
+#### ComponentKind
+
+_Underlying type:_ _string_
+
+ComponentKind represents the type of underlying Kubernetes resource.
+
+_Validation:_
+- Enum: [PodClique PodCliqueScalingGroup Deployment LeaderWorkerSet]
+
+_Appears in:_
+- [ServiceReplicaStatus](#servicereplicastatus)
+
+| Field | Description |
+| --- | --- |
+| `PodClique` | ComponentKindPodClique represents a PodClique resource.<br /> |
+| `PodCliqueScalingGroup` | ComponentKindPodCliqueScalingGroup represents a PodCliqueScalingGroup resource.<br /> |
+| `Deployment` | ComponentKindDeployment represents a Deployment resource.<br /> |
+| `LeaderWorkerSet` | ComponentKindLeaderWorkerSet represents a LeaderWorkerSet resource.<br /> |
+
+
+#### ConfigMapKeySelector
+
+
+
+ConfigMapKeySelector selects a specific key from a ConfigMap.
+Used to reference external configuration data stored in ConfigMaps.
+
+
+
+_Appears in:_
+- [ProfilingConfigSpec](#profilingconfigspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `name` _string_ | Name of the ConfigMap containing the desired data. |  | Required: \{\} <br /> |
+| `key` _string_ | Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml". | disagg.yaml |  |
+
+
+#### DeploymentOverridesSpec
+
+
+
+DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments.
+When autoApply is enabled, these overrides are applied to the generated DGD resource.
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `name` _string_ | Name is the desired name for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR name. |  | Optional: \{\} <br /> |
+| `namespace` _string_ | Namespace is the desired namespace for the created DynamoGraphDeployment.<br />If not specified, defaults to the DGDR namespace. |  | Optional: \{\} <br /> |
+| `labels` _object (keys:string, values:string)_ | Labels are additional labels to add to the DynamoGraphDeployment metadata.<br />These are merged with auto-generated labels from the profiling process. |  | Optional: \{\} <br /> |
+| `annotations` _object (keys:string, values:string)_ | Annotations are additional annotations to add to the DynamoGraphDeployment metadata. |  | Optional: \{\} <br /> |
+| `workersImage` _string_ | WorkersImage specifies the container image to use for DynamoGraphDeployment worker components.<br />This image is used for both temporary DGDs created during online profiling and the final DGD.<br />If omitted, the image from the base config file (e.g., disagg.yaml) is used.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" |  | Optional: \{\} <br /> |
+
+
+#### DeploymentStatus
+
+
+
+DeploymentStatus tracks the state of an auto-created DynamoGraphDeployment.
+This status is populated when autoApply is enabled and a DGD is created.
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentRequestStatus](#dynamographdeploymentrequeststatus)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `name` _string_ | Name is the name of the created DynamoGraphDeployment. |  |  |
+| `namespace` _string_ | Namespace is the namespace of the created DynamoGraphDeployment. |  |  |
+| `state` _string_ | State is the current state of the DynamoGraphDeployment.<br />This value is mirrored from the DGD's status.state field. |  |  |
+| `created` _boolean_ | Created indicates whether the DGD has been successfully created.<br />Used to prevent recreation if the DGD is manually deleted by users. |  |  |
+
+
+
+
+#### DynamoCheckpoint
+
+
+
+DynamoCheckpoint is the Schema for the dynamocheckpoints API
+It represents a container checkpoint that can be used to restore pods to a warm state
+
+
+
+
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
+| `kind` _string_ | `DynamoCheckpoint` | | |
+| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
+| `spec` _[DynamoCheckpointSpec](#dynamocheckpointspec)_ |  |  |  |
+| `status` _[DynamoCheckpointStatus](#dynamocheckpointstatus)_ |  |  |  |
+
+
+
+
+#### DynamoCheckpointIdentity
+
+
+
+DynamoCheckpointIdentity defines the inputs that determine checkpoint equivalence
+Two checkpoints with the same identity hash are considered equivalent
+
+
+
+_Appears in:_
+- [DynamoCheckpointSpec](#dynamocheckpointspec)
+- [ServiceCheckpointConfig](#servicecheckpointconfig)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `model` _string_ | Model is the model identifier (e.g., "meta-llama/Llama-3-70B") |  | Required: \{\} <br /> |
+| `backendFramework` _string_ | BackendFramework is the runtime framework (vllm, sglang, trtllm) |  | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
+| `dynamoVersion` _string_ | DynamoVersion is the Dynamo platform version (optional)<br />If not specified, version is not included in identity hash<br />This ensures checkpoint compatibility across Dynamo releases |  | Optional: \{\} <br /> |
+| `tensorParallelSize` _integer_ | TensorParallelSize is the tensor parallel configuration | 1 | Minimum: 1 <br />Optional: \{\} <br /> |
+| `pipelineParallelSize` _integer_ | PipelineParallelSize is the pipeline parallel configuration | 1 | Minimum: 1 <br />Optional: \{\} <br /> |
+| `dtype` _string_ | Dtype is the data type (fp16, bf16, fp8, etc.) |  | Optional: \{\} <br /> |
+| `maxModelLen` _integer_ | MaxModelLen is the maximum sequence length |  | Minimum: 1 <br />Optional: \{\} <br /> |
+| `extraParameters` _object (keys:string, values:string)_ | ExtraParameters are additional parameters that affect the checkpoint hash<br />Use for any framework-specific or custom parameters not covered above |  | Optional: \{\} <br /> |
+
+
+#### DynamoCheckpointJobConfig
+
+
+
+DynamoCheckpointJobConfig defines the configuration for the checkpoint creation Job
+
+
+
+_Appears in:_
+- [DynamoCheckpointSpec](#dynamocheckpointspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `podTemplateSpec` _[PodTemplateSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#podtemplatespec-v1-core)_ | PodTemplateSpec allows customizing the checkpoint Job pod<br />This should include the container that runs the workload to be checkpointed |  | Required: \{\} <br /> |
+| `activeDeadlineSeconds` _integer_ | ActiveDeadlineSeconds specifies the maximum time the Job can run | 3600 | Optional: \{\} <br /> |
+| `backoffLimit` _integer_ | BackoffLimit specifies the number of retries before marking the Job failed | 3 | Optional: \{\} <br /> |
+| `ttlSecondsAfterFinished` _integer_ | TTLSecondsAfterFinished specifies how long to keep the Job after completion | 300 | Optional: \{\} <br /> |
+
+
+#### DynamoCheckpointPhase
+
+_Underlying type:_ _string_
+
+DynamoCheckpointPhase represents the current phase of the checkpoint lifecycle
+
+_Validation:_
+- Enum: [Pending Creating Ready Failed]
+
+_Appears in:_
+- [DynamoCheckpointStatus](#dynamocheckpointstatus)
+
+| Field | Description |
+| --- | --- |
+| `Pending` | DynamoCheckpointPhasePending indicates the checkpoint CR has been created but the Job has not started<br /> |
+| `Creating` | DynamoCheckpointPhaseCreating indicates the checkpoint Job is running<br /> |
+| `Ready` | DynamoCheckpointPhaseReady indicates the checkpoint tar file is available on the PVC<br /> |
+| `Failed` | DynamoCheckpointPhaseFailed indicates the checkpoint creation failed<br /> |
+
+
+#### DynamoCheckpointSpec
+
+
+
+DynamoCheckpointSpec defines the desired state of DynamoCheckpoint
+
+
+
+_Appears in:_
+- [DynamoCheckpoint](#dynamocheckpoint)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the inputs that determine checkpoint equivalence |  | Required: \{\} <br /> |
+| `job` _[DynamoCheckpointJobConfig](#dynamocheckpointjobconfig)_ | Job defines the configuration for the checkpoint creation Job |  | Required: \{\} <br /> |
+
+
+#### DynamoCheckpointStatus
+
+
+
+DynamoCheckpointStatus defines the observed state of DynamoCheckpoint
+
+
+
+_Appears in:_
+- [DynamoCheckpoint](#dynamocheckpoint)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `phase` _[DynamoCheckpointPhase](#dynamocheckpointphase)_ | Phase represents the current phase of the checkpoint lifecycle |  | Enum: [Pending Creating Ready Failed] <br />Optional: \{\} <br /> |
+| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity<br />This hash is used to identify equivalent checkpoints |  | Optional: \{\} <br /> |
+| `location` _string_ | Location is the full URI/path to the checkpoint in the storage backend<br />For PVC: same as TarPath (e.g., /checkpoints/\{hash\}.tar)<br />For S3: s3://bucket/prefix/\{hash\}.tar<br />For OCI: oci://registry/repo:\{hash\} |  | Optional: \{\} <br /> |
+| `storageType` _[DynamoCheckpointStorageType](#dynamocheckpointstoragetype)_ | StorageType indicates the storage backend type used for this checkpoint |  | Enum: [pvc s3 oci] <br />Optional: \{\} <br /> |
+| `jobName` _string_ | JobName is the name of the checkpoint creation Job |  | Optional: \{\} <br /> |
+| `createdAt` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | CreatedAt is the timestamp when the checkpoint tar was created |  | Optional: \{\} <br /> |
+| `message` _string_ | Message provides additional information about the current state |  | Optional: \{\} <br /> |
+| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions represent the latest available observations of the checkpoint's state |  | Optional: \{\} <br /> |
+
+
+#### DynamoCheckpointStorageType
+
+_Underlying type:_ _string_
+
+DynamoCheckpointStorageType defines the supported storage backends for checkpoints
+
+_Validation:_
+- Enum: [pvc s3 oci]
+
+_Appears in:_
+- [DynamoCheckpointStatus](#dynamocheckpointstatus)
+
+
+
+#### DynamoComponentDeployment
+
+
+
+DynamoComponentDeployment is the Schema for the dynamocomponentdeployments API
+
+
+
+
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
+| `kind` _string_ | `DynamoComponentDeployment` | | |
+| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
+| `spec` _[DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)_ | Spec defines the desired state for this Dynamo component deployment. |  |  |
+
+
+#### DynamoComponentDeploymentSharedSpec
+
+
+
+
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+- [DynamoGraphDeploymentSpec](#dynamographdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `annotations` _object (keys:string, values:string)_ | Annotations to add to generated Kubernetes resources for this component<br />(such as Pod, Service, and Ingress when applicable). |  |  |
+| `labels` _object (keys:string, values:string)_ | Labels to add to generated Kubernetes resources for this component. |  |  |
+| `serviceName` _string_ | The name of the component |  |  |
+| `componentType` _string_ | ComponentType indicates the role of this component (for example, "main"). |  |  |
+| `subComponentType` _string_ | SubComponentType indicates the sub-role of this component (for example, "prefill"). |  |  |
+| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.<br />The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component |  | Optional: \{\} <br /> |
+| `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace |  |  |
+| `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,<br />GPUs/devices, and any runtime-specific resources. |  |  |
+| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter<br />with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md<br />for migration guidance. This field will be removed in a future API version. |  |  |
+| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. |  |  |
+| `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as<br />environment variables in the component containers. |  |  |
+| `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. |  |  |
+| `ingress` _[IngressSpec](#ingressspec)_ | Ingress config to expose the component outside the cluster (or through a service mesh). |  |  |
+| `modelRef` _[ModelReference](#modelreference)_ | ModelRef references a model that this component serves<br />When specified, a headless service will be created for endpoint discovery |  | Optional: \{\} <br /> |
+| `sharedMemory` _[SharedMemorySpec](#sharedmemoryspec)_ | SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size). |  |  |
+| `extraPodMetadata` _[ExtraPodMetadata](#extrapodmetadata)_ | ExtraPodMetadata adds labels/annotations to the created Pods. |  | Optional: \{\} <br /> |
+| `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.<br />It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field<br />that allows overriding the main container configuration. |  | Optional: \{\} <br /> |
+| `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. |  |  |
+| `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. |  |  |
+| `replicas` _integer_ | Replicas is the desired number of Pods for this component.<br />When scalingAdapter is enabled, this field is managed by the<br />DynamoGraphDeploymentScalingAdapter and should not be modified directly. |  | Minimum: 0 <br /> |
+| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. |  |  |
+| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled, replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. |  | Optional: \{\} <br /> |
+| `eppConfig` _[EPPConfig](#eppconfig)_ | EPPConfig defines EPP-specific configuration options for Endpoint Picker Plugin components.<br />Only applicable when ComponentType is "epp". |  | Optional: \{\} <br /> |
+| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. |  | Optional: \{\} <br /> |
+
+
+#### DynamoComponentDeploymentSpec
+
+
+
+DynamoComponentDeploymentSpec defines the desired state of DynamoComponentDeployment
+
+
+
+_Appears in:_
+- [DynamoComponentDeployment](#dynamocomponentdeployment)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm") |  | Enum: [sglang vllm trtllm] <br /> |
+| `annotations` _object (keys:string, values:string)_ | Annotations to add to generated Kubernetes resources for this component<br />(such as Pod, Service, and Ingress when applicable). |  |  |
+| `labels` _object (keys:string, values:string)_ | Labels to add to generated Kubernetes resources for this component. |  |  |
+| `serviceName` _string_ | The name of the component |  |  |
+| `componentType` _string_ | ComponentType indicates the role of this component (for example, "main"). |  |  |
+| `subComponentType` _string_ | SubComponentType indicates the sub-role of this component (for example, "prefill"). |  |  |
+| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.<br />The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component |  | Optional: \{\} <br /> |
+| `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace |  |  |
+| `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,<br />GPUs/devices, and any runtime-specific resources. |  |  |
+| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter<br />with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md<br />for migration guidance. This field will be removed in a future API version. |  |  |
+| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. |  |  |
+| `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as<br />environment variables in the component containers. |  |  |
+| `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. |  |  |
+| `ingress` _[IngressSpec](#ingressspec)_ | Ingress config to expose the component outside the cluster (or through a service mesh). |  |  |
+| `modelRef` _[ModelReference](#modelreference)_ | ModelRef references a model that this component serves<br />When specified, a headless service will be created for endpoint discovery |  | Optional: \{\} <br /> |
+| `sharedMemory` _[SharedMemorySpec](#sharedmemoryspec)_ | SharedMemory controls the tmpfs mounted at /dev/shm (enable/disable and size). |  |  |
+| `extraPodMetadata` _[ExtraPodMetadata](#extrapodmetadata)_ | ExtraPodMetadata adds labels/annotations to the created Pods. |  | Optional: \{\} <br /> |
+| `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.<br />It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field<br />that allows overriding the main container configuration. |  | Optional: \{\} <br /> |
+| `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. |  |  |
+| `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. |  |  |
+| `replicas` _integer_ | Replicas is the desired number of Pods for this component.<br />When scalingAdapter is enabled, this field is managed by the<br />DynamoGraphDeploymentScalingAdapter and should not be modified directly. |  | Minimum: 0 <br /> |
+| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. |  |  |
+| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled, replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. |  | Optional: \{\} <br /> |
+| `eppConfig` _[EPPConfig](#eppconfig)_ | EPPConfig defines EPP-specific configuration options for Endpoint Picker Plugin components.<br />Only applicable when ComponentType is "epp". |  | Optional: \{\} <br /> |
+| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. |  | Optional: \{\} <br /> |
+
+
+#### DynamoGraphDeployment
+
+
+
+DynamoGraphDeployment is the Schema for the dynamographdeployments API.
+
+
+
+
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
+| `kind` _string_ | `DynamoGraphDeployment` | | |
+| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
+| `spec` _[DynamoGraphDeploymentSpec](#dynamographdeploymentspec)_ | Spec defines the desired state for this graph deployment. |  |  |
+| `status` _[DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)_ | Status reflects the current observed state of this graph deployment. |  |  |
+
+
+#### DynamoGraphDeploymentRequest
+
+
+
+DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests API.
+It serves as the primary interface for users to request model deployments with
+specific performance and resource constraints, enabling SLA-driven deployments.
+
+Lifecycle:
+ 1. Initial → Pending: Validates spec and prepares for profiling
+ 2. Pending → Profiling: Creates and runs profiling job (online or AIC)
+ 3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
+ 4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
+ 5. Ready: Terminal state when DGD is operational or spec is available
+ 6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
+
+The spec becomes immutable once profiling starts. Users must delete and recreate
+the DGDR to modify configuration after this point.
+
+
+
+
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
+| `kind` _string_ | `DynamoGraphDeploymentRequest` | | |
+| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
+| `spec` _[DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec)_ | Spec defines the desired state for this deployment request. |  |  |
+| `status` _[DynamoGraphDeploymentRequestStatus](#dynamographdeploymentrequeststatus)_ | Status reflects the current observed state of this deployment request. |  |  |
+
+
+#### DynamoGraphDeploymentRequestSpec
+
+
+
+DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.
+This CRD serves as the primary interface for users to request model deployments with
+specific performance constraints and resource requirements, enabling SLA-driven deployments.
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. |  | Required: \{\} <br /> |
+| `backend` _string_ | Backend specifies the inference backend for profiling.<br />The controller automatically sets this value in profilingConfig.config.engine.backend.<br />Profiling runs on real GPUs or via AIC simulation to collect performance data. |  | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
+| `useMocker` _boolean_ | UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of<br />a real backend deployment. When true, the deployment uses simulated engines that<br />don't require GPUs, using the profiling data to simulate realistic timing behavior.<br />Mocker is available in all backend images and useful for large-scale experiments.<br />Profiling still runs against the real backend (specified above) to collect performance data. | false |  |
+| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes<br />cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),<br />discovered GPU configuration is used as defaults when hardware configuration is not manually<br />specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values<br />always take precedence over auto-discovered values. If GPU discovery fails (e.g.,<br />namespace-restricted operator without node permissions), manual hardware config is required.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. |  | Required: \{\} <br /> |
+| `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false |  |
+| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. |  | Optional: \{\} <br /> |
+
+
+#### DynamoGraphDeploymentRequestStatus
+
+
+
+DynamoGraphDeploymentRequestStatus represents the observed state of a DynamoGraphDeploymentRequest.
+The controller updates this status as the DGDR progresses through its lifecycle.
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `state` _string_ | State is a high-level textual status of the deployment request lifecycle.<br />Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"<br />Empty string ("") represents the initial state before initialization. |  |  |
+| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. |  | Optional: \{\} <br /> |
+| `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.<br />Used to detect spec changes and enforce immutability after profiling starts. |  |  |
+| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.<br />Conditions are merged by type on patch updates. |  |  |
+| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.<br />Format: "configmap/<name>" |  | Optional: \{\} <br /> |
+| `generatedDeployment` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#rawextension-runtime-pkg)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification<br />including metadata, based on profiling results. Users can extract this to create<br />a DGD manually, or it's used automatically when autoApply is true.<br />Stored as RawExtension to preserve all fields including metadata.<br />For mocker backends, this contains the mocker DGD spec. |  | EmbeddedResource: \{\} <br />Optional: \{\} <br /> |
+| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.<br />Contains name, namespace, state, and creation status of the managed DGD. |  | Optional: \{\} <br /> |
+
+
+#### DynamoGraphDeploymentScalingAdapter
+
+
+
+DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services
+within a DynamoGraphDeployment. It implements the Kubernetes scale
+subresource, enabling integration with HPA, KEDA, and custom autoscalers.
+
+The adapter acts as an intermediary between autoscalers and the DGD,
+ensuring that only the adapter controller modifies the DGD's service replicas.
+This prevents conflicts when multiple autoscaling mechanisms are in play.
+
+
+
+
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
+| `kind` _string_ | `DynamoGraphDeploymentScalingAdapter` | | |
+| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
+| `spec` _[DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)_ |  |  |  |
+| `status` _[DynamoGraphDeploymentScalingAdapterStatus](#dynamographdeploymentscalingadapterstatus)_ |  |  |  |
+
+
+#### DynamoGraphDeploymentScalingAdapterSpec
+
+
+
+DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `replicas` _integer_ | Replicas is the desired number of replicas for the target service.<br />This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. |  | Minimum: 0 <br />Required: \{\} <br /> |
+| `dgdRef` _[DynamoGraphDeploymentServiceRef](#dynamographdeploymentserviceref)_ | DGDRef references the DynamoGraphDeployment and the specific service to scale. |  | Required: \{\} <br /> |
+
+
+#### DynamoGraphDeploymentScalingAdapterStatus
+
+
+
+DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `replicas` _integer_ | Replicas is the current number of replicas for the target service.<br />This is synced from the DGD's service replicas and is required for the scale subresource. |  | Optional: \{\} <br /> |
+| `selector` _string_ | Selector is a label selector string for the pods managed by this adapter.<br />Required for HPA compatibility via the scale subresource. |  | Optional: \{\} <br /> |
+| `lastScaleTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | LastScaleTime is the last time the adapter scaled the target service. |  | Optional: \{\} <br /> |
+
+
+#### DynamoGraphDeploymentServiceRef
+
+
+
+DynamoGraphDeploymentServiceRef identifies a specific service within a DynamoGraphDeployment
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `name` _string_ | Name of the DynamoGraphDeployment |  | MinLength: 1 <br />Required: \{\} <br /> |
+| `serviceName` _string_ | ServiceName is the key name of the service within the DGD's spec.services map to scale |  | MinLength: 1 <br />Required: \{\} <br /> |
+
+
+#### DynamoGraphDeploymentSpec
+
+
+
+DynamoGraphDeploymentSpec defines the desired state of DynamoGraphDeployment.
+
+
+
+_Appears in:_
+- [DynamoGraphDeployment](#dynamographdeployment)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `pvcs` _[PVC](#pvc) array_ | PVCs defines a list of persistent volume claims that can be referenced by components.<br />Each PVC must have a unique name that can be referenced in component specifications. |  | MaxItems: 100 <br />Optional: \{\} <br /> |
+| `services` _object (keys:string, values:[DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec))_ | Services are the services to deploy as part of this deployment. |  | MaxProperties: 25 <br />Optional: \{\} <br /> |
+| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs are environment variables applied to all services in the deployment unless<br />overridden by service-specific configuration. |  | Optional: \{\} <br /> |
+| `backendFramework` _string_ | BackendFramework specifies the backend framework (e.g., "sglang", "vllm", "trtllm"). |  | Enum: [sglang vllm trtllm] <br /> |
+| `restart` _[Restart](#restart)_ | Restart specifies the restart policy for the graph deployment. |  | Optional: \{\} <br /> |
+
+
+#### DynamoGraphDeploymentStatus
+
+
+
+DynamoGraphDeploymentStatus defines the observed state of DynamoGraphDeployment.
+
+
+
+_Appears in:_
+- [DynamoGraphDeployment](#dynamographdeployment)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `state` _string_ | State is a high-level textual status of the graph deployment lifecycle. |  |  |
+| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the graph deployment.<br />The slice is merged by type on patch updates. |  |  |
+| `services` _object (keys:string, values:[ServiceReplicaStatus](#servicereplicastatus))_ | Services contains per-service replica status information.<br />The map key is the service name from spec.services. |  | Optional: \{\} <br /> |
+| `restart` _[RestartStatus](#restartstatus)_ | Restart contains the status of the restart of the graph deployment. |  | Optional: \{\} <br /> |
+| `checkpoints` _object (keys:string, values:[ServiceCheckpointStatus](#servicecheckpointstatus))_ | Checkpoints contains per-service checkpoint status information.<br />The map key is the service name from spec.services. |  | Optional: \{\} <br /> |
+
+
+#### DynamoModel
+
+
+
+DynamoModel is the Schema for the dynamo models API
+
+
+
+
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
+| `kind` _string_ | `DynamoModel` | | |
+| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
+| `spec` _[DynamoModelSpec](#dynamomodelspec)_ |  |  |  |
+| `status` _[DynamoModelStatus](#dynamomodelstatus)_ |  |  |  |
+
+
+#### DynamoModelSpec
+
+
+
+DynamoModelSpec defines the desired state of DynamoModel
+
+
+
+_Appears in:_
+- [DynamoModel](#dynamomodel)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `modelName` _string_ | ModelName is the full model identifier (e.g., "meta-llama/Llama-3.3-70B-Instruct-lora") |  | Required: \{\} <br /> |
+| `baseModelName` _string_ | BaseModelName is the base model identifier that matches the service label<br />This is used to discover endpoints via headless services |  | Required: \{\} <br /> |
+| `modelType` _string_ | ModelType specifies the type of model (e.g., "base", "lora", "adapter") | base | Enum: [base lora adapter] <br />Optional: \{\} <br /> |
+| `source` _[ModelSource](#modelsource)_ | Source specifies the model source location (only applicable for lora model type) |  | Optional: \{\} <br /> |
+
+
+#### DynamoModelStatus
+
+
+
+DynamoModelStatus defines the observed state of DynamoModel
+
+
+
+_Appears in:_
+- [DynamoModel](#dynamomodel)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `endpoints` _[EndpointInfo](#endpointinfo) array_ | Endpoints is the current list of all endpoints for this model |  | Optional: \{\} <br /> |
+| `readyEndpoints` _integer_ | ReadyEndpoints is the count of endpoints that are ready |  |  |
+| `totalEndpoints` _integer_ | TotalEndpoints is the total count of endpoints |  |  |
+| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions represents the latest available observations of the model's state |  | Optional: \{\} <br /> |
+
+
+#### EPPConfig
+
+
+
+EPPConfig contains configuration for EPP (Endpoint Picker Plugin) components.
+EPP is responsible for intelligent endpoint selection and KV-aware routing.
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `configMapRef` _[ConfigMapKeySelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#configmapkeyselector-v1-core)_ | ConfigMapRef references a user-provided ConfigMap containing EPP configuration.<br />The ConfigMap should contain EndpointPickerConfig YAML.<br />Mutually exclusive with Config. |  | Optional: \{\} <br /> |
+| `config` _[EndpointPickerConfig](#endpointpickerconfig)_ | Config allows specifying EPP EndpointPickerConfig directly as a structured object.<br />The operator will marshal this to YAML and create a ConfigMap automatically.<br />Mutually exclusive with ConfigMapRef.<br />One of ConfigMapRef or Config must be specified (no default configuration).<br />Uses the upstream type from github.com/kubernetes-sigs/gateway-api-inference-extension |  | Type: object <br />Optional: \{\} <br /> |
+
+
+#### EndpointInfo
+
+
+
+EndpointInfo represents a single endpoint (pod) serving the model
+
+
+
+_Appears in:_
+- [DynamoModelStatus](#dynamomodelstatus)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `address` _string_ | Address is the full address of the endpoint (e.g., "http://10.0.1.5:9090") |  |  |
+| `podName` _string_ | PodName is the name of the pod serving this endpoint |  | Optional: \{\} <br /> |
+| `ready` _boolean_ | Ready indicates whether the endpoint is ready to serve traffic<br />For LoRA models: true if the POST /loras request succeeded with a 2xx status code<br />For base models: always false (no probing performed) |  |  |
+
+
+#### ExtraPodMetadata
+
+
+
+
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `annotations` _object (keys:string, values:string)_ |  |  |  |
+| `labels` _object (keys:string, values:string)_ |  |  |  |
+
+
+#### ExtraPodSpec
+
+
+
+
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `mainContainer` _[Container](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#container-v1-core)_ |  |  |  |
+
+
+#### IngressSpec
+
+
+
+
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `enabled` _boolean_ | Enabled exposes the component through an ingress or virtual service when true. |  |  |
+| `host` _string_ | Host is the base host name to route external traffic to this component. |  |  |
+| `useVirtualService` _boolean_ | UseVirtualService indicates whether to configure a service-mesh VirtualService instead of a standard Ingress. |  |  |
+| `virtualServiceGateway` _string_ | VirtualServiceGateway optionally specifies the gateway name to attach the VirtualService to. |  |  |
+| `hostPrefix` _string_ | HostPrefix is an optional prefix added before the host. |  |  |
+| `annotations` _object (keys:string, values:string)_ | Annotations to set on the generated Ingress/VirtualService resources. |  |  |
+| `labels` _object (keys:string, values:string)_ | Labels to set on the generated Ingress/VirtualService resources. |  |  |
+| `tls` _[IngressTLSSpec](#ingresstlsspec)_ | TLS holds the TLS configuration used by the Ingress/VirtualService. |  |  |
+| `hostSuffix` _string_ | HostSuffix is an optional suffix appended after the host. |  |  |
+| `ingressControllerClassName` _string_ | IngressControllerClassName selects the ingress controller class (e.g., "nginx"). |  |  |
+
+
+#### IngressTLSSpec
+
+
+
+
+
+
+
+_Appears in:_
+- [IngressSpec](#ingressspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `secretName` _string_ | SecretName is the name of a Kubernetes Secret containing the TLS certificate and key. |  |  |
+
+
+
+
+#### ModelReference
+
+
+
+ModelReference identifies a model served by this component
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `name` _string_ | Name is the base model identifier (e.g., "llama-3-70b-instruct-v1") |  | Required: \{\} <br /> |
+| `revision` _string_ | Revision is the model revision/version (optional) |  | Optional: \{\} <br /> |
+
+
+#### ModelSource
+
+
+
+ModelSource defines the source location of a model
+
+
+
+_Appears in:_
+- [DynamoModelSpec](#dynamomodelspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `uri` _string_ | URI is the model source URI<br />Supported formats:<br />- S3: s3://bucket/path/to/model<br />- HuggingFace: hf://org/model@revision_sha |  | Required: \{\} <br /> |
+
+
+#### MultinodeSpec
+
+
+
+
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `nodeCount` _integer_ | Indicates the number of nodes to deploy for multinode components.<br />Total number of GPUs is NumberOfNodes * GPU limit.<br />Must be greater than 1. | 2 | Minimum: 2 <br /> |
+
+
+#### PVC
+
+
+
+
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentSpec](#dynamographdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `create` _boolean_ | Create indicates to create a new PVC |  |  |
+| `name` _string_ | Name is the name of the PVC |  | Required: \{\} <br /> |
+| `storageClass` _string_ | StorageClass to be used for PVC creation. Required when create is true. |  |  |
+| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ | Size of the volume in Gi, used during PVC creation. Required when create is true. |  |  |
+| `volumeAccessMode` _[PersistentVolumeAccessMode](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#persistentvolumeaccessmode-v1-core)_ | VolumeAccessMode is the volume access mode of the PVC. Required when create is true. |  |  |
+
+
+#### ProfilingConfigSpec
+
+
+
+ProfilingConfigSpec defines configuration for the profiling process.
+This structure maps directly to the profile_sla.py config format.
+See benchmarks/profiler/utils/profiler_argparse.py for the complete schema.
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `config` _[JSON](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#json-v1-apiextensions-k8s-io)_ | Config is the profiling configuration as arbitrary JSON/YAML. This will be passed directly to the profiler.<br />The profiler will validate the configuration and report any errors. |  | Optional: \{\} <br />Type: object <br /> |
+| `configMapRef` _[ConfigMapKeySelector](#configmapkeyselector)_ | ConfigMapRef is an optional reference to a ConfigMap containing the DynamoGraphDeployment<br />base config file (disagg.yaml). This is separate from the profiling config above.<br />The path to this config will be set as engine.config in the profiling config. |  | Optional: \{\} <br /> |
+| `profilerImage` _string_ | ProfilerImage specifies the container image to use for profiling jobs.<br />This image contains the profiler code and dependencies needed for SLA-based profiling.<br />Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1" |  | Required: \{\} <br /> |
+| `outputPVC` _string_ | OutputPVC is an optional PersistentVolumeClaim name for storing profiling output.<br />If specified, all profiling artifacts (logs, plots, configs, raw data) will be written<br />to this PVC instead of an ephemeral emptyDir volume. This allows users to access<br />complete profiling results after the job completes by mounting the PVC.<br />The PVC must exist in the same namespace as the DGDR.<br />If not specified, profiling uses emptyDir and only essential data is saved to ConfigMaps.<br />Note: ConfigMaps are still created regardless of this setting for planner integration. |  | Optional: \{\} <br /> |
+| `resources` _[ResourceRequirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourcerequirements-v1-core)_ | Resources specifies the compute resource requirements for the profiling job container.<br />If not specified, no resource requests or limits are set. |  | Optional: \{\} <br /> |
+| `tolerations` _[Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#toleration-v1-core) array_ | Tolerations allows the profiling job to be scheduled on nodes with matching taints.<br />For example, to schedule on GPU nodes, add a toleration for the nvidia.com/gpu taint. |  | Optional: \{\} <br /> |
+
+
+#### ResourceItem
+
+
+
+
+
+
+
+_Appears in:_
+- [Resources](#resources)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `cpu` _string_ | CPU specifies the CPU resource request/limit (e.g., "1000m", "2") |  |  |
+| `memory` _string_ | Memory specifies the memory resource request/limit (e.g., "4Gi", "8Gi") |  |  |
+| `gpu` _string_ | GPU indicates the number of GPUs to request.<br />Total number of GPUs is NumberOfNodes * GPU in case of multinode deployment. |  |  |
+| `gpuType` _string_ | GPUType can specify a custom GPU type, e.g. "gpu.intel.com/xe"<br />By default if not specified, the GPU type is "nvidia.com/gpu" |  |  |
+| `custom` _object (keys:string, values:string)_ | Custom specifies additional custom resource requests/limits |  |  |
+
+
+#### Resources
+
+
+
+Resources defines requested and limits for a component, including CPU, memory,
+GPUs/devices, and any runtime-specific resources.
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `requests` _[ResourceItem](#resourceitem)_ | Requests specifies the minimum resources required by the component |  |  |
+| `limits` _[ResourceItem](#resourceitem)_ | Limits specifies the maximum resources allowed for the component |  |  |
+| `claims` _[ResourceClaim](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourceclaim-v1-core) array_ | Claims specifies resource claims for dynamic resource allocation |  |  |
+
+
+#### Restart
+
+
+
+
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentSpec](#dynamographdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `id` _string_ | ID is an arbitrary string that triggers a restart when changed.<br />Any modification to this value will initiate a restart of the graph deployment according to the strategy. |  | MinLength: 1 <br />Required: \{\} <br /> |
+| `strategy` _[RestartStrategy](#restartstrategy)_ | Strategy specifies the restart strategy for the graph deployment. |  | Optional: \{\} <br /> |
+
+
+#### RestartPhase
+
+_Underlying type:_ _string_
+
+
+
+
+
+_Appears in:_
+- [RestartStatus](#restartstatus)
+
+| Field | Description |
+| --- | --- |
+| `Pending` |  |
+| `Restarting` |  |
+| `Completed` |  |
+| `Failed` |  |
+
+
+#### RestartStatus
+
+
+
+RestartStatus contains the status of the restart of the graph deployment.
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `observedID` _string_ | ObservedID is the restart ID that has been observed and is being processed.<br />Matches the Restart.ID field in the spec. |  |  |
+| `phase` _[RestartPhase](#restartphase)_ | Phase is the phase of the restart. |  |  |
+| `inProgress` _string array_ | InProgress contains the names of the services that are currently being restarted. |  | Optional: \{\} <br /> |
+
+
+#### RestartStrategy
+
+
+
+
+
+
+
+_Appears in:_
+- [Restart](#restart)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `type` _[RestartStrategyType](#restartstrategytype)_ | Type specifies the restart strategy type. | Sequential | Enum: [Sequential Parallel] <br /> |
+| `order` _string array_ | Order specifies the order in which the services should be restarted. |  | Optional: \{\} <br /> |
+
+
+#### RestartStrategyType
+
+_Underlying type:_ _string_
+
+
+
+
+
+_Appears in:_
+- [RestartStrategy](#restartstrategy)
+
+| Field | Description |
+| --- | --- |
+| `Sequential` |  |
+| `Parallel` |  |
+
+
+#### ScalingAdapter
+
+
+
+ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter
+for replica management. When enabled, the DGDSA owns the replicas field and
+external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource.
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `enabled` _boolean_ | Enabled indicates whether the ScalingAdapter should be enabled for this service.<br />When true, a DGDSA is created and owns the replicas field.<br />When false (default), no DGDSA is created and replicas can be modified directly in the DGD. | false | Optional: \{\} <br /> |
+
+
+#### ServiceCheckpointConfig
+
+
+
+ServiceCheckpointConfig configures checkpointing for a DGD service
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `enabled` _boolean_ | Enabled indicates whether checkpointing is enabled for this service | false | Optional: \{\} <br /> |
+| `mode` _[CheckpointMode](#checkpointmode)_ | Mode defines how checkpoint creation is handled<br />- Auto: DGD controller creates Checkpoint CR automatically<br />- Manual: User must create Checkpoint CR | Auto | Enum: [Auto Manual] <br />Optional: \{\} <br /> |
+| `checkpointRef` _string_ | CheckpointRef references an existing Checkpoint CR to use<br />If specified, Identity is ignored and this checkpoint is used directly |  | Optional: \{\} <br /> |
+| `identity` _[DynamoCheckpointIdentity](#dynamocheckpointidentity)_ | Identity defines the checkpoint identity for hash computation<br />Used when Mode is Auto or when looking up existing checkpoints<br />Required when checkpointRef is not specified |  | Optional: \{\} <br /> |
+
+
+#### ServiceCheckpointStatus
+
+
+
+ServiceCheckpointStatus contains checkpoint information for a single service.
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `checkpointName` _string_ | CheckpointName is the name of the associated Checkpoint CR |  | Optional: \{\} <br /> |
+| `identityHash` _string_ | IdentityHash is the computed hash of the checkpoint identity |  | Optional: \{\} <br /> |
+| `ready` _boolean_ | Ready indicates if the checkpoint is ready for use |  | Optional: \{\} <br /> |
+
+
+#### ServiceReplicaStatus
+
+
+
+ServiceReplicaStatus contains replica information for a single service.
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentStatus](#dynamographdeploymentstatus)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `componentKind` _[ComponentKind](#componentkind)_ | ComponentKind is the underlying resource kind (e.g., "PodClique", "PodCliqueScalingGroup", "Deployment", "LeaderWorkerSet"). |  | Enum: [PodClique PodCliqueScalingGroup Deployment LeaderWorkerSet] <br /> |
+| `componentName` _string_ | ComponentName is the name of the underlying resource. |  |  |
+| `replicas` _integer_ | Replicas is the total number of non-terminated replicas.<br />Required for all component kinds. |  | Minimum: 0 <br /> |
+| `updatedReplicas` _integer_ | UpdatedReplicas is the number of replicas at the current/desired revision.<br />Required for all component kinds. |  | Minimum: 0 <br /> |
+| `readyReplicas` _integer_ | ReadyReplicas is the number of ready replicas.<br />Populated for PodClique, Deployment, and LeaderWorkerSet.<br />Not available for PodCliqueScalingGroup.<br />When nil, the field is omitted from the API response. |  | Minimum: 0 <br />Optional: \{\} <br /> |
+| `availableReplicas` _integer_ | AvailableReplicas is the number of available replicas.<br />For Deployment: replicas ready for >= minReadySeconds.<br />For PodCliqueScalingGroup: replicas where all constituent PodCliques have >= MinAvailable ready pods.<br />Not available for PodClique or LeaderWorkerSet.<br />When nil, the field is omitted from the API response. |  | Minimum: 0 <br />Optional: \{\} <br /> |
+
+
+#### SharedMemorySpec
+
+
+
+
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `disabled` _boolean_ |  |  |  |
+| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#quantity-resource-api)_ |  |  |  |
+
+
+#### VolumeMount
+
+
+
+VolumeMount references a PVC defined at the top level for volumes to be mounted by the component
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `name` _string_ | Name references a PVC name defined in the top-level PVCs map |  | Required: \{\} <br /> |
+| `mountPoint` _string_ | MountPoint specifies where to mount the volume.<br />If useAsCompilationCache is true and mountPoint is not specified,<br />a backend-specific default will be used. |  |  |
+| `useAsCompilationCache` _boolean_ | UseAsCompilationCache indicates this volume should be used as a compilation cache.<br />When true, backend-specific environment variables will be set and default mount points may be used. | false |  |
+
+
+# Operator Default Values Injection
+
+The Dynamo operator automatically applies default values to various fields when they are not explicitly specified in your deployments. These defaults include:
+
+- **Health Probes**: Startup, liveness, and readiness probes are configured differently for frontend, worker, and planner components. For example, worker components receive a startup probe with a 2-hour timeout (720 failures × 10 seconds) to accommodate long model loading times.
+
+- **Security Context**: All components receive `fsGroup: 1000` by default to ensure proper file permissions for mounted volumes. This can be overridden via the `extraPodSpec.securityContext` field.
+
+- **Shared Memory**: All components receive an 8Gi shared memory volume mounted at `/dev/shm` by default (can be disabled or resized via the `sharedMemory` field).
+
+- **Environment Variables**: Components automatically receive environment variables like `DYN_NAMESPACE`, `DYN_PARENT_DGD_K8S_NAME`, `DYNAMO_PORT`, and backend-specific variables.
+
+- **Pod Configuration**: Default `terminationGracePeriodSeconds` of 60 seconds and `restartPolicy: Always`.
+
+- **Autoscaling**: When enabled without explicit metrics, defaults to CPU-based autoscaling with 80% target utilization.
+
+- **Backend-Specific Behavior**: For multinode deployments, probes are automatically modified or removed for worker nodes depending on the backend framework (VLLM, SGLang, or TensorRT-LLM).
+
+## Pod Specification Defaults
+
+All components receive the following pod-level defaults unless overridden:
+
+- **`terminationGracePeriodSeconds`**: `60` seconds
+- **`restartPolicy`**: `Always`
+
+## Security Context
+
+The operator automatically applies default security context settings to all components to ensure proper file permissions, particularly for mounted volumes:
+
+- **`fsGroup`**: `1000` - Sets the group ownership of mounted volumes and any files created in those volumes
+
+This default ensures that non-root containers can write to mounted volumes (like model caches or persistent storage) without permission issues. The `fsGroup` setting is particularly important for:
+- Model downloads and caching
+- Compilation cache directories
+- Persistent volume claims (PVCs)
+- SSH key generation in multinode deployments
+
+### Overriding Security Context
+
+To override the default security context, specify your own `securityContext` in the `extraPodSpec` of your component:
+
+```yaml
+services:
+  YourWorker:
+    extraPodSpec:
+      securityContext:
+        fsGroup: 2000  # Custom group ID
+        runAsUser: 1000
+        runAsGroup: 1000
+        runAsNonRoot: true
+```
+
+**Important**: When you provide *any* `securityContext` object in `extraPodSpec`, the operator will not inject any defaults. This gives you complete control over the security context, including the ability to run as root (by omitting `runAsNonRoot` or setting it to `false`).
+
+### OpenShift and Security Context Constraints
+
+In OpenShift environments with Security Context Constraints (SCCs), you may need to omit explicit UID/GID values to allow OpenShift's admission controllers to assign them dynamically:
+
+```yaml
+services:
+  YourWorker:
+    extraPodSpec:
+      securityContext:
+        # Omit fsGroup to let OpenShift assign it based on SCC
+        # OpenShift will inject the appropriate UID range
+```
+
+Alternatively, if you want to keep the default `fsGroup: 1000` behavior and are certain your cluster allows it, you don't need to specify anything - the operator defaults will work.
+
+## Shared Memory Configuration
+
+Shared memory is enabled by default for all components:
+
+- **Enabled**: `true` (unless explicitly disabled via `sharedMemory.disabled`)
+- **Size**: `8Gi`
+- **Mount Path**: `/dev/shm`
+- **Volume Type**: `emptyDir` with `memory` medium
+
+To disable shared memory or customize the size, use the `sharedMemory` field in your component specification.
+
+## Health Probes by Component Type
+
+The operator applies different default health probes based on the component type.
+
+### Frontend Components
+
+Frontend components receive the following probe configurations:
+
+**Liveness Probe:**
+- **Type**: HTTP GET
+- **Path**: `/health`
+- **Port**: `http` (8000)
+- **Initial Delay**: 60 seconds
+- **Period**: 60 seconds
+- **Timeout**: 30 seconds
+- **Failure Threshold**: 10
+
+**Readiness Probe:**
+- **Type**: Exec command
+- **Command**: `curl -s http://localhost:${DYNAMO_PORT}/health | jq -e ".status == \"healthy\""`
+- **Initial Delay**: 60 seconds
+- **Period**: 60 seconds
+- **Timeout**: 30 seconds
+- **Failure Threshold**: 10
+
+### Worker Components
+
+Worker components receive the following probe configurations:
+
+**Liveness Probe:**
+- **Type**: HTTP GET
+- **Path**: `/live`
+- **Port**: `system` (9090)
+- **Period**: 5 seconds
+- **Timeout**: 30 seconds
+- **Failure Threshold**: 1
+
+**Readiness Probe:**
+- **Type**: HTTP GET
+- **Path**: `/health`
+- **Port**: `system` (9090)
+- **Period**: 10 seconds
+- **Timeout**: 30 seconds
+- **Failure Threshold**: 60
+
+**Startup Probe:**
+- **Type**: HTTP GET
+- **Path**: `/live`
+- **Port**: `system` (9090)
+- **Period**: 10 seconds
+- **Timeout**: 5 seconds
+- **Failure Threshold**: 720 (allows up to 2 hours for startup: 10s × 720 = 7200s)
+
+:::{note}
+For larger models (typically >70B parameters) or slower storage systems, you may need to increase the `failureThreshold` to allow more time for model loading. Calculate the required threshold based on your expected startup time: `failureThreshold = (expected_startup_seconds / period)`. Override the startup probe in your component specification if the default 2-hour window is insufficient.
+:::
+
+### Multinode Deployment Probe Modifications
+
+For multinode deployments, the operator modifies probes based on the backend framework and node role:
+
+#### VLLM Backend
+
+The operator automatically selects between two deployment modes based on parallelism configuration:
+
+**Tensor/Pipeline Parallel Mode** (when `world_size > GPUs_per_node`):
+- Uses Ray for distributed execution (`--distributed-executor-backend ray`)
+- **Leader nodes**: Starts Ray head and runs vLLM; all probes remain active
+- **Worker nodes**: Run Ray agents only; all probes (liveness, readiness, startup) are removed
+
+**Data Parallel Mode** (when `world_size × data_parallel_size > GPUs_per_node`):
+- **Worker nodes**: All probes (liveness, readiness, startup) are removed
+- **Leader nodes**: All probes remain active
+
+#### SGLang Backend
+- **Worker nodes**: All probes (liveness, readiness, startup) are removed
+
+#### TensorRT-LLM Backend
+- **Leader nodes**: All probes remain unchanged
+- **Worker nodes**:
+  - Liveness and startup probes are removed
+  - Readiness probe is replaced with a TCP socket check on SSH port (2222):
+    - **Initial Delay**: 20 seconds
+    - **Period**: 20 seconds
+    - **Timeout**: 5 seconds
+    - **Failure Threshold**: 10
+
+## Environment Variables
+
+The operator automatically injects environment variables based on component type and configuration:
+
+### All Components
+
+- **`DYN_NAMESPACE`**: The Dynamo namespace for the component
+- **`DYN_PARENT_DGD_K8S_NAME`**: The parent DynamoGraphDeployment Kubernetes resource name
+- **`DYN_PARENT_DGD_K8S_NAMESPACE`**: The parent DynamoGraphDeployment Kubernetes namespace
+
+### Frontend Components
+
+- **`DYNAMO_PORT`**: `8000`
+- **`DYN_HTTP_PORT`**: `8000`
+
+### Worker Components
+
+- **`DYN_SYSTEM_PORT`**: `9090` (automatically enables the system metrics server)
+- **`DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS`**: `["generate"]`
+- **`DYN_SYSTEM_ENABLED`**: `true` (needed for runtime images 0.6.1 and older)
+
+### Planner Components
+
+- **`PLANNER_PROMETHEUS_PORT`**: `9085`
+
+### VLLM Backend (with compilation cache)
+
+When a volume mount is configured with `useAsCompilationCache: true`:
+- **`VLLM_CACHE_ROOT`**: Set to the mount point of the cache volume
+
+## Service Account
+
+Planner components automatically receive the following service account:
+
+- **`serviceAccountName`**: `planner-serviceaccount`
+
+## Image Pull Secrets
+
+The operator automatically discovers and injects image pull secrets for container images. When a component specifies a container image, the operator:
+
+1. Scans all Kubernetes secrets of type `kubernetes.io/dockerconfigjson` in the component's namespace
+2. Extracts the docker registry server URLs from each secret's authentication configuration
+3. Matches the container image's registry host against the discovered registry URLs
+4. Automatically injects matching secrets as `imagePullSecrets` in the pod specification
+
+This eliminates the need to manually specify image pull secrets for each component. The operator maintains an internal index of docker secrets and their associated registries, refreshing this index periodically.
+
+**To disable automatic image pull secret discovery** for a specific component, add the following annotation:
+
+```yaml
+annotations:
+  nvidia.com/disable-image-pull-secret-discovery: "true"
+```
+
+## Autoscaling Defaults
+
+When autoscaling is enabled but no metrics are specified, the operator applies:
+
+- **Default Metric**: CPU utilization
+- **Target Average Utilization**: `80%`
+
+## Port Configurations
+
+Default container ports are configured based on component type:
+
+### Frontend Components
+- **Port**: 8000
+- **Protocol**: TCP
+- **Name**: `http`
+
+### Worker Components
+- **Port**: 9090
+- **Protocol**: TCP
+- **Name**: `system`
+
+### Planner Components
+- **Port**: 9085
+- **Protocol**: TCP
+- **Name**: `metrics`
+
+## Backend-Specific Configurations
+
+### VLLM
+- **Ray Head Port**: 6379 (for Ray cluster coordination in multinode TP/PP deployments)
+- **Data Parallel RPC Port**: 13445 (for data parallel multinode deployments)
+
+### SGLang
+- **Distribution Init Port**: 29500 (for multinode deployments)
+
+### TensorRT-LLM
+- **SSH Port**: 2222 (for multinode MPI communication)
+- **OpenMPI Environment**: `OMPI_MCA_orte_keep_fqdn_hostnames=1`
+
+## Implementation Reference
+
+For users who want to understand the implementation details or contribute to the operator, the default values described in this document are set in the following source files:
+
+- **Health Probes, Security Context & Pod Specifications**: [`internal/dynamo/graph.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/graph.go) - Contains the main logic for applying default probes, security context, environment variables, shared memory, and pod configurations
+- **Component-Specific Defaults**:
+  - [`internal/dynamo/component_frontend.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/component_frontend.go)
+  - [`internal/dynamo/component_worker.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/component_worker.go)
+  - [`internal/dynamo/component_planner.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/component_planner.go)
+- **Image Pull Secrets**: [`internal/secrets/docker.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/secrets/docker.go) - Implements the docker secret indexer and automatic discovery
+- **Backend-Specific Behavior**:
+  - [`internal/dynamo/backend_vllm.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/backend_vllm.go)
+  - [`internal/dynamo/backend_sglang.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/backend_sglang.go)
+  - [`internal/dynamo/backend_trtllm.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/dynamo/backend_trtllm.go)
+- **Constants & Annotations**: [`internal/consts/consts.go`](https://github.com/ai-dynamo/dynamo/blob/main/deploy/operator/internal/consts/consts.go) - Defines annotation keys and other constants
+
+## Notes
+
+- All these defaults can be overridden by explicitly specifying values in your DynamoComponentDeployment or DynamoGraphDeployment resources
+- User-specified probes (via `livenessProbe`, `readinessProbe`, or `startupProbe` fields) take precedence over operator defaults
+- For security context, if you provide *any* `securityContext` in `extraPodSpec`, no defaults will be injected, giving you full control
+- For multinode deployments, some defaults are modified or removed as described above to accommodate distributed execution patterns
+- The `extraPodSpec.mainContainer` field can be used to override probe configurations set by the operator
--- a/docs/pages/components/profiler/profiler-guide.md
+++ b/docs/pages/components/profiler/profiler-guide.md
@@ -227,14 +227,15 @@ See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#

 ### Automatic GPU Discovery

-Cluster-scoped operators can optionally enable automatic GPU discovery:
+The operator automatically discovers GPU resources from your Kubernetes cluster nodes when available. GPU discovery provides:

-```yaml
-spec:
-  enableGpuDiscovery: true
-```
+- Hardware information (GPU model, VRAM, GPUs per node)
+- Automatic calculation of profiling search space based on model size
+- Hardware system identifier for AI Configurator integration
+
+**Permissions**: GPU discovery requires cluster-wide node read permissions. Cluster-scoped operators automatically have these permissions. Namespace-restricted operators can also use GPU discovery if granted node read permissions via RBAC.

-This is only available with cluster-scoped operators (`namespaceRestriction.enabled=false`) as it requires cluster-wide node access permissions.
+If GPU discovery is unavailable (no permissions or no GPU labels), the profiler will use manually specified hardware configuration or defaults.

 ## Configuration


--- a/docs/pages/kubernetes/api-reference.md
+++ b/docs/pages/kubernetes/api-reference.md
@@ -462,8 +462,8 @@ _Appears in:_
 | `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. |  | Required: \{\} <br /> |
 | `backend` _string_ | Backend specifies the inference backend for profiling.<br />The controller automatically sets this value in profilingConfig.config.engine.backend.<br />Profiling runs on real GPUs or via AIC simulation to collect performance data. |  | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
 | `useMocker` _boolean_ | UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of<br />a real backend deployment. When true, the deployment uses simulated engines that<br />don't require GPUs, using the profiling data to simulate realistic timing behavior.<br />Mocker is available in all backend images and useful for large-scale experiments.<br />Profiling still runs against the real backend (specified above) to collect performance data. | false |  |
-| `enableGpuDiscovery` _boolean_ | EnableGpuDiscovery controls whether the profiler should automatically discover GPU<br />resources from the Kubernetes cluster nodes. When enabled, the profiler will override<br />any manually specified hardware configuration (minNumGpusPerEngine, maxNumGpusPerEngine,<br />numGpusPerNode) with values detected from the cluster.<br />Requires cluster-wide node access permissions - only available with cluster-scoped operators. | false | Optional: \{\} <br /> |
-| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. |  | Required: \{\} <br /> |
+| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes<br />cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),<br />discovered GPU configuration is used as defaults when hardware configuration is not manually<br />specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values<br />always take precedence over auto-discovered values. If GPU discovery fails (e.g.,<br />namespace-restricted operator without node permissions), manual hardware config is required.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. |  | Required: \{\} <br /> |
+| `enableGpuDiscovery` _boolean_ | EnableGPUDiscovery controls whether the operator attempts to discover GPU hardware from cluster nodes.<br />DEPRECATED: This field is deprecated and will be removed in v1beta1. GPU discovery is now always<br />attempted automatically. Setting this field has no effect - the operator will always try to discover<br />GPU hardware when node read permissions are available. If discovery is unavailable (e.g., namespace-scoped<br />operator without permissions), manual hardware configuration is required regardless of this setting. | true | Optional: \{\} <br /> |
 | `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false |  |
 | `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. |  | Optional: \{\} <br /> |


--- a/tests/profiler/test_profile_sla_aiconfigurator.py
+++ b/tests/profiler/test_profile_sla_aiconfigurator.py
@@ -18,6 +18,7 @@ project_root = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(project_root))

 from dynamo.profiler.profile_sla import run_profile  # noqa: E402
+from dynamo.profiler.utils.defaults import SearchStrategy  # noqa: E402
 from dynamo.profiler.utils.model_info import ModelInfo  # noqa: E402

 pytestmark = [
@@ -44,7 +45,7 @@ class TestProfileSlaAiconfigurator:
    def llm_args(self, request):
        class Args:
            def __init__(self):
-                self.model = ""
+                self.model = "Qwen/Qwen3-32B"  # Set to match aic_hf_id for consistency
                self.dgd_image = ""
                self.backend = "trtllm"
                self.config = "examples/backends/trtllm/deploy/disagg.yaml"
@@ -63,14 +64,13 @@ class TestProfileSlaAiconfigurator:
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = False
-                self.use_ai_configurator = True
-                self.aic_system = "h200_sxm"
-                self.aic_hf_id = "Qwen/Qwen3-32B"
-                self.aic_backend = ""
-                self.aic_backend_version = None
                self.num_gpus_per_node = 8
                self.deploy_after_profile = False
                self.pick_with_webui = False
+                # Use RAPID strategy to leverage AI Configurator for perf estimation
+                # This avoids Kubernetes deployments while testing aiconfigurator functionality
+                self.search_strategy = SearchStrategy.RAPID
+                self.system = "h200_sxm"  # Must match aic_system for RAPID strategy
                # Provide minimal model_info to avoid HF queries
                self.model_info = ModelInfo(
                    model_size=16384.0,
@@ -86,10 +86,10 @@ class TestProfileSlaAiconfigurator:
    @pytest.mark.performance
    @pytest.mark.parallel
    @pytest.mark.asyncio
-    @pytest.mark.parametrize("missing_arg", ["aic_system", "aic_hf_id"])
+    @pytest.mark.parametrize("missing_arg", ["system", "model"])
    async def test_aiconfigurator_missing_args(self, llm_args, missing_arg):
-        # Check that validation error happens when a required arg is missing.
-        # Note: aic_backend_version is optional - when None, auto-detects latest version
+        # Check that validation error happens when a required arg is missing for RAPID strategy.
+        # These args are required when using SearchStrategy.RAPID with AI Configurator.
        setattr(llm_args, missing_arg, None)
        with pytest.raises(ValueError):
            await run_profile(llm_args)
@@ -103,8 +103,7 @@ class TestProfileSlaAiconfigurator:
        "arg_name, bad_value",
        [
            # these values don't exist in the aiconfigurator database.
-            ("aic_system", "fake_gpu_system"),
-            ("aic_backend_version", "0.1.0"),
+            ("system", "fake_gpu_system"),
        ],
    )
    async def test_aiconfigurator_no_data(self, llm_args, arg_name, bad_value):
@@ -131,14 +130,11 @@ class TestProfileSlaAiconfigurator:
    @pytest.mark.nightly
    # fmt: off
    @pytest.mark.parametrize(
-        "backend, aic_backend_version",
+        "backend",
        [
-            pytest.param("trtllm", None,          marks=pytest.mark.trtllm),
-            pytest.param("trtllm", "1.2.0rc5",    marks=pytest.mark.trtllm),
-            pytest.param("vllm",   None,          marks=pytest.mark.vllm),
-            pytest.param("vllm",   "0.12.0",      marks=pytest.mark.vllm),
-            pytest.param("sglang", None,          marks=pytest.mark.sglang),
-            pytest.param("sglang", "0.5.6.post2", marks=pytest.mark.sglang),
+            pytest.param("trtllm", marks=pytest.mark.trtllm),
+            pytest.param("vllm",   marks=pytest.mark.vllm),
+            pytest.param("sglang", marks=pytest.mark.sglang),
        ],
    )
    # fmt: on
@@ -149,11 +145,10 @@ class TestProfileSlaAiconfigurator:
            "meta-llama/Llama-3.1-405B",
        ],
    )
-    async def test_aiconfigurator_dense_models(
-        self, llm_args, hf_model_id, backend, aic_backend_version
-    ):
-        # Test that profile_sla works with a variety of backend versions and model names.
-        llm_args.aic_hf_id = hf_model_id
+    async def test_aiconfigurator_dense_models(self, llm_args, hf_model_id, backend):
+        # Test that profile_sla works with a variety of backends and model names
+        # using AI Configurator's RAPID strategy for performance estimation.
+        # Backend version is not used with RAPID strategy - performance comes from AI Configurator.
+        llm_args.model = hf_model_id  # Used by RAPID strategy
        llm_args.backend = backend
-        llm_args.aic_backend_version = aic_backend_version
        await run_profile(llm_args)
--- a/tests/profiler/test_profile_sla_dryrun.py
+++ b/tests/profiler/test_profile_sla_dryrun.py
@@ -19,6 +19,7 @@ project_root = Path(__file__).parent.parent.parent
 sys.path.insert(0, str(project_root))

 from dynamo.profiler.profile_sla import run_profile  # noqa: E402
+from dynamo.profiler.utils.defaults import SearchStrategy  # noqa: E402
 from dynamo.profiler.utils.model_info import ModelInfo  # noqa: E402
 from dynamo.profiler.utils.search_space_autogen import (  # noqa: E402
    auto_generate_search_space,
@@ -66,12 +67,13 @@ class TestProfileSLADryRun:
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
-                self.use_ai_configurator = False
                self.aic_system = None
                self.aic_hf_id = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
+                self.search_strategy = SearchStrategy.THOROUGH
+                self.system = ""
                self.deploy_after_profile = False
                self.pick_with_webui = False
                self.model_cache_pvc_name = ""
@@ -113,12 +115,13 @@ class TestProfileSLADryRun:
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
-                self.use_ai_configurator = False
                self.aic_system = None
                self.aic_hf_id = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
+                self.search_strategy = SearchStrategy.THOROUGH
+                self.system = ""
                self.deploy_after_profile = False
                self.pick_with_webui = False
                self.model_cache_pvc_name = ""
@@ -181,12 +184,13 @@ class TestProfileSLADryRun:
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
-                self.use_ai_configurator = False
                self.aic_system = None
                self.aic_hf_id = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
+                self.search_strategy = SearchStrategy.THOROUGH
+                self.system = ""
                self.deploy_after_profile = False
                self.pick_with_webui = False
                self.model_cache_pvc_name = ""
@@ -238,12 +242,13 @@ class TestProfileSLADryRun:
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
-                self.use_ai_configurator = False
                self.aic_system = None
                self.aic_hf_id = None
                self.aic_backend = ""
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
+                self.search_strategy = SearchStrategy.THOROUGH
+                self.system = ""
                self.deploy_after_profile = False
                self.pick_with_webui = False
                # Added in newer profiler versions; keep Args compatible with search_space_autogen
@@ -318,16 +323,14 @@ class TestProfileSLADryRun:
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
-                self.use_ai_configurator = False
-                self.aic_system = None
-                self.aic_hf_id = None
-                self.aic_backend = ""
-                self.aic_backend_version = None
-                # Set to 0 to trigger auto-generation path
-                self.num_gpus_per_node = 0
+                self.system = "h100_sxm"  # Renamed from aic_system, moved to hardware
+                self.search_strategy = SearchStrategy.RAPID  # New top-level arg
+                # GPU discovery values (auto-populated by Operator)
+                self.num_gpus_per_node = 8
+                self.gpu_model = "H100-SXM5-80GB"
+                self.gpu_vram_mib = 81920
                self.deploy_after_profile = False
                self.pick_with_webui = False
-                self.enable_gpu_discovery = True
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
@@ -340,27 +343,24 @@ class TestProfileSLADryRun:
    @pytest.mark.integration
    @pytest.mark.gpu_0
    @pytest.mark.vllm
-    @patch("dynamo.profiler.utils.search_space_autogen.get_gpu_summary")
-    @patch("dynamo.profiler.utils.search_space_autogen.get_model_info")
+    @patch("dynamo.profiler.utils.model_info.get_model_info")
    async def test_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
-        mock_get_gpu_summary,
        vllm_args_with_model_autogen,
-        mock_h100_gpu_info,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
-        size and available GPU memory.
+        size and available GPU memory. GPU info is provided via command-line
+        arguments injected by the Operator into the profiling config (DYN-2135).
        """
-        # Configure the mocks to return the appropriate info
+        # Configure the mock to return the appropriate model info
        mock_get_model_info.return_value = mock_model_info
-        mock_get_gpu_summary.return_value = mock_h100_gpu_info

        # Run the profile - the search space will be auto-generated
-        # based on the model and mocked GPU info
+        # based on the model and GPU info from args
        auto_generate_search_space(vllm_args_with_model_autogen)
        await run_profile(vllm_args_with_model_autogen)

@@ -390,15 +390,14 @@ class TestProfileSLADryRun:
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
-                self.use_ai_configurator = False
-                self.aic_system = None
-                self.aic_hf_id = None
-                self.aic_backend = ""
-                self.aic_backend_version = None
-                self.num_gpus_per_node = 0
+                self.system = "h100_sxm"  # Renamed from aic_system, moved to hardware
+                self.search_strategy = SearchStrategy.RAPID  # New top-level arg
+                # GPU discovery values (auto-populated by Operator)
+                self.num_gpus_per_node = 8
+                self.gpu_model = "H100-SXM5-80GB"
+                self.gpu_vram_mib = 81920
                self.deploy_after_profile = False
                self.pick_with_webui = False
-                self.enable_gpu_discovery = True
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
@@ -411,27 +410,33 @@ class TestProfileSLADryRun:
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.sglang
-    @patch("dynamo.profiler.utils.search_space_autogen.get_gpu_summary")
-    @patch("dynamo.profiler.utils.search_space_autogen.get_model_info")
+    @pytest.mark.skip(
+        reason="Blocked on AI Configurator database format: sglang 0.5.6.post2 database "
+        "is in legacy format missing 'gemm_dtype' field. "
+        "See: KeyError in aiconfigurator/sdk/perf_database.py"
+    )
+    @patch("dynamo.profiler.utils.model_info.get_model_info")
    async def test_sglang_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
-        mock_get_gpu_summary,
        sglang_args_with_model_autogen,
-        mock_h100_gpu_info,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space for sglang on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
-        size and available GPU memory for sglang backend.
+        size and available GPU memory for sglang backend. GPU info is provided via
+        command-line arguments injected by the Operator into the profiling config (DYN-2135).
+
+        NOTE: Currently skipped due to AI Configurator database format issue.
+        The sglang 0.5.6.post2 database for h100_sxm is in legacy format and missing
+        the required 'gemm_dtype' field, causing KeyError during database loading.
        """
-        # Configure the mocks to return the appropriate info
+        # Configure the mock to return the appropriate model info
        mock_get_model_info.return_value = mock_model_info
-        mock_get_gpu_summary.return_value = mock_h100_gpu_info

        # Run the profile - the search space will be auto-generated
-        # based on the model and mocked GPU info
+        # based on the model and GPU info from args
        auto_generate_search_space(sglang_args_with_model_autogen)
        await run_profile(sglang_args_with_model_autogen)

@@ -461,15 +466,14 @@ class TestProfileSLADryRun:
                self.decode_interpolation_granularity = 6
                self.service_name = ""
                self.dry_run = True
-                self.use_ai_configurator = False
-                self.aic_system = None
-                self.aic_hf_id = None
-                self.aic_backend = ""
-                self.aic_backend_version = None
-                self.num_gpus_per_node = 0
+                self.system = "h100_sxm"  # Renamed from aic_system, moved to hardware
+                self.search_strategy = SearchStrategy.RAPID  # New top-level arg
+                # GPU discovery values (auto-populated by Operator)
+                self.num_gpus_per_node = 8
+                self.gpu_model = "H100-SXM5-80GB"
+                self.gpu_vram_mib = 81920
                self.deploy_after_profile = False
                self.pick_with_webui = False
-                self.enable_gpu_discovery = True
                self.model_cache_pvc_name = ""
                self.model_cache_pvc_path = ""
                self.model_cache_pvc_mount_path = "/opt/model-cache"
@@ -482,26 +486,91 @@ class TestProfileSLADryRun:
    @pytest.mark.gpu_0
    @pytest.mark.integration
    @pytest.mark.trtllm
-    @patch("dynamo.profiler.utils.search_space_autogen.get_gpu_summary")
-    @patch("dynamo.profiler.utils.search_space_autogen.get_model_info")
+    @patch("dynamo.profiler.utils.model_info.get_model_info")
    async def test_trtllm_profile_with_autogen_search_space_h100(
        self,
        mock_get_model_info,
-        mock_get_gpu_summary,
        trtllm_args_with_model_autogen,
-        mock_h100_gpu_info,
        mock_model_info,
    ):
        """Test profile_sla with auto-generated search space for trtllm on mocked H100 cluster.

        This test demonstrates how search space is auto-generated based on model
-        size and available GPU memory for trtllm backend.
+        size and available GPU memory for trtllm backend. GPU info is provided via
+        command-line arguments injected by the Operator into the profiling config (DYN-2135).
        """
-        # Configure the mocks to return the appropriate info
+        # Configure the mock to return the appropriate model info
        mock_get_model_info.return_value = mock_model_info
-        mock_get_gpu_summary.return_value = mock_h100_gpu_info

        # Run the profile - the search space will be auto-generated
-        # based on the model and mocked GPU info
+        # based on the model and GPU info from args
        auto_generate_search_space(trtllm_args_with_model_autogen)
        await run_profile(trtllm_args_with_model_autogen)
+
+    # Unit tests for search_strategy and system attributes
+    @pytest.mark.pre_merge
+    @pytest.mark.unit
+    @pytest.mark.gpu_0
+    def test_vllm_args_has_search_strategy(self, vllm_args):
+        """Test that vllm_args fixture has search_strategy attribute."""
+        assert hasattr(vllm_args, "search_strategy")
+        assert vllm_args.search_strategy == SearchStrategy.THOROUGH
+        assert hasattr(vllm_args, "system")
+        assert vllm_args.system == ""
+
+    @pytest.mark.pre_merge
+    @pytest.mark.unit
+    @pytest.mark.gpu_0
+    def test_sglang_args_has_search_strategy(self, sglang_args):
+        """Test that sglang_args fixture has search_strategy attribute."""
+        assert hasattr(sglang_args, "search_strategy")
+        assert sglang_args.search_strategy == SearchStrategy.THOROUGH
+        assert hasattr(sglang_args, "system")
+        assert sglang_args.system == ""
+
+    @pytest.mark.pre_merge
+    @pytest.mark.unit
+    @pytest.mark.gpu_0
+    def test_trtllm_args_has_search_strategy(self, trtllm_args):
+        """Test that trtllm_args fixture has search_strategy attribute."""
+        assert hasattr(trtllm_args, "search_strategy")
+        assert trtllm_args.search_strategy == SearchStrategy.THOROUGH
+        assert hasattr(trtllm_args, "system")
+        assert trtllm_args.system == ""
+
+    @pytest.mark.pre_merge
+    @pytest.mark.unit
+    @pytest.mark.gpu_0
+    def test_sglang_moe_args_has_search_strategy(self, sglang_moe_args):
+        """Test that sglang_moe_args fixture has search_strategy attribute."""
+        assert hasattr(sglang_moe_args, "search_strategy")
+        assert sglang_moe_args.search_strategy == SearchStrategy.THOROUGH
+        assert hasattr(sglang_moe_args, "system")
+        assert sglang_moe_args.system == ""
+
+    @pytest.mark.pre_merge
+    @pytest.mark.unit
+    @pytest.mark.gpu_0
+    def test_model_autogen_args_have_rapid_strategy(
+        self,
+        vllm_args_with_model_autogen,
+        sglang_args_with_model_autogen,
+        trtllm_args_with_model_autogen,
+    ):
+        """Test that model autogen fixtures have RAPID search strategy and GPU info."""
+        for args_fixture in [
+            vllm_args_with_model_autogen,
+            sglang_args_with_model_autogen,
+            trtllm_args_with_model_autogen,
+        ]:
+            assert hasattr(args_fixture, "search_strategy")
+            assert args_fixture.search_strategy == SearchStrategy.RAPID
+            assert hasattr(args_fixture, "system")
+            assert args_fixture.system == "h100_sxm"
+            # Verify GPU discovery attributes
+            assert hasattr(args_fixture, "num_gpus_per_node")
+            assert args_fixture.num_gpus_per_node == 8
+            assert hasattr(args_fixture, "gpu_model")
+            assert args_fixture.gpu_model == "H100-SXM5-80GB"
+            assert hasattr(args_fixture, "gpu_vram_mib")
+            assert args_fixture.gpu_vram_mib == 81920