docs: V1beta1 dgdr docs (#6647)

Signed-off-by: ashnamehrotra <ashnamehrotra@gmail.com> Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

docs: V1beta1 dgdr docs (#6647)
Signed-off-by: ashnamehrotra <ashnamehrotra@gmail.com> Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
d8b7d394 · Ashna Mehrotra · GitHub · 9df78cb6 · d8b7d394 · d8b7d394
Unverified Commit d8b7d394 authored Feb 27, 2026 by Ashna Mehrotra Committed by GitHub Feb 27, 2026
20 changed files
--- a/components/src/dynamo/profiler/deploy/profile_sla_aic_dgdr.yaml
+++ b/components/src/dynamo/profiler/deploy/profile_sla_aic_dgdr.yaml
@@ -2,27 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 # DynamoGraphDeploymentRequest for AI Configurator-based profiling
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: sla-aic
 spec:
  model: Qwen/Qwen3-32B
  backend: trtllm
+  image: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
-  # ProfilingConfig maps directly to the profile_sla.py config format
-  profilingConfig:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
-    # NOTE: any image built before January 10 and any release prior to 0.8.1
-    # will need to use snake_case within profilingConfig.config
-    config:
-      searchStrategy: rapid
-      sla:
-        isl: 3000
-        osl: 150
-        ttft: 500.0
-        itl: 30.0
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag"
-  autoApply: true
--- a/components/src/dynamo/profiler/deploy/profile_sla_dgdr.yaml
+++ b/components/src/dynamo/profiler/deploy/profile_sla_dgdr.yaml
@@ -2,27 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 # DynamoGraphDeploymentRequest for online profiling (actual deployment testing)
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: sla-online
 spec:
  model: Qwen/Qwen3-0.6B
  backend: vllm
+  image: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag" # tag must be at least 1.0.0
-  # ProfilingConfig maps directly to the profile_sla.py config format
+  searchStrategy: thorough
-  profilingConfig:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
-    # NOTE: any image built before January 10 and any release prior to 0.8.1
-    # will need to use snake_case within profilingConfig.config
-    config:
-      searchStrategy: thorough
-      sla:
-        isl: 3000
-        osl: 150
-        ttft: 200.0
-        itl: 20.0
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag"
-  autoApply: true
--- a/components/src/dynamo/profiler/deploy/profile_sla_moe_dgdr.yaml
+++ b/components/src/dynamo/profiler/deploy/profile_sla_moe_dgdr.yaml
@@ -2,49 +2,20 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 # DynamoGraphDeploymentRequest for MoE model profiling
-# Converted from profile_sla_moe_job.yaml
+apiVersion: nvidia.com/v1beta1
-apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: sla-moe
 spec:
  model: deepseek-ai/DeepSeek-R1
  backend: sglang
+  image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:my-tag"
+  searchStrategy: rapid
-  # ProfilingConfig maps directly to the profile_sla.py config format
+  modelCache:
-  profilingConfig:
+    pvcName: "model-cache"                      # Name of PVC containing model weights
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag"
+    pvcModelPath: "deepseek-r1"                  # Subpath within PVC where model is stored
-    # NOTE: any image built before January 10 and any release prior to 0.8.1
-    # will need to use snake_case within profilingConfig.config
-    config:
-      # 0.8.1 and later: Model cache PVC to access model weights
-      deployment:
-        modelCache:
-          pvcName: "model-cache"                      # Name of PVC containing model weights
-          pvcPath: "deepseek-r1"                      # Subpath within PVC where model is stored
-      searchStrategy: rapid
-      hardware:
-        # for h200, sweep over 8-16 GPUs per engine
-        minNumGpusPerEngine: 8
-        maxNumGpusPerEngine: 16
-        numGpusPerNode: 8  # Override auto-discovered value if different
-      sla:
-        isl: 3000
-        osl: 150
-        ttft: 200.0
-        itl: 20.0
-    # Reference to ConfigMap containing the DGD base config
-    # For MoE models, this should point to the appropriate disagg config
-    # Original path: /sgl-workspace/dynamo/recipes/deepseek-r1/sglang/disagg-16gpu/deploy.yaml
-    configMapRef:
-      name: deepseek-r1-config
-      key: tep16p-dep16d-disagg.yaml
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:my-tag"
-  autoApply: true
+  hardware:
+    # for h200, sweep over 8-16 GPUs per engine
+    numGpusPerNode: 8  # Override auto-discovered value if different
--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
@@ -9108,7 +9108,7 @@ spec:
                conditions:
                  description: |-
                    Conditions contains the latest observed conditions of the deployment request.
-                    Standard condition types include: Validated, ProfilingComplete, DeploymentReady.
+                    Standard condition types include: Succeeded, Validation, Profiling, SpecGenerated, DeploymentReady.
                  items:
                    description: Condition contains details for one aspect of the current state of this API Resource.
                    properties:

--- a/deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
+++ b/deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
@@ -458,7 +458,7 @@ type DynamoGraphDeploymentRequestStatus struct {
 	ProfilingJobName string `json:"profilingJobName,omitempty"`
 	// Conditions contains the latest observed conditions of the deployment request.
-	// Standard condition types include: Validated, ProfilingComplete, DeploymentReady.
+	// Standard condition types include: Succeeded, Validation, Profiling, SpecGenerated, DeploymentReady.
 	// +optional
 	// +listType=map
 	// +listMapKey=type

--- a/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
@@ -9108,7 +9108,7 @@ spec:
                conditions:
                  description: |-
                    Conditions contains the latest observed conditions of the deployment request.
-                    Standard condition types include: Validated, ProfilingComplete, DeploymentReady.
+                    Standard condition types include: Succeeded, Validation, Profiling, SpecGenerated, DeploymentReady.
                  items:
                    description: Condition contains details for one aspect of the current state of this API Resource.
                    properties:

--- a/deploy/operator/config/samples/kustomization.yaml
+++ b/deploy/operator/config/samples/kustomization.yaml
@@ -18,7 +18,7 @@ resources:
 - nvidia.com_v1alpha1_dynamocomponentdeployment.yaml
 - nvidia.com_v1alpha1_dynamocomponent.yaml
 - nvidia.com_v1alpha1_dynamographdeployment.yaml
- nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
+- nvidia.com_v1beta1_dynamographdeploymentrequest.yaml
 - nvidia.com_v1alpha1_dynamomodel.yaml
 - nvidia.com_v1alpha1_dynamocheckpoint.yaml
 #+kubebuilder:scaffold:manifestskustomizesamples
--- a/deploy/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
+++ b/deploy/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeploymentRequest
-metadata:
-  name: example-llm-sla
-spec:
-  # Model is a high-level identifier for the model being deployed (required - injected into profilingConfig.config.deployment.model)
-  model: Qwen/Qwen3-0.6B
-  # Backend to use for profiling (required - injected into profilingConfig.config.engine.backend)
-  backend: trtllm
-  # ProfilerImage is the container image to use for profiling jobs (required)
-  profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.9.0"
-  # ProfilingConfig maps directly to the profile_sla.py config format
-  # See dynamo/profiler/utils/profiler_argparse.py for complete schema
-  # Note: deployment.model and engine.backend are automatically set from model and backend above
-  profilingConfig:
-    config:
-      # Optional: Output directory for profiling results (defaults to /data in the Job)
-      # output_dir: "profiling_results"
-      # Engine configuration
-      engine:
-        maxContextLength: 16384  # will override max context length of the model if provided
-      # Search strategy: 'rapid' for AI Configurator estimation (20-30s), 'thorough' for actual deployments (2-4h)
-      searchStrategy: thorough
-      # Hardware configuration
-      # Note: Operator auto-discovers GPU info from cluster nodes when available
-      hardware:
-        minNumGpusPerEngine: 1  # Minimum GPUs to test
-        maxNumGpusPerEngine: 4  # Maximum GPUs to test
-        numGpusPerNode: 8       # GPUs per node (optional - auto-discovered if not specified)
-        system: h200_sxm        # Hardware system (optional - auto-detected if not specified)
-        # gpuModel: "H200-SXM"  # GPU model (optional - auto-discovered)
-        # gpuVramMib: 141557    # GPU VRAM in MiB (optional - auto-discovered)
-      # Sweep/profiling configuration
-      sweep:
-        prefillInterpolationGranularity: 16  # Samples for TTFT interpolation
-        decodeInterpolationGranularity: 6    # Samples for ITL interpolation
-      # SLA targets for profiling
-      sla:
-        isl: 3000  # Input sequence length
-        osl: 500   # Output sequence length
-        ttft: 50.0  # Time To First Token target (milliseconds)
-        itl: 10.0   # Inter-Token Latency target (milliseconds)
-      # Optional: Planner-specific arguments
-      # planner:
-      #   plannerMinEndpoint: 2
-      #   # Add any other planner args here
-    # Reference to ConfigMap containing the DGD base config (disagg.yaml)
-    # The path to this file will be automatically set as engine.config
-    configMapRef:
-      name: my-profiling-config
-      key: disagg.yaml  # defaults to "disagg.yaml"
-  # Optional: Automatically create DynamoGraphDeployment after profiling
-  autoApply: true  # default is false
-  # Optional: Override metadata for auto-created DGD (only used when autoApply: true)
-  # deploymentOverrides:
-  #   name: my-custom-dgd-name
-  #   namespace: production
-  #   labels:
-  #     team: ml-platform
-  #   annotations:
-  #     description: "Auto-generated from DGDR"
--- a/deploy/operator/config/samples/nvidia.com_v1beta1_dynamographdeploymentrequest.yaml
+++ b/deploy/operator/config/samples/nvidia.com_v1beta1_dynamographdeploymentrequest.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1beta1
+kind: DynamoGraphDeploymentRequest
+metadata:
+  name: example-llm-sla
+spec:
+  # Model is a high-level identifier for the model being deployed (required)
+  model: Qwen/Qwen3-0.6B
+  # Backend to use for profiling and deployment
+  backend: trtllm
+  # Image is the container image reference for the profiling job
+  image: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.9.0"
+  # SearchStrategy controls the profiling search depth
+  # "rapid" for fast sweep; "thorough" for deeper exploration
+  searchStrategy: thorough
+  # Hardware describes the hardware resources available for profiling and deployment.
+  # In cluster-scoped mode the operator auto-discovers GPU info from cluster nodes
+  # (via GPU Feature Discovery labels), so these fields are optional.
+  # In namespace-restricted mode, auto-discovery is intentionally disabled because
+  # the operator lacks permission to list cluster nodes. Validation will reject the
+  # DGDR before any profiling runs, so you must explicitly set numGpusPerNode,
+  # gpuSku, and vramMb.
+  hardware:
+    numGpusPerNode: 8       # GPUs per node (required in namespace-restricted mode)
+    gpuSku: h200_sxm        # Hardware system (required in namespace-restricted mode)
+    # vramMb: 141557        # GPU VRAM in MiB (required in namespace-restricted mode)
+  # Workload defines the expected workload characteristics
+  workload:
+    isl: 3000  # Input sequence length
+    osl: 500   # Output sequence length
+  # SLA defines service-level agreement targets for profiling optimization
+  sla:
+    ttft: 50.0  # Time To First Token target (milliseconds)
+    itl: 10.0   # Inter-Token Latency target (milliseconds)
+  # Optional: Features controls optional Dynamo platform features
+  # features:
+  #   planner:
+  #     plannerMinEndpoint: 2
+  #   mocker:
+  #     enabled: false
+  # Optional: Overrides allows customizing the profiling job and generated DGD
+  # overrides:
+  #   profilingJob: { ... }
+  #   dgd: { ... }
+  # Optional: Automatically create DynamoGraphDeployment after profiling
+  autoApply: true  # default is true
--- a/docs/components/profiler/profiler_guide.md
+++ b/docs/components/profiler/profiler_guide.md
@@ -12,10 +12,10 @@ This guide covers deployment, configuration, integration, and troubleshooting fo
 A **DynamoGraphDeploymentRequest (DGDR)** is a Kubernetes Custom Resource that serves as the primary interface for users to request model deployments with specific performance and resource constraints. You specify:
 - **What** model you want to deploy (`model`)
- **How** it should perform (SLA targets: `ttft`, `itl`)
+- **How** it should perform (SLA targets: `sla.ttft`, `sla.itl`)
- **Where** it should run (optional GPU preferences)
+- **Where** it should run (optional GPU preferences via `hardware`)
- **Which** backend to use (`backend`: sglang, trtllm, or vllm)
+- **Which** backend to use (`backend`: auto, vllm, sglang, or trtllm)
- **Which** images to use (`profilingConfig.profilerImage`, `deploymentOverrides.workersImage`)
+- **Which** image to use (`image`)
 The Dynamo Operator watches for DGDRs and automatically:
 1. Discovers available GPU resources in your cluster
@@ -60,17 +60,13 @@ The recommended deployment method is through DGDRs. Sample configurations are pr
 #### Container Images
-Each DGDR requires container images for profiling and deployment:
+Each DGDR requires a container image for profiling and deployment:
- **`profilingConfig.profilerImage`** (Required): Container image for the profiling job. Must contain the profiler code and dependencies.
+- **`image`** (Optional): Container image for the profiling job. Must contain the profiler code and dependencies.
- **`deploymentOverrides.workersImage`** (Optional): Container image for DGD worker components (frontend, workers, planner). If omitted, uses image from the base config file.
 ```yaml
 spec:
-  profilingConfig:
+  image: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
 ```
 #### Quick Start: Deploy with DGDR
@@ -80,23 +76,23 @@ spec:
 Use a sample configuration or create your own:
 ```yaml
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: my-model-profiling
 spec:
  model: "Qwen/Qwen3-0.6B"
  backend: vllm
-  profilingConfig:
+  image: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
-    config:
+  workload:
-      sla:
+    isl: 3000
-        isl: 3000
+    osl: 150
-        osl: 150
-        ttft: 200.0
+  sla:
-        itl: 20.0
+    ttft: 200.0
-  deploymentOverrides:
+    itl: 20.0
-    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
  autoApply: true
 ```
@@ -120,11 +116,12 @@ kubectl describe dgdr my-model-profiling -n $NAMESPACE
 kubectl logs -f job/profile-my-model-profiling -n $NAMESPACE
 ```
-**DGDR Status States:**
+**DGDR Status Phases:**
 - `Pending`: Initial state, preparing to profile
 - `Profiling`: Running profiling job (20-30 seconds for AIC, 2-4 hours for online)
+- `Ready`: Profiling complete, generated DGD spec available in status
 - `Deploying`: Generating and applying DGD configuration
- `Ready`: DGD successfully deployed and running
+- `Deployed`: DGD successfully deployed and running
 - `Failed`: Error occurred (check events for details)
 **Step 4: Access Your Deployment**
@@ -143,21 +140,6 @@ curl http://localhost:8000/v1/models
 > [!NOTE]
 > DGDRs are **immutable**. To update SLAs or configuration, delete the existing DGDR and create a new one.
-### Direct Script Execution
-For advanced use cases or local development:
-```bash
-python -m benchmarks.profiler.profile_sla \
-  --backend vllm \
-  --config path/to/disagg.yaml \
-  --model meta-llama/Llama-3-8B \
-  --ttft 200 --itl 15 \
-  --isl 3000 --osl 150 \
-  --min-num-gpus 1 \
-  --max-num-gpus 8
-```
 ## Profiling Method
 The profiler follows a 5-step process:
@@ -188,11 +170,11 @@ Profiles your model by creating real test deployments in Kubernetes and measurin
 - **GPU Requirements**: Full access to test different parallelization mappings
 - **Backends**: SGLang, TensorRT-LLM, vLLM
+AIPerf-based profiling is the default behavior. Use `searchStrategy: thorough` for comprehensive real-engine profiling:
 ```yaml
-profilingConfig:
+spec:
-  config:
+  searchStrategy: thorough  # Deep exploration with real engine profiling
-    sweep:
-      useAiConfigurator: false  # Default
 ```
 ### AI Configurator Simulation
@@ -202,16 +184,13 @@ Uses performance simulation to rapidly estimate optimal configurations without r
 - **Duration**: 20-30 seconds
 - **Accuracy**: Estimated (may have errors for unusual configurations)
 - **GPU Requirements**: None
- **Backends**: TensorRT-LLM only (SGLang/vLLM coming soon)
+- **Backends**: All backends (vLLM, SGLang, TensorRT-LLM)
+AI Configurator simulation is enabled by default via `searchStrategy: rapid`:
 ```yaml
-profilingConfig:
+spec:
-  config:
+  searchStrategy: rapid  # Fast profiling with AI Configurator simulation
-    sweep:
-      useAiConfigurator: true
-      aicSystem: h200_sxm
-      aicHfId: Qwen/Qwen3-32B
-      aicBackendVersion: "0.20.0"      # TRT-LLM version simulated by AIC
 ```
 > [!NOTE]
@@ -240,39 +219,33 @@ If GPU discovery is unavailable (no permissions or no GPU labels), the profiler
 ### DGDR Configuration Structure
-All profiler configuration goes under `spec.profilingConfig.config`:
+All profiler configuration is provided through the v1beta1 DGDR spec fields:
 ```yaml
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: my-deployment
 spec:
  model: "Qwen/Qwen3-0.6B"
  backend: vllm
+  image: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
-  profilingConfig:
+  workload: { ... }
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
+  sla: { ... }
-    configMapRef:                  # Optional: base DGD config
+  hardware: { ... }
-      name: my-config
+  features: { ... }
-      key: disagg.yaml
+  overrides: { ... }
-    config:
-      sla: { ... }
-      hardware: { ... }
-      sweep: { ... }
-      planner: { ... }
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
 ```
-### SLA Configuration (Required)
+### SLA Configuration (Optional)
 ```yaml
-sla:
+workload:
  isl: 3000      # Average input sequence length (tokens)
  osl: 150       # Average output sequence length (tokens)
+sla:
  ttft: 200.0    # Target Time To First Token (milliseconds)
  itl: 20.0      # Target Inter-Token Latency (milliseconds)
 ```
@@ -286,54 +259,40 @@ sla:
 ```yaml
 hardware:
-  minNumGpusPerEngine: 2      # Auto-determined from model size and VRAM if not provided
+  gpuSku: h200_sxm            # GPU SKU identifier (auto-detected)
-  maxNumGpusPerEngine: 8      # Maximum GPUs to test
+  vramMb: 81920               # VRAM per GPU in MiB
+  totalGpus: 16               # Total GPUs available in the cluster
  numGpusPerNode: 8           # GPUs per node (for multi-node MoE)
-  gpuType: h200_sxm           # GPU type hint (informational, auto-detected)
 ```
- **minNumGpusPerEngine**: Skip small TP sizes if your model is large
- **maxNumGpusPerEngine**: Limit search space or work around constraints (e.g., [AIC attention heads](#ai-configurator-attention-head-constraint-error))
 - **numGpusPerNode**: Determine the upper bound of GPUs per node for dense models and configure Grove for multi-node MoE engines
- **gpuType**: Informational only, auto-detected by the controller. For AI Configurator, use `aicSystem` in the [sweep configuration](#ai-configurator-configuration) instead
+- **gpuSku**: GPU SKU identifier, auto-detected by the controller
 > [!TIP]
 > If you don't specify hardware constraints, the controller auto-detects based on your model size and available cluster resources.
-### Sweep Configuration (Optional)
+### Search Strategy (Optional)
-```yaml
+Controls the profiling search depth:
-sweep:
-  useAiConfigurator: false              # Use real profiling (default)
-  prefillInterpolationGranularity: 16   # Samples for prefill TTFT curve
-  decodeInterpolationGranularity: 6     # Samples for decode ITL curve
-```
- **useAiConfigurator**: Set to `true` for 20-30 second profiling (TensorRT-LLM only)
- **prefillInterpolationGranularity**: Samples for prefill TTFT curve (lower = faster but less accurate)
- **decodeInterpolationGranularity**: Samples for decode ITL curve. Since ITL interpolation is 3D and takes longer, we default to fewer samples. Increasing this value may quadratically increase profiling time.
-### AI Configurator Configuration
-Required if `useAiConfigurator: true`:
 ```yaml
-sweep:
+spec:
-  useAiConfigurator: true
+  searchStrategy: rapid   # "rapid" (default) for fast sweep; "thorough" for deeper exploration
-  aicSystem: h200_sxm              # h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm
-  aicHfId: Qwen/Qwen3-32B         # HuggingFace model ID
-  aicBackendVersion: "0.20.0"      # TensorRT-LLM version
 ```
+- **rapid**: Performs a fast sweep over parallelization mappings (default)
+- **thorough**: Explores more configurations for potentially better results
 ### Planner Configuration (Optional)
-Pass arguments to the SLA planner:
+Pass arguments to the SLA planner via the features section:
 ```yaml
-planner:
+features:
-  planner_min_endpoint: 2                    # Minimum endpoints to maintain
+  planner:
-  planner_adjustment_interval: 60            # Adjustment interval (seconds)
+    planner_min_endpoint: 2                    # Minimum endpoints to maintain
-  planner_load_predictor: linear             # Load prediction method
+    planner_adjustment_interval: 60            # Adjustment interval (seconds)
+    planner_load_predictor: linear             # Load prediction method
 ```
 > [!NOTE]
@@ -344,11 +303,10 @@ planner:
 For large models, use a pre-populated PVC containing model weights instead of downloading from HuggingFace:
 ```yaml
-deployment:
+modelCache:
-  modelCache:
+  pvcName: "model-cache"
-    pvcName: "model-cache"
+  pvcModelPath: "hub/models--deepseek-ai--DeepSeek-R1"
-    pvcPath: "hub/models--deepseek-ai--DeepSeek-R1"
+  pvcMountPath: "/opt/model-cache"
-    mountPath: "/opt/model-cache"
 ```
 Requirements:
@@ -357,7 +315,7 @@ Requirements:
 ### Engine Configuration (Auto-configured)
-The controller automatically injects these from high-level fields:
+The controller automatically handles model and backend configuration from high-level fields:
 ```yaml
 # You specify:
@@ -365,58 +323,28 @@ spec:
  model: "Qwen/Qwen3-0.6B"
  backend: vllm
-# Controller auto-injects:
+# Controller auto-injects into the profiling job
-profilingConfig:
-  config:
-    deployment:
-      model: "Qwen/Qwen3-0.6B"
-    engine:
-      backend: vllm
-      config: /path/to/configmap
 ```
-You should **not** manually set `deployment.model` or `engine.backend` in `profilingConfig.config`.
+You should **not** manually set model or backend in profiling config overrides.
-### Using Existing DGD Configs (ConfigMap)
-Reference an existing DGD config via ConfigMap:
+### Using Existing DGD Configs
-```bash
+Provide a base DGD config via the overrides section:
-kubectl create configmap my-config \
-  --from-file=disagg.yaml=/path/to/your/disagg.yaml \
-  --namespace $NAMESPACE \
-  --dry-run=client -o yaml | kubectl apply -f -
-```
 ```yaml
-profilingConfig:
+overrides:
-  configMapRef:
+  dgd:
-    name: my-config
+    apiVersion: nvidia.com/v1alpha1
-    key: disagg.yaml
+    kind: DynamoGraphDeployment
+    metadata:
+      name: my-dgd
+    spec:
+      # ... your base DGD spec
 ```
 The profiler uses the DGD config as a **base template**, then optimizes it based on your SLA targets.
-### CLI Arguments
-| Argument | Type | Default | Description |
-|----------|------|---------|-------------|
-| `--backend` | string | - | Inference backend: sglang, trtllm, vllm |
-| `--config` | string | - | Path to DGD YAML config file |
-| `--model` | string | - | HuggingFace model ID |
-| `--ttft` | float | - | Target TTFT in milliseconds |
-| `--itl` | float | - | Target ITL in milliseconds |
-| `--isl` | int | - | Average input sequence length |
-| `--osl` | int | - | Average output sequence length |
-| `--min-num-gpus` | int | auto | Minimum GPUs per engine |
-| `--max-num-gpus` | int | 8 | Maximum GPUs per engine |
-| `--use-ai-configurator` | flag | false | Use offline AI Configurator |
-| `--pick-with-webui` | flag | false | Launch interactive WebUI |
-| `--webui-port` | int | 8000 | Port for WebUI |
-> [!NOTE]
-> CLI arguments map to DGDR config fields: `--min-num-gpus` = `hardware.minNumGpusPerEngine`, `--max-num-gpus` = `hardware.maxNumGpusPerEngine`, `--use-ai-configurator` = `sweep.useAiConfigurator`. See [DGDR Configuration Structure](#dgdr-configuration-structure) for all field mappings.
 ## Integration
 ### With SLA Planner
@@ -476,10 +404,10 @@ Then manually extract and apply:
 ```bash
 # Extract generated DGD from DGDR status
-kubectl get dgdr my-deployment -n $NAMESPACE -o jsonpath='{.status.generatedDeployment}' | kubectl apply -f -
+kubectl get dgdr my-deployment -n $NAMESPACE -o jsonpath='{.status.profilingResults.selectedConfig}' | kubectl apply -f -
 # Or save to file for review
-kubectl get dgdr my-deployment -n $NAMESPACE -o jsonpath='{.status.generatedDeployment}' > my-dgd.yaml
+kubectl get dgdr my-deployment -n $NAMESPACE -o jsonpath='{.status.profilingResults.selectedConfig}' > my-dgd.yaml
 ```
 ### Mocker Deployment
@@ -490,7 +418,9 @@ Deploy a mocker deployment that simulates engines without GPUs:
 spec:
  model: <model-name>
  backend: trtllm
-  useMocker: true    # Deploy mocker instead of real backend
+  features:
+    mocker:
+      enabled: true    # Deploy mocker instead of real backend
  autoApply: true
 ```
@@ -498,11 +428,17 @@ Profiling still runs against the real backend to collect performance data. The m
 ### Accessing Profiling Artifacts
-By default, profiling data is stored in ConfigMaps. For detailed artifacts (plots, logs, raw data), attach a PVC:
+By default, profiling data is stored in ConfigMaps. For detailed artifacts (plots, logs, raw data), attach a PVC via overrides:
 ```yaml
-profilingConfig:
+overrides:
-  outputPVC: "dynamo-pvc"
+  profilingJob:
+    template:
+      spec:
+        volumes:
+        - name: profiling-output
+          persistentVolumeClaim:
+            claimName: "dynamo-pvc"
 ```
 **ConfigMaps (always created):**
@@ -560,17 +496,18 @@ View traces using Chrome's `chrome://tracing`, [Perfetto UI](https://ui.perfetto
 ### Profiling Takes Too Long
-**Solution 1**: Use AI Configurator for rapid profiling (TensorRT-LLM only):
+**Solution 1**: Use `searchStrategy: rapid` for fast AI Configurator profiling (TensorRT-LLM only):
 ```yaml
-sweep:
+spec:
-  useAiConfigurator: true
+  searchStrategy: rapid
 ```
-**Solution 2**: Reduce search space:
+**Solution 2**: Reduce search space by specifying hardware constraints in the DGDR:
 ```yaml
-hardware:
+spec:
-  minNumGpusPerEngine: 4  # Skip TP1, TP2
+  hardware:
-  maxNumGpusPerEngine: 8  # Don't test beyond TP8
+    numGpusPerNode: 4
+    totalGpus: 8
 ```
 ### SLA Cannot Be Met

--- a/docs/kubernetes/api_reference.md
+++ b/docs/kubernetes/api_reference.md
@@ -430,19 +430,17 @@ DynamoGraphDeployment is the Schema for the dynamographdeployments API.
 DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests API.
-It serves as the primary interface for users to request model deployments with
+It provides a simplified, SLA-driven interface for deploying inference models on Dynamo.
-specific performance and resource constraints, enabling SLA-driven deployments.
+Users specify a model and optional performance targets; the controller handles profiling,
+configuration selection, and deployment.
 Lifecycle:
- 1. Initial → Pending: Validates spec and prepares for profiling
+ 1. Pending: Spec validated, preparing for profiling
- 2. Pending → Profiling: Creates and runs profiling job (online or AIC)
+ 2. Profiling: Profiling job is running to discover optimal configurations
- 3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
+ 3. Ready: Profiling complete, generated DGD spec available in status
- 4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
+ 4. Deploying: DGD is being created and rolled out (when autoApply=true)
- 5. Ready: Terminal state when DGD is operational or spec is available
+ 5. Deployed: DGD is running and healthy
- 6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
+ 6. Failed: An unrecoverable error occurred
-The spec becomes immutable once profiling starts. Users must delete and recreate
-the DGDR to modify configuration after this point.
@@ -450,7 +448,7 @@ the DGDR to modify configuration after this point.
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
+| `apiVersion` _string_ | `nvidia.com/v1beta1` | | |
 | `kind` _string_ | `DynamoGraphDeploymentRequest` | | |
 | `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
 | `spec` _[DynamoGraphDeploymentRequestSpec](#dynamographdeploymentrequestspec)_ | Spec defines the desired state for this deployment request. |  |  |
@@ -462,8 +460,7 @@ the DGDR to modify configuration after this point.
 DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.
-This CRD serves as the primary interface for users to request model deployments with
+Only the Model field is required; all other fields are optional and have sensible defaults.
-specific performance constraints and resource requirements, enabling SLA-driven deployments.
@@ -472,12 +469,17 @@ _Appears in:_
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />This is a high-level identifier for easy reference in kubectl output and logs.<br />The controller automatically sets this value in profilingConfig.config.deployment.model. |  | Required: \{\} <br /> |
+| `model` _string_ | Model specifies the model to deploy (e.g., "Qwen/Qwen3-0.6B", "meta-llama/Llama-3-70b").<br />Can be a HuggingFace ID or a private model name. |  | Required: \{\} <br />MinLength: 1 <br /> |
-| `backend` _string_ | Backend specifies the inference backend for profiling.<br />The controller automatically sets this value in profilingConfig.config.engine.backend.<br />Profiling runs on real GPUs or via AIC simulation to collect performance data. |  | Enum: [vllm sglang trtllm] <br />Required: \{\} <br /> |
+| `backend` _[BackendType](#backendtype)_ | Backend specifies the inference backend to use for profiling and deployment. | auto | Enum: [auto sglang trtllm vllm] <br /> |
-| `useMocker` _boolean_ | UseMocker indicates whether to deploy a mocker DynamoGraphDeployment instead of<br />a real backend deployment. When true, the deployment uses simulated engines that<br />don't require GPUs, using the profiling data to simulate realistic timing behavior.<br />Mocker is available in all backend images and useful for large-scale experiments.<br />Profiling still runs against the real backend (specified above) to collect performance data. | false |  |
+| `image` _string_ | Image is the container image reference for the profiling job. |  | Optional: \{\} <br /> |
-| `profilingConfig` _[ProfilingConfigSpec](#profilingconfigspec)_ | ProfilingConfig provides the complete configuration for the profiling job.<br />Note: GPU discovery is automatically attempted to detect GPU resources from Kubernetes<br />cluster nodes. If the operator has node read permissions (cluster-wide or explicitly granted),<br />discovered GPU configuration is used as defaults when hardware configuration is not manually<br />specified (minNumGpusPerEngine, maxNumGpusPerEngine, numGpusPerNode). User-specified values<br />always take precedence over auto-discovered values. If GPU discovery fails (e.g.,<br />namespace-restricted operator without node permissions), manual hardware config is required.<br />This configuration is passed directly to the profiler.<br />The structure matches the profile_sla config format exactly (see ProfilingConfigSpec for schema).<br />Note: deployment.model and engine.backend are automatically set from the high-level<br />modelName and backend fields and should not be specified in this config. |  | Required: \{\} <br /> |
+| `modelCache` _[ModelCacheSpec](#modelcachespec)_ | ModelCache provides optional PVC configuration for pre-downloaded model weights. |  | Optional: \{\} <br /> |
-| `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, only the spec is generated and stored in status.<br />Users can then manually create a DGD using the generated spec. | false |  |
+| `hardware` _[HardwareSpec](#hardwarespec)_ | Hardware describes the hardware resources available for profiling and deployment. |  | Optional: \{\} <br /> |
-| `deploymentOverrides` _[DeploymentOverridesSpec](#deploymentoverridesspec)_ | DeploymentOverrides allows customizing metadata for the auto-created DGD.<br />Only applicable when AutoApply is true. |  | Optional: \{\} <br /> |
+| `workload` _[WorkloadSpec](#workloadspec)_ | Workload defines the expected workload characteristics for SLA-based profiling. |  | Optional: \{\} <br /> |
+| `sla` _[SLASpec](#slaspec)_ | SLA defines service-level agreement targets that drive profiling optimization. |  | Optional: \{\} <br /> |
+| `overrides` _[OverridesSpec](#overridesspec)_ | Overrides allows customizing the profiling job and the generated DynamoGraphDeployment. |  | Optional: \{\} <br /> |
+| `features` _[FeaturesSpec](#featuresspec)_ | Features controls optional Dynamo platform features in the generated deployment. |  | Optional: \{\} <br /> |
+| `searchStrategy` _[SearchStrategy](#searchstrategy)_ | SearchStrategy controls the profiling search depth. | rapid | Enum: [rapid thorough] <br /> |
+| `autoApply` _boolean_ | AutoApply indicates whether to automatically create a DynamoGraphDeployment<br />after profiling completes. If false, the generated spec is stored in status<br />for manual review and application. | true |  |
 #### DynamoGraphDeploymentRequestStatus
@@ -485,7 +487,6 @@ _Appears in:_
 DynamoGraphDeploymentRequestStatus represents the observed state of a DynamoGraphDeploymentRequest.
-The controller updates this status as the DGDR progresses through its lifecycle.
@@ -494,13 +495,14 @@ _Appears in:_
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `state` _string_ | State is a high-level textual status of the deployment request lifecycle.<br />Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"<br />Empty string ("") represents the initial state before initialization. |  |  |
+| `phase` _[DGDRPhase](#dgdrphase)_ | Phase is the high-level lifecycle phase of the deployment request. |  | Enum: [Pending Profiling Ready Deploying Deployed Failed] <br /> |
-| `backend` _string_ | Backend is extracted from profilingConfig.config.engine.backend for display purposes.<br />This field is populated by the controller and shown in kubectl output. |  | Optional: \{\} <br /> |
+| `profilingPhase` _[ProfilingPhase](#profilingphase)_ | ProfilingPhase indicates the current sub-phase of the profiling pipeline.<br />Only meaningful when Phase is "Profiling". |  | Optional: \{\} <br /> |
-| `observedGeneration` _integer_ | ObservedGeneration reflects the generation of the most recently observed spec.<br />Used to detect spec changes and enforce immutability after profiling starts. |  |  |
+| `dgdName` _string_ | DGDName is the name of the generated or created DynamoGraphDeployment. |  | Optional: \{\} <br /> |
-| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.<br />Conditions are merged by type on patch updates. |  |  |
+| `profilingJobName` _string_ | ProfilingJobName is the name of the Kubernetes Job running the profiler. |  | Optional: \{\} <br /> |
-| `profilingResults` _string_ | ProfilingResults contains a reference to the ConfigMap holding profiling data.<br />Format: "configmap/<name>" |  | Optional: \{\} <br /> |
+| `observedGeneration` _integer_ | ObservedGeneration is the most recent generation observed by the controller. |  |  |
-| `generatedDeployment` _[RawExtension](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#rawextension-runtime-pkg)_ | GeneratedDeployment contains the full generated DynamoGraphDeployment specification<br />including metadata, based on profiling results. Users can extract this to create<br />a DGD manually, or it's used automatically when autoApply is true.<br />Stored as RawExtension to preserve all fields including metadata.<br />For mocker backends, this contains the mocker DGD spec. |  | EmbeddedResource: \{\} <br />Optional: \{\} <br /> |
+| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Succeeded, Validation, Profiling, SpecGenerated, DeploymentReady. |  |  |
-| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.<br />Contains name, namespace, state, and creation status of the managed DGD. |  | Optional: \{\} <br /> |
+| `profilingResults` _[ProfilingResultsStatus](#profilingresultsstatus)_ | ProfilingResults contains the output of the profiling process including<br />Pareto-optimal configurations and the selected deployment configuration. |  | Optional: \{\} <br /> |
+| `deploymentInfo` _[DeploymentInfoStatus](#deploymentinfostatus)_ | DeploymentInfo tracks the state of the deployed DynamoGraphDeployment. |  | Optional: \{\} <br /> |
 #### DynamoGraphDeploymentScalingAdapter

--- a/docs/pages/components/planner/planner-examples.md
+++ b/docs/pages/components/planner/planner-examples.md
@@ -13,32 +13,14 @@ Practical examples for deploying the SLA Planner with throughput-based scaling.
 The simplest way to deploy with the SLA planner. Uses AI Configurator for offline profiling (20-30 seconds instead of hours):
 ```yaml
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: sla-aic
 spec:
  model: Qwen/Qwen3-32B
  backend: vllm
+  image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:my-tag"
-  profilingConfig:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
-    config:
-      sla:
-        isl: 3000
-        osl: 150
-        ttft: 200
-        itl: 20
-      sweep:
-        useAiConfigurator: true
-        aicSystem: h200_sxm
-        aicHfId: Qwen/Qwen3-32B
-        aicBackendVersion: "0.20.0"
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
-  autoApply: true
 ```
 Deploy:
@@ -52,31 +34,14 @@ kubectl apply -f components/src/dynamo/profiler/deploy/profile_sla_aic_dgdr.yaml
 Standard online profiling runs real GPU measurements for more accurate results. Takes 2-4 hours:
 ```yaml
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: sla-online
 spec:
  model: meta-llama/Llama-3.3-70B-Instruct
  backend: vllm
+  image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:my-tag"
-  profilingConfig:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
-    config:
-      sla:
-        isl: 3000
-        osl: 150
-        ttft: 200
-        itl: 20
-      sweep:
-        useAiConfigurator: false
-        prefillInterpolationGranularity: 16
-        decodeInterpolationGranularity: 6
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
-  autoApply: true
 ```
 Deploy:
@@ -89,7 +54,7 @@ Available sample DGDRs in `components/src/dynamo/profiler/deploy/`:
 - **`profile_sla_aic_dgdr.yaml`**: Fast offline profiling using AI Configurator
 - **`profile_sla_moe_dgdr.yaml`**: Online profiling for MoE models (SGLang)
-> **Profiling Config Cases**: Prior to 0.8.1, fields under `profilingConfig.config` use snake_case. Starting 0.8.1, fields use camelCase. There is backwards compatibility to snake_case, but example DGDRs use camelCase.
+> **Note**: Starting with Dynamo 1.0.0 (DGDR API version v1beta1), DGDR fields use structured spec fields (e.g., `spec.workload`, `spec.sla`, `spec.hardware`) instead of the nested `profilingConfig.config` blob used in v1alpha1.
 ## Kubernetes Examples
@@ -98,29 +63,14 @@ Available sample DGDRs in `components/src/dynamo/profiler/deploy/`:
 For Mixture-of-Experts models like DeepSeek-R1, use SGLang backend:
 ```yaml
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: sla-moe
 spec:
  model: deepseek-ai/DeepSeek-R1
  backend: sglang
+  image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:my-tag"
-  profilingConfig:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
-    config:
-      sla:
-        isl: 4000
-        osl: 500
-        ttft: 300
-        itl: 10
-      sweep:
-        useAiConfigurator: false
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
-  autoApply: true
 ```
 Deploy:
@@ -144,60 +94,36 @@ kubectl create configmap deepseek-r1-config \
 **Step 2: Reference it in your DGDR:**
 ```yaml
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: deepseek-r1
 spec:
  model: deepseek-ai/DeepSeek-R1
  backend: sglang
+  image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:my-tag"
-  profilingConfig:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
-    configMapRef:
-      name: deepseek-r1-config
-      key: disagg.yaml  # Must match the key used in --from-file
-    config:
-      sla:
-        isl: 4000
-        osl: 500
-        ttft: 300
-        itl: 10
-      sweep:
-        useAiConfigurator: true
-        aicSystem: h200_sxm
-        aicHfId: deepseek-ai/DeepSeek-V3
-        aicBackendVersion: "0.20.0"
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1"
-  autoApply: true
 ```
 The profiler uses the DGD config from the ConfigMap as a **base template**, then optimizes it based on your SLA targets. The controller automatically injects `spec.model` and `spec.backend` into the final configuration.
 ### Inline Configuration (Simple Use Cases)
-For simple use cases without a custom DGD config, provide profiler configuration directly. The profiler auto-generates a basic DGD configuration:
+For simple use cases without a custom DGD config, provide the configuration directly in the v1beta1 DGDR spec fields. The profiler auto-generates a basic DGD configuration:
 ```yaml
-profilingConfig:
+spec:
-  config:
+  workload:
-    sla:
+    isl: 8000
-      isl: 8000
+    osl: 200
-      osl: 200
-      ttft: 200.0
+  sla:
-      itl: 10.0
+    ttft: 200.0
+    itl: 10.0
-    hardware:
-      minNumGpusPerEngine: 2
+  hardware:
-      maxNumGpusPerEngine: 8
+    gpuSku: h200_sxm
-      gpuType: h200_sxm
+  searchStrategy: rapid
-    sweep:
-      prefillInterpolationGranularity: 16
-      decodeInterpolationGranularity: 6
 ```
 ### Mocker Deployment (Testing)
@@ -211,20 +137,11 @@ Deploy a mocker backend that simulates GPU timing behavior without real GPUs. Us
 spec:
  model: <model-name>
  backend: trtllm  # Real backend for profiling
-  useMocker: true  # Deploy mocker instead of real backend
+  features:
+    mocker:
-  profilingConfig:
+      enabled: true  # Deploy mocker instead of real backend
-    profilerImage: "nvcr.io/nvidia/dynamo/trtllm-runtime:<image-tag>"
-    config:
+  image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:my-tag"
-      sla:
-        isl: 3000
-        osl: 150
-        ttft: 200
-        itl: 20
-      sweep:
-        useAiConfigurator: true
-        aicSystem: h100_sxm
-  autoApply: true
 ```
 Profiling runs against the real backend (via GPUs or AIC). The mocker deployment then uses profiling data to simulate realistic timing.
@@ -314,10 +231,9 @@ See `components/planner/test/test_virtual_connector.py` for a full working examp
 Pass planner-specific settings through the DGDR:
 ```yaml
-profilingConfig:
+features:
-  config:
+  planner:
-    planner:
+    plannerMinEndpoint: 2
-      plannerMinEndpoint: 2
 ```
 ### Review Before Deploy (autoApply: false)
@@ -334,7 +250,7 @@ After profiling completes:
 ```bash
 # Extract and review generated DGD
 kubectl get dgdr sla-aic -n $NAMESPACE \
-  -o jsonpath='{.status.generatedDeployment}' > my-dgd.yaml
+  -o jsonpath='{.status.profilingResults.selectedConfig}' > my-dgd.yaml
 # Review and modify as needed
 vi my-dgd.yaml
@@ -349,14 +265,13 @@ Save detailed profiling artifacts (plots, logs, raw data) to a PVC:
 ```yaml
 spec:
-  profilingConfig:
+  workload:
-    outputPVC: "dynamo-pvc"
+    isl: 3000
-    config:
+    osl: 150
-      sla:
-        isl: 3000
+  sla:
-        osl: 150
+    ttft: 200
-        ttft: 200
+    itl: 20
-        itl: 20
 ```
 Setup:

--- a/docs/pages/components/profiler/README.md
+++ b/docs/pages/components/profiler/README.md
@@ -30,25 +30,22 @@ The Dynamo Profiler is an automated performance analysis tool that measures mode
 The recommended way to profile models is through DGDRs, which automate the entire profiling and deployment workflow.
 ```yaml
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: my-model-profiling
 spec:
  model: "Qwen/Qwen3-0.6B"
  backend: vllm
+  image: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
-  profilingConfig:
+  workload:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
+    isl: 3000      # Average input sequence length
-    config:
+    osl: 150       # Average output sequence length
-      sla:
-        isl: 3000      # Average input sequence length
-        osl: 150       # Average output sequence length
-        ttft: 200.0    # Target Time To First Token (ms)
-        itl: 20.0      # Target Inter-Token Latency (ms)
-  deploymentOverrides:
+  sla:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
+    ttft: 200.0    # Target Time To First Token (ms)
+    itl: 20.0      # Target Inter-Token Latency (ms)
  autoApply: true
 ```
@@ -59,42 +56,18 @@ kubectl apply -f my-profiling-dgdr.yaml -n $NAMESPACE
 ### Using AI Configurator (Fast Offline Profiling)
-For TensorRT-LLM, use AI Configurator for rapid profiling (~30 seconds):
+AI Configurator enables rapid offline profiling (~30 seconds) and supports all backends (vLLM, SGLang, TensorRT-LLM). Since `searchStrategy: rapid` is the default, AIC is used automatically unless you explicitly set `searchStrategy: thorough`.
-```yaml
-profilingConfig:
-  config:
-    sweep:
-      useAiConfigurator: true
-      aicSystem: h200_sxm
-      aicHfId: Qwen/Qwen3-32B
-      aicBackendVersion: "0.20.0"
-```
-### Direct Script Usage (Advanced)
-For advanced scenarios, run the profiler directly:
-```bash
-python -m dynamo.profiler.profile_sla \
-  --backend vllm \
-  --config path/to/disagg.yaml \
-  --model meta-llama/Llama-3-8B \
-  --ttft 200 --itl 15 \
-  --isl 3000 --osl 150
-```
 ## Configuration
 | Parameter | Default | Description |
 |-----------|---------|-------------|
-| `sla.isl` | - | Average input sequence length (tokens) |
+| `workload.isl` | 4000 | Average input sequence length (tokens) |
-| `sla.osl` | - | Average output sequence length (tokens) |
+| `workload.osl` | 1000 | Average output sequence length (tokens) |
-| `sla.ttft` | - | Target Time To First Token (milliseconds) |
+| `sla.ttft` | 2000 | Target Time To First Token (milliseconds) |
-| `sla.itl` | - | Target Inter-Token Latency (milliseconds) |
+| `sla.itl` | 30 | Target Inter-Token Latency (milliseconds) |
-| `sweep.useAiConfigurator` | `false` | Use offline simulation instead of real profiling |
+| `hardware.numGpusPerNode` | auto | Number of GPUs per node |
-| `hardware.minNumGpusPerEngine` | auto | Minimum GPUs per engine (auto-detected from model size) |
+| `hardware.gpuSku` | auto | GPU SKU identifier |
-| `hardware.maxNumGpusPerEngine` | 8 | Maximum GPUs per engine |
 ## Profiling Methods

--- a/docs/pages/components/profiler/profiler-examples.md
+++ b/docs/pages/components/profiler/profiler-examples.md
@@ -4,7 +4,7 @@
 title: Profiler Examples
 ---
-Complete examples for profiling with DGDRs, the interactive WebUI, and direct script usage.
+Complete examples for profiling with DGDRs.
 ## DGDR Examples
@@ -13,32 +13,22 @@ Complete examples for profiling with DGDRs, the interactive WebUI, and direct sc
 Standard online profiling with real GPU measurements:
 ```yaml
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: vllm-dense-online
 spec:
  model: "Qwen/Qwen3-0.6B"
  backend: vllm
+  image: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
-  profilingConfig:
+  workload:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
+    isl: 3000
-    config:
+    osl: 150
-      sla:
-        isl: 3000
-        osl: 150
-        ttft: 200.0
-        itl: 20.0
-      hardware:
+  sla:
-        minNumGpusPerEngine: 1
+    ttft: 200.0
-        maxNumGpusPerEngine: 8
+    itl: 20.0
-      sweep:
-        useAiConfigurator: false
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
  autoApply: true
 ```
@@ -48,31 +38,22 @@ spec:
 Fast offline profiling (~30 seconds, TensorRT-LLM only):
 ```yaml
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: trtllm-aic-offline
 spec:
  model: "Qwen/Qwen3-32B"
  backend: trtllm
+  image: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.9.0"
-  profilingConfig:
+  workload:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.9.0"
+    isl: 4000
-    config:
+    osl: 500
-      sla:
-        isl: 4000
-        osl: 500
-        ttft: 300.0
-        itl: 10.0
-      sweep:
-        useAiConfigurator: true
-        aicSystem: h200_sxm  # Also supports h100_sxm, b200_sxm, gb200_sxm, a100_sxm
-        aicHfId: Qwen/Qwen3-32B
-        aicBackendVersion: "0.20.0"
-  deploymentOverrides:
+  sla:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.9.0"
+    ttft: 300.0
+    itl: 10.0
  autoApply: true
 ```
@@ -82,32 +63,25 @@ spec:
 Multi-node MoE profiling with SGLang:
 ```yaml
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: sglang-moe
 spec:
  model: "deepseek-ai/DeepSeek-R1"
  backend: sglang
+  image: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
-  profilingConfig:
+  workload:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
+    isl: 2048
-    config:
+    osl: 512
-      sla:
-        isl: 2048
-        osl: 512
-        ttft: 300.0
-        itl: 25.0
-      hardware:
+  sla:
-        numGpusPerNode: 8
+    ttft: 300.0
-        maxNumGpusPerEngine: 32
+    itl: 25.0
-      engine:
+  hardware:
-        isMoeModel: true
+    numGpusPerNode: 8
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
  autoApply: true
 ```
@@ -125,136 +99,24 @@ kubectl create configmap deepseek-r1-config \
 ```
 ```yaml
-apiVersion: nvidia.com/v1alpha1
+apiVersion: nvidia.com/v1beta1
 kind: DynamoGraphDeploymentRequest
 metadata:
  name: deepseek-r1
 spec:
  model: deepseek-ai/DeepSeek-R1
  backend: sglang
+  image: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
-  profilingConfig:
+  workload:
-    profilerImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
+    isl: 4000
-    configMapRef:
+    osl: 500
-      name: deepseek-r1-config
-      key: disagg.yaml
-    config:
-      sla:
-        isl: 4000
-        osl: 500
-        ttft: 300
-        itl: 10
-      sweep:
-        useAiConfigurator: true
-        aicSystem: h200_sxm
-        aicHfId: deepseek-ai/DeepSeek-V3
-        aicBackendVersion: "0.20.0"
-  deploymentOverrides:
-    workersImage: "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.9.0"
-  autoApply: true
-```
-## Interactive WebUI
-Launch an interactive configuration selection interface:
-```bash
-python -m dynamo.profiler.profile_sla \
-  --backend trtllm \
-  --config path/to/disagg.yaml \
-  --pick-with-webui \
-  --use-ai-configurator \
-  --model Qwen/Qwen3-32B-FP8 \
-  --aic-system h200_sxm \
-  --ttft 200 --itl 15
-```
-The WebUI launches on port 8000 by default (configurable with `--webui-port`).
-### Features
- **Interactive Charts**: Visualize prefill TTFT, decode ITL, and GPU hours analysis with hover-to-highlight synchronization between charts and tables
- **Pareto-Optimal Analysis**: The GPU Hours table shows pareto-optimal configurations balancing latency and throughput
- **DGD Config Preview**: Click "Show Config" on any row to view the corresponding DynamoGraphDeployment YAML
- **GPU Cost Estimation**: Toggle GPU cost display to convert GPU hours to cost ($/1000 requests)
- **SLA Visualization**: Red dashed lines indicate your TTFT and ITL targets
-### Selection Methods
-1. **GPU Hours Table** (recommended): Click any row to select both prefill and decode configurations at once based on the pareto-optimal combination
-2. **Individual Selection**: Click one row in the Prefill table AND one row in the Decode table to manually choose each
-### Example DGD Config Output
-When you click "Show Config", you see a DynamoGraphDeployment configuration:
+  sla:
+    ttft: 300
+    itl: 10
-```yaml
+  autoApply: true
-# DynamoGraphDeployment Configuration
-# Prefill: 1 GPU(s), TP=1
-# Decode: 4 GPU(s), TP=4
-# Model: Qwen/Qwen3-32B-FP8
-# Backend: trtllm
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-spec:
-  services:
-    PrefillWorker:
-      subComponentType: prefill
-      replicas: 1
-      extraPodSpec:
-        mainContainer:
-          args:
-          - --tensor-parallel-size=1
-    DecodeWorker:
-      subComponentType: decode
-      replicas: 1
-      extraPodSpec:
-        mainContainer:
-          args:
-          - --tensor-parallel-size=4
-```
-Once you select a configuration, the full DGD CRD is saved as `config_with_planner.yaml`.
-## Direct Script Examples
-### Basic Profiling
-```bash
-python -m dynamo.profiler.profile_sla \
-  --backend vllm \
-  --config path/to/disagg.yaml \
-  --model meta-llama/Llama-3-8B \
-  --ttft 200 --itl 15 \
-  --isl 3000 --osl 150
-```
-### With GPU Constraints
-```bash
-python -m dynamo.profiler.profile_sla \
-  --backend sglang \
-  --config examples/backends/sglang/deploy/disagg.yaml \
-  --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
-  --ttft 200 --itl 15 \
-  --isl 3000 --osl 150 \
-  --min-num-gpus 2 \
-  --max-num-gpus 8
-```
-### AI Configurator (Offline)
-```bash
-python -m dynamo.profiler.profile_sla \
-  --backend trtllm \
-  --config path/to/disagg.yaml \
-  --use-ai-configurator \
-  --model Qwen/Qwen3-32B-FP8 \
-  --aic-system h200_sxm \
-  --ttft 200 --itl 15 \
-  --isl 4000 --osl 500
 ```
 ## SGLang Runtime Profiling

--- a/docs/pages/components/profiler/profiler-guide.md
+++ b/docs/pages/components/profiler/profiler-guide.md
@@ -12,6 +12,12 @@ The profiler accepts a `DynamoGraphDeploymentRequestSpec` (DGDR) as input and us
 ## Workflow
+- **What** model you want to deploy (`model`)
+- **How** it should perform (SLA targets: `sla.ttft`, `sla.itl`)
+- **Where** it should run (optional GPU preferences via `hardware`)
+- **Which** backend to use (`backend`: auto, vllm, sglang, or trtllm)
+- **Which** image to use (`image`)
 The profiler follows this pipeline:
 ```mermaid
@@ -150,37 +156,481 @@ The profiler enforces these rules at startup:
 | SLA unachievable | Warning logged, SLA updated to best achievable value. |
 | Load-match needs more GPUs than available | Warning logged. |
-## CLI Usage
+## Support Matrix
+| Backend | Dense Models | MoE Models |
+|---------|-------------|------------|
+| vLLM | ✅ | 🚧 |
+| SGLang | ✅ | ✅ |
+| TensorRT-LLM | ✅ | 🚧 |
+The profiler sweeps over the following parallelization mappings for prefill and decode:
+| Model Architecture | Prefill Parallelization Mapping | Decode Parallelization Mapping |
+|---------|-------------|------------|
+| MLA+MoE (DeepseekV3ForCausalLM, DeepseekV32ForCausalLM) | TEP, DEP | TEP, DEP |
+| GQA+MoE (Qwen3MoeForCausalLM) | TP, TEP, DEP | TP, TEP, DEP |
+| Other Models | TP | TP |
+> [!NOTE]
+> Exact model x parallelization mapping support is dependent on the backend. The profiler does not guarantee that the recommended P/D engine configuration is supported and bug-free by the backend.
+## Deployment
+### Kubernetes Deployment (DGDR)
+The recommended deployment method is through DGDRs. Sample configurations are provided in `components/src/dynamo/profiler/deploy/`:
+| Sample | Description |
+|--------|-------------|
+| `profile_sla_dgdr.yaml` | Standard online profiling with AIPerf |
+| `profile_sla_aic_dgdr.yaml` | Fast offline profiling with AI Configurator |
+| `profile_sla_moe_dgdr.yaml` | MoE model profiling (SGLang) |
+#### Container Images
+Each DGDR requires a container image for profiling and deployment:
+- **`image`** (Optional): Container image for the profiling job. Must contain the profiler code and dependencies.
+```yaml
+spec:
+  image: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
+```
+#### Quick Start: Deploy with DGDR
-The profiler can be run directly for local development and testing:
+**Step 1: Create Your DGDR**
+Use a sample configuration or create your own:
+```yaml
+apiVersion: nvidia.com/v1beta1
+kind: DynamoGraphDeploymentRequest
+metadata:
+  name: my-model-profiling
+spec:
+  model: "Qwen/Qwen3-0.6B"
+  backend: vllm
+  image: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0"
+```
+**Step 2: Apply the DGDR**
 ```bash
-python -m dynamo.profiler --config <spec.yaml>
+export NAMESPACE=your-namespace
+kubectl apply -f my-profiling-dgdr.yaml -n $NAMESPACE
 ```
-Where `<spec.yaml>` is a DGDR spec (JSON or YAML file, or inline JSON string).
+**Step 3: Monitor Progress**
-### Operational flags
+```bash
+# View status
+kubectl get dgdr -n $NAMESPACE
-| Flag | Default | Description |
+# Detailed status
-|------|---------|-------------|
+kubectl describe dgdr my-model-profiling -n $NAMESPACE
-| `--output-dir` | `profiling_results` | Directory for output files |
-| `--deployment-timeout` | `3600` | Max seconds to wait for K8s deployment readiness |
-| `--prefill-interpolation-granularity` | `16` | Number of ISL samples for prefill interpolation |
-| `--decode-interpolation-granularity` | `6` | Number of samples for decode interpolation |
-| `--dry-run` | `false` | Skip all deployments and benchmarking (dev mode) |
-### Output
+# Watch profiling job logs
+kubectl logs -f job/profile-my-model-profiling -n $NAMESPACE
+```
-The profiler writes `final_config.yaml` to the output directory. When the planner is enabled, this is a multi-document YAML containing ConfigMaps + DGD. The `profiler_status.yaml` file tracks job status (`success` / `failed`).
+**DGDR Status Phases:**
+- `Pending`: Initial state, preparing to profile
+- `Profiling`: Running profiling job (20-30 seconds for AIC, 2-4 hours for online)
+- `Ready`: Profiling complete, generated DGD spec available in status
+- `Deploying`: Generating and applying DGD configuration
+- `Deployed`: DGD successfully deployed and running
+- `Failed`: Error occurred (check events for details)
-## Support Matrix
+**Step 4: Access Your Deployment**
-| Backend | Dense Models | MoE Models |
+```bash
-|---------|-------------|------------|
+# Find the frontend service
-| vLLM | ✅ | 🚧 |
+kubectl get svc -n $NAMESPACE | grep frontend
-| SGLang | ✅ | ✅ |
-| TensorRT-LLM | ✅ | 🚧 |
+# Port-forward to access locally
+kubectl port-forward svc/<deployment>-frontend 8000:8000 -n $NAMESPACE
+# Test the endpoint
+curl http://localhost:8000/v1/models
+```
+> [!NOTE]
+> DGDRs are **immutable**. To update SLAs or configuration, delete the existing DGDR and create a new one.
+## Profiling Method
+The profiler follows a 5-step process:
+1. **Hardware Setup**: Uses defaults or user-specified hardware configuration. Optionally, cluster-scoped operators can enable automatic GPU discovery to detect specifications from cluster nodes.
+2. **Identify Sweep Ranges**: Automatically determine minimum and maximum number of GPUs per engine. Minimum is determined by the model size and GPU VRAM. Maximum is set to one node for dense models and 4 nodes for MoE models.
+3. **Parallelization Mapping Sweep**: Test performance of engines with different parallelization mappings using the input ISL and OSL.
+   - For dense models, test different TP sizes for both prefill and decode.
+   - For MoE models (SGLang), evaluate both TEP and DEP as candidates for prefill and decode.
+   - **Prefill**:
+     - TP/TEP: Measure TTFT with batch size = 1 (assuming ISL is long enough to saturate compute) without KV reuse.
+     - DEP: Attention uses data parallelism. Send a single burst with total concurrency `attention_dp_size × attn_dp_num_req_ratio` (defaults to 4) and compute the reported TTFT as `time_to_first_token.max / attn_dp_num_req_ratio` from the AIPerf summary of that burst.
+   ![Prefill Performance](../../../assets/img/h100-prefill-performance.png)
+   - **Decode**: Measure the ITL under different numbers of in-flight requests, from 1 to the maximum the KV cache can hold. To measure ITL without being affected by piggy-backed prefill requests, the script enables KV-reuse and warms up the engine by issuing the same prompts before measuring.
+   ![Decode Performance](../../../assets/img/h100-decode-performance.png)
+4. **Recommendation**: Select optimal parallelization mapping for prefill and decode that achieves the highest per-GPU throughput while adhering to the SLA on TTFT and ITL.
+5. **In-Depth Profiling on the Recommended P/D Engine**: Interpolate TTFT with ISL and ITL with active KV cache and decode context length for more accurate performance estimation.
+![ITL Interpolation](../../../assets/img/pd-interpolation.png)
+   - **Prefill**: Measures TTFT and throughput per GPU across different input lengths with batch size=1.
+   - **Decode**: Measures ITL and throughput per GPU under various KV cache loads and decode context lengths.
+### AIPerf on Real Engines
+Profiles your model by creating real test deployments in Kubernetes and measuring their performance.
+- **Duration**: 2-4 hours
+- **Accuracy**: Highest (real measurements)
+- **GPU Requirements**: Full access to test different parallelization mappings
+- **Backends**: vLLM, SGLang, TensorRT-LLM
+AIPerf-based profiling is the default behavior. Use `searchStrategy: thorough` for comprehensive real-engine profiling:
+```yaml
+spec:
+  searchStrategy: thorough  # Deep exploration with real engine profiling
+```
+### AI Configurator Simulation
+Uses performance simulation to rapidly estimate optimal configurations without running real deployments.
+- **Duration**: 20-30 seconds
+- **Accuracy**: Estimated (may have errors for unusual configurations)
+- **GPU Requirements**: None
+- **Backends**: TensorRT-LLM only (vLLM/SGLang coming soon)
+AI Configurator is used by default with `searchStrategy: rapid`:
+```yaml
+spec:
+  searchStrategy: rapid  # Fast profiling with AI Configurator simulation (default)
+```
+> [!NOTE]
+> `aicBackendVersion` specifies the TensorRT-LLM version that AI Configurator simulates. See the [AI Configurator supported features](https://github.com/ai-dynamo/aiconfigurator#supported-features) for available versions.
+**Currently supports:**
+- **Backends**: TensorRT-LLM (versions 0.20.0, 1.0.0rc3, 1.0.0rc6)
+- **Systems**: H100 SXM, H200 SXM, B200 SXM, GB200 SXM, A100 SXM
+- **Models**: Wide range including GPT, Llama, Mixtral, DeepSeek, Qwen, and more
+See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features) for the full list.
+### Automatic GPU Discovery
+The operator automatically discovers GPU resources from cluster nodes, providing hardware info (GPU model, VRAM, GPUs per node) and automatic profiling search space calculation.
+**Requirements:**
+- **Cluster-scoped operators**: Have node read permissions by default
+- **Namespace-scoped operators**: GPU discovery is enabled by default when installing via Helm — the chart provisions the required ClusterRole/ClusterRoleBinding automatically
+**For namespace-scoped operators**, GPU discovery is controlled by a Helm value:
+```bash
+# GPU discovery enabled (default) — Helm provisions read-only node access automatically
+helm install dynamo-platform ... --set dynamo-operator.gpuDiscovery.enabled=true
+# GPU discovery disabled — you must provide hardware config manually in each DGDR
+helm install dynamo-platform ... --set dynamo-operator.gpuDiscovery.enabled=false
+```
+If GPU discovery is disabled, provide hardware config manually in the DGDR:
+```yaml
+spec:
+  hardware:
+    numGpusPerNode: 8
+    gpuSku: "H100-SXM5-80GB"
+    vramMb: 81920
+```
+If GPU discovery is disabled and no manual hardware config is provided, the DGDR will be rejected at admission time.
+## Configuration
+### DGDR Configuration Structure
+All profiler configuration is provided through the v1beta1 DGDR spec fields:
+```yaml
+apiVersion: nvidia.com/v1beta1
+kind: DynamoGraphDeploymentRequest
+metadata:
+  name: my-deployment
+spec:
+  model: "Qwen/Qwen3-0.6B"
+  backend: vllm
+  image: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.9.0"
+  searchStrategy: rapid  # or thorough
+  autoApply: true
+  workload: { ... }
+  sla: { ... }
+  hardware: { ... }
+  features: { ... }
+  overrides: { ... }
+```
+### SLA Configuration (Optional)
+```yaml
+workload:
+  isl: 3000      # Average input sequence length (tokens)
+  osl: 150       # Average output sequence length (tokens)
+sla:
+  ttft: 200.0    # Target Time To First Token (milliseconds)
+  itl: 20.0      # Target Inter-Token Latency (milliseconds)
+```
+- **ISL/OSL**: Based on your expected traffic patterns
+- **TTFT**: First token latency target (lower = more GPUs needed, affects prefill engine)
+- **ITL**: Token generation latency target (lower = more GPUs needed, affects decode engine)
+- **Trade-offs**: Tighter SLAs require more GPU resources
+### Hardware Configuration (Optional)
+```yaml
+hardware:
+  gpuSku: h200_sxm            # GPU SKU identifier (auto-detected)
+  vramMb: 81920               # VRAM per GPU in MiB
+  totalGpus: 16               # Total GPUs available in the cluster
+  numGpusPerNode: 8           # GPUs per node (for multi-node MoE)
+```
+- **numGpusPerNode**: Determine the upper bound of GPUs per node for dense models and configure Grove for multi-node MoE engines
+- **gpuSku**: GPU SKU identifier, auto-detected by the controller
+> [!TIP]
+> If you don't specify hardware constraints, the controller auto-detects based on your model size and available cluster resources.
+### Search Strategy (Optional)
+Controls the profiling search depth:
+```yaml
+spec:
+  searchStrategy: rapid   # "rapid" (default) for fast sweep; "thorough" for deeper exploration
+```
+- **rapid**: Performs a fast sweep over parallelization mappings (default)
+- **thorough**: Explores more configurations for potentially better results
+### Planner Configuration (Optional)
+Pass arguments to the SLA planner via the features section:
+```yaml
+features:
+  planner:
+    planner_min_endpoint: 2                    # Minimum endpoints to maintain
+    planner_adjustment_interval: 60            # Adjustment interval (seconds)
+    planner_load_predictor: linear             # Load prediction method
+```
+> [!NOTE]
+> Planner arguments use `planner_` prefix. See [SLA Planner documentation](../planner/planner-guide.md) for full list.
+### Model Cache PVC (Advanced)
+For large models, use a pre-populated PVC containing model weights instead of downloading from HuggingFace:
+```yaml
+modelCache:
+  pvcName: "model-cache"
+  pvcModelPath: "hub/models--deepseek-ai--DeepSeek-R1"
+  pvcMountPath: "/opt/model-cache"
+```
+Requirements:
+- The PVC must exist in the same namespace as the DGDR
+- The model weights must be accessible at `{mountPath}/{pvcPath}`
+### Engine Configuration (Auto-configured)
+The controller automatically handles model and backend configuration from high-level fields:
+```yaml
+# You specify:
+spec:
+  model: "Qwen/Qwen3-0.6B"
+  backend: vllm
+# Controller auto-injects into the profiling job
+```
+You should **not** manually set model or backend in profiling config overrides.
+### Using Existing DGD Configs
+Provide a base DGD config via the overrides section:
+```yaml
+overrides:
+  dgd:
+    apiVersion: nvidia.com/v1alpha1
+    kind: DynamoGraphDeployment
+    metadata:
+      name: my-dgd
+    spec:
+      # ... your base DGD spec
+```
+The profiler uses the DGD config as a **base template**, then optimizes it based on your SLA targets.
+## Integration
+### With SLA Planner
+The Profiler generates interpolation data that the SLA Planner uses for autoscaling decisions.
+**Prefill Interpolation** (`selected_prefill_interpolation/raw_data.npz`):
+- `prefill_isl`: 1D array of input sequence lengths tested
+- `prefill_ttft`: 1D array of TTFTs (ms) at each ISL
+- `prefill_thpt_per_gpu`: 1D array of throughput (tokens/s/GPU) at each ISL
+**Decode Interpolation** (`selected_decode_interpolation/raw_data.npz`):
+- `max_kv_tokens`: Total KV tokens capacity in decode engine
+- `x_kv_usage`: 1D array of active KV usage percentages [0, 1]
+- `y_context_length`: 1D array of average context lengths tested
+- `z_itl`: 1D array of ITLs (ms) at each (KV usage, context length) point
+- `z_thpt_per_gpu`: 1D array of throughput (tokens/s/GPU) at each point
+### With Dynamo Operator
+When using DGDR, the Dynamo Operator:
+1. Creates profiling jobs automatically
+2. Stores profiling data in ConfigMaps (`planner-profile-data`)
+3. Generates optimized DGD configurations
+4. Deploys the DGD with SLA Planner integration
+The generated DGD is tracked via labels:
+```yaml
+metadata:
+  labels:
+    dgdr.nvidia.com/name: my-deployment
+    dgdr.nvidia.com/namespace: your-namespace
+```
+### With Observability
+Monitor profiling jobs:
+```bash
+kubectl logs -f job/profile-<dgdr-name> -n $NAMESPACE
+kubectl describe dgdr <name> -n $NAMESPACE
+```
+## Advanced Topics
+### Manual Deployment Control
+Disable auto-deployment to review the generated DGD before applying:
+```yaml
+spec:
+  autoApply: false
+```
+Then manually extract and apply:
+```bash
+# Extract generated DGD from DGDR status
+kubectl get dgdr my-deployment -n $NAMESPACE -o jsonpath='{.status.profilingResults.selectedConfig}' | kubectl apply -f -
+# Or save to file for review
+kubectl get dgdr my-deployment -n $NAMESPACE -o jsonpath='{.status.profilingResults.selectedConfig}' > my-dgd.yaml
+```
+### Mocker Deployment
+Deploy a mocker deployment that simulates engines without GPUs:
+```yaml
+spec:
+  model: <model-name>
+  backend: trtllm
+  features:
+    mocker:
+      enabled: true    # Deploy mocker instead of real backend
+  autoApply: true
+```
+Profiling still runs against the real backend to collect performance data. The mocker uses this data to simulate realistic timing behavior. Useful for large-scale experiments, testing Planner behavior, and validating configurations.
+### Accessing Profiling Artifacts
+By default, profiling data is stored in ConfigMaps. For detailed artifacts (plots, logs, raw data), attach a PVC via overrides:
+```yaml
+overrides:
+  profilingJob:
+    template:
+      spec:
+        volumes:
+        - name: profiling-output
+          persistentVolumeClaim:
+            claimName: "dynamo-pvc"
+```
+**ConfigMaps (always created):**
+- `dgdr-output-<name>`: Generated DGD configuration
+- `planner-profile-data`: Profiling data for Planner (JSON)
+**PVC artifacts (optional):**
+- Performance plots (PNGs)
+- DGD configurations for each profiled deployment
+- AIPerf profiling artifacts
+- Raw profiling data (`.npz` files)
+- Profiler logs
+Access PVC results:
+```bash
+kubectl apply -f deploy/utils/manifests/pvc-access-pod.yaml -n $NAMESPACE
+kubectl wait --for=condition=Ready pod/pvc-access-pod -n $NAMESPACE --timeout=60s
+kubectl cp $NAMESPACE/pvc-access-pod:/data ./profiling-results
+kubectl delete pod pvc-access-pod -n $NAMESPACE
+```
+### Output Performance Plots
+The profiler generates plots to visualize performance data:
+**Parallelization Mapping Sweep Plots:**
+- `prefill_performance.png`: TTFT vs Parallelization Mapping size
+- `decode_performance.png`: ITL vs Parallelization Mapping size and in-flight requests
+**In-Depth Profiling Plots:**
+- `selected_prefill_interpolation/prefill_ttft_interpolation.png`: TTFT vs ISL
+- `selected_prefill_interpolation/prefill_throughput_interpolation.png`: Throughput vs ISL
+- `selected_decode_interpolation/decode_itl_interplation.png`: ITL vs KV usage and context length
+- `selected_decode_interpolation/decode_throughput_interpolation.png`: Throughput vs KV usage and context length
+## Runtime Profiling (SGLang)
+SGLang workers expose profiling endpoints for runtime performance analysis:
+```bash
+# Start profiling
+curl -X POST http://localhost:9090/engine/start_profile \
+  -H "Content-Type: application/json" \
+  -d '{"output_dir": "/tmp/profiler_output"}'
+# Run inference requests...
+# Stop profiling
+curl -X POST http://localhost:9090/engine/stop_profile
+```
+View traces using Chrome's `chrome://tracing`, [Perfetto UI](https://ui.perfetto.dev/), or TensorBoard.
 ## Troubleshooting

--- a/docs/pages/kubernetes/api-reference.md
+++ b/docs/pages/kubernetes/api-reference.md
@@ -1355,7 +1355,7 @@ _Appears in:_
 | `profilingPhase` _[ProfilingPhase](#profilingphase)_ | ProfilingPhase indicates the current sub-phase of the profiling pipeline.<br />Only meaningful when Phase is "Profiling". Cleared when profiling completes or fails. |  | Enum: [Initializing SweepingPrefill SweepingDecode SelectingConfig BuildingCurves GeneratingDGD Done] <br />Optional: \{\} <br /> |
 | `dgdName` _string_ | DGDName is the name of the generated or created DynamoGraphDeployment. |  | Optional: \{\} <br /> |
 | `profilingJobName` _string_ | ProfilingJobName is the name of the Kubernetes Job running the profiler. |  | Optional: \{\} <br /> |
-| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Validated, ProfilingComplete, DeploymentReady. |  | Optional: \{\} <br /> |
+| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#condition-v1-meta) array_ | Conditions contains the latest observed conditions of the deployment request.<br />Standard condition types include: Succeeded, Validation, Profiling, SpecGenerated, DeploymentReady. |  | Optional: \{\} <br /> |
 | `profilingResults` _[ProfilingResultsStatus](#profilingresultsstatus)_ | ProfilingResults contains the output of the profiling process including<br />Pareto-optimal configurations and the selected deployment configuration. |  | Optional: \{\} <br /> |
 | `deploymentInfo` _[DeploymentInfoStatus](#deploymentinfostatus)_ | DeploymentInfo tracks the state of the deployed DynamoGraphDeployment.<br />Populated when a DGD has been created (either via autoApply or manually). |  | Optional: \{\} <br /> |
 | `observedGeneration` _integer_ | ObservedGeneration is the most recent generation observed by the controller. |  | Optional: \{\} <br /> |

--- a/docs/pages/kubernetes/installation-guide.md
+++ b/docs/pages/kubernetes/installation-guide.md
@@ -203,12 +203,10 @@ When GPU discovery is disabled, you must provide hardware configuration manually
 ```yaml
 spec:
-  profilingConfig:
+  hardware:
-    config:
+    numGpusPerNode: 8
-      hardware:
+    gpuSku: "H100-SXM5-80GB"
-        numGpusPerNode: 8
+    vramMb: 81920
-        gpuModel: "H100-SXM5-80GB"
-        gpuVramMib: 81920
 ```
 > **Note**: If GPU discovery is disabled and no hardware config is provided, the DGDR will be rejected at admission time with a clear error message.

--- a/docs/pages/kubernetes/observability/operator-metrics.md
+++ b/docs/pages/kubernetes/observability/operator-metrics.md
@@ -91,7 +91,7 @@ All metrics use the `dynamo_operator` namespace prefix.
  - `"not_ready"` - Resource exists but is not operational (DCD, DM, DGDSA)
  - `"unknown"` - State cannot be determined (default for empty status)
  - DGD uses: `"pending"`, `"successful"`, `"failed"` from `.status.state`
-  - DGDR uses: `"Pending"`, `"Profiling"`, `"Deploying"`, `"Ready"`, `"DeploymentDeleted"`, `"Failed"` from `.status.state`
+  - DGDR uses: `"Pending"`, `"Profiling"`, `"Ready"`, `"Deploying"`, `"Deployed"`, `"Failed"` from `.status.phase`
 ## Example Queries

--- a/docs/pages/templates/backend-readme.md
+++ b/docs/pages/templates/backend-readme.md
@@ -38,7 +38,7 @@ title: Backend README
 ### Kubernetes
 ```yaml
-# Add DGDR example - use apiVersion: nvidia.com/v1alpha1
+# Add DGDR example - use apiVersion: nvidia.com/v1beta1
 # See recipes/ folder for production examples
 ```

--- a/docs/pages/templates/component-readme.md
+++ b/docs/pages/templates/component-readme.md
@@ -31,9 +31,9 @@ title: Component README
 ### Kubernetes
 ```yaml
-# Add DGDR example - use apiVersion: nvidia.com/v1alpha1
+# Add DGDR example - use apiVersion: nvidia.com/v1beta1
 # Example pattern (from Router):
-# apiVersion: nvidia.com/v1alpha1
+# apiVersion: nvidia.com/v1beta1
 # kind: DynamoGraphDeployment
 # metadata:
 #   name: <component>-deployment