feat: add DGDR custom resource (#3489)

Signed-off-by: Julien Mancuso <jmancuso@nvidia.com> Signed-off-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com> Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Co-authored-by: Hannah Zhang <hannahz@nvidia.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

feat: add DGDR custom resource (#3489)
Signed-off-by: Julien Mancuso <jmancuso@nvidia.com> Signed-off-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com> Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Co-authored-by: Hannah Zhang <hannahz@nvidia.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
57cdb9a1 · Julien Mancuso · GitHub · 66fd6f84 · 57cdb9a1 · 57cdb9a1
Unverified Commit 57cdb9a1 authored Oct 17, 2025 by Julien Mancuso Committed by GitHub Oct 17, 2025
20 changed files
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -596,10 +596,11 @@ async def run_profile(args):
            try:
                await client.wait_for_deployment_ready()
                logger.info("Deployment is ready")
+
                skip_profile = False
            except TimeoutError:
                logger.error(
-                    "Deployment failed to become ready within timeout, skipping profiling"
+                    "Deployment or model failed to become ready within timeout, skipping profiling"
                )
                skip_profile = True


--- a/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeploymentrequests.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.4
+    helm.sh/resource-policy: keep
+  name: dynamographdeploymentrequests.nvidia.com
+spec:
+  group: nvidia.com
+  names:
+    kind: DynamoGraphDeploymentRequest
+    listKind: DynamoGraphDeploymentRequestList
+    plural: dynamographdeploymentrequests
+    shortNames:
+      - dgdr
+    singular: dynamographdeploymentrequest
+  scope: Namespaced
+  versions:
+    - additionalPrinterColumns:
+        - jsonPath: .spec.modelName
+          name: Model
+          type: string
+        - jsonPath: .spec.backend
+          name: Backend
+          type: string
+        - jsonPath: .status.state
+          name: State
+          type: string
+        - jsonPath: .status.deployment.state
+          name: DGD-State
+          type: string
+        - jsonPath: .metadata.creationTimestamp
+          name: Age
+          type: date
+      name: v1alpha1
+      schema:
+        openAPIV3Schema:
+          description: |-
+            DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests API.
+            It serves as the primary interface for users to request model deployments with
+            specific performance and resource constraints, enabling SLA-driven deployments.
+
+            Lifecycle:
+             1. Initial → Pending: Validates spec and prepares for profiling
+             2. Pending → Profiling: Creates and runs profiling job (online or AIC)
+             3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
+             4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
+             5. Ready: Terminal state when DGD is operational or spec is available
+             6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
+
+            The spec becomes immutable once profiling starts. Users must delete and recreate
+            the DGDR to modify configuration after this point.
+          properties:
+            apiVersion:
+              description: |-
+                APIVersion defines the versioned schema of this representation of an object.
+                Servers should convert recognized schemas to the latest internal value, and
+                may reject unrecognized values.
+                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+              type: string
+            kind:
+              description: |-
+                Kind is a string value representing the REST resource this object represents.
+                Servers may infer this from the endpoint the client submits requests to.
+                Cannot be updated.
+                In CamelCase.
+                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+              type: string
+            metadata:
+              type: object
+            spec:
+              description: Spec defines the desired state for this deployment request.
+              properties:
+                autoApply:
+                  default: false
+                  description: |-
+                    AutoApply indicates whether to automatically create a DynamoGraphDeployment
+                    after profiling completes. If false, only the spec is generated and stored in status.
+                    Users can then manually create a DGD using the generated spec.
+                  type: boolean
+                backend:
+                  default: trtllm
+                  description: |-
+                    Backend specifies the inference backend framework to use.
+                    Supported values are: "vllm", "sglang", "trtllm".
+                  enum:
+                    - vllm
+                    - sglang
+                    - trtllm
+                  type: string
+                deploymentOverrides:
+                  description: |-
+                    DeploymentOverrides allows customizing metadata for the auto-created DGD.
+                    Only applicable when AutoApply is true.
+                  properties:
+                    annotations:
+                      additionalProperties:
+                        type: string
+                      description: Annotations are additional annotations to add to the DynamoGraphDeployment metadata.
+                      type: object
+                    labels:
+                      additionalProperties:
+                        type: string
+                      description: |-
+                        Labels are additional labels to add to the DynamoGraphDeployment metadata.
+                        These are merged with auto-generated labels from the profiling process.
+                      type: object
+                    name:
+                      description: |-
+                        Name is the desired name for the created DynamoGraphDeployment.
+                        If not specified, defaults to the DGDR name.
+                      type: string
+                    namespace:
+                      description: |-
+                        Namespace is the desired namespace for the created DynamoGraphDeployment.
+                        If not specified, defaults to the DGDR namespace.
+                      type: string
+                  type: object
+                gpu:
+                  description: |-
+                    GPU defines optional GPU type and resource specifications.
+                    These constraints guide the profiler to find configurations within specified bounds.
+                  properties:
+                    maxNumGPUsPerEngine:
+                      default: 8
+                      description: |-
+                        MaxNumGPUsPerEngine specifies the maximum number of GPUs per engine for profiling.
+                        The profiler will not consider configurations with more GPUs than this value.
+                      minimum: 1
+                      type: integer
+                    minNumGPUsPerEngine:
+                      default: 1
+                      description: |-
+                        MinNumGPUsPerEngine specifies the minimum number of GPUs per engine for profiling.
+                        The profiler will not consider configurations with fewer GPUs than this value.
+                      minimum: 1
+                      type: integer
+                    type:
+                      description: |-
+                        Type specifies the GPU type to target (e.g., "h200", "h100", "a100").
+                        If specified, profiling will focus on configurations optimized for this GPU type.
+                      type: string
+                  type: object
+                modelName:
+                  description: |-
+                    ModelName specifies the model to deploy (e.g., "meta/llama3-70b").
+                    This should be a valid model identifier that the profiler can resolve.
+                  type: string
+                online:
+                  default: false
+                  description: |-
+                    Online indicates whether to use online profiler (true) or AI Configurator (false).
+                    Online profiling uses real deployments for accurate measurements (2-4 hours).
+                    Offline profiling uses AI Configurator for fast simulation-based profiling (20-30 seconds).
+                  type: boolean
+                profilingConfig:
+                  description: |-
+                    ProfilingConfig provides custom configuration for the profiling job.
+                    Applicable to both online and offline (AIC) profiling modes.
+                  properties:
+                    configMapRef:
+                      description: |-
+                        ConfigMapRef is a reference to a ConfigMap containing profiling configuration.
+                        The ConfigMap should contain a key (default: "disagg.yaml") with the configuration file.
+                        This configuration is used by both online and offline (AIC) profiling modes.
+                      properties:
+                        key:
+                          default: disagg.yaml
+                          description: Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml".
+                          type: string
+                        name:
+                          description: Name of the ConfigMap containing the desired data.
+                          type: string
+                      required:
+                        - name
+                      type: object
+                  type: object
+                sla:
+                  description: |-
+                    SLA defines the Service Level Agreement profiling targets.
+                    The profiler uses these targets to find an optimal deployment configuration.
+                  properties:
+                    isl:
+                      default: 3000
+                      description: |-
+                        ISL is the Input Sequence Length for profiling.
+                        Defines the length of input sequences to use during profiling tests.
+                      minimum: 1
+                      type: integer
+                    itl:
+                      default: 10
+                      description: |-
+                        ITL is the target Inter-Token Latency in milliseconds.
+                        This represents the maximum time allowed between consecutive tokens in the output.
+                      type: integer
+                    osl:
+                      default: 500
+                      description: |-
+                        OSL is the Output Sequence Length for profiling.
+                        Defines the expected length of output sequences to generate during profiling tests.
+                      minimum: 1
+                      type: integer
+                    ttft:
+                      default: 50
+                      description: |-
+                        TTFT is the target Time To First Token in milliseconds.
+                        This represents the maximum time allowed from request submission to receiving the first token.
+                      type: integer
+                  type: object
+              required:
+                - modelName
+                - sla
+              type: object
+            status:
+              description: Status reflects the current observed state of this deployment request.
+              properties:
+                conditions:
+                  description: |-
+                    Conditions contains the latest observed conditions of the deployment request.
+                    Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.
+                    Conditions are merged by type on patch updates.
+                  items:
+                    description: Condition contains details for one aspect of the current state of this API Resource.
+                    properties:
+                      lastTransitionTime:
+                        description: |-
+                          lastTransitionTime is the last time the condition transitioned from one status to another.
+                          This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                        format: date-time
+                        type: string
+                      message:
+                        description: |-
+                          message is a human readable message indicating details about the transition.
+                          This may be an empty string.
+                        maxLength: 32768
+                        type: string
+                      observedGeneration:
+                        description: |-
+                          observedGeneration represents the .metadata.generation that the condition was set based upon.
+                          For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                          with respect to the current state of the instance.
+                        format: int64
+                        minimum: 0
+                        type: integer
+                      reason:
+                        description: |-
+                          reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                          Producers of specific condition types may define expected values and meanings for this field,
+                          and whether the values are considered a guaranteed API.
+                          The value should be a CamelCase string.
+                          This field may not be empty.
+                        maxLength: 1024
+                        minLength: 1
+                        pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                        type: string
+                      status:
+                        description: status of the condition, one of True, False, Unknown.
+                        enum:
+                          - "True"
+                          - "False"
+                          - Unknown
+                        type: string
+                      type:
+                        description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                        maxLength: 316
+                        pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                        type: string
+                    required:
+                      - lastTransitionTime
+                      - message
+                      - reason
+                      - status
+                      - type
+                    type: object
+                  type: array
+                deployment:
+                  description: |-
+                    Deployment tracks the auto-created DGD when AutoApply is true.
+                    Contains name, namespace, state, and creation status of the managed DGD.
+                  properties:
+                    created:
+                      description: |-
+                        Created indicates whether the DGD has been successfully created.
+                        Used to prevent recreation if the DGD is manually deleted by users.
+                      type: boolean
+                    name:
+                      description: Name is the name of the created DynamoGraphDeployment.
+                      type: string
+                    namespace:
+                      description: Namespace is the namespace of the created DynamoGraphDeployment.
+                      type: string
+                    state:
+                      description: |-
+                        State is the current state of the DynamoGraphDeployment.
+                        This value is mirrored from the DGD's status.state field.
+                      type: string
+                  type: object
+                generatedDeployment:
+                  description: |-
+                    GeneratedDeployment contains the full generated DynamoGraphDeployment specification
+                    including metadata, based on profiling results. Users can extract this to create
+                    a DGD manually, or it's used automatically when autoApply is true.
+                    Stored as RawExtension to preserve all fields including metadata.
+                  type: object
+                  x-kubernetes-embedded-resource: true
+                  x-kubernetes-preserve-unknown-fields: true
+                observedGeneration:
+                  description: |-
+                    ObservedGeneration reflects the generation of the most recently observed spec.
+                    Used to detect spec changes and enforce immutability after profiling starts.
+                  format: int64
+                  type: integer
+                profilingResults:
+                  description: |-
+                    ProfilingResults contains a reference to the ConfigMap holding profiling data.
+                    Format: "configmap/<name>"
+                  type: string
+                state:
+                  description: |-
+                    State is a high-level textual status of the deployment request lifecycle.
+                    Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
+                    Empty string ("") represents the initial state before initialization.
+                  type: string
+              type: object
+          type: object
+      served: true
+      storage: true
+      subresources:
+        status: {}
--- a/deploy/cloud/helm/platform/README.md
+++ b/deploy/cloud/helm/platform/README.md
@@ -132,6 +132,7 @@ The chart includes built-in validation to prevent all operator conflicts:
 | dynamo-operator.dynamo.metrics.prometheusEndpoint | string | `""` | Endpoint that services can use to retrieve metrics. If set, dynamo operator will automatically inject the PROMETHEUS_ENDPOINT environment variable into services it manages. Users can override the value of the PROMETHEUS_ENDPOINT environment variable by modifying the corresponding deployment's environment variables |
 | dynamo-operator.dynamo.mpiRun.secretName | string | `"mpi-run-ssh-secret"` | Name of the secret containing the SSH key for MPI Run |
 | dynamo-operator.dynamo.mpiRun.sshKeygen.enabled | bool | `true` | Whether to enable SSH key generation for MPI Run |
+| dynamo-operator.dynamo.dgdr.profilerImage | string | `""` | Container image to use for profiling jobs (both online and offline/AIC) |
 | grove.enabled | bool | `false` | Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide |
 | kai-scheduler.enabled | bool | `false` | Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide |
 | etcd.enabled | bool | `true` | Whether to enable etcd deployment, disable if you want to use an external etcd instance. For complete configuration options, see: https://github.com/bitnami/charts/tree/main/bitnami/etcd , all etcd settings should be prefixed with "etcd." |

--- a/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml
+++ b/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml
@@ -124,7 +124,11 @@ spec:
          - --mpi-run-ssh-secret-name={{ .Values.dynamo.mpiRun.secretName }}
          - --mpi-run-ssh-secret-namespace={{ .Release.Namespace }}
        {{- end }}
+        {{- if .Values.dynamo.dgdr.profilerImage }}
+          - --profiler-image={{ .Values.dynamo.dgdr.profilerImage }}
+        {{- end }}
        {{- if not .Values.namespaceRestriction.enabled }}
+          - --dgdr-profiling-cluster-role-name={{ include "dynamo-operator.fullname" . }}-dgdr-profiling
          - --planner-cluster-role-name={{ include "dynamo-operator.fullname" . }}-planner
        {{- end }}
        command:

--- a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml
+++ b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml
@@ -359,6 +359,7 @@ rules:
  - nvidia.com
  resources:
  - dynamocomponentdeployments
+  - dynamographdeploymentrequests
  - dynamographdeployments
  verbs:
  - create
@@ -372,6 +373,7 @@ rules:
  - nvidia.com
  resources:
  - dynamocomponentdeployments/finalizers
+  - dynamographdeploymentrequests/finalizers
  - dynamographdeployments/finalizers
  verbs:
  - update
@@ -379,6 +381,7 @@ rules:
  - nvidia.com
  resources:
  - dynamocomponentdeployments/status
+  - dynamographdeploymentrequests/status
  - dynamographdeployments/status
  verbs:
  - get

--- a/deploy/cloud/helm/platform/components/operator/templates/profiling-job-rbac.yaml
+++ b/deploy/cloud/helm/platform/components/operator/templates/profiling-job-rbac.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- if .Values.namespaceRestriction.enabled }}
+# Namespace-restricted mode: Role + ServiceAccount + RoleBinding
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: dgdr-profiling-job
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "dynamo-operator.labels" . | nindent 4 }}
+    app.kubernetes.io/component: dgdr-profiling
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: dgdr-profiling-job
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "dynamo-operator.labels" . | nindent 4 }}
+    app.kubernetes.io/component: dgdr-profiling
+rules:
+# ConfigMaps - needed for saving profiling results
+- apiGroups: [""]
+  resources: ["configmaps"]
+  verbs: ["create", "get", "update", "patch", "delete"]
+# DynamoGraphDeploymentRequests - needed to get DGDR info
+- apiGroups: ["nvidia.com"]
+  resources: ["dynamographdeploymentrequests"]
+  verbs: ["get"]
+# DynamoGraphDeployments - needed for online profiling to create test deployments
+# The operator will handle creating the actual pods, services, and deployments
+- apiGroups: ["nvidia.com"]
+  resources: ["dynamographdeployments"]
+  verbs: ["get", "create", "delete", "list", "watch"]
+# Pods - needed for listing pods by label selector and getting logs from test deployments
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["list", "get"]
+- apiGroups: [""]
+  resources: ["pods/log"]
+  verbs: ["get"]
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: dgdr-profiling-job
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "dynamo-operator.labels" . | nindent 4 }}
+    app.kubernetes.io/component: dgdr-profiling
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: dgdr-profiling-job
+subjects:
+- kind: ServiceAccount
+  name: dgdr-profiling-job
+  namespace: {{ .Release.Namespace }}
+{{- else }}
+# Cluster-wide mode: ClusterRole for DGDR profiling jobs
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "dynamo-operator.fullname" . }}-dgdr-profiling
+  labels:
+    {{- include "dynamo-operator.labels" . | nindent 4 }}
+    app.kubernetes.io/component: dgdr-profiling
+rules:
+# ConfigMaps - needed for saving profiling results
+- apiGroups: [""]
+  resources: ["configmaps"]
+  verbs: ["create", "get", "update", "patch", "delete"]
+# DynamoGraphDeploymentRequests - needed to get DGDR info
+- apiGroups: ["nvidia.com"]
+  resources: ["dynamographdeploymentrequests"]
+  verbs: ["get"]
+# DynamoGraphDeployments - needed for online profiling to create test deployments
+# The operator will handle creating the actual pods, services, and deployments
+- apiGroups: ["nvidia.com"]
+  resources: ["dynamographdeployments"]
+  verbs: ["get", "create", "delete", "list", "watch"]
+# Pods - needed for listing pods by label selector and getting logs from test deployments
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["list", "get"]
+- apiGroups: [""]
+  resources: ["pods/log"]
+  verbs: ["get"]
+{{- end }}
+# (Remove the trailing blank line at end of file)
--- a/deploy/cloud/helm/platform/components/operator/values.yaml
+++ b/deploy/cloud/helm/platform/components/operator/values.yaml
@@ -117,6 +117,15 @@ dynamo:
    sshKeygen:
      enabled: true

+  # DynamoGraphDeploymentRequest (DGDR) configuration
+  dgdr:
+    # Container image to use for profiling jobs (both online and offline/AIC)
+    # REQUIRED: Must be set to create DynamoGraphDeploymentRequests
+    # For development: Build and push the profiler image from the ai-dynamo repository
+    # Public image will be available in release 0.6.1
+    # Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
+    profilerImage: ""
+

 #imagePullSecrets: []
 kubernetesClusterDomain: cluster.local

--- a/deploy/cloud/helm/platform/values.yaml
+++ b/deploy/cloud/helm/platform/values.yaml
@@ -135,6 +135,15 @@ dynamo-operator:
        # -- Whether to enable SSH key generation for MPI Run
        enabled: true

+    # DynamoGraphDeploymentRequest (DGDR) configuration
+    dgdr:
+      # -- Container image to use for profiling jobs (both online and offline/AIC)
+      # REQUIRED: Must be set to create DynamoGraphDeploymentRequests
+      # For development: Build and push the profiler image from the ai-dynamo repository
+      # Public image will be available in release 0.6.1
+      # Example: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"
+      profilerImage: ""
+

 # Grove component - distributed inference orchestration
 grove:

--- a/deploy/cloud/operator/README.md
+++ b/deploy/cloud/operator/README.md
@@ -5,10 +5,19 @@ A Kubernetes Operator to manage all Dynamo pipelines using custom resources.

 ## Overview

-This operator automates the deployment and lifecycle management of `DynamoGraphDeployment` resources in Kubernetes clusters.
+This operator automates the deployment and lifecycle management of Dynamo resources in Kubernetes clusters:
+
+- **DynamoGraphDeploymentRequest (DGDR)** - Simplified SLA-driven deployment interface
+- **DynamoGraphDeployment (DGD)** - Direct deployment configuration

 Built with [Kubebuilder](https://book.kubebuilder.io/), it follows Kubernetes best practices and supports declarative configuration through CustomResourceDefinitions (CRDs).

+### Custom Resources
+
+- **DynamoGraphDeploymentRequest**: High-level interface for SLA-driven configuration generation. Automatically handles profiling and generates an optimized DGD spec based on your performance requirements.
+- **DynamoGraphDeployment**: Lower-level interface for direct deployment configuration with full control over all parameters.
+
+
 ## Developer guide

 ### Pre-requisites

--- a/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentrequest_types.go
+++ b/deploy/cloud/operator/api/v1alpha1/dynamographdeploymentrequest_types.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API group.
+
+This package defines the DynamoGraphDeploymentRequest (DGDR) custom resource, which provides
+a high-level, SLA-driven interface for deploying machine learning models on Dynamo.
+*/
+package v1alpha1
+
+import (
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	runtime "k8s.io/apimachinery/pkg/runtime"
+)
+
+// EDIT THIS FILE!  THIS IS SCAFFOLDING FOR YOU TO OWN!
+// NOTE: json tags are required.  Any new fields you add must have json tags for the fields to be serialized.
+
+// SLASpec defines Service Level Agreement targets for model profiling and deployment.
+// These targets guide the profiling process to find optimal deployment configurations
+// that meet the specified performance requirements.
+type SLASpec struct {
+	// ITL is the target Inter-Token Latency in milliseconds.
+	// This represents the maximum time allowed between consecutive tokens in the output.
+	// +kubebuilder:default=10
+	// +optional
+	ITL int `json:"itl,omitempty"`
+
+	// TTFT is the target Time To First Token in milliseconds.
+	// This represents the maximum time allowed from request submission to receiving the first token.
+	// +kubebuilder:default=50
+	// +optional
+	TTFT int `json:"ttft,omitempty"`
+
+	// ISL is the Input Sequence Length for profiling.
+	// Defines the length of input sequences to use during profiling tests.
+	// +kubebuilder:default=3000
+	// +kubebuilder:validation:Minimum=1
+	// +optional
+	ISL int `json:"isl,omitempty"`
+
+	// OSL is the Output Sequence Length for profiling.
+	// Defines the expected length of output sequences to generate during profiling tests.
+	// +kubebuilder:default=500
+	// +kubebuilder:validation:Minimum=1
+	// +optional
+	OSL int `json:"osl,omitempty"`
+}
+
+// GPUSpec defines optional GPU type and resource specifications for profiling and deployment.
+// These constraints help narrow down the search space during profiling to find configurations
+// that fit within specified hardware bounds.
+type GPUSpec struct {
+	// Type specifies the GPU type to target (e.g., "h200", "h100", "a100").
+	// If specified, profiling will focus on configurations optimized for this GPU type.
+	// +kubebuilder:validation:Optional
+	Type string `json:"type,omitempty"`
+
+	// MinNumGPUsPerEngine specifies the minimum number of GPUs per engine for profiling.
+	// The profiler will not consider configurations with fewer GPUs than this value.
+	// +kubebuilder:validation:Optional
+	// +kubebuilder:validation:Minimum=1
+	// +kubebuilder:default=1
+	MinNumGPUsPerEngine int `json:"minNumGPUsPerEngine,omitempty"`
+
+	// MaxNumGPUsPerEngine specifies the maximum number of GPUs per engine for profiling.
+	// The profiler will not consider configurations with more GPUs than this value.
+	// +kubebuilder:validation:Optional
+	// +kubebuilder:validation:Minimum=1
+	// +kubebuilder:default=8
+	MaxNumGPUsPerEngine int `json:"maxNumGPUsPerEngine,omitempty"`
+}
+
+// ConfigMapKeySelector selects a specific key from a ConfigMap.
+// Used to reference external configuration data stored in ConfigMaps.
+type ConfigMapKeySelector struct {
+	// Name of the ConfigMap containing the desired data.
+	// +kubebuilder:validation:Required
+	Name string `json:"name"`
+
+	// Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml".
+	// +kubebuilder:default=disagg.yaml
+	Key string `json:"key,omitempty"`
+}
+
+// ProfilingConfigSpec defines configuration for the profiling process.
+// Allows users to provide custom profiling parameters via ConfigMap references.
+type ProfilingConfigSpec struct {
+	// ConfigMapRef is a reference to a ConfigMap containing profiling configuration.
+	// The ConfigMap should contain a key (default: "disagg.yaml") with the configuration file.
+	// This configuration is used by both online and offline (AIC) profiling modes.
+	// +kubebuilder:validation:Optional
+	ConfigMapRef *ConfigMapKeySelector `json:"configMapRef,omitempty"`
+}
+
+// DeploymentOverridesSpec allows users to customize metadata for auto-created DynamoGraphDeployments.
+// When autoApply is enabled, these overrides are applied to the generated DGD resource.
+type DeploymentOverridesSpec struct {
+	// Name is the desired name for the created DynamoGraphDeployment.
+	// If not specified, defaults to the DGDR name.
+	// +kubebuilder:validation:Optional
+	Name string `json:"name,omitempty"`
+
+	// Namespace is the desired namespace for the created DynamoGraphDeployment.
+	// If not specified, defaults to the DGDR namespace.
+	// +kubebuilder:validation:Optional
+	Namespace string `json:"namespace,omitempty"`
+
+	// Labels are additional labels to add to the DynamoGraphDeployment metadata.
+	// These are merged with auto-generated labels from the profiling process.
+	// +kubebuilder:validation:Optional
+	Labels map[string]string `json:"labels,omitempty"`
+
+	// Annotations are additional annotations to add to the DynamoGraphDeployment metadata.
+	// +kubebuilder:validation:Optional
+	Annotations map[string]string `json:"annotations,omitempty"`
+}
+
+// DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.
+// This CRD serves as the primary interface for users to request model deployments with
+// specific performance constraints and resource requirements, enabling SLA-driven deployments.
+type DynamoGraphDeploymentRequestSpec struct {
+	// ModelName specifies the model to deploy (e.g., "meta/llama3-70b").
+	// This should be a valid model identifier that the profiler can resolve.
+	// +kubebuilder:validation:Required
+	ModelName string `json:"modelName"`
+
+	// Backend specifies the inference backend framework to use.
+	// Supported values are: "vllm", "sglang", "trtllm".
+	// +kubebuilder:validation:Enum=vllm;sglang;trtllm
+	// +kubebuilder:default=trtllm
+	Backend string `json:"backend,omitempty"`
+
+	// SLA defines the Service Level Agreement profiling targets.
+	// The profiler uses these targets to find an optimal deployment configuration.
+	// +kubebuilder:validation:Required
+	SLA SLASpec `json:"sla"`
+
+	// GPU defines optional GPU type and resource specifications.
+	// These constraints guide the profiler to find configurations within specified bounds.
+	// +kubebuilder:validation:Optional
+	GPU *GPUSpec `json:"gpu,omitempty"`
+
+	// Online indicates whether to use online profiler (true) or AI Configurator (false).
+	// Online profiling uses real deployments for accurate measurements (2-4 hours).
+	// Offline profiling uses AI Configurator for fast simulation-based profiling (20-30 seconds).
+	// +kubebuilder:default=false
+	Online bool `json:"online,omitempty"`
+
+	// AutoApply indicates whether to automatically create a DynamoGraphDeployment
+	// after profiling completes. If false, only the spec is generated and stored in status.
+	// Users can then manually create a DGD using the generated spec.
+	// +kubebuilder:default=false
+	AutoApply bool `json:"autoApply,omitempty"`
+
+	// DeploymentOverrides allows customizing metadata for the auto-created DGD.
+	// Only applicable when AutoApply is true.
+	// +kubebuilder:validation:Optional
+	DeploymentOverrides *DeploymentOverridesSpec `json:"deploymentOverrides,omitempty"`
+
+	// ProfilingConfig provides custom configuration for the profiling job.
+	// Applicable to both online and offline (AIC) profiling modes.
+	// +kubebuilder:validation:Optional
+	ProfilingConfig *ProfilingConfigSpec `json:"profilingConfig,omitempty"`
+}
+
+// DeploymentStatus tracks the state of an auto-created DynamoGraphDeployment.
+// This status is populated when autoApply is enabled and a DGD is created.
+type DeploymentStatus struct {
+	// Name is the name of the created DynamoGraphDeployment.
+	Name string `json:"name,omitempty"`
+
+	// Namespace is the namespace of the created DynamoGraphDeployment.
+	Namespace string `json:"namespace,omitempty"`
+
+	// State is the current state of the DynamoGraphDeployment.
+	// This value is mirrored from the DGD's status.state field.
+	State string `json:"state,omitempty"`
+
+	// Created indicates whether the DGD has been successfully created.
+	// Used to prevent recreation if the DGD is manually deleted by users.
+	Created bool `json:"created,omitempty"`
+}
+
+// DynamoGraphDeploymentRequestStatus represents the observed state of a DynamoGraphDeploymentRequest.
+// The controller updates this status as the DGDR progresses through its lifecycle.
+type DynamoGraphDeploymentRequestStatus struct {
+	// State is a high-level textual status of the deployment request lifecycle.
+	// Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
+	// Empty string ("") represents the initial state before initialization.
+	State string `json:"state,omitempty"`
+
+	// ObservedGeneration reflects the generation of the most recently observed spec.
+	// Used to detect spec changes and enforce immutability after profiling starts.
+	ObservedGeneration int64 `json:"observedGeneration,omitempty"`
+
+	// Conditions contains the latest observed conditions of the deployment request.
+	// Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.
+	// Conditions are merged by type on patch updates.
+	Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
+
+	// ProfilingResults contains a reference to the ConfigMap holding profiling data.
+	// Format: "configmap/<name>"
+	// +kubebuilder:validation:Optional
+	ProfilingResults string `json:"profilingResults,omitempty"`
+
+	// GeneratedDeployment contains the full generated DynamoGraphDeployment specification
+	// including metadata, based on profiling results. Users can extract this to create
+	// a DGD manually, or it's used automatically when autoApply is true.
+	// Stored as RawExtension to preserve all fields including metadata.
+	// +kubebuilder:validation:Optional
+	// +kubebuilder:pruning:PreserveUnknownFields
+	// +kubebuilder:validation:EmbeddedResource
+	GeneratedDeployment *runtime.RawExtension `json:"generatedDeployment,omitempty"`
+
+	// Deployment tracks the auto-created DGD when AutoApply is true.
+	// Contains name, namespace, state, and creation status of the managed DGD.
+	// +kubebuilder:validation:Optional
+	Deployment *DeploymentStatus `json:"deployment,omitempty"`
+}
+
+// DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests API.
+// It serves as the primary interface for users to request model deployments with
+// specific performance and resource constraints, enabling SLA-driven deployments.
+//
+// Lifecycle:
+//  1. Initial → Pending: Validates spec and prepares for profiling
+//  2. Pending → Profiling: Creates and runs profiling job (online or AIC)
+//  3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
+//  4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
+//  5. Ready: Terminal state when DGD is operational or spec is available
+//  6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
+//
+// The spec becomes immutable once profiling starts. Users must delete and recreate
+// the DGDR to modify configuration after this point.
+//
+// +kubebuilder:object:root=true
+// +kubebuilder:subresource:status
+// +kubebuilder:resource:shortName=dgdr
+// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=`.spec.modelName`
+// +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.spec.backend`
+// +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.status.state`
+// +kubebuilder:printcolumn:name="DGD-State",type=string,JSONPath=`.status.deployment.state`
+// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
+type DynamoGraphDeploymentRequest struct {
+	metav1.TypeMeta   `json:",inline"`
+	metav1.ObjectMeta `json:"metadata,omitempty"`
+
+	// Spec defines the desired state for this deployment request.
+	Spec DynamoGraphDeploymentRequestSpec `json:"spec,omitempty"`
+
+	// Status reflects the current observed state of this deployment request.
+	Status DynamoGraphDeploymentRequestStatus `json:"status,omitempty"`
+}
+
+// SetState updates the State field in the DGDR status.
+func (s *DynamoGraphDeploymentRequest) SetState(state string) {
+	s.Status.State = state
+}
+
+// GetSpec returns the spec of this DGDR as a generic interface.
+// Implements a common interface used by controller utilities.
+func (s *DynamoGraphDeploymentRequest) GetSpec() any {
+	return s.Spec
+}
+
+// SetSpec updates the spec of this DGDR from a generic interface value.
+// Implements a common interface used by controller utilities.
+func (s *DynamoGraphDeploymentRequest) SetSpec(spec any) {
+	s.Spec = spec.(DynamoGraphDeploymentRequestSpec)
+}
+
+// AddStatusCondition adds or updates a condition in the status.
+// If a condition with the same type already exists, it replaces it.
+// Otherwise, it appends the new condition to the list.
+func (s *DynamoGraphDeploymentRequest) AddStatusCondition(condition metav1.Condition) {
+	if s.Status.Conditions == nil {
+		s.Status.Conditions = []metav1.Condition{}
+	}
+	// Check if condition with same type already exists
+	for i, existingCondition := range s.Status.Conditions {
+		if existingCondition.Type == condition.Type {
+			// Replace the existing condition
+			s.Status.Conditions[i] = condition
+			return
+		}
+	}
+	// If no matching condition found, append the new one
+	s.Status.Conditions = append(s.Status.Conditions, condition)
+}
+
+// DynamoGraphDeploymentRequestList contains a list of DynamoGraphDeploymentRequest resources.
+//
+// +kubebuilder:object:root=true
+type DynamoGraphDeploymentRequestList struct {
+	metav1.TypeMeta `json:",inline"`
+	metav1.ListMeta `json:"metadata,omitempty"`
+	Items           []DynamoGraphDeploymentRequest `json:"items"`
+}
+
+func init() {
+	SchemeBuilder.Register(&DynamoGraphDeploymentRequest{}, &DynamoGraphDeploymentRequestList{})
+}
--- a/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go
+++ b/deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go
@@ -42,7 +42,7 @@ import (
 	"k8s.io/api/autoscaling/v2"
 	"k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	runtime "k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/apimachinery/pkg/runtime"
 )

 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
@@ -114,6 +114,65 @@ func (in *BaseStatus) DeepCopy() *BaseStatus {
 	return out
 }

+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *ConfigMapKeySelector) DeepCopyInto(out *ConfigMapKeySelector) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ConfigMapKeySelector.
+func (in *ConfigMapKeySelector) DeepCopy() *ConfigMapKeySelector {
+	if in == nil {
+		return nil
+	}
+	out := new(ConfigMapKeySelector)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DeploymentOverridesSpec) DeepCopyInto(out *DeploymentOverridesSpec) {
+	*out = *in
+	if in.Labels != nil {
+		in, out := &in.Labels, &out.Labels
+		*out = make(map[string]string, len(*in))
+		for key, val := range *in {
+			(*out)[key] = val
+		}
+	}
+	if in.Annotations != nil {
+		in, out := &in.Annotations, &out.Annotations
+		*out = make(map[string]string, len(*in))
+		for key, val := range *in {
+			(*out)[key] = val
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeploymentOverridesSpec.
+func (in *DeploymentOverridesSpec) DeepCopy() *DeploymentOverridesSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(DeploymentOverridesSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DeploymentStatus) DeepCopyInto(out *DeploymentStatus) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeploymentStatus.
+func (in *DeploymentStatus) DeepCopy() *DeploymentStatus {
+	if in == nil {
+		return nil
+	}
+	out := new(DeploymentStatus)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *DynamoComponentDeployment) DeepCopyInto(out *DynamoComponentDeployment) {
 	*out = *in
@@ -378,6 +437,128 @@ func (in *DynamoGraphDeploymentList) DeepCopyObject() runtime.Object {
 	return nil
 }

+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DynamoGraphDeploymentRequest) DeepCopyInto(out *DynamoGraphDeploymentRequest) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
+	in.Spec.DeepCopyInto(&out.Spec)
+	in.Status.DeepCopyInto(&out.Status)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentRequest.
+func (in *DynamoGraphDeploymentRequest) DeepCopy() *DynamoGraphDeploymentRequest {
+	if in == nil {
+		return nil
+	}
+	out := new(DynamoGraphDeploymentRequest)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *DynamoGraphDeploymentRequest) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DynamoGraphDeploymentRequestList) DeepCopyInto(out *DynamoGraphDeploymentRequestList) {
+	*out = *in
+	out.TypeMeta = in.TypeMeta
+	in.ListMeta.DeepCopyInto(&out.ListMeta)
+	if in.Items != nil {
+		in, out := &in.Items, &out.Items
+		*out = make([]DynamoGraphDeploymentRequest, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentRequestList.
+func (in *DynamoGraphDeploymentRequestList) DeepCopy() *DynamoGraphDeploymentRequestList {
+	if in == nil {
+		return nil
+	}
+	out := new(DynamoGraphDeploymentRequestList)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
+func (in *DynamoGraphDeploymentRequestList) DeepCopyObject() runtime.Object {
+	if c := in.DeepCopy(); c != nil {
+		return c
+	}
+	return nil
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DynamoGraphDeploymentRequestSpec) DeepCopyInto(out *DynamoGraphDeploymentRequestSpec) {
+	*out = *in
+	out.SLA = in.SLA
+	if in.GPU != nil {
+		in, out := &in.GPU, &out.GPU
+		*out = new(GPUSpec)
+		**out = **in
+	}
+	if in.DeploymentOverrides != nil {
+		in, out := &in.DeploymentOverrides, &out.DeploymentOverrides
+		*out = new(DeploymentOverridesSpec)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.ProfilingConfig != nil {
+		in, out := &in.ProfilingConfig, &out.ProfilingConfig
+		*out = new(ProfilingConfigSpec)
+		(*in).DeepCopyInto(*out)
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentRequestSpec.
+func (in *DynamoGraphDeploymentRequestSpec) DeepCopy() *DynamoGraphDeploymentRequestSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(DynamoGraphDeploymentRequestSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DynamoGraphDeploymentRequestStatus) DeepCopyInto(out *DynamoGraphDeploymentRequestStatus) {
+	*out = *in
+	if in.Conditions != nil {
+		in, out := &in.Conditions, &out.Conditions
+		*out = make([]metav1.Condition, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
+	if in.GeneratedDeployment != nil {
+		in, out := &in.GeneratedDeployment, &out.GeneratedDeployment
+		*out = new(runtime.RawExtension)
+		(*in).DeepCopyInto(*out)
+	}
+	if in.Deployment != nil {
+		in, out := &in.Deployment, &out.Deployment
+		*out = new(DeploymentStatus)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoGraphDeploymentRequestStatus.
+func (in *DynamoGraphDeploymentRequestStatus) DeepCopy() *DynamoGraphDeploymentRequestStatus {
+	if in == nil {
+		return nil
+	}
+	out := new(DynamoGraphDeploymentRequestStatus)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *DynamoGraphDeploymentSpec) DeepCopyInto(out *DynamoGraphDeploymentSpec) {
 	*out = *in
@@ -445,6 +626,21 @@ func (in *DynamoGraphDeploymentStatus) DeepCopy() *DynamoGraphDeploymentStatus {
 	return out
 }

+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *GPUSpec) DeepCopyInto(out *GPUSpec) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUSpec.
+func (in *GPUSpec) DeepCopy() *GPUSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(GPUSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *IngressSpec) DeepCopyInto(out *IngressSpec) {
 	*out = *in
@@ -555,6 +751,41 @@ func (in *PVC) DeepCopy() *PVC {
 	return out
 }

+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *ProfilingConfigSpec) DeepCopyInto(out *ProfilingConfigSpec) {
+	*out = *in
+	if in.ConfigMapRef != nil {
+		in, out := &in.ConfigMapRef, &out.ConfigMapRef
+		*out = new(ConfigMapKeySelector)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProfilingConfigSpec.
+func (in *ProfilingConfigSpec) DeepCopy() *ProfilingConfigSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(ProfilingConfigSpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *SLASpec) DeepCopyInto(out *SLASpec) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SLASpec.
+func (in *SLASpec) DeepCopy() *SLASpec {
+	if in == nil {
+		return nil
+	}
+	out := new(SLASpec)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *SharedMemorySpec) DeepCopyInto(out *SharedMemorySpec) {
 	*out = *in

--- a/deploy/cloud/operator/cmd/main.go
+++ b/deploy/cloud/operator/cmd/main.go
@@ -140,6 +140,8 @@ func main() {
 	var mpiRunSecretName string
 	var mpiRunSecretNamespace string
 	var plannerClusterRoleName string
+	var profilerImage string
+	var dgdrProfilingClusterRoleName string
 	flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
 	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
 	flag.BoolVar(&enableLeaderElection, "leader-elect", false,
@@ -180,6 +182,10 @@ func main() {
 		"Namespace where the MPI SSH secret is located (required)")
 	flag.StringVar(&plannerClusterRoleName, "planner-cluster-role-name", "",
 		"Name of the ClusterRole for planner (cluster-wide mode only)")
+	flag.StringVar(&profilerImage, "profiler-image", "",
+		"Container image to use for profiling jobs (both online and offline/AIC) (for DynamoGraphDeploymentRequest)")
+	flag.StringVar(&dgdrProfilingClusterRoleName, "dgdr-profiling-cluster-role-name", "",
+		"Name of the ClusterRole for DGDR profiling jobs (cluster-wide mode only)")
 	opts := zap.Options{
 		Development: true,
 	}
@@ -237,6 +243,7 @@ func main() {
 		},
 		RBAC: commonController.RBACConfig{
 			PlannerClusterRoleName:       plannerClusterRoleName,
+			DGDRProfilingClusterRoleName: dgdrProfilingClusterRoleName,
 		},
 	}

@@ -449,6 +456,17 @@ func main() {
 		setupLog.Error(err, "unable to create controller", "controller", "DynamoGraphDeployment")
 		os.Exit(1)
 	}
+
+	if err = (&controller.DynamoGraphDeploymentRequestReconciler{
+		Client:        mgr.GetClient(),
+		Recorder:      mgr.GetEventRecorderFor("dynamographdeploymentrequest"),
+		ProfilerImage: profilerImage,
+		Config:        ctrlConfig,
+		RBACManager:   rbacManager,
+	}).SetupWithManager(mgr); err != nil {
+		setupLog.Error(err, "unable to create controller", "controller", "DynamoGraphDeploymentRequest")
+		os.Exit(1)
+	}
 	//+kubebuilder:scaffold:builder

 	if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {

--- a/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.4
+    helm.sh/resource-policy: keep
+  name: dynamographdeploymentrequests.nvidia.com
+spec:
+  group: nvidia.com
+  names:
+    kind: DynamoGraphDeploymentRequest
+    listKind: DynamoGraphDeploymentRequestList
+    plural: dynamographdeploymentrequests
+    shortNames:
+      - dgdr
+    singular: dynamographdeploymentrequest
+  scope: Namespaced
+  versions:
+    - additionalPrinterColumns:
+        - jsonPath: .spec.modelName
+          name: Model
+          type: string
+        - jsonPath: .spec.backend
+          name: Backend
+          type: string
+        - jsonPath: .status.state
+          name: State
+          type: string
+        - jsonPath: .status.deployment.state
+          name: DGD-State
+          type: string
+        - jsonPath: .metadata.creationTimestamp
+          name: Age
+          type: date
+      name: v1alpha1
+      schema:
+        openAPIV3Schema:
+          description: |-
+            DynamoGraphDeploymentRequest is the Schema for the dynamographdeploymentrequests API.
+            It serves as the primary interface for users to request model deployments with
+            specific performance and resource constraints, enabling SLA-driven deployments.
+
+            Lifecycle:
+             1. Initial → Pending: Validates spec and prepares for profiling
+             2. Pending → Profiling: Creates and runs profiling job (online or AIC)
+             3. Profiling → Ready/Deploying: Generates DGD spec after profiling completes
+             4. Deploying → Ready: When autoApply=true, monitors DGD until Ready
+             5. Ready: Terminal state when DGD is operational or spec is available
+             6. DeploymentDeleted: Terminal state when auto-created DGD is manually deleted
+
+            The spec becomes immutable once profiling starts. Users must delete and recreate
+            the DGDR to modify configuration after this point.
+          properties:
+            apiVersion:
+              description: |-
+                APIVersion defines the versioned schema of this representation of an object.
+                Servers should convert recognized schemas to the latest internal value, and
+                may reject unrecognized values.
+                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+              type: string
+            kind:
+              description: |-
+                Kind is a string value representing the REST resource this object represents.
+                Servers may infer this from the endpoint the client submits requests to.
+                Cannot be updated.
+                In CamelCase.
+                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+              type: string
+            metadata:
+              type: object
+            spec:
+              description: Spec defines the desired state for this deployment request.
+              properties:
+                autoApply:
+                  default: false
+                  description: |-
+                    AutoApply indicates whether to automatically create a DynamoGraphDeployment
+                    after profiling completes. If false, only the spec is generated and stored in status.
+                    Users can then manually create a DGD using the generated spec.
+                  type: boolean
+                backend:
+                  default: trtllm
+                  description: |-
+                    Backend specifies the inference backend framework to use.
+                    Supported values are: "vllm", "sglang", "trtllm".
+                  enum:
+                    - vllm
+                    - sglang
+                    - trtllm
+                  type: string
+                deploymentOverrides:
+                  description: |-
+                    DeploymentOverrides allows customizing metadata for the auto-created DGD.
+                    Only applicable when AutoApply is true.
+                  properties:
+                    annotations:
+                      additionalProperties:
+                        type: string
+                      description: Annotations are additional annotations to add to the DynamoGraphDeployment metadata.
+                      type: object
+                    labels:
+                      additionalProperties:
+                        type: string
+                      description: |-
+                        Labels are additional labels to add to the DynamoGraphDeployment metadata.
+                        These are merged with auto-generated labels from the profiling process.
+                      type: object
+                    name:
+                      description: |-
+                        Name is the desired name for the created DynamoGraphDeployment.
+                        If not specified, defaults to the DGDR name.
+                      type: string
+                    namespace:
+                      description: |-
+                        Namespace is the desired namespace for the created DynamoGraphDeployment.
+                        If not specified, defaults to the DGDR namespace.
+                      type: string
+                  type: object
+                gpu:
+                  description: |-
+                    GPU defines optional GPU type and resource specifications.
+                    These constraints guide the profiler to find configurations within specified bounds.
+                  properties:
+                    maxNumGPUsPerEngine:
+                      default: 8
+                      description: |-
+                        MaxNumGPUsPerEngine specifies the maximum number of GPUs per engine for profiling.
+                        The profiler will not consider configurations with more GPUs than this value.
+                      minimum: 1
+                      type: integer
+                    minNumGPUsPerEngine:
+                      default: 1
+                      description: |-
+                        MinNumGPUsPerEngine specifies the minimum number of GPUs per engine for profiling.
+                        The profiler will not consider configurations with fewer GPUs than this value.
+                      minimum: 1
+                      type: integer
+                    type:
+                      description: |-
+                        Type specifies the GPU type to target (e.g., "h200", "h100", "a100").
+                        If specified, profiling will focus on configurations optimized for this GPU type.
+                      type: string
+                  type: object
+                modelName:
+                  description: |-
+                    ModelName specifies the model to deploy (e.g., "meta/llama3-70b").
+                    This should be a valid model identifier that the profiler can resolve.
+                  type: string
+                online:
+                  default: false
+                  description: |-
+                    Online indicates whether to use online profiler (true) or AI Configurator (false).
+                    Online profiling uses real deployments for accurate measurements (2-4 hours).
+                    Offline profiling uses AI Configurator for fast simulation-based profiling (20-30 seconds).
+                  type: boolean
+                profilingConfig:
+                  description: |-
+                    ProfilingConfig provides custom configuration for the profiling job.
+                    Applicable to both online and offline (AIC) profiling modes.
+                  properties:
+                    configMapRef:
+                      description: |-
+                        ConfigMapRef is a reference to a ConfigMap containing profiling configuration.
+                        The ConfigMap should contain a key (default: "disagg.yaml") with the configuration file.
+                        This configuration is used by both online and offline (AIC) profiling modes.
+                      properties:
+                        key:
+                          default: disagg.yaml
+                          description: Key in the ConfigMap to select. If not specified, defaults to "disagg.yaml".
+                          type: string
+                        name:
+                          description: Name of the ConfigMap containing the desired data.
+                          type: string
+                      required:
+                        - name
+                      type: object
+                  type: object
+                sla:
+                  description: |-
+                    SLA defines the Service Level Agreement profiling targets.
+                    The profiler uses these targets to find an optimal deployment configuration.
+                  properties:
+                    isl:
+                      default: 3000
+                      description: |-
+                        ISL is the Input Sequence Length for profiling.
+                        Defines the length of input sequences to use during profiling tests.
+                      minimum: 1
+                      type: integer
+                    itl:
+                      default: 10
+                      description: |-
+                        ITL is the target Inter-Token Latency in milliseconds.
+                        This represents the maximum time allowed between consecutive tokens in the output.
+                      type: integer
+                    osl:
+                      default: 500
+                      description: |-
+                        OSL is the Output Sequence Length for profiling.
+                        Defines the expected length of output sequences to generate during profiling tests.
+                      minimum: 1
+                      type: integer
+                    ttft:
+                      default: 50
+                      description: |-
+                        TTFT is the target Time To First Token in milliseconds.
+                        This represents the maximum time allowed from request submission to receiving the first token.
+                      type: integer
+                  type: object
+              required:
+                - modelName
+                - sla
+              type: object
+            status:
+              description: Status reflects the current observed state of this deployment request.
+              properties:
+                conditions:
+                  description: |-
+                    Conditions contains the latest observed conditions of the deployment request.
+                    Standard condition types include: Validation, Profiling, SpecGenerated, DeploymentReady.
+                    Conditions are merged by type on patch updates.
+                  items:
+                    description: Condition contains details for one aspect of the current state of this API Resource.
+                    properties:
+                      lastTransitionTime:
+                        description: |-
+                          lastTransitionTime is the last time the condition transitioned from one status to another.
+                          This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                        format: date-time
+                        type: string
+                      message:
+                        description: |-
+                          message is a human readable message indicating details about the transition.
+                          This may be an empty string.
+                        maxLength: 32768
+                        type: string
+                      observedGeneration:
+                        description: |-
+                          observedGeneration represents the .metadata.generation that the condition was set based upon.
+                          For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                          with respect to the current state of the instance.
+                        format: int64
+                        minimum: 0
+                        type: integer
+                      reason:
+                        description: |-
+                          reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                          Producers of specific condition types may define expected values and meanings for this field,
+                          and whether the values are considered a guaranteed API.
+                          The value should be a CamelCase string.
+                          This field may not be empty.
+                        maxLength: 1024
+                        minLength: 1
+                        pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                        type: string
+                      status:
+                        description: status of the condition, one of True, False, Unknown.
+                        enum:
+                          - "True"
+                          - "False"
+                          - Unknown
+                        type: string
+                      type:
+                        description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                        maxLength: 316
+                        pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                        type: string
+                    required:
+                      - lastTransitionTime
+                      - message
+                      - reason
+                      - status
+                      - type
+                    type: object
+                  type: array
+                deployment:
+                  description: |-
+                    Deployment tracks the auto-created DGD when AutoApply is true.
+                    Contains name, namespace, state, and creation status of the managed DGD.
+                  properties:
+                    created:
+                      description: |-
+                        Created indicates whether the DGD has been successfully created.
+                        Used to prevent recreation if the DGD is manually deleted by users.
+                      type: boolean
+                    name:
+                      description: Name is the name of the created DynamoGraphDeployment.
+                      type: string
+                    namespace:
+                      description: Namespace is the namespace of the created DynamoGraphDeployment.
+                      type: string
+                    state:
+                      description: |-
+                        State is the current state of the DynamoGraphDeployment.
+                        This value is mirrored from the DGD's status.state field.
+                      type: string
+                  type: object
+                generatedDeployment:
+                  description: |-
+                    GeneratedDeployment contains the full generated DynamoGraphDeployment specification
+                    including metadata, based on profiling results. Users can extract this to create
+                    a DGD manually, or it's used automatically when autoApply is true.
+                    Stored as RawExtension to preserve all fields including metadata.
+                  type: object
+                  x-kubernetes-embedded-resource: true
+                  x-kubernetes-preserve-unknown-fields: true
+                observedGeneration:
+                  description: |-
+                    ObservedGeneration reflects the generation of the most recently observed spec.
+                    Used to detect spec changes and enforce immutability after profiling starts.
+                  format: int64
+                  type: integer
+                profilingResults:
+                  description: |-
+                    ProfilingResults contains a reference to the ConfigMap holding profiling data.
+                    Format: "configmap/<name>"
+                  type: string
+                state:
+                  description: |-
+                    State is a high-level textual status of the deployment request lifecycle.
+                    Possible values: "", "Pending", "Profiling", "Deploying", "Ready", "DeploymentDeleted", "Failed"
+                    Empty string ("") represents the initial state before initialization.
+                  type: string
+              type: object
+          type: object
+      served: true
+      storage: true
+      subresources:
+        status: {}
--- a/deploy/cloud/operator/config/rbac/role.yaml
+++ b/deploy/cloud/operator/config/rbac/role.yaml
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
@@ -74,6 +73,18 @@ rules:
  - patch
  - update
  - watch
+- apiGroups:
+  - batch
+  resources:
+  - jobs
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
 - apiGroups:
  - coordination.k8s.io
  resources:
@@ -160,6 +171,7 @@ rules:
  - nvidia.com
  resources:
  - dynamocomponentdeployments
+  - dynamographdeploymentrequests
  - dynamographdeployments
  verbs:
  - create
@@ -173,6 +185,7 @@ rules:
  - nvidia.com
  resources:
  - dynamocomponentdeployments/finalizers
+  - dynamographdeploymentrequests/finalizers
  - dynamographdeployments/finalizers
  verbs:
  - update
@@ -180,6 +193,7 @@ rules:
  - nvidia.com
  resources:
  - dynamocomponentdeployments/status
+  - dynamographdeploymentrequests/status
  - dynamographdeployments/status
  verbs:
  - get

--- a/deploy/cloud/operator/config/samples/kustomization.yaml
+++ b/deploy/cloud/operator/config/samples/kustomization.yaml
@@ -18,4 +18,5 @@ resources:
 - nvidia.com_v1alpha1_dynamocomponentdeployment.yaml
 - nvidia.com_v1alpha1_dynamocomponent.yaml
 - nvidia.com_v1alpha1_dynamographdeployment.yaml
+- nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
 #+kubebuilder:scaffold:manifestskustomizesamples
--- a/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
+++ b/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeploymentRequest
+metadata:
+  name: example-llm-sla
+spec:
+  modelName: "meta/llama3-70b"
+  backend: trtllm # enum: [vllm, sglang, trtllm]; default is trtllm
+  sla: # SLA profiling targets (all fields optional with defaults)
+    itl: 10    # Inter-Token Latency target in milliseconds (default: 10)
+    ttft: 50   # Time To First Token target in milliseconds (default: 50)
+    isl: 3000  # Input Sequence Length (default: 3000)
+    osl: 500   # Output Sequence Length (default: 500)
+  gpu: # optional
+    type: h200_sxm
+    minNumGPUsPerEngine: 1  # default is 1
+    maxNumGPUsPerEngine: 8  # default is 8
+  online: false # true for online profiler, false for AIC profiler
+
+  # Optional: Automatically create DynamoGraphDeployment after profiling
+  autoApply: true  # default is false
+
+  # Optional: Override metadata for auto-created DGD (only used when autoApply: true)
+  # deploymentOverrides:
+  #   name: my-custom-dgd-name
+  #   namespace: production
+  #   labels:
+  #     team: ml-platform
+  #   annotations:
+  #     description: "Auto-generated from DGDR"
+
+  # Currently required for both online and offline/AIC profiling, but will be removed in the future
+  profilingConfig:
+    configMapRef:
+      name: my-profiling-config
+      key: disagg.yaml  # default is "disagg.yaml"
--- a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller.go
+++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package controller
+
+import (
+	"bytes"
+	"context"
+	"errors"
+	"fmt"
+	"text/template"
+
+	batchv1 "k8s.io/api/batch/v1"
+	corev1 "k8s.io/api/core/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/meta"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/client-go/tools/record"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/builder"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/event"
+	"sigs.k8s.io/controller-runtime/pkg/handler"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/controller-runtime/pkg/predicate"
+	"sigs.k8s.io/yaml"
+
+	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
+	commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
+)
+
+const (
+	// State constants
+	StateEmpty             = ""
+	StatePending           = "Pending"
+	StateProfiling         = "Profiling"
+	StateDeploying         = "Deploying"
+	StateReady             = "Ready"
+	StateDeploymentDeleted = "DeploymentDeleted"
+	StateFailed            = "Failed"
+
+	// Condition types
+	ConditionTypeValidation      = "Validation"
+	ConditionTypeProfiling       = "Profiling"
+	ConditionTypeSpecGenerated   = "SpecGenerated"
+	ConditionTypeDeploymentReady = "DeploymentReady"
+
+	// Event reasons
+	EventReasonInitialized          = "Initialized"
+	EventReasonValidationFailed     = "ValidationFailed"
+	EventReasonProfilingJobCreated  = "ProfilingJobCreated"
+	EventReasonProfilingJobFailed   = "ProfilingJobFailed"
+	EventReasonAIConfiguratorFailed = "AIConfiguratorFailed"
+	EventReasonSpecGenerated        = "SpecGenerated"
+	EventReasonSpecChangeRejected   = "SpecChangeRejected"
+	EventReasonDeploymentCreated    = "DeploymentCreated"
+	EventReasonDeploymentReady      = "DeploymentReady"
+	EventReasonDeploymentDegraded   = "DeploymentDegraded"
+	EventReasonDeploymentDeleted    = "DeploymentDeleted"
+
+	// Label keys
+	LabelApp           = "app"
+	LabelDGDR          = "dgdr"
+	LabelDGDRName      = "dgdr.nvidia.com/name"
+	LabelDGDRNamespace = "dgdr.nvidia.com/namespace"
+	LabelManagedBy     = "nvidia.com/managed-by"
+
+	// Label values
+	LabelValueDynamoProfiler = "dynamo-profiler"
+	LabelValueAICProfiler    = "aic-profiler"
+	LabelValueDynamoOperator = "dynamo-operator"
+
+	// Job naming
+	JobNamePrefixOnline = "profile-online-"
+	JobNamePrefixAIC    = "profile-aic-"
+
+	// Container names
+	ContainerNameProfiler     = "profiler"
+	ContainerNameOutputCopier = "output-copier"
+
+	// ServiceAccount
+	ServiceAccountProfilingJob = "dgdr-profiling-job"
+
+	// ConfigMap naming
+	ConfigMapOutputPrefix = "dgdr-output-"
+
+	// Sidecar image
+	SidecarImage = "bitnami/kubectl:latest"
+
+	// Volume names
+	VolumeNameProfilingConfig = "profiling-config"
+	VolumeNameProfilingOutput = "profiling-output"
+
+	// Volume paths
+	ProfilingOutputPath = "/data"
+	ProfilingOutputFile = "config_with_planner.yaml"
+	ProfilingConfigPath = "/config"
+	ProfilingConfigFile = "disagg.yaml"
+
+	// Command line arguments
+	ArgModel   = "--model"
+	ArgBackend = "--backend"
+	ArgTTFT    = "--ttft"
+	ArgITL     = "--itl"
+	ArgConfig  = "--config"
+
+	// Messages
+	MessageInitialized               = "DGDR initialized successfully"
+	MessageProfilingJobCreated       = "Profiling job created"
+	MessageAICProfilingJobCreated    = "AIC profiling job created"
+	MessageProfilingInProgress       = "Profiling is in progress"
+	MessageSpecGenerated             = "DynamoGraphDeployment spec generated successfully"
+	MessageSpecAvailable             = "Generated spec is available in status.generatedDeployment"
+	MessageDeploymentCreated         = "DynamoGraphDeployment %s created successfully"
+	MessageDeploymentReady           = "DynamoGraphDeployment %s is ready"
+	MessageDeploymentDegraded        = "DynamoGraphDeployment %s degraded from Ready to %s"
+	MessageDeploymentDeleted         = "DGD %s was deleted. DGDR will not recreate it. Delete this DGDR and create a new one to redeploy."
+	MessageInvalidState              = "Invalid state"
+	MessageSpecChangeRejected        = "Cannot modify spec in state '%s'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead."
+	MessageJobCreationFailed         = "JobCreationFailed"
+	MessageDeploymentCreationFailed  = "DeploymentCreationFailed"
+	MessageResultsRetrievalFailed    = "ResultsRetrievalFailed"
+	MessageGenerationFailed          = "GenerationFailed"
+	MessageAIConfiguratorCheckFailed = "AIConfiguratorCheckFailed"
+	MessageProfilingCheckFailed      = "ProfilingCheckFailed"
+	MessageConfigMapNotFound         = "ConfigMap %s not found in namespace %s"
+	MessageConfigMapKeyNotFound      = "key %s not found in ConfigMap %s"
+
+	// Validation messages
+	ValidationErrorModelNameRequired = "modelName is required"
+	ValidationErrorITLPositive       = "sla.itl must be positive"
+	ValidationErrorTTFTPositive      = "sla.ttft must be positive"
+	ValidationErrorInvalidBackend    = "invalid backend: %s (must be vllm, sglang, or trtllm)"
+
+	// Valid backend values
+	BackendVLLM   = "vllm"
+	BackendSGLang = "sglang"
+	BackendTRTLLM = "trtllm"
+)
+
+// shell script template for the output copier sidecar
+const sidecarScriptTemplate = `
+set -e
+set -o pipefail
+while [ ! -f {{.OutputPath}}/{{.OutputFile}} ]; do sleep 2; done
+
+# Start building ConfigMap YAML with DGD spec
+cat >/tmp/cm.yaml <<EOF
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{.ConfigMapName}}
+  namespace: {{.Namespace}}
+  labels:
+    dgdr.nvidia.com/name: {{.DGDRName}}
+    nvidia.com/managed-by: dynamo-operator
+data:
+  {{.OutputFile}}: |
+EOF
+sed 's/^/    /' {{.OutputPath}}/{{.OutputFile}} >> /tmp/cm.yaml
+
+# Add profiling data directories to ConfigMap for long-term storage
+# Find all interpolation directories and add their raw_data.npz files
+for dir in {{.OutputPath}}/*/interpolation; do
+  if [ -d "$dir" ]; then
+    dirname=$(basename $(dirname "$dir"))
+    if [ -f "$dir/raw_data.npz" ]; then
+      echo "  ${dirname}_raw_data.npz: |" >> /tmp/cm.yaml
+      base64 "$dir/raw_data.npz" | sed 's/^/    /' >> /tmp/cm.yaml
+    fi
+  fi
+done
+
+kubectl apply -f /tmp/cm.yaml
+echo "Saved profiling output to ConfigMap {{.ConfigMapName}}"
+`
+
+// DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object
+type DynamoGraphDeploymentRequestReconciler struct {
+	client.Client
+	Recorder record.EventRecorder
+	Config   commonController.Config
+
+	// ProfilerImage is the container image to use for profiling jobs (both online and offline/AIC)
+	ProfilerImage string
+	// RBACMgr handles RBAC setup for profiling jobs
+	RBACManager RBACManager
+}
+
+// RBACManager interface for managing RBAC resources
+type RBACManager interface {
+	EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
+}
+
+// GetRecorder implements commonController.Reconciler interface
+func (r *DynamoGraphDeploymentRequestReconciler) GetRecorder() record.EventRecorder {
+	return r.Recorder
+}
+
+// FinalizeResource implements commonController.Finalizer interface
+func (r *DynamoGraphDeploymentRequestReconciler) FinalizeResource(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
+	logger := log.FromContext(ctx)
+	logger.Info("Finalizing DGDR", "name", dgdr.Name)
+
+	// Cleanup profiling resources
+	if err := r.cleanupProfilingResources(ctx, dgdr); err != nil {
+		logger.Error(err, "Failed to cleanup profiling resources")
+		return err
+	}
+
+	logger.Info("DGDR finalized successfully", "name", dgdr.Name)
+	return nil
+}
+
+// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentrequests,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentrequests/status,verbs=get;update;patch
+// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeploymentrequests/finalizers,verbs=update
+// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch
+// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
+// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch
+
+// Reconcile handles the reconciliation loop for DynamoGraphDeploymentRequest
+func (r *DynamoGraphDeploymentRequestReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+	logger.Info("Reconciling DynamoGraphDeploymentRequest", "name", req.Name, "namespace", req.Namespace)
+
+	// Fetch the DGDR instance
+	dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}
+	if err := r.Get(ctx, req.NamespacedName, dgdr); err != nil {
+		if apierrors.IsNotFound(err) {
+			logger.Info("DGDR resource not found, ignoring since object must be deleted")
+			return ctrl.Result{}, nil
+		}
+		logger.Error(err, "Failed to get DGDR")
+		return ctrl.Result{}, err
+	}
+
+	// Handle finalizer using common function
+	finalized, err := commonController.HandleFinalizer(ctx, dgdr, r.Client, r)
+	if err != nil {
+		return ctrl.Result{}, err
+	}
+	if finalized {
+		// Resource was deleted and finalized
+		return ctrl.Result{}, nil
+	}
+
+	// Check for spec changes (immutability enforcement)
+	if dgdr.Status.ObservedGeneration > 0 && dgdr.Status.ObservedGeneration != dgdr.Generation {
+		// Spec changed after initial processing
+		if dgdr.Status.State == StateProfiling || dgdr.Status.State == StateDeploying ||
+			dgdr.Status.State == StateReady || dgdr.Status.State == StateDeploymentDeleted {
+			logger.Info("Spec change detected in immutable state",
+				"state", dgdr.Status.State,
+				"observedGeneration", dgdr.Status.ObservedGeneration,
+				"currentGeneration", dgdr.Generation)
+
+			r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonSpecChangeRejected,
+				fmt.Sprintf(MessageSpecChangeRejected, dgdr.Status.State))
+
+			// Keep the old observedGeneration to continue rejecting changes
+			// No state transition - stay in current state with old spec
+			return ctrl.Result{}, nil
+		}
+	}
+	// State machine: handle different states
+	switch dgdr.Status.State {
+	case StateEmpty:
+		return r.handleInitialState(ctx, dgdr)
+	case StatePending:
+		return r.handlePendingState(ctx, dgdr)
+	case StateProfiling:
+		return r.handleProfilingState(ctx, dgdr)
+	case StateDeploying:
+		return r.handleDeployingState(ctx, dgdr)
+	case StateReady:
+		return r.handleReadyState(ctx, dgdr)
+	case StateDeploymentDeleted:
+		return r.handleDeploymentDeletedState(ctx, dgdr)
+	case StateFailed:
+		return r.handleFailedState(ctx, dgdr)
+	default:
+		logger.Info("Unknown state", "state", dgdr.Status.State)
+		return r.updateStateAndRequeue(ctx, dgdr, StateFailed, MessageInvalidState)
+	}
+}
+
+// handleInitialState processes newly created DGDR resources
+func (r *DynamoGraphDeploymentRequestReconciler) handleInitialState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+	logger.Info("Handling initial state", "name", dgdr.Name)
+
+	// Validate the spec
+	if err := r.validateSpec(ctx, dgdr); err != nil {
+		r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonValidationFailed, err.Error())
+		return r.updateStateWithCondition(ctx, dgdr, StateFailed, ConditionTypeValidation, metav1.ConditionFalse, EventReasonValidationFailed, err.Error())
+	}
+
+	// Set observedGeneration to track the spec we're processing
+	dgdr.Status.ObservedGeneration = dgdr.Generation
+
+	// Initialize status
+	r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonInitialized, MessageInitialized)
+	return r.updateStateAndRequeue(ctx, dgdr, StatePending, MessageInitialized)
+}
+
+// handlePendingState starts the profiling process
+func (r *DynamoGraphDeploymentRequestReconciler) handlePendingState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+	logger.Info("Handling pending state", "name", dgdr.Name)
+
+	// Create profiling job (online or AIC)
+	if err := r.createProfilingJob(ctx, dgdr); err != nil {
+		r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonProfilingJobFailed, err.Error())
+		return r.updateStateWithCondition(ctx, dgdr, StateFailed, ConditionTypeProfiling, metav1.ConditionFalse, MessageJobCreationFailed, err.Error())
+	}
+
+	// Record event with appropriate message
+	if dgdr.Spec.Online {
+		r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonProfilingJobCreated, MessageProfilingJobCreated)
+	} else {
+		r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonProfilingJobCreated, MessageAICProfilingJobCreated)
+	}
+
+	// Update to Profiling state with Running status
+	return r.updateStateWithCondition(ctx, dgdr, StateProfiling, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingRunning", MessageProfilingInProgress)
+}
+
+// handleProfilingState monitors profiling progress and generates spec when complete
+func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+	logger.Info("Handling profiling state", "name", dgdr.Name)
+
+	// Check profiling job status (both online and offline/AIC run as Jobs)
+	// Note: We watch the Job via Owns(), so we'll be triggered automatically on Job changes
+	completed, err := r.checkProfilingJobStatus(ctx, dgdr)
+	if err != nil {
+		r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageProfilingCheckFailed, err.Error())
+		// Job failed - transition to Failed state
+		return r.updateStateWithCondition(ctx, dgdr, StateFailed, ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingFailed", err.Error())
+	}
+
+	if !completed {
+		logger.Info("Profiling job still running", "name", dgdr.Name)
+		// Don't requeue - we'll be triggered when the Job completes/fails
+		return ctrl.Result{}, nil
+	}
+
+	// Mark profiling as completed successfully
+	meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
+		Type:               ConditionTypeProfiling,
+		Status:             metav1.ConditionTrue,
+		ObservedGeneration: dgdr.Generation,
+		Reason:             "ProfilingCompleted",
+		Message:            "Profiling job completed successfully",
+	})
+
+	// Retrieve profiling results and generate spec
+	if err := r.generateDGDSpec(ctx, dgdr); err != nil {
+		r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageGenerationFailed, err.Error())
+		return r.updateStateWithCondition(ctx, dgdr, StateFailed, ConditionTypeSpecGenerated, metav1.ConditionFalse, MessageGenerationFailed, err.Error())
+	}
+
+	// Record spec generation event
+	r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonSpecGenerated, MessageSpecGenerated)
+
+	// If autoApply is enabled, transition to Deploying state
+	if dgdr.Spec.AutoApply {
+		logger.Info("AutoApply enabled, transitioning to Deploying state")
+		return r.updateStateWithCondition(ctx, dgdr, StateDeploying, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecGenerated)
+	}
+
+	// Otherwise, transition to Ready state
+	return r.updateStateWithCondition(ctx, dgdr, StateReady, ConditionTypeSpecGenerated, metav1.ConditionTrue, EventReasonSpecGenerated, MessageSpecAvailable)
+}
+
+// handleReadyState handles DGDR in Ready state
+func (r *DynamoGraphDeploymentRequestReconciler) handleReadyState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+	logger.Info("DGDR is ready", "name", dgdr.Name)
+
+	// If autoApply is not enabled, nothing to monitor
+	if !dgdr.Spec.AutoApply {
+		return ctrl.Result{}, nil
+	}
+
+	// Check if DGD still exists and monitor its status
+	dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{}
+	err := r.Get(ctx, types.NamespacedName{
+		Name:      dgdr.Status.Deployment.Name,
+		Namespace: dgdr.Status.Deployment.Namespace,
+	}, dgd)
+
+	if apierrors.IsNotFound(err) {
+		// DGD was deleted by user
+		return r.handleDGDDeleted(ctx, dgdr)
+	}
+
+	if err != nil {
+		return ctrl.Result{}, err
+	}
+
+	// Update deployment status
+	dgdr.Status.Deployment.State = dgd.Status.State
+
+	// Check if DGD degraded from Ready
+	if dgd.Status.State != "Ready" {
+		logger.Info("DGD degraded, transitioning back to Deploying",
+			"dgdState", dgd.Status.State)
+
+		dgdr.Status.State = StateDeploying
+
+		r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDegraded,
+			fmt.Sprintf(MessageDeploymentDegraded, dgd.Name, dgd.Status.State))
+
+		meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
+			Type:    ConditionTypeDeploymentReady,
+			Status:  metav1.ConditionFalse,
+			Reason:  EventReasonDeploymentDegraded,
+			Message: fmt.Sprintf("Deployment degraded to %s", dgd.Status.State),
+		})
+	}
+
+	return ctrl.Result{}, r.Status().Update(ctx, dgdr)
+}
+
+// handleDeployingState handles DGD creation and monitors deployment
+func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+	logger.Info("Handling deploying state", "name", dgdr.Name)
+
+	if !dgdr.Spec.AutoApply {
+		// Shouldn't be in this state without autoApply
+		logger.Info("AutoApply not enabled, transitioning to Ready")
+		dgdr.Status.State = StateReady
+		return ctrl.Result{}, r.Status().Update(ctx, dgdr)
+	}
+
+	// Check if we need to create DGD
+	if dgdr.Status.Deployment == nil || !dgdr.Status.Deployment.Created {
+		return r.createDGD(ctx, dgdr)
+	}
+
+	// DGD was already created, check its status
+	dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{}
+	err := r.Get(ctx, types.NamespacedName{
+		Name:      dgdr.Status.Deployment.Name,
+		Namespace: dgdr.Status.Deployment.Namespace,
+	}, dgd)
+
+	if apierrors.IsNotFound(err) {
+		// DGD was deleted by user
+		return r.handleDGDDeleted(ctx, dgdr)
+	}
+
+	if err != nil {
+		return ctrl.Result{}, err
+	}
+
+	// Update deployment status
+	dgdr.Status.Deployment.State = dgd.Status.State
+
+	// Check if DGD is Ready
+	if dgd.Status.State == "Ready" {
+		logger.Info("DGD is Ready, transitioning to Ready state")
+		dgdr.Status.State = StateReady
+
+		r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonDeploymentReady,
+			fmt.Sprintf(MessageDeploymentReady, dgd.Name))
+
+		meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
+			Type:    ConditionTypeDeploymentReady,
+			Status:  metav1.ConditionTrue,
+			Reason:  EventReasonDeploymentReady,
+			Message: fmt.Sprintf(MessageDeploymentReady, dgd.Name),
+		})
+	}
+
+	return ctrl.Result{}, r.Status().Update(ctx, dgdr)
+}
+
+// handleDeploymentDeletedState is a terminal state for when auto-created DGD is deleted
+func (r *DynamoGraphDeploymentRequestReconciler) handleDeploymentDeletedState(_ context.Context, _ *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
+	// Terminal state - nothing to do
+	// User must delete this DGDR and create a new one to redeploy
+	return ctrl.Result{}, nil
+}
+
+// handleDGDDeleted handles the case when auto-created DGD is deleted by user
+func (r *DynamoGraphDeploymentRequestReconciler) handleDGDDeleted(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+	logger.Info("DGD was deleted by user, transitioning to DeploymentDeleted state")
+
+	dgdr.Status.State = StateDeploymentDeleted
+	dgdr.Status.Deployment.State = "Deleted"
+
+	r.Recorder.Event(dgdr, corev1.EventTypeWarning, EventReasonDeploymentDeleted,
+		fmt.Sprintf(MessageDeploymentDeleted, dgdr.Status.Deployment.Name))
+
+	meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
+		Type:    ConditionTypeDeploymentReady,
+		Status:  metav1.ConditionFalse,
+		Reason:  EventReasonDeploymentDeleted,
+		Message: "Deployment was deleted by user. Create a new DGDR to redeploy.",
+	})
+
+	return ctrl.Result{}, r.Status().Update(ctx, dgdr)
+}
+
+// createDGD creates a DynamoGraphDeployment with the generated spec
+func (r *DynamoGraphDeploymentRequestReconciler) createDGD(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+
+	// Extract DGD from RawExtension
+	if dgdr.Status.GeneratedDeployment == nil {
+		return ctrl.Result{}, fmt.Errorf("generatedDeployment is not set")
+	}
+
+	generatedDGD := &nvidiacomv1alpha1.DynamoGraphDeployment{}
+
+	// RawExtension can have either Object (already decoded) or Raw (JSON bytes)
+	if dgdr.Status.GeneratedDeployment.Object != nil {
+		var ok bool
+		generatedDGD, ok = dgdr.Status.GeneratedDeployment.Object.(*nvidiacomv1alpha1.DynamoGraphDeployment)
+		if !ok {
+			return ctrl.Result{}, fmt.Errorf("generatedDeployment.Object is not a DynamoGraphDeployment")
+		}
+	} else if dgdr.Status.GeneratedDeployment.Raw != nil {
+		if err := yaml.Unmarshal(dgdr.Status.GeneratedDeployment.Raw, generatedDGD); err != nil {
+			return ctrl.Result{}, fmt.Errorf("failed to unmarshal generated deployment: %w", err)
+		}
+	} else {
+		return ctrl.Result{}, fmt.Errorf("generatedDeployment has neither Object nor Raw set")
+	}
+
+	// Determine DGD name and namespace
+	dgdName := generatedDGD.Name
+	dgdNamespace := dgdr.Namespace
+
+	if dgdr.Spec.DeploymentOverrides != nil {
+		if dgdr.Spec.DeploymentOverrides.Name != "" {
+			dgdName = dgdr.Spec.DeploymentOverrides.Name
+		}
+		if dgdr.Spec.DeploymentOverrides.Namespace != "" {
+			dgdNamespace = dgdr.Spec.DeploymentOverrides.Namespace
+		}
+	}
+
+	// Build labels (start with generated DGD's labels)
+	labels := make(map[string]string)
+	if generatedDGD.Labels != nil {
+		for k, v := range generatedDGD.Labels {
+			labels[k] = v
+		}
+	}
+	// Add/override with managed labels
+	labels[LabelDGDRName] = dgdr.Name
+	labels[LabelDGDRNamespace] = dgdr.Namespace
+	labels[LabelManagedBy] = LabelValueDynamoOperator
+
+	// Merge custom labels from overrides
+	if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.Labels != nil {
+		for k, v := range dgdr.Spec.DeploymentOverrides.Labels {
+			labels[k] = v
+		}
+	}
+
+	// Build annotations (start with generated DGD's annotations)
+	annotations := make(map[string]string)
+	if generatedDGD.Annotations != nil {
+		for k, v := range generatedDGD.Annotations {
+			annotations[k] = v
+		}
+	}
+	// Merge custom annotations from overrides
+	if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.Annotations != nil {
+		for k, v := range dgdr.Spec.DeploymentOverrides.Annotations {
+			annotations[k] = v
+		}
+	}
+
+	// Create DGD from generated deployment
+	dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:        dgdName,
+			Namespace:   dgdNamespace,
+			Labels:      labels,
+			Annotations: annotations,
+		},
+		Spec: generatedDGD.Spec,
+	}
+
+	// Note: We don't set owner reference on DGD
+	// If a DGDR is deleted, the DGD may be serving traffic and should persist independently.
+	// We use labels (LabelDGDRName) to track the relationship.
+
+	logger.Info("Creating DynamoGraphDeployment", "name", dgdName, "namespace", dgdNamespace)
+
+	if err := r.Create(ctx, dgd); err != nil {
+		if apierrors.IsAlreadyExists(err) {
+			// DGD already exists, just update status
+			logger.Info("DGD already exists, updating status")
+			dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
+				Name:      dgdName,
+				Namespace: dgdNamespace,
+				State:     "Pending",
+				Created:   true,
+			}
+			return ctrl.Result{}, r.Status().Update(ctx, dgdr)
+		}
+		r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageDeploymentCreationFailed, err.Error())
+		return ctrl.Result{}, err
+	}
+
+	// Update status
+	dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
+		Name:      dgdName,
+		Namespace: dgdNamespace,
+		State:     "Pending",
+		Created:   true,
+	}
+
+	r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonDeploymentCreated,
+		fmt.Sprintf(MessageDeploymentCreated, dgdName))
+
+	meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
+		Type:    ConditionTypeDeploymentReady,
+		Status:  metav1.ConditionFalse,
+		Reason:  EventReasonDeploymentCreated,
+		Message: fmt.Sprintf("DGD %s created, waiting for Ready", dgdName),
+	})
+
+	logger.Info("DynamoGraphDeployment created successfully", "name", dgdName)
+
+	return ctrl.Result{}, r.Status().Update(ctx, dgdr)
+}
+
+// handleFailedState handles DGDR in Failed state
+func (r *DynamoGraphDeploymentRequestReconciler) handleFailedState(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+	logger.Info("DGDR is in failed state", "name", dgdr.Name)
+
+	// Cleanup profiling resources if any
+	if err := r.cleanupProfilingResources(ctx, dgdr); err != nil {
+		logger.Error(err, "Failed to cleanup profiling resources")
+	}
+
+	// Could implement retry logic here if desired
+	return ctrl.Result{}, nil
+}
+
+// getProfilingJobName returns the job name for a DGDR based on profiling mode
+func getProfilingJobName(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) string {
+	var jobNamePrefix string
+	if dgdr.Spec.Online {
+		jobNamePrefix = JobNamePrefixOnline
+	} else {
+		jobNamePrefix = JobNamePrefixAIC
+	}
+	return fmt.Sprintf("%s%s", jobNamePrefix, dgdr.Name)
+}
+
+// getOutputConfigMapName returns the ConfigMap name for profiling output
+func getOutputConfigMapName(dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) string {
+	return fmt.Sprintf("%s%s", ConfigMapOutputPrefix, dgdr.Name)
+}
+
+// validateSpec validates the DGDR spec
+func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
+	if dgdr.Spec.ModelName == "" {
+		return errors.New(ValidationErrorModelNameRequired)
+	}
+
+	if dgdr.Spec.SLA.ITL <= 0 {
+		return errors.New(ValidationErrorITLPositive)
+	}
+
+	if dgdr.Spec.SLA.TTFT <= 0 {
+		return errors.New(ValidationErrorTTFTPositive)
+	}
+
+	// Validate backend
+	validBackends := map[string]bool{
+		BackendVLLM:   true,
+		BackendSGLang: true,
+		BackendTRTLLM: true,
+	}
+	if dgdr.Spec.Backend != "" && !validBackends[dgdr.Spec.Backend] {
+		return fmt.Errorf(ValidationErrorInvalidBackend, dgdr.Spec.Backend)
+	}
+
+	// Validate ConfigMap if provided (for both online and offline/AIC profiling)
+	if dgdr.Spec.ProfilingConfig != nil && dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
+		cm := &corev1.ConfigMap{}
+		err := r.Get(ctx, types.NamespacedName{
+			Name:      dgdr.Spec.ProfilingConfig.ConfigMapRef.Name,
+			Namespace: dgdr.Namespace,
+		}, cm)
+
+		if err != nil {
+			if apierrors.IsNotFound(err) {
+				return fmt.Errorf(MessageConfigMapNotFound,
+					dgdr.Spec.ProfilingConfig.ConfigMapRef.Name, dgdr.Namespace)
+			}
+			return err
+		}
+
+		// Validate key exists
+		key := dgdr.Spec.ProfilingConfig.ConfigMapRef.Key
+		if key == "" {
+			key = "disagg.yaml"
+		}
+
+		if _, exists := cm.Data[key]; !exists {
+			return fmt.Errorf(MessageConfigMapKeyNotFound, key, cm.Name)
+		}
+	}
+
+	return nil
+}
+
+// createProfilingJob creates a Kubernetes Job for profiling using SyncResource
+func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
+	logger := log.FromContext(ctx)
+
+	// Ensure profiling job RBAC exists in cluster-wide mode
+	if r.Config.RestrictedNamespace == "" {
+		if err := r.RBACManager.EnsureServiceAccountWithRBAC(
+			ctx,
+			dgdr.Namespace,
+			ServiceAccountProfilingJob,
+			r.Config.RBAC.DGDRProfilingClusterRoleName,
+		); err != nil {
+			logger.Error(err, "Failed to ensure profiling job RBAC")
+			return fmt.Errorf("failed to ensure profiling job RBAC: %w", err)
+		}
+	}
+
+	// Use ProfilerImage for both online and offline (AIC) profiling
+	imageName := r.ProfilerImage
+	if imageName == "" {
+		return fmt.Errorf("profiler image not configured: the operator's profilerImage must be set in the Helm chart values (dynamo-operator.dynamo.dgdr.profilerImage). The image must contain the ai-dynamo profiler (python -m benchmarks.profiler.profile_sla entrypoint). For development, build from the ai-dynamo repository Dockerfile and push to your registry. A public image will be available in release 0.6.1")
+	}
+
+	logger.Info("Using profiler image", "image", imageName, "online", dgdr.Spec.Online)
+
+	// Determine label based on profiling mode
+	var labelValue string
+	if dgdr.Spec.Online {
+		labelValue = LabelValueDynamoProfiler
+	} else {
+		labelValue = LabelValueAICProfiler
+	}
+
+	// Use SyncResource to create/update the job
+	modified, job, err := commonController.SyncResource(ctx, r, dgdr, func(ctx context.Context) (*batchv1.Job, bool, error) {
+		jobName := getProfilingJobName(dgdr)
+		outputConfigMapName := getOutputConfigMapName(dgdr)
+
+		// Build profiler container based on online vs offline (AIC) mode
+		var profilerArgs []string
+		var profilerEnv []corev1.EnvVar
+
+		// Common environment variables
+		profilerEnv = []corev1.EnvVar{
+			{
+				Name: "HUGGING_FACE_HUB_TOKEN",
+				ValueFrom: &corev1.EnvVarSource{
+					SecretKeyRef: &corev1.SecretKeySelector{
+						LocalObjectReference: corev1.LocalObjectReference{
+							Name: "hf-token-secret",
+						},
+						Key: "HF_TOKEN",
+					},
+				},
+			},
+			{
+				Name:  "NATS_SERVER",
+				Value: fmt.Sprintf("nats://%s-nats:4222", dgdr.Namespace),
+			},
+			{
+				Name:  "ETCD_ENDPOINTS",
+				Value: fmt.Sprintf("%s-etcd:2379", dgdr.Namespace),
+			},
+		}
+
+		// Build container with volume mounts
+		volumeMounts := []corev1.VolumeMount{
+			{
+				Name:      VolumeNameProfilingOutput,
+				MountPath: ProfilingOutputPath,
+			},
+		}
+
+		// Determine GPU range for profiling
+		minGPUs := 1
+		maxGPUs := 8
+		if dgdr.Spec.GPU != nil {
+			if dgdr.Spec.GPU.MinNumGPUsPerEngine > 0 {
+				minGPUs = dgdr.Spec.GPU.MinNumGPUsPerEngine
+			}
+			if dgdr.Spec.GPU.MaxNumGPUsPerEngine > 0 {
+				maxGPUs = dgdr.Spec.GPU.MaxNumGPUsPerEngine
+			}
+		}
+
+		// Build common profiler args (shared by both online and offline modes)
+		profilerArgs = []string{
+			"--namespace", dgdr.Namespace,
+			"--backend", dgdr.Spec.Backend,
+			"--ttft", fmt.Sprintf("%d", dgdr.Spec.SLA.TTFT),
+			"--itl", fmt.Sprintf("%d", dgdr.Spec.SLA.ITL),
+			"--isl", fmt.Sprintf("%d", dgdr.Spec.SLA.ISL),
+			"--osl", fmt.Sprintf("%d", dgdr.Spec.SLA.OSL),
+			"--output-dir", ProfilingOutputPath,
+			"--min-num-gpus-per-engine", fmt.Sprintf("%d", minGPUs),
+			"--max-num-gpus-per-engine", fmt.Sprintf("%d", maxGPUs),
+		}
+
+		// Add mode-specific args
+		if !dgdr.Spec.Online {
+			// Offline (AIC) profiling: add AI Configurator args
+			profilerArgs = append(profilerArgs,
+				"--use-ai-configurator",
+				"--aic-model-name", dgdr.Spec.ModelName,
+				"--aic-backend-version", "0.20.0", // TODO: don't hardcode this
+			)
+
+			// Add AIC-specific GPU system type
+			if dgdr.Spec.GPU != nil && dgdr.Spec.GPU.Type != "" {
+				profilerArgs = append(profilerArgs, "--aic-system", dgdr.Spec.GPU.Type)
+			}
+		}
+
+		// Add config if provided (for both online and offline modes)
+		if dgdr.Spec.ProfilingConfig != nil && dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
+			profilerArgs = append(profilerArgs, "--config", fmt.Sprintf("%s/%s", ProfilingConfigPath, ProfilingConfigFile))
+			volumeMounts = append(volumeMounts, corev1.VolumeMount{
+				Name:      VolumeNameProfilingConfig,
+				MountPath: ProfilingConfigPath,
+				ReadOnly:  true,
+			})
+		}
+
+		profilerContainer := corev1.Container{
+			Name:    ContainerNameProfiler,
+			Image:   imageName,
+			Command: []string{"python", "-m", "benchmarks.profiler.profile_sla"},
+			Args:    profilerArgs,
+			Resources: corev1.ResourceRequirements{
+				Requests: corev1.ResourceList{
+					corev1.ResourceCPU:    resource.MustParse("16"),
+					corev1.ResourceMemory: resource.MustParse("10Gi"),
+				},
+			},
+			Env:          profilerEnv,
+			VolumeMounts: volumeMounts,
+		}
+
+		// Generate sidecar script from template
+		tmpl, err := template.New("sidecar").Parse(sidecarScriptTemplate)
+		if err != nil {
+			return nil, false, fmt.Errorf("failed to parse sidecar script template: %w", err)
+		}
+
+		var scriptBuf bytes.Buffer
+		err = tmpl.Execute(&scriptBuf, map[string]string{
+			"OutputPath":    ProfilingOutputPath,
+			"OutputFile":    ProfilingOutputFile,
+			"ConfigMapName": outputConfigMapName,
+			"Namespace":     dgdr.Namespace,
+			"DGDRName":      dgdr.Name,
+		})
+		if err != nil {
+			return nil, false, fmt.Errorf("failed to execute sidecar script template: %w", err)
+		}
+
+		sidecarContainer := corev1.Container{
+			Name:    ContainerNameOutputCopier,
+			Image:   SidecarImage,
+			Command: []string{"/bin/sh", "-c"},
+			Args:    []string{scriptBuf.String()},
+			VolumeMounts: []corev1.VolumeMount{{
+				Name:      VolumeNameProfilingOutput,
+				MountPath: ProfilingOutputPath,
+				ReadOnly:  true,
+			}},
+		}
+
+		// Build volumes - use dynamo-pvc for profiling output so data persists for the Planner
+		volumes := []corev1.Volume{{
+			Name: VolumeNameProfilingOutput,
+			VolumeSource: corev1.VolumeSource{
+				PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
+					ClaimName: "dynamo-pvc",
+				},
+			},
+		}}
+
+		// Add ConfigMap volume if provided (for both online and offline/AIC)
+		if dgdr.Spec.ProfilingConfig != nil && dgdr.Spec.ProfilingConfig.ConfigMapRef != nil {
+			key := dgdr.Spec.ProfilingConfig.ConfigMapRef.Key
+			if key == "" {
+				key = ProfilingConfigFile
+			}
+
+			volumes = append(volumes, corev1.Volume{
+				Name: VolumeNameProfilingConfig,
+				VolumeSource: corev1.VolumeSource{
+					ConfigMap: &corev1.ConfigMapVolumeSource{
+						LocalObjectReference: corev1.LocalObjectReference{
+							Name: dgdr.Spec.ProfilingConfig.ConfigMapRef.Name,
+						},
+						Items: []corev1.KeyToPath{{
+							Key:  key,
+							Path: ProfilingConfigFile,
+						}},
+					},
+				},
+			})
+		}
+
+		// Limit retries to prevent infinite loop
+		backoffLimit := int32(3)
+
+		job := &batchv1.Job{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      jobName,
+				Namespace: dgdr.Namespace,
+				Labels: map[string]string{
+					LabelApp:       labelValue,
+					LabelDGDR:      dgdr.Name,
+					LabelManagedBy: LabelValueDynamoOperator,
+				},
+			},
+			Spec: batchv1.JobSpec{
+				BackoffLimit: &backoffLimit,
+				Template: corev1.PodTemplateSpec{
+					Spec: corev1.PodSpec{
+						ServiceAccountName: ServiceAccountProfilingJob,
+						RestartPolicy:      corev1.RestartPolicyNever,
+						Containers:         []corev1.Container{profilerContainer, sidecarContainer},
+						Volumes:            volumes,
+						ImagePullSecrets: []corev1.LocalObjectReference{
+							{Name: "nvcr-imagepullsecret"},
+						},
+					},
+				},
+			},
+		}
+
+		return job, false, nil
+	})
+
+	if err != nil {
+		return err
+	}
+
+	if modified {
+		if dgdr.Spec.Online {
+			logger.Info("Online profiling job created/updated", "job", job.Name)
+		} else {
+			logger.Info("Offline (AIC) profiling job created/updated", "job", job.Name)
+		}
+	}
+
+	return nil
+}
+
+// checkProfilingJobStatus checks if the profiling job has completed
+func (r *DynamoGraphDeploymentRequestReconciler) checkProfilingJobStatus(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) (bool, error) {
+	logger := log.FromContext(ctx)
+	jobName := getProfilingJobName(dgdr)
+
+	job := &batchv1.Job{}
+	if err := r.Get(ctx, types.NamespacedName{Name: jobName, Namespace: dgdr.Namespace}, job); err != nil {
+		return false, err
+	}
+
+	// Check job conditions
+	for _, condition := range job.Status.Conditions {
+		if condition.Type == batchv1.JobComplete && condition.Status == corev1.ConditionTrue {
+			logger.Info("Profiling job completed", "job", jobName)
+			return true, nil
+		}
+		if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue {
+			return false, fmt.Errorf("profiling job failed: %s", condition.Message)
+		}
+	}
+
+	return false, nil
+}
+
+// generateDGDSpec generates DGD spec from profiling results (online or offline/AIC)
+func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
+	logger := log.FromContext(ctx)
+	logger.Info("Generating DGD spec from profiling results", "name", dgdr.Name, "online", dgdr.Spec.Online)
+
+	// Read the generated spec from ConfigMap (created by sidecar)
+	outputConfigMapName := getOutputConfigMapName(dgdr)
+	cm := &corev1.ConfigMap{}
+	err := r.Get(ctx, types.NamespacedName{
+		Name:      outputConfigMapName,
+		Namespace: dgdr.Namespace,
+	}, cm)
+
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			return fmt.Errorf("output ConfigMap %s not found - profiling may not have completed yet", outputConfigMapName)
+		}
+		return fmt.Errorf("failed to get output ConfigMap: %w", err)
+	}
+
+	// Get YAML content from ConfigMap
+	yamlContent, exists := cm.Data[ProfilingOutputFile]
+	if !exists {
+		return fmt.Errorf("key %s not found in ConfigMap %s", ProfilingOutputFile, outputConfigMapName)
+	}
+
+	logger.Info("Found profiling output in ConfigMap", "configMap", outputConfigMapName, "size", len(yamlContent))
+
+	// Parse YAML into full DynamoGraphDeployment object first to validate and get name
+	dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{}
+	if err := yaml.Unmarshal([]byte(yamlContent), dgd); err != nil {
+		return fmt.Errorf("failed to parse %s: %w", ProfilingOutputFile, err)
+	}
+
+	logger.Info("Parsed DGD from ConfigMap", "dgdName", dgd.Name)
+
+	// Store as RawExtension (need to marshal to JSON as RawExtension expects JSON)
+	// This preserves all fields including metadata
+	dgdr.Status.GeneratedDeployment = &runtime.RawExtension{
+		Object: dgd,
+	}
+
+	// Set profiling results reference
+	dgdr.Status.ProfilingResults = fmt.Sprintf("configmap/%s", outputConfigMapName)
+
+	logger.Info("Successfully generated DGD from profiling output", "dgdName", dgd.Name)
+
+	return r.Status().Update(ctx, dgdr)
+}
+
+// cleanupProfilingResources cleans up profiling resources
+func (r *DynamoGraphDeploymentRequestReconciler) cleanupProfilingResources(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest) error {
+	logger := log.FromContext(ctx)
+	logger.Info("Cleaning up profiling resources", "name", dgdr.Name)
+
+	// Cleanup behavior when DGDR is deleted:
+	// - Profiling Job: Automatically deleted via ownerReference (set by SyncResource)
+	// - Output ConfigMap: NOT deleted (no ownerReference) - contains valuable profiling data
+	// - Auto-created DGD: NOT deleted (no ownerReference) - may be serving traffic
+	//
+	// We use labels (LabelDGDRName) to track relationships without cascade delete.
+	// Users can manually clean up ConfigMaps and DGDs if needed using label selectors:
+	//   kubectl delete configmap -l dgdr.nvidia.com/name=<dgdr-name>
+	//   kubectl delete dynamographdeployment -l dgdr.nvidia.com/name=<dgdr-name>
+
+	logger.Info("Profiling job will be automatically deleted via ownerReference")
+	return nil
+}
+
+// updateStateAndRequeue updates the DGDR state and requeues
+func (r *DynamoGraphDeploymentRequestReconciler) updateStateAndRequeue(ctx context.Context, dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, state, _ string) (ctrl.Result, error) {
+	dgdr.Status.State = state
+	if err := r.Status().Update(ctx, dgdr); err != nil {
+		return ctrl.Result{}, err
+	}
+	return ctrl.Result{Requeue: true}, nil
+}
+
+// updateStateWithCondition updates state and adds/updates a condition
+func (r *DynamoGraphDeploymentRequestReconciler) updateStateWithCondition(
+	ctx context.Context,
+	dgdr *nvidiacomv1alpha1.DynamoGraphDeploymentRequest,
+	state string,
+	conditionType string,
+	status metav1.ConditionStatus,
+	reason string,
+	message string,
+) (ctrl.Result, error) {
+	dgdr.Status.State = state
+
+	condition := metav1.Condition{
+		Type:               conditionType,
+		Status:             status,
+		ObservedGeneration: dgdr.Generation,
+		LastTransitionTime: metav1.Now(),
+		Reason:             reason,
+		Message:            message,
+	}
+
+	dgdr.AddStatusCondition(condition)
+
+	if err := r.Status().Update(ctx, dgdr); err != nil {
+		return ctrl.Result{}, err
+	}
+
+	return ctrl.Result{Requeue: true}, nil
+}
+
+// SetupWithManager sets up the controller with the Manager
+func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manager) error {
+	return ctrl.NewControllerManagedBy(mgr).
+		For(&nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}).
+		Owns(&batchv1.Job{}, builder.WithPredicates(predicate.Funcs{
+			// ignore creation cause we don't want to be called again after we create the job
+			CreateFunc:  func(ce event.CreateEvent) bool { return false },
+			DeleteFunc:  func(de event.DeleteEvent) bool { return true },
+			UpdateFunc:  func(de event.UpdateEvent) bool { return true },
+			GenericFunc: func(ge event.GenericEvent) bool { return true },
+		})). // Watch Jobs created by this controller (via ownerReference)
+		Watches(
+			&nvidiacomv1alpha1.DynamoGraphDeployment{},
+			handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []ctrl.Request {
+				// Find DGDR by label instead of owner reference
+				dgd := obj.(*nvidiacomv1alpha1.DynamoGraphDeployment)
+				dgdrName, hasName := dgd.Labels[LabelDGDRName]
+				dgdrNamespace, hasNamespace := dgd.Labels[LabelDGDRNamespace]
+				if !hasName || !hasNamespace {
+					return nil
+				}
+				return []ctrl.Request{{
+					NamespacedName: types.NamespacedName{
+						Name:      dgdrName,
+						Namespace: dgdrNamespace,
+					},
+				}}
+			}),
+			builder.WithPredicates(predicate.Funcs{
+				// ignore creation cause we don't want to be called again after we create the DGD
+				CreateFunc:  func(ce event.CreateEvent) bool { return false },
+				DeleteFunc:  func(de event.DeleteEvent) bool { return true },
+				UpdateFunc:  func(ue event.UpdateEvent) bool { return true },
+				GenericFunc: func(ge event.GenericEvent) bool { return true },
+			}),
+		). // Watch DGDs created by this controller (via label)
+		Complete(r)
+}
--- a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
+++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package controller
+
+import (
+	"context"
+	"time"
+
+	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
+	commonController "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	batchv1 "k8s.io/api/batch/v1"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/client-go/tools/record"
+	"sigs.k8s.io/controller-runtime/pkg/reconcile"
+)
+
+// MockRBACManager implements RBACManager for testing
+type MockRBACManager struct {
+	EnsureServiceAccountWithRBACFunc func(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
+}
+
+func (m *MockRBACManager) EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error {
+	if m.EnsureServiceAccountWithRBACFunc != nil {
+		return m.EnsureServiceAccountWithRBACFunc(ctx, targetNamespace, serviceAccountName, clusterRoleName)
+	}
+	return nil
+}
+
+var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
+	const (
+		timeout  = time.Second * 10
+		interval = time.Millisecond * 250
+	)
+
+	var (
+		reconciler *DynamoGraphDeploymentRequestReconciler
+		recorder   *record.FakeRecorder
+	)
+
+	BeforeEach(func() {
+		recorder = record.NewFakeRecorder(100)
+		reconciler = &DynamoGraphDeploymentRequestReconciler{
+			Client:        k8sClient,
+			Recorder:      recorder,
+			ProfilerImage: "test-profiler:latest",
+			Config: commonController.Config{
+				RestrictedNamespace: "",
+				RBAC: commonController.RBACConfig{
+					DGDRProfilingClusterRoleName: "test-cluster-role",
+				},
+			},
+			RBACManager: &MockRBACManager{},
+		}
+	})
+
+	Context("When reconciling initial DGDR", func() {
+		It("Should validate spec and transition to Pending", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-initial"
+			namespace := "default"
+
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "test-model",
+					Backend:   BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  1500,
+						ISL:  3000,
+						OSL:  5,
+					},
+					Online: true,
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer k8sClient.Delete(ctx, dgdr)
+
+			// First reconcile: Empty -> Pending
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Check status
+			Eventually(func() string {
+				var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+				k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
+				return updated.Status.State
+			}, timeout, interval).Should(Equal(StatePending))
+
+			// Verify observedGeneration is set
+			var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+			k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
+			Expect(updated.Status.ObservedGeneration).Should(Equal(updated.Generation))
+		})
+
+		It("Should fail validation with missing modelName", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-invalid"
+			namespace := "default"
+
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					Backend: BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  1500,
+						ISL:  3000,
+						OSL:  5,
+					},
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer k8sClient.Delete(ctx, dgdr)
+
+			// Reconcile
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Check status transitions to Failed
+			Eventually(func() string {
+				var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+				k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
+				return updated.Status.State
+			}, timeout, interval).Should(Equal(StateFailed))
+		})
+	})
+
+	Context("When creating profiling job", func() {
+		It("Should create online profiling job", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-profiling-online"
+			namespace := "default"
+
+			// Create ConfigMap for profiling config
+			configMap := &corev1.ConfigMap{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-config",
+					Namespace: namespace,
+				},
+				Data: map[string]string{
+					"disagg.yaml": "test: config",
+				},
+			}
+			Expect(k8sClient.Create(ctx, configMap)).Should(Succeed())
+			defer k8sClient.Delete(ctx, configMap)
+
+			// Create ServiceAccount
+			sa := &corev1.ServiceAccount{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      ServiceAccountProfilingJob,
+					Namespace: namespace,
+				},
+			}
+			Expect(k8sClient.Create(ctx, sa)).Should(Succeed())
+			defer k8sClient.Delete(ctx, sa)
+
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "test-model",
+					Backend:   BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  1500,
+						ISL:  3000,
+						OSL:  5,
+					},
+					Online: true,
+					ProfilingConfig: &nvidiacomv1alpha1.ProfilingConfigSpec{
+						ConfigMapRef: &nvidiacomv1alpha1.ConfigMapKeySelector{
+							Name: "test-config",
+							Key:  "disagg.yaml",
+						},
+					},
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer k8sClient.Delete(ctx, dgdr)
+
+			// Reconcile multiple times to move through states
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Second reconcile: Pending -> Profiling
+			_, err = reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Verify profiling job was created
+			Eventually(func() bool {
+				jobName := getProfilingJobName(dgdr)
+				job := &batchv1.Job{}
+				err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)
+				return err == nil
+			}, timeout, interval).Should(BeTrue())
+
+			// Verify job has correct labels
+			jobName := getProfilingJobName(dgdr)
+			job := &batchv1.Job{}
+			k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job)
+			Expect(job.Labels[LabelApp]).Should(Equal(LabelValueDynamoProfiler))
+			Expect(job.Labels[LabelDGDR]).Should(Equal(dgdrName))
+
+			// Verify job has profiler container
+			Expect(job.Spec.Template.Spec.Containers).Should(HaveLen(2))
+			Expect(job.Spec.Template.Spec.Containers[0].Name).Should(Equal(ContainerNameProfiler))
+			Expect(job.Spec.Template.Spec.Containers[1].Name).Should(Equal(ContainerNameOutputCopier))
+
+			// Verify PVC volume mount
+			Expect(job.Spec.Template.Spec.Volumes).Should(ContainElement(
+				corev1.Volume{
+					Name: VolumeNameProfilingOutput,
+					VolumeSource: corev1.VolumeSource{
+						PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
+							ClaimName: "dynamo-pvc",
+						},
+					},
+				},
+			))
+
+			// Clean up job
+			k8sClient.Delete(ctx, job)
+		})
+
+		It("Should create offline (AIC) profiling job", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-profiling-aic"
+			namespace := "default"
+
+			// Create ServiceAccount
+			sa := &corev1.ServiceAccount{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      ServiceAccountProfilingJob,
+					Namespace: namespace,
+				},
+			}
+			_ = k8sClient.Create(ctx, sa)
+			defer k8sClient.Delete(ctx, sa)
+
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "QWEN3_32B",
+					Backend:   BackendTRTLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  1500,
+						ISL:  3000,
+						OSL:  5,
+					},
+					Online: false, // Offline profiling
+					GPU: &nvidiacomv1alpha1.GPUSpec{
+						Type:                "h200_sxm",
+						MinNumGPUsPerEngine: 1,
+						MaxNumGPUsPerEngine: 8,
+					},
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer k8sClient.Delete(ctx, dgdr)
+
+			// Reconcile
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			_, err = reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Verify job was created with AIC label
+			Eventually(func() string {
+				jobName := getProfilingJobName(dgdr)
+				job := &batchv1.Job{}
+				if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job); err != nil {
+					return ""
+				}
+				return job.Labels[LabelApp]
+			}, timeout, interval).Should(Equal(LabelValueAICProfiler))
+
+			// Clean up
+			jobName := getProfilingJobName(dgdr)
+			job := &batchv1.Job{}
+			if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job); err == nil {
+				k8sClient.Delete(ctx, job)
+			}
+		})
+	})
+
+	Context("When profiling completes", func() {
+		It("Should generate DGD spec from ConfigMap", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-profiling-complete"
+			namespace := "default"
+
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "test-model",
+					Backend:   BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  1500,
+						ISL:  3000,
+						OSL:  5,
+					},
+					Online: true,
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer k8sClient.Delete(ctx, dgdr)
+
+			// Update status to Profiling using Status subresource
+			dgdr.Status.State = StateProfiling
+			Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
+
+			// Create completed profiling job
+			jobName := getProfilingJobName(dgdr)
+			job := &batchv1.Job{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      jobName,
+					Namespace: namespace,
+				},
+				Spec: batchv1.JobSpec{
+					Template: corev1.PodTemplateSpec{
+						Spec: corev1.PodSpec{
+							Containers: []corev1.Container{{
+								Name:  "test",
+								Image: "test",
+							}},
+							RestartPolicy: corev1.RestartPolicyNever,
+						},
+					},
+				},
+				Status: batchv1.JobStatus{
+					Conditions: []batchv1.JobCondition{{
+						Type:   batchv1.JobComplete,
+						Status: corev1.ConditionTrue,
+					}},
+				},
+			}
+			Expect(k8sClient.Create(ctx, job)).Should(Succeed())
+			defer k8sClient.Delete(ctx, job)
+
+			// Update job status to completed using Status subresource
+			job.Status.Conditions = []batchv1.JobCondition{{
+				Type:   batchv1.JobComplete,
+				Status: corev1.ConditionTrue,
+			}}
+			Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+
+			// Create output ConfigMap with DGD spec
+			dgdYAML := `apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: test-dgd
+spec:
+  services:
+    Frontend:
+      replicas: 1`
+
+			outputConfigMapName := getOutputConfigMapName(dgdr)
+			cm := &corev1.ConfigMap{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      outputConfigMapName,
+					Namespace: namespace,
+				},
+				Data: map[string]string{
+					ProfilingOutputFile: dgdYAML,
+				},
+			}
+			Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
+			defer k8sClient.Delete(ctx, cm)
+
+			// Reconcile to process the profiling completion
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Get the updated DGDR
+			var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
+
+			// Check that DGD spec was generated
+			Expect(updated.Status.GeneratedDeployment).NotTo(BeNil())
+
+			// Verify state transitioned to Ready (since autoApply is false by default)
+			Expect(updated.Status.State).Should(Equal(StateReady))
+		})
+	})
+
+	Context("When autoApply is enabled", func() {
+		It("Should create DGD after profiling", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-autoapply"
+			namespace := "default"
+
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "test-model",
+					Backend:   BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  1500,
+						ISL:  3000,
+						OSL:  5,
+					},
+					Online:    true,
+					AutoApply: true,
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer k8sClient.Delete(ctx, dgdr)
+
+			// Update status to Profiling using Status subresource
+			dgdr.Status.State = StateProfiling
+			Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
+
+			// Create completed profiling job
+			jobName := getProfilingJobName(dgdr)
+			job := &batchv1.Job{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      jobName,
+					Namespace: namespace,
+				},
+				Spec: batchv1.JobSpec{
+					Template: corev1.PodTemplateSpec{
+						Spec: corev1.PodSpec{
+							Containers: []corev1.Container{{
+								Name:  "test",
+								Image: "test",
+							}},
+							RestartPolicy: corev1.RestartPolicyNever,
+						},
+					},
+				},
+				Status: batchv1.JobStatus{
+					Conditions: []batchv1.JobCondition{{
+						Type:   batchv1.JobComplete,
+						Status: corev1.ConditionTrue,
+					}},
+				},
+			}
+			Expect(k8sClient.Create(ctx, job)).Should(Succeed())
+			defer k8sClient.Delete(ctx, job)
+
+			// Update job status to completed using Status subresource
+			job.Status.Conditions = []batchv1.JobCondition{{
+				Type:   batchv1.JobComplete,
+				Status: corev1.ConditionTrue,
+			}}
+			Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+
+			// Create output ConfigMap
+			dgdYAML := `apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: test-dgd-auto
+spec:
+  services:
+    Frontend:
+      replicas: 1`
+
+			outputConfigMapName := getOutputConfigMapName(dgdr)
+			cm := &corev1.ConfigMap{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      outputConfigMapName,
+					Namespace: namespace,
+				},
+				Data: map[string]string{
+					ProfilingOutputFile: dgdYAML,
+				},
+			}
+			Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
+			defer k8sClient.Delete(ctx, cm)
+
+			// Reconcile to generate spec (transitions to Deploying because autoApply=true)
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Get updated DGDR and check state is Deploying
+			var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
+			Expect(updated.Status.State).Should(Equal(StateDeploying))
+
+			// Reconcile again to create DGD
+			_, err = reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Verify DGD was created
+			dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{}
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)).Should(Succeed())
+
+			// Get final DGDR status
+			k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)
+			Expect(updated.Status.Deployment).NotTo(BeNil())
+			Expect(updated.Status.Deployment.Created).Should(BeTrue())
+			Expect(updated.Status.Deployment.Name).Should(Equal("test-dgd-auto"))
+
+			// Clean up DGD
+			k8sClient.Get(ctx, types.NamespacedName{Name: "test-dgd-auto", Namespace: namespace}, dgd)
+			k8sClient.Delete(ctx, dgd)
+		})
+	})
+
+	Context("When enforcing spec immutability", func() {
+		It("Should reject spec changes after profiling starts", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-immutable"
+			namespace := "default"
+
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "test-model",
+					Backend:   BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  1500,
+						ISL:  3000,
+						OSL:  5,
+					},
+					Online: true,
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer k8sClient.Delete(ctx, dgdr)
+
+			// Reconcile to initialize
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Get current generation
+			var current nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+			k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)
+			initialGeneration := current.Generation
+			observedGeneration := current.Status.ObservedGeneration
+
+			// Manually set state to Profiling to simulate in-progress profiling
+			current.Status.State = StateProfiling
+			k8sClient.Status().Update(ctx, &current)
+
+			// Try to modify spec
+			k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)
+			current.Spec.SLA.TTFT = 200
+			k8sClient.Update(ctx, &current)
+
+			// Reconcile
+			_, err = reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Verify generation changed but observedGeneration stayed the same
+			k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &current)
+			Expect(current.Generation).Should(BeNumerically(">", initialGeneration))
+			Expect(current.Status.ObservedGeneration).Should(Equal(observedGeneration))
+			Expect(current.Status.State).Should(Equal(StateProfiling)) // State unchanged
+
+			// Verify event was recorded
+			Eventually(func() bool {
+				select {
+				case event := <-recorder.Events:
+					return event == "Warning SpecChangeRejected Cannot modify spec in state 'Profiling'. DynamoGraphDeploymentRequest is immutable once profiling starts. Create a new resource with a different name instead."
+				default:
+					return false
+				}
+			}, timeout, interval).Should(BeTrue())
+		})
+	})
+
+	Context("When handling DGD deletion", func() {
+		It("Should transition to DeploymentDeleted state", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-dgd-deleted"
+			namespace := "default"
+
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "test-model",
+					Backend:   BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  1500,
+						ISL:  3000,
+						OSL:  5,
+					},
+					Online:    true,
+					AutoApply: true,
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer k8sClient.Delete(ctx, dgdr)
+
+			// Update status to Ready with Deployment info using Status subresource
+			dgdr.Status.State = StateReady
+			dgdr.Status.Deployment = &nvidiacomv1alpha1.DeploymentStatus{
+				Name:      "test-dgd-to-delete",
+				Namespace: namespace,
+				Created:   true,
+				State:     "Ready",
+			}
+			Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
+
+			// Reconcile when DGD doesn't exist (simulating deletion)
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Get updated DGDR and check state transitioned to DeploymentDeleted
+			var updated nvidiacomv1alpha1.DynamoGraphDeploymentRequest
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
+			Expect(updated.Status.State).Should(Equal(StateDeploymentDeleted))
+		})
+	})
+})
+
+var _ = Describe("DGDR Helper Functions", func() {
+	Context("getProfilingJobName", func() {
+		It("Should return correct job name for online profiling", func() {
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "test-dgdr",
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					Online: true,
+				},
+			}
+			Expect(getProfilingJobName(dgdr)).Should(Equal("profile-online-test-dgdr"))
+		})
+
+		It("Should return correct job name for offline profiling", func() {
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "test-dgdr",
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					Online: false,
+				},
+			}
+			Expect(getProfilingJobName(dgdr)).Should(Equal("profile-aic-test-dgdr"))
+		})
+	})
+
+	Context("getOutputConfigMapName", func() {
+		It("Should return correct ConfigMap name", func() {
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "test-dgdr",
+				},
+			}
+			Expect(getOutputConfigMapName(dgdr)).Should(Equal("dgdr-output-test-dgdr"))
+		})
+	})
+})
+
+var _ = Describe("DGDR Validation", func() {
+	var reconciler *DynamoGraphDeploymentRequestReconciler
+
+	BeforeEach(func() {
+		reconciler = &DynamoGraphDeploymentRequestReconciler{
+			Client: k8sClient,
+		}
+	})
+
+	Context("validateSpec", func() {
+		It("Should pass validation for valid spec", func() {
+			ctx := context.Background()
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "test-model",
+					Backend:   BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  1500,
+						ISL:  3000,
+						OSL:  5,
+					},
+				},
+			}
+
+			err := reconciler.validateSpec(ctx, dgdr)
+			Expect(err).NotTo(HaveOccurred())
+		})
+
+		It("Should fail validation when modelName is empty", func() {
+			ctx := context.Background()
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					Backend: BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  1500,
+					},
+				},
+			}
+
+			err := reconciler.validateSpec(ctx, dgdr)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).Should(ContainSubstring("modelName"))
+		})
+
+		It("Should fail validation when TTFT is zero", func() {
+			ctx := context.Background()
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "test-model",
+					Backend:   BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 0,
+						ITL:  1500,
+						ISL:  3000,
+						OSL:  500,
+					},
+				},
+			}
+
+			err := reconciler.validateSpec(ctx, dgdr)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).Should(ContainSubstring("ttft"))
+		})
+
+		It("Should fail validation when TTFT is negative", func() {
+			ctx := context.Background()
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "test-model",
+					Backend:   BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: -1,
+						ITL:  1500,
+					},
+				},
+			}
+
+			err := reconciler.validateSpec(ctx, dgdr)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).Should(ContainSubstring("ttft"))
+		})
+
+		It("Should fail validation when ITL is zero", func() {
+			ctx := context.Background()
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "test-model",
+					Backend:   BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  0,
+						ISL:  3000,
+						OSL:  500,
+					},
+				},
+			}
+
+			err := reconciler.validateSpec(ctx, dgdr)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).Should(ContainSubstring("itl"))
+		})
+
+		It("Should fail validation when ITL is negative", func() {
+			ctx := context.Background()
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "test-model",
+					Backend:   BackendVLLM,
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  -1,
+					},
+				},
+			}
+
+			err := reconciler.validateSpec(ctx, dgdr)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).Should(ContainSubstring("itl"))
+		})
+
+		It("Should fail validation for invalid backend", func() {
+			ctx := context.Background()
+			dgdr := &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
+					ModelName: "test-model",
+					Backend:   "invalid-backend",
+					SLA: nvidiacomv1alpha1.SLASpec{
+						TTFT: 100,
+						ITL:  1500,
+					},
+				},
+			}
+
+			err := reconciler.validateSpec(ctx, dgdr)
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).Should(ContainSubstring("invalid backend"))
+		})
+	})
+})
--- a/deploy/cloud/operator/internal/controller/suite_test.go
+++ b/deploy/cloud/operator/internal/controller/suite_test.go
@@ -80,7 +80,7 @@ var _ = BeforeSuite(func() {
 			filepath.Join(".", "testing", "run.ai"),
 			filepath.Join(".", "testing", "nvidia"),
 		},
-		ErrorIfCRDPathMissing: true,
+		ErrorIfCRDPathMissing: false,

 		// The BinaryAssetsDirectory is only required if you want to run the tests directly
 		// without call the makefile target test. If not informed it will look for the

--- a/deploy/cloud/operator/internal/controller_common/predicate.go
+++ b/deploy/cloud/operator/internal/controller_common/predicate.go
@@ -74,6 +74,8 @@ type Config struct {
 type RBACConfig struct {
 	// PlannerClusterRoleName is the name of the ClusterRole for planner (cluster-wide mode only)
 	PlannerClusterRoleName string
+	// DGDRProfilingClusterRoleName is the name of the ClusterRole for DGDR profiling jobs (cluster-wide mode only)
+	DGDRProfilingClusterRoleName string
 }

 type IngressConfig struct {