feat: surface status and errors for profiler in DGDR (#6855)

Signed-off-by: Jont828 <jt572@cornell.edu>

feat: surface status and errors for profiler in DGDR (#6855)
Signed-off-by: Jont828 <jt572@cornell.edu>
740130eb · Jonathan Tong · GitHub · 1114004e · 740130eb · 740130eb
Unverified Commit 740130eb authored Mar 20, 2026 by Jonathan Tong Committed by GitHub Mar 20, 2026
11 changed files
--- a/components/src/dynamo/profiler/profile_sla.py
+++ b/components/src/dynamo/profiler/profile_sla.py
@@ -36,6 +36,7 @@ from dynamo.profiler.utils.dgd_generation import assemble_final_config
 from dynamo.profiler.utils.dgdr_v1beta1_types import (
    BackendType,
    DynamoGraphDeploymentRequestSpec,
+    ProfilingPhase,
 )
 from dynamo.profiler.utils.dgdr_validate import (
    valid_dgdr_spec,
@@ -183,6 +184,14 @@ async def _execute_strategy(
                deployment_clients,
            )

+        ops.current_phase = ProfilingPhase.SelectingConfig
+        write_profiler_status(
+            ops.output_dir,
+            status=ProfilerStatus.RUNNING,
+            message="Filtering results and selecting cost-efficient configuration",
+            phase=ProfilingPhase.SelectingConfig,
+        )
+
        best_config_df = pick_result["best_config_df"]
        best_latencies = pick_result["best_latencies"]

@@ -244,6 +253,7 @@ def _write_final_output(ops: ProfilerOperationalConfig, final_config: Any) -> bo
                status=ProfilerStatus.FAILED,
                error=error_msg,
                message=error_msg,
+                phase=ProfilingPhase.GeneratingDGD,
            )
            return False
    else:
@@ -261,6 +271,7 @@ def _write_final_output(ops: ProfilerOperationalConfig, final_config: Any) -> bo
        outputs={
            "final_config": "final_config.yaml",
        },
+        phase=ProfilingPhase.Done,
    )
    return True

@@ -287,6 +298,7 @@ async def run_profile(
        ops.output_dir,
        status=ProfilerStatus.RUNNING,
        message="Profiler job started",
+        phase=ProfilingPhase.Initializing,
    )

    try:
@@ -312,6 +324,14 @@ async def run_profile(
        # then validate DGDR features based on AIC support
        validate_dgdr_dynamo_features(dgdr, aic_supported)

+        ops.current_phase = ProfilingPhase.SweepingPrefill
+        write_profiler_status(
+            ops.output_dir,
+            status=ProfilerStatus.RUNNING,
+            message="Sweeping parallelization strategies",
+            phase=ops.current_phase,
+        )
+
        (
            pick_result,
            best_prefill_config,
@@ -357,6 +377,13 @@ async def run_profile(
        chosen_exp = pick_result.get("chosen_exp", "")
        is_disagg_config = chosen_exp not in ("agg",) and bool(chosen_exp)
        if not ops.dry_run and dgd_config and needs_profile_data(dgdr):
+            ops.current_phase = ProfilingPhase.BuildingCurves
+            write_profiler_status(
+                ops.output_dir,
+                status=ProfilerStatus.RUNNING,
+                message="Building interpolation curves for planner integration",
+                phase=ops.current_phase,
+            )
            if not is_disagg_config:
                logger.info(
                    "Picked config is aggregated (chosen_exp=%r) — "
@@ -396,6 +423,13 @@ async def run_profile(
        # ---------------------------------------------------------------
        # Final DGD assembly
        # ---------------------------------------------------------------
+        ops.current_phase = ProfilingPhase.GeneratingDGD
+        write_profiler_status(
+            ops.output_dir,
+            status=ProfilerStatus.RUNNING,
+            message="Packaging data and generating final DGD YAML",
+            phase=ops.current_phase,
+        )
        final_config = assemble_final_config(
            dgdr, ops, dgd_config, best_prefill_config, best_decode_config
        )
@@ -431,6 +465,7 @@ async def run_profile(
            status=ProfilerStatus.FAILED,
            error=str(e),
            message=f"Profiler failed with exception: {type(e).__name__}",
+            phase=ops.current_phase,
        )
        raise
    finally:

--- a/components/src/dynamo/profiler/thorough.py
+++ b/components/src/dynamo/profiler/thorough.py
@@ -43,6 +43,7 @@ from dynamo.profiler.utils.config_modifiers.protocol import apply_dgd_overrides
 from dynamo.profiler.utils.dgdr_v1beta1_types import (
    DynamoGraphDeploymentRequestSpec,
    ModelCacheSpec,
+    ProfilingPhase,
 )
 from dynamo.profiler.utils.profile_common import (
    ProfilerOperationalConfig,
@@ -51,6 +52,7 @@ from dynamo.profiler.utils.profile_common import (
    inject_tolerations_into_dgd,
 )
 from dynamo.profiler.utils.profile_decode import get_num_request_range
+from dynamo.profiler.utils.profiler_status import ProfilerStatus, write_profiler_status

 logger = logging.getLogger(__name__)

@@ -68,7 +70,8 @@ async def _benchmark_prefill_candidates(
 ) -> pd.DataFrame:
    """Deploy each prefill candidate, measure TTFT, return prefill_df."""
    prefill_rows: list[dict] = []
-    for candidate in prefill_candidates:
+    total_prefill = len(prefill_candidates)
+    for idx, candidate in enumerate(prefill_candidates, 1):
        num_gpus = candidate.num_gpus
        label = make_parallel_label(
            candidate.tp,
@@ -88,7 +91,18 @@ async def _benchmark_prefill_candidates(
        model_name, model_path = config_modifier.get_model_name(candidate.dgd_config)
        frontend_port = config_modifier.get_port(candidate.dgd_config)

+        progress_msg = (
+            f"Benchmarking prefill candidate {idx}/{total_prefill}: "
+            f"{label} ({num_gpus} GPUs)"
+        )
        logger.info("Profiling prefill candidate %s with %d GPUs...", label, num_gpus)
+        ops.current_phase = ProfilingPhase.SweepingPrefill
+        write_profiler_status(
+            ops.output_dir,
+            status=ProfilerStatus.RUNNING,
+            message=progress_msg,
+            phase=ProfilingPhase.SweepingPrefill,
+        )

        client = DynamoDeploymentClient(
            namespace=ops.k8s_namespace,
@@ -158,7 +172,8 @@ async def _benchmark_decode_candidates(
 ) -> pd.DataFrame:
    """Deploy each decode candidate, sweep num_request, return decode_df."""
    decode_rows: list[dict] = []
-    for candidate in decode_candidates:
+    total_decode = len(decode_candidates)
+    for idx, candidate in enumerate(decode_candidates, 1):
        num_gpus = candidate.num_gpus
        label = make_parallel_label(
            candidate.tp,
@@ -178,7 +193,18 @@ async def _benchmark_decode_candidates(
        model_name, model_path = config_modifier.get_model_name(candidate.dgd_config)
        frontend_port = config_modifier.get_port(candidate.dgd_config)

+        progress_msg = (
+            f"Benchmarking decode candidate {idx}/{total_decode}: "
+            f"{label} ({num_gpus} GPUs)"
+        )
        logger.info("Profiling decode candidate %s with %d GPUs...", label, num_gpus)
+        ops.current_phase = ProfilingPhase.SweepingDecode
+        write_profiler_status(
+            ops.output_dir,
+            status=ProfilerStatus.RUNNING,
+            message=progress_msg,
+            phase=ProfilingPhase.SweepingDecode,
+        )

        client = DynamoDeploymentClient(
            namespace=ops.k8s_namespace,
@@ -377,6 +403,13 @@ async def run_thorough(
    config_modifier = CONFIG_MODIFIERS[backend]

    # --- Stage 2: Benchmarking ---
+    ops.current_phase = ProfilingPhase.SweepingPrefill
+    write_profiler_status(
+        ops.output_dir,
+        status=ProfilerStatus.RUNNING,
+        message="Sweeping parallelization strategies for prefill, measuring TTFT",
+        phase=ops.current_phase,
+    )
    prefill_df = await _benchmark_prefill_candidates(
        prefill_candidates,
        ops,
@@ -388,6 +421,13 @@ async def run_thorough(
        deployment_clients,
        config_modifier,
    )
+    ops.current_phase = ProfilingPhase.SweepingDecode
+    write_profiler_status(
+        ops.output_dir,
+        status=ProfilerStatus.RUNNING,
+        message="Sweeping parallelization strategies for decode, measuring ITL",
+        phase=ops.current_phase,
+    )
    decode_df = await _benchmark_decode_candidates(
        decode_candidates,
        ops,

--- a/components/src/dynamo/profiler/utils/profile_common.py
+++ b/components/src/dynamo/profiler/utils/profile_common.py
@@ -18,14 +18,17 @@
 import copy
 import logging
 import os
-from dataclasses import dataclass
+from dataclasses import dataclass, field

 import pandas as pd

 from dynamo.profiler.utils.config_modifiers.parallelization_mapping import (
    PickedParallelConfig,
 )
-from dynamo.profiler.utils.dgdr_v1beta1_types import DynamoGraphDeploymentRequestSpec
+from dynamo.profiler.utils.dgdr_v1beta1_types import (
+    DynamoGraphDeploymentRequestSpec,
+    ProfilingPhase,
+)

 logger = logging.getLogger(__name__)

@@ -111,6 +114,7 @@ class ProfilerOperationalConfig:
    prefill_interpolation_granularity: int = DEFAULT_PREFILL_INTERPOLATION_GRANULARITY
    decode_interpolation_granularity: int = DEFAULT_DECODE_INTERPOLATION_GRANULARITY
    dry_run: bool = DEFAULT_DRY_RUN
+    current_phase: ProfilingPhase = field(default=ProfilingPhase.Initializing)


 # ---------------------------------------------------------------------------

--- a/components/src/dynamo/profiler/utils/profiler_status.py
+++ b/components/src/dynamo/profiler/utils/profiler_status.py
@@ -16,6 +16,8 @@ from typing import Any

 import yaml

+from dynamo.profiler.utils.dgdr_v1beta1_types import ProfilingPhase
+
 logger = logging.getLogger(__name__)


@@ -36,6 +38,7 @@ def write_profiler_status(
    message: str = "",
    error: str = "",
    outputs: dict | None = None,
+    phase: ProfilingPhase | None = None,
 ) -> None:
    """
    Write profiler status file.
@@ -46,6 +49,8 @@ def write_profiler_status(
        message: Optional status message
        error: Optional error message (for failed status)
        outputs: Optional dict of output files (for success status)
+        phase: Optional profiling sub-phase (e.g. ProfilingPhase value).
+               Relayed by the sidecar to the controller for kubectl visibility.
    """
    status_file = os.path.join(output_dir, STATUS_FILE_NAME)
    status_data: dict[str, Any] = {
@@ -58,6 +63,8 @@ def write_profiler_status(
        status_data["error"] = error
    if outputs:
        status_data["outputs"] = outputs
+    if phase:
+        status_data["phase"] = phase.value

    try:
        with open(status_file, "w") as f:

--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
@@ -480,11 +480,9 @@ spec:
          type: string
        - jsonPath: .status.conditions[?(@.type=="Succeeded")].reason
          name: Reason
-          priority: 1
          type: string
        - jsonPath: .status.conditions[?(@.type=="Succeeded")].message
          name: Message
-          priority: 1
          type: string
        - jsonPath: .status.dgdName
          name: DGD

--- a/deploy/operator/.gitignore
+++ b/deploy/operator/.gitignore
@@ -6,6 +6,7 @@
 *.so
 *.dylib
 bin/*
+tilt_bin/
 Dockerfile.cross

 #temp files
@@ -28,6 +29,9 @@ go.work
 .idea
 .vscode
 *.swp
+
+# Tilt local settings (personal overrides)
+tilt-settings.local.yaml
 *.swo
 *~


--- a/deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
+++ b/deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
@@ -499,8 +499,8 @@ type DynamoGraphDeploymentRequestStatus struct {
 // +kubebuilder:printcolumn:name="Backend",type=string,JSONPath=`.spec.backend`
 // +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase`
 // +kubebuilder:printcolumn:name="Profiling",type=string,JSONPath=`.status.profilingPhase`
-// +kubebuilder:printcolumn:name="Reason",type=string,JSONPath=`.status.conditions[?(@.type=="Succeeded")].reason`,priority=1
-// +kubebuilder:printcolumn:name="Message",type=string,JSONPath=`.status.conditions[?(@.type=="Succeeded")].message`,priority=1
+// +kubebuilder:printcolumn:name="Reason",type=string,JSONPath=`.status.conditions[?(@.type=="Succeeded")].reason`
+// +kubebuilder:printcolumn:name="Message",type=string,JSONPath=`.status.conditions[?(@.type=="Succeeded")].message`
 // +kubebuilder:printcolumn:name="DGD",type=string,JSONPath=`.status.dgdName`
 // +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp"
 type DynamoGraphDeploymentRequest struct {

--- a/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
@@ -480,11 +480,9 @@ spec:
          type: string
        - jsonPath: .status.conditions[?(@.type=="Succeeded")].reason
          name: Reason
-          priority: 1
          type: string
        - jsonPath: .status.conditions[?(@.type=="Succeeded")].message
          name: Message
-          priority: 1
          type: string
        - jsonPath: .status.dgdName
          name: DGD

--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
@@ -130,21 +130,61 @@ const (
 	MessageModelCachePVCNotFound     = "model cache PVC %s not found in namespace %s"
 )

-// shell script template for the output copier sidecar
+// shell script template for the output copier sidecar.
+//
+// The sidecar is a continuous poller that:
+//  1. During profiling: polls profiler_status.yaml every 10s, relays phase+message
+//     to the output ConfigMap so the controller can track sub-phase progress.
+//  2. After profiler terminates: writes the final profiling output (final_config.yaml
+//     + profiler_status.yaml) to the same ConfigMap, preserving the phase+message keys.
 const sidecarScriptTemplate = `
 set -e
 set -o pipefail

-# Wait for profiler container to terminate (no timeout - profiling can take hours)
-echo "Waiting for profiler to complete..."
+STATUS_FILE="{{.OutputPath}}/profiler_status.yaml"
+LAST_PHASE=""
 START_TIME=$(date +%s)
 LAST_PROGRESS_LOG=$START_TIME
 PROGRESS_INTERVAL=300

+# relay_phase: read phase+message from profiler_status.yaml and write to ConfigMap.
+# Only writes when the phase changes (debounce).
+relay_phase() {
+  if [ ! -f "$STATUS_FILE" ]; then
+    return
+  fi
+  PHASE=$(grep "^phase:" "$STATUS_FILE" 2>/dev/null | awk '{print $2}' | tr -d '"' | tr -d "'" || true)
+  MESSAGE=$(grep "^message:" "$STATUS_FILE" 2>/dev/null | sed 's/^message: *//' | tr -d '"' | tr -d "'" || true)
+  if [ -z "$PHASE" ] || [ "$PHASE" = "$LAST_PHASE" ]; then
+    return
+  fi
+  echo "Phase update: $PHASE - $MESSAGE"
+  cat >/tmp/progress.yaml <<PEOF
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{.ConfigMapName}}
+  namespace: {{.Namespace}}
+  labels:
+    dgdr.nvidia.com/name: {{.DGDRName}}
+    dgdr.nvidia.com/namespace: {{.Namespace}}
+    nvidia.com/managed-by: dynamo-operator
+data:
+  phase: "$PHASE"
+  message: "$MESSAGE"
+PEOF
+  kubectl apply -f /tmp/progress.yaml 2>/dev/null && LAST_PHASE="$PHASE" || echo "Warning: failed to update progress ConfigMap"
+}
+
+# Main loop: poll profiler_status.yaml and wait for profiler to terminate
+echo "Waiting for profiler to complete..."
 while true; do
  CURRENT_TIME=$(date +%s)
  ELAPSED=$((CURRENT_TIME - START_TIME))

+  # Relay phase updates to ConfigMap
+  relay_phase
+
  # Log progress every 5 minutes
  if [ $((CURRENT_TIME - LAST_PROGRESS_LOG)) -ge $PROGRESS_INTERVAL ]; then
    echo "Still waiting... ($(($ELAPSED / 60)) minutes elapsed)"
@@ -157,12 +197,14 @@ while true; do
    echo "Profiler terminated (ran for $(($ELAPSED / 60)) minutes)"
    break
  fi
-  sleep 5
+  sleep 10
 done

+# Final relay: pick up any last phase change written just before termination
+relay_phase
+
 # Check profiler status file (2 minute timeout)
 echo "Checking profiler status..."
-STATUS_FILE="{{.OutputPath}}/profiler_status.yaml"
 TIMEOUT=120
 CHECK_START=$(date +%s)

@@ -206,9 +248,13 @@ case "$STATUS" in
    ;;
 esac

-echo "Creating ConfigMap..."
+echo "Writing profiling output to ConfigMap..."
+
+# Read final phase+message to preserve them alongside the profiling output
+FINAL_PHASE=$(grep "^phase:" "$STATUS_FILE" 2>/dev/null | awk '{print $2}' | tr -d '"' | tr -d "'" || true)
+FINAL_MESSAGE=$(grep "^message:" "$STATUS_FILE" 2>/dev/null | sed 's/^message: *//' | tr -d '"' | tr -d "'" || true)

-# Start building ConfigMap YAML with DGD spec
+# Start building ConfigMap YAML with DGD spec + preserved phase/message
 cat >/tmp/cm.yaml <<EOF
 apiVersion: v1
 kind: ConfigMap
@@ -217,8 +263,11 @@ metadata:
  namespace: {{.Namespace}}
  labels:
    dgdr.nvidia.com/name: {{.DGDRName}}
+    dgdr.nvidia.com/namespace: {{.Namespace}}
    nvidia.com/managed-by: dynamo-operator
 data:
+  phase: "$FINAL_PHASE"
+  message: "$FINAL_MESSAGE"
  {{.OutputFile}}: |
 EOF
 sed 's/^/    /' {{.OutputPath}}/{{.OutputFile}} >> /tmp/cm.yaml
@@ -242,6 +291,44 @@ kubectl apply -f /tmp/cm.yaml
 echo "Saved profiling output to ConfigMap {{.ConfigMapName}}"
 `

+// profilingPhaseReason returns the condition Reason for a profiling sub-phase.
+// By design, the ProfilingPhase string values are identical to the Reason values
+// (e.g., ProfilingPhaseSweepingDecode = "SweepingDecode" = ProfilingReasonSweepingDecode).
+func profilingPhaseReason(phase nvidiacomv1beta1.ProfilingPhase) string {
+	if phase == nvidiacomv1beta1.ProfilingPhaseDone {
+		return nvidiacomv1beta1.ProfilingReasonCompleted
+	}
+
+	return string(phase)
+}
+
+// profilingPhaseFailureReason returns the condition Reason for a failed profiling sub-phase.
+// By convention, failure reasons are "<Phase>Failed" (e.g., "SweepingDecodeFailed").
+// An empty phase yields the generic "ProfilingFailed".
+func profilingPhaseFailureReason(phase nvidiacomv1beta1.ProfilingPhase) string {
+	if phase == "" {
+		return "ProfilingFailed"
+	}
+	return string(phase) + "Failed"
+}
+
+// validProfilingPhases is the set of phases the profiler sidecar may report.
+var validProfilingPhases = map[nvidiacomv1beta1.ProfilingPhase]struct{}{
+	nvidiacomv1beta1.ProfilingPhaseInitializing:    {},
+	nvidiacomv1beta1.ProfilingPhaseSweepingPrefill: {},
+	nvidiacomv1beta1.ProfilingPhaseSweepingDecode:  {},
+	nvidiacomv1beta1.ProfilingPhaseSelectingConfig: {},
+	nvidiacomv1beta1.ProfilingPhaseBuildingCurves:  {},
+	nvidiacomv1beta1.ProfilingPhaseGeneratingDGD:   {},
+	nvidiacomv1beta1.ProfilingPhaseDone:            {},
+}
+
+// isValidProfilingPhase returns true if phase is a recognized ProfilingPhase value.
+func isValidProfilingPhase(phase string) bool {
+	_, ok := validProfilingPhases[nvidiacomv1beta1.ProfilingPhase(phase)]
+	return ok
+}
+
 // DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object
 type DynamoGraphDeploymentRequestReconciler struct {
 	client.Client
@@ -389,9 +476,66 @@ func (r *DynamoGraphDeploymentRequestReconciler) handlePendingPhase(ctx context.
 		r.Recorder.Event(dgdr, corev1.EventTypeNormal, nvidiacomv1beta1.EventReasonProfilingJobCreated, MessageAICProfilingJobCreated)
 	}

-	// Update to Profiling phase — show DiscoveringHardware until the job is confirmed running.
+	// Update to Profiling phase — use Initializing reason to indicate the profiler is loading.
 	dgdr.SetProfilingPhase(nvidiacomv1beta1.ProfilingPhaseInitializing)
-	return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseProfiling, nvidiacomv1beta1.ConditionTypeProfiling, metav1.ConditionFalse, "DiscoveringHardware", MessageDiscoveringHardware)
+	return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseProfiling, nvidiacomv1beta1.ConditionTypeProfiling, metav1.ConditionFalse, nvidiacomv1beta1.ProfilingReasonInitializing, MessageDiscoveringHardware)
+}
+
+// updateProfilingSubPhase reads the output ConfigMap and updates status.profilingPhase
+// and the Profiling/Succeeded conditions. The sidecar continuously polls profiler_status.yaml
+// and writes phase+message to the output ConfigMap (dgdr-output-<name>). This function
+// reads those keys and copies them verbatim into the DGDR status.
+func (r *DynamoGraphDeploymentRequestReconciler) updateProfilingSubPhase(
+	ctx context.Context,
+	dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest,
+) error {
+	logger := log.FromContext(ctx)
+	outputCMName := getOutputConfigMapName(dgdr)
+
+	cm := &corev1.ConfigMap{}
+	if err := r.Get(ctx, types.NamespacedName{
+		Name: outputCMName, Namespace: dgdr.Namespace,
+	}, cm); err != nil {
+		return nil // No output ConfigMap yet — skip
+	}
+
+	phase, exists := cm.Data["phase"]
+	if !exists || phase == "" {
+		return nil
+	}
+
+	if !isValidProfilingPhase(phase) {
+		return fmt.Errorf("invalid profiling phase %q in ConfigMap %s", phase, outputCMName)
+	}
+
+	profilingPhase := nvidiacomv1beta1.ProfilingPhase(phase)
+	if dgdr.Status.ProfilingPhase == profilingPhase {
+		return nil // No change
+	}
+
+	logger.Info("Profiling sub-phase updated", "phase", phase)
+	dgdr.SetProfilingPhase(profilingPhase)
+
+	// Reason is derived from phase; message comes from the profiler via ConfigMap.
+	reason := profilingPhaseReason(profilingPhase)
+	message := cm.Data["message"] // written by profiler, relayed by sidecar
+
+	meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
+		Type:               nvidiacomv1beta1.ConditionTypeProfiling,
+		Status:             metav1.ConditionFalse,
+		ObservedGeneration: dgdr.Generation,
+		Reason:             reason,
+		Message:            message,
+	})
+	meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
+		Type:               nvidiacomv1beta1.ConditionTypeSucceeded,
+		Status:             metav1.ConditionFalse,
+		ObservedGeneration: dgdr.Generation,
+		Reason:             reason,
+		Message:            message,
+	})
+
+	return r.Status().Update(ctx, dgdr)
 }

 // handleProfilingPhase monitors profiling progress and generates spec when complete
@@ -399,21 +543,54 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingPhase(ctx contex
 	logger := log.FromContext(ctx)
 	logger.Info("Handling profiling phase", "name", dgdr.Name)

+	// Check for sub-phase updates from output ConfigMap (populated by sidecar poller)
+	if err := r.updateProfilingSubPhase(ctx, dgdr); err != nil {
+		return ctrl.Result{}, err
+	}
+
 	// Check profiling job status (both online and offline/AIC run as Jobs)
 	// Note: We watch the Job via Owns(), so we'll be triggered automatically on Job changes
 	completed, err := r.checkProfilingJobStatus(ctx, dgdr)
 	if err != nil {
 		r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageProfilingCheckFailed, err.Error())
-		// Job failed - clear profiling sub-phase and transition to Failed
-		dgdr.ClearProfilingPhase()
-		return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseFailed, nvidiacomv1beta1.ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingFailed", err.Error())
+		// Job failed - keep profilingPhase set so users can see where it died.
+		// profilingPhase is already current: set to Initializing on entry,
+		// then updated by updateProfilingSubPhase() above (reads output ConfigMap).
+		failureReason := "ProfilingFailed"
+		failureMessage := err.Error()
+		if dgdr.Status.ProfilingPhase != "" {
+			failureReason = profilingPhaseFailureReason(dgdr.Status.ProfilingPhase)
+		}
+
+		// Set phase and conditions directly so we can use sub-phase-specific failure
+		// reason on both Profiling and Succeeded conditions. (updatePhaseWithCondition
+		// would hardcode Succeeded reason to generic "Failed".)
+		dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseFailed
+		meta.SetStatusCondition(&dgdr.Status.Conditions, metav1.Condition{
+			Type:               nvidiacomv1beta1.ConditionTypeSucceeded,
+			Status:             metav1.ConditionFalse,
+			ObservedGeneration: dgdr.Generation,
+			Reason:             failureReason,
+			Message:            failureMessage,
+		})
+		dgdr.AddStatusCondition(metav1.Condition{
+			Type:               nvidiacomv1beta1.ConditionTypeProfiling,
+			Status:             metav1.ConditionFalse,
+			ObservedGeneration: dgdr.Generation,
+			Reason:             failureReason,
+			Message:            failureMessage,
+		})
+		if err := r.Status().Update(ctx, dgdr); err != nil {
+			return ctrl.Result{}, err
+		}
+		return ctrl.Result{Requeue: true}, nil
 	}

 	if !completed {
 		logger.Info("Profiling job still running", "name", dgdr.Name)
-		// Transition from DiscoveringHardware to ProfilingRunning once the job is confirmed active.
+		// Transition from Initializing to ProfilingRunning once the job is confirmed active.
 		cond := meta.FindStatusCondition(dgdr.Status.Conditions, nvidiacomv1beta1.ConditionTypeProfiling)
-		if cond != nil && cond.Reason == "DiscoveringHardware" {
+		if cond != nil && cond.Reason == nvidiacomv1beta1.ProfilingReasonInitializing {
 			return r.updatePhaseWithCondition(ctx, dgdr, nvidiacomv1beta1.DGDRPhaseProfiling, nvidiacomv1beta1.ConditionTypeProfiling, metav1.ConditionFalse, "ProfilingRunning", MessageProfilingInProgress)
 		}
 		// Don't requeue - we'll be triggered when the Job completes/fails
@@ -1784,6 +1961,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manag
 			UpdateFunc:  func(de event.UpdateEvent) bool { return true },
 			GenericFunc: func(ge event.GenericEvent) bool { return true },
 		})). // Watch Jobs created by this controller (via ownerReference)
+		// Watch DGDs created by this controller (via label)
 		Watches(
 			&dgdv1alpha1.DynamoGraphDeployment{},
 			handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []ctrl.Request {
@@ -1809,7 +1987,41 @@ func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manag
 				GenericFunc: func(ge event.GenericEvent) bool { return true },
 			}),
 		).
-		// Watch DGDs created by this controller (via label)
+		// Watch output ConfigMaps for profiling sub-phase updates (via label)
+		Watches(
+			&corev1.ConfigMap{},
+			handler.EnqueueRequestsFromMapFunc(func(ctx context.Context, obj client.Object) []ctrl.Request {
+				// Only trigger for ConfigMaps with DGDR labels (written by the sidecar)
+				cm := obj.(*corev1.ConfigMap)
+				dgdrName, hasName := cm.Labels[nvidiacomv1beta1.LabelDGDRName]
+				dgdrNamespace, hasNamespace := cm.Labels[nvidiacomv1beta1.LabelDGDRNamespace]
+				if !hasName || !hasNamespace {
+					return nil
+				}
+				return []ctrl.Request{{
+					NamespacedName: types.NamespacedName{
+						Name:      dgdrName,
+						Namespace: dgdrNamespace,
+					},
+				}}
+			}),
+			builder.WithPredicates(predicate.Funcs{
+				CreateFunc: func(ce event.CreateEvent) bool {
+					labels := ce.Object.GetLabels()
+					_, hasName := labels[nvidiacomv1beta1.LabelDGDRName]
+					_, hasNamespace := labels[nvidiacomv1beta1.LabelDGDRNamespace]
+					return hasName && hasNamespace
+				},
+				UpdateFunc: func(ue event.UpdateEvent) bool {
+					labels := ue.ObjectNew.GetLabels()
+					_, hasName := labels[nvidiacomv1beta1.LabelDGDRName]
+					_, hasNamespace := labels[nvidiacomv1beta1.LabelDGDRNamespace]
+					return hasName && hasNamespace
+				},
+				DeleteFunc:  func(de event.DeleteEvent) bool { return false },
+				GenericFunc: func(ge event.GenericEvent) bool { return false },
+			}),
+		).
 		// Set the event filter to ignore resources handled by other controllers in namespace-restricted mode
 		WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config, r.RuntimeConfig)).
 		Complete(observability.NewObservedReconciler(r, consts.ResourceTypeDynamoGraphDeploymentRequest))

--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
@@ -1395,7 +1395,7 @@ spec:
 					Name: "gpu-worker-1",
 					Labels: map[string]string{
 						"nvidia.com/gpu.count":   "8",
-						"nvidia.com/gpu.product": "H100-SXM5-80GB",
+						"nvidia.com/gpu.product": "h100_sxm",
 						"nvidia.com/gpu.memory":  "81920",
 					},
 				},
@@ -1426,7 +1426,7 @@ spec:
 			mockGPU := &gpu.GPUInfo{
 				GPUsPerNode:   8,
 				VRAMPerGPU:    81920,
-				System:        "H100-SXM5-80GB",
+				System:        "h100_sxm",
 				NodesWithGPUs: 1,
 			}
 			cache := gpu.NewGPUDiscoveryCache()
@@ -1461,7 +1461,7 @@ spec:
 					Name: "gpu-worker-h100",
 					Labels: map[string]string{
 						"nvidia.com/gpu.count":   "8",
-						"nvidia.com/gpu.product": "H100-SXM5-80GB",
+						"nvidia.com/gpu.product": "h100_sxm",
 						"nvidia.com/gpu.memory":  "81920",
 					},
 				},
@@ -1520,7 +1520,7 @@ spec:
 					Name: "gpu-worker-autodiscovery",
 					Labels: map[string]string{
 						"nvidia.com/gpu.count":   "8",
-						"nvidia.com/gpu.product": "H100-SXM5-80GB",
+						"nvidia.com/gpu.product": "h100_sxm",
 						"nvidia.com/gpu.memory":  "81920",
 					},
 				},
@@ -1551,7 +1551,7 @@ spec:
 			mockGPU := &gpu.GPUInfo{
 				GPUsPerNode:   8,
 				VRAMPerGPU:    81920,
-				System:        "H100-SXM5-80GB",
+				System:        "h100_sxm",
 				NodesWithGPUs: 1,
 			}
 			cache := gpu.NewGPUDiscoveryCache()
@@ -1640,7 +1640,7 @@ spec:
 					Name: "gpu-worker-h100",
 					Labels: map[string]string{
 						"nvidia.com/gpu.count":   "8",
-						"nvidia.com/gpu.product": "H100-SXM5-80GB",
+						"nvidia.com/gpu.product": "h100_sxm",
 						"nvidia.com/gpu.memory":  "81920",
 					},
 				},
@@ -1675,7 +1675,7 @@ spec:
 			mockGPU := &gpu.GPUInfo{
 				GPUsPerNode:   8,
 				VRAMPerGPU:    81920,
-				System:        "H100-SXM5-80GB",
+				System:        "h100_sxm",
 				NodesWithGPUs: 1,
 			}
 			cache := gpu.NewGPUDiscoveryCache()
@@ -2335,3 +2335,554 @@ spec:
 		})
 	})
 })
+
+var _ = Describe("DGDR Profiling Phase Derivation Functions", func() {
+	Context("profilingPhaseReason", func() {
+		It("Should return phase string as reason (they are identical by design)", func() {
+			tests := []struct {
+				phase    nvidiacomv1beta1.ProfilingPhase
+				expected string
+			}{
+				{nvidiacomv1beta1.ProfilingPhaseInitializing, "Initializing"},
+				{nvidiacomv1beta1.ProfilingPhaseSweepingPrefill, "SweepingPrefill"},
+				{nvidiacomv1beta1.ProfilingPhaseSweepingDecode, "SweepingDecode"},
+				{nvidiacomv1beta1.ProfilingPhaseSelectingConfig, "SelectingConfig"},
+				{nvidiacomv1beta1.ProfilingPhaseBuildingCurves, "BuildingCurves"},
+				{nvidiacomv1beta1.ProfilingPhaseGeneratingDGD, "GeneratingDGD"},
+			}
+			for _, tt := range tests {
+				Expect(profilingPhaseReason(tt.phase)).Should(Equal(tt.expected))
+			}
+		})
+
+		It("Should return Completed for Done phase", func() {
+			Expect(profilingPhaseReason(nvidiacomv1beta1.ProfilingPhaseDone)).Should(Equal(nvidiacomv1beta1.ProfilingReasonCompleted))
+		})
+
+		It("Should pass through unrecognized phases as-is", func() {
+			Expect(profilingPhaseReason(nvidiacomv1beta1.ProfilingPhase("CustomPhase"))).Should(Equal("CustomPhase"))
+		})
+	})
+
+	Context("profilingPhaseFailureReason", func() {
+		It("Should derive failure reason as phase + Failed", func() {
+			tests := []struct {
+				phase    nvidiacomv1beta1.ProfilingPhase
+				expected string
+			}{
+				{nvidiacomv1beta1.ProfilingPhaseInitializing, "InitializingFailed"},
+				{nvidiacomv1beta1.ProfilingPhaseSweepingPrefill, "SweepingPrefillFailed"},
+				{nvidiacomv1beta1.ProfilingPhaseSweepingDecode, "SweepingDecodeFailed"},
+				{nvidiacomv1beta1.ProfilingPhaseSelectingConfig, "SelectingConfigFailed"},
+				{nvidiacomv1beta1.ProfilingPhaseBuildingCurves, "BuildingCurvesFailed"},
+				{nvidiacomv1beta1.ProfilingPhaseGeneratingDGD, "GeneratingDGDFailed"},
+				{nvidiacomv1beta1.ProfilingPhaseDone, "DoneFailed"},
+			}
+			for _, tt := range tests {
+				Expect(profilingPhaseFailureReason(tt.phase)).Should(Equal(tt.expected))
+			}
+		})
+
+		It("Should return generic ProfilingFailed for empty phase", func() {
+			Expect(profilingPhaseFailureReason(nvidiacomv1beta1.ProfilingPhase(""))).Should(Equal("ProfilingFailed"))
+		})
+	})
+})
+
+var _ = Describe("DGDR Output ConfigMap Naming", func() {
+	Context("getOutputConfigMapName", func() {
+		It("Should use ConfigMapOutputPrefix", func() {
+			dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "my-deploy",
+				},
+			}
+			name := getOutputConfigMapName(dgdr)
+			Expect(name).Should(HavePrefix(ConfigMapOutputPrefix))
+			Expect(name).Should(Equal("dgdr-output-my-deploy"))
+		})
+	})
+})
+
+var _ = Describe("DGDR Profiling Failure Attribution", func() {
+	var (
+		reconciler *DynamoGraphDeploymentRequestReconciler
+		recorder   *record.FakeRecorder
+	)
+
+	BeforeEach(func() {
+		recorder = record.NewFakeRecorder(100)
+		reconciler = &DynamoGraphDeploymentRequestReconciler{
+			Client:    k8sClient,
+			APIReader: k8sClient,
+			Recorder:  recorder,
+			Config: &configv1alpha1.OperatorConfiguration{
+				Namespace: configv1alpha1.NamespaceConfiguration{
+					Restricted: "",
+				},
+			},
+			RuntimeConfig: &commonController.RuntimeConfig{},
+			RBACManager:   &MockRBACManager{},
+		}
+	})
+
+	Context("Profiling failure keeps profilingPhase", func() {
+		It("Should preserve profilingPhase and use sub-phase failure reason on job failure", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-keep-phase"
+			namespace := defaultNamespace
+
+			dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
+					Model:   "test-model",
+					Backend: "vllm",
+					Image:   "test-profiler:latest",
+					Hardware: &nvidiacomv1beta1.HardwareSpec{
+						NumGPUsPerNode: ptr.To[int32](8),
+						GPUSKU:         "h100_sxm",
+						VRAMMB:         ptr.To(81920.0),
+						TotalGPUs:      ptr.To[int32](128),
+					},
+					SLA: &nvidiacomv1beta1.SLASpec{
+						TTFT: ptr.To(100.0),
+						ITL:  ptr.To(1500.0),
+					},
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
+
+			// Set status to Profiling with SweepingDecode sub-phase
+			dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
+			dgdr.Status.ProfilingPhase = nvidiacomv1beta1.ProfilingPhaseSweepingDecode
+			Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
+
+			// Create failed job
+			jobName := getProfilingJobName(dgdr)
+			job := &batchv1.Job{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      jobName,
+					Namespace: namespace,
+				},
+				Spec: batchv1.JobSpec{
+					Template: corev1.PodTemplateSpec{
+						Spec: corev1.PodSpec{
+							Containers: []corev1.Container{{
+								Name:  ContainerNameProfiler,
+								Image: "test",
+							}},
+							RestartPolicy: corev1.RestartPolicyNever,
+						},
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, job)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, job) }()
+
+			// Update job status to failed
+			job.Status.Conditions = []batchv1.JobCondition{{
+				Type:    batchv1.JobFailed,
+				Status:  corev1.ConditionTrue,
+				Message: "BackoffLimitExceeded",
+			}}
+			Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+
+			// Reconcile
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Verify DGDR is in Failed phase with profilingPhase preserved
+			var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
+			Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseFailed))
+			Expect(updated.Status.ProfilingPhase).Should(Equal(nvidiacomv1beta1.ProfilingPhaseSweepingDecode))
+
+			// Verify Profiling condition has sub-phase-specific failure reason
+			profilingCond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1beta1.ConditionTypeProfiling)
+			Expect(profilingCond).NotTo(BeNil())
+			Expect(profilingCond.Reason).Should(Equal(nvidiacomv1beta1.ProfilingReasonSweepingDecodeFailed))
+
+			// Verify Succeeded condition has sub-phase-specific failure reason
+			succeededCond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1beta1.ConditionTypeSucceeded)
+			Expect(succeededCond).NotTo(BeNil())
+			Expect(succeededCond.Reason).Should(Equal(nvidiacomv1beta1.ProfilingReasonSweepingDecodeFailed))
+		})
+
+		It("Should use generic ProfilingFailed when no sub-phase info available", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-generic-fail"
+			namespace := defaultNamespace
+
+			dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
+					Model:   "test-model",
+					Backend: "vllm",
+					Image:   "test-profiler:latest",
+					Hardware: &nvidiacomv1beta1.HardwareSpec{
+						NumGPUsPerNode: ptr.To[int32](8),
+						GPUSKU:         "h100_sxm",
+						VRAMMB:         ptr.To(81920.0),
+						TotalGPUs:      ptr.To[int32](128),
+					},
+					SLA: &nvidiacomv1beta1.SLASpec{
+						TTFT: ptr.To(100.0),
+						ITL:  ptr.To(1500.0),
+					},
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
+
+			// Set status to Profiling with empty sub-phase
+			dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
+			dgdr.Status.ProfilingPhase = ""
+			Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
+
+			// Create failed job
+			jobName := getProfilingJobName(dgdr)
+			job := &batchv1.Job{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      jobName,
+					Namespace: namespace,
+				},
+				Spec: batchv1.JobSpec{
+					Template: corev1.PodTemplateSpec{
+						Spec: corev1.PodSpec{
+							Containers: []corev1.Container{{
+								Name:  ContainerNameProfiler,
+								Image: "test",
+							}},
+							RestartPolicy: corev1.RestartPolicyNever,
+						},
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, job)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, job) }()
+
+			job.Status.Conditions = []batchv1.JobCondition{{
+				Type:    batchv1.JobFailed,
+				Status:  corev1.ConditionTrue,
+				Message: "BackoffLimitExceeded",
+			}}
+			Expect(k8sClient.Status().Update(ctx, job)).Should(Succeed())
+
+			// Reconcile
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Verify generic ProfilingFailed is used
+			var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
+			Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseFailed))
+
+			profilingCond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1beta1.ConditionTypeProfiling)
+			Expect(profilingCond).NotTo(BeNil())
+			Expect(profilingCond.Reason).Should(Equal("ProfilingFailed"))
+		})
+	})
+
+	Context("Profiling entry uses Initializing reason", func() {
+		It("Should use Initializing reason when entering Profiling phase", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-init-reason"
+			namespace := defaultNamespace
+
+			dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
+					Model:   "test-model",
+					Backend: "vllm",
+					Image:   "test-profiler:latest",
+					Hardware: &nvidiacomv1beta1.HardwareSpec{
+						NumGPUsPerNode: ptr.To[int32](8),
+						GPUSKU:         "h100_sxm",
+						VRAMMB:         ptr.To(81920.0),
+						TotalGPUs:      ptr.To[int32](128),
+					},
+					SLA: &nvidiacomv1beta1.SLASpec{
+						TTFT: ptr.To(100.0),
+						ITL:  ptr.To(1500.0),
+					},
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
+
+			// First reconcile: validation → Pending
+			_, err := reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Second reconcile: Pending → Profiling
+			_, err = reconciler.Reconcile(ctx, reconcile.Request{
+				NamespacedName: types.NamespacedName{Name: dgdrName, Namespace: namespace},
+			})
+			Expect(err).NotTo(HaveOccurred())
+
+			// Verify Profiling condition uses Initializing reason (not generic ProfilingRunning)
+			var updated nvidiacomv1beta1.DynamoGraphDeploymentRequest
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, &updated)).Should(Succeed())
+			Expect(updated.Status.Phase).Should(Equal(nvidiacomv1beta1.DGDRPhaseProfiling))
+
+			profilingCond := meta.FindStatusCondition(updated.Status.Conditions, nvidiacomv1beta1.ConditionTypeProfiling)
+			Expect(profilingCond).NotTo(BeNil())
+			Expect(profilingCond.Reason).Should(Equal(nvidiacomv1beta1.ProfilingReasonInitializing))
+
+			// Clean up job
+			jobName := getProfilingJobName(&updated)
+			job := &batchv1.Job{}
+			if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: namespace}, job); err == nil {
+				_ = k8sClient.Delete(ctx, job)
+			}
+		})
+	})
+
+	Context("updateProfilingSubPhase", func() {
+		It("Should update profilingPhase from output ConfigMap", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-subphase-update"
+			namespace := defaultNamespace
+
+			dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
+					Model:   "test-model",
+					Backend: "vllm",
+					Image:   "test-profiler:latest",
+					Hardware: &nvidiacomv1beta1.HardwareSpec{
+						NumGPUsPerNode: ptr.To[int32](8),
+						GPUSKU:         "h100_sxm",
+						VRAMMB:         ptr.To(81920.0),
+						TotalGPUs:      ptr.To[int32](128),
+					},
+					SLA: &nvidiacomv1beta1.SLASpec{
+						TTFT: ptr.To(100.0),
+						ITL:  ptr.To(1500.0),
+					},
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
+
+			// Set initial status
+			dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
+			dgdr.Status.ProfilingPhase = nvidiacomv1beta1.ProfilingPhaseInitializing
+			Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
+
+			// Create output ConfigMap with updated phase and message from profiler
+			cm := &corev1.ConfigMap{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      getOutputConfigMapName(dgdr),
+					Namespace: namespace,
+				},
+				Data: map[string]string{
+					"phase":   "SweepingPrefill",
+					"message": "Sweeping TP=4 DEP=2, measuring TTFT",
+				},
+			}
+			Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, cm) }()
+
+			// Re-fetch to get latest resourceVersion
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, dgdr)).Should(Succeed())
+
+			// Call updateProfilingSubPhase
+			Expect(reconciler.updateProfilingSubPhase(ctx, dgdr)).Should(Succeed())
+
+			// Verify in-memory status was updated
+			Expect(dgdr.Status.ProfilingPhase).Should(Equal(nvidiacomv1beta1.ProfilingPhaseSweepingPrefill))
+
+			// Verify conditions: reason derived from phase, message from profiler
+			profilingCond := meta.FindStatusCondition(dgdr.Status.Conditions, nvidiacomv1beta1.ConditionTypeProfiling)
+			Expect(profilingCond).NotTo(BeNil())
+			Expect(profilingCond.Reason).Should(Equal("SweepingPrefill"))
+			Expect(profilingCond.Message).Should(Equal("Sweeping TP=4 DEP=2, measuring TTFT"))
+		})
+
+		It("Should be a no-op when no progress ConfigMap exists", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-no-cm"
+			namespace := defaultNamespace
+
+			dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
+					Model:   "test-model",
+					Backend: "vllm",
+					Image:   "test-profiler:latest",
+					Hardware: &nvidiacomv1beta1.HardwareSpec{
+						NumGPUsPerNode: ptr.To[int32](8),
+						GPUSKU:         "h100_sxm",
+						VRAMMB:         ptr.To(81920.0),
+						TotalGPUs:      ptr.To[int32](128),
+					},
+					SLA: &nvidiacomv1beta1.SLASpec{
+						TTFT: ptr.To(100.0),
+						ITL:  ptr.To(1500.0),
+					},
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
+
+			dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
+			dgdr.Status.ProfilingPhase = nvidiacomv1beta1.ProfilingPhaseInitializing
+			Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
+
+			// Re-fetch
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, dgdr)).Should(Succeed())
+
+			// Call updateProfilingSubPhase — should not change anything
+			Expect(reconciler.updateProfilingSubPhase(ctx, dgdr)).Should(Succeed())
+
+			// ProfilingPhase should remain Initializing
+			Expect(dgdr.Status.ProfilingPhase).Should(Equal(nvidiacomv1beta1.ProfilingPhaseInitializing))
+		})
+
+		It("Should skip update when phase has not changed", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-same-phase"
+			namespace := defaultNamespace
+
+			dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
+					Model:   "test-model",
+					Backend: "vllm",
+					Image:   "test-profiler:latest",
+					Hardware: &nvidiacomv1beta1.HardwareSpec{
+						NumGPUsPerNode: ptr.To[int32](8),
+						GPUSKU:         "h100_sxm",
+						VRAMMB:         ptr.To(81920.0),
+						TotalGPUs:      ptr.To[int32](128),
+					},
+					SLA: &nvidiacomv1beta1.SLASpec{
+						TTFT: ptr.To(100.0),
+						ITL:  ptr.To(1500.0),
+					},
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
+
+			dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
+			dgdr.Status.ProfilingPhase = nvidiacomv1beta1.ProfilingPhaseSweepingPrefill
+			Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
+
+			// Create output ConfigMap with same phase as status
+			cm := &corev1.ConfigMap{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      getOutputConfigMapName(dgdr),
+					Namespace: namespace,
+				},
+				Data: map[string]string{
+					"phase":   "SweepingPrefill",
+					"message": "Sweeping TP=4 DEP=2, measuring TTFT",
+				},
+			}
+			Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, cm) }()
+
+			// Re-fetch
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, dgdr)).Should(Succeed())
+
+			// Call updateProfilingSubPhase — should not update since phase hasn't changed
+			Expect(reconciler.updateProfilingSubPhase(ctx, dgdr)).Should(Succeed())
+
+			// Should still be SweepingPrefill
+			Expect(dgdr.Status.ProfilingPhase).Should(Equal(nvidiacomv1beta1.ProfilingPhaseSweepingPrefill))
+		})
+
+		It("Should return error for invalid phase value in ConfigMap", func() {
+			ctx := context.Background()
+			dgdrName := "test-dgdr-invalid-phase"
+			namespace := defaultNamespace
+
+			dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      dgdrName,
+					Namespace: namespace,
+				},
+				Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
+					Model:   "test-model",
+					Backend: "vllm",
+					Image:   "test-profiler:latest",
+					Hardware: &nvidiacomv1beta1.HardwareSpec{
+						NumGPUsPerNode: ptr.To[int32](8),
+						GPUSKU:         "h100_sxm",
+						VRAMMB:         ptr.To(81920.0),
+						TotalGPUs:      ptr.To[int32](128),
+					},
+					SLA: &nvidiacomv1beta1.SLASpec{
+						TTFT: ptr.To(100.0),
+						ITL:  ptr.To(1500.0),
+					},
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, dgdr)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, dgdr) }()
+
+			dgdr.Status.Phase = nvidiacomv1beta1.DGDRPhaseProfiling
+			dgdr.Status.ProfilingPhase = nvidiacomv1beta1.ProfilingPhaseInitializing
+			Expect(k8sClient.Status().Update(ctx, dgdr)).Should(Succeed())
+
+			// Create output ConfigMap with invalid phase
+			cm := &corev1.ConfigMap{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      getOutputConfigMapName(dgdr),
+					Namespace: namespace,
+				},
+				Data: map[string]string{
+					"phase":   "BogusPhase",
+					"message": "this should not be accepted",
+				},
+			}
+			Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
+			defer func() { _ = k8sClient.Delete(ctx, cm) }()
+
+			// Re-fetch
+			Expect(k8sClient.Get(ctx, types.NamespacedName{Name: dgdrName, Namespace: namespace}, dgdr)).Should(Succeed())
+
+			err := reconciler.updateProfilingSubPhase(ctx, dgdr)
+			Expect(err).Should(HaveOccurred())
+			Expect(err.Error()).Should(ContainSubstring("invalid profiling phase"))
+			Expect(err.Error()).Should(ContainSubstring("BogusPhase"))
+
+			// profilingPhase should remain unchanged
+			Expect(dgdr.Status.ProfilingPhase).Should(Equal(nvidiacomv1beta1.ProfilingPhaseInitializing))
+		})
+	})
+})
--- a/deploy/utils/dynamo_deployment.py
+++ b/deploy/utils/dynamo_deployment.py
@@ -240,10 +240,11 @@ class DynamoDeploymentClient:
            self.deployment_spec is not None
        ), "Failed to load deployment specification"

-        # Extract component names
-        self.components = [
-            svc.lower() for svc in self.deployment_spec["spec"]["services"].keys()
-        ]
+        # Extract component names (original case for label queries, lowercase for directories)
+        self._original_components = list(
+            self.deployment_spec["spec"]["services"].keys()
+        )
+        self.components = [svc.lower() for svc in self._original_components]

        # Ensure name and namespace are set correctly
        self.deployment_spec["metadata"]["name"] = self.deployment_name
@@ -450,14 +451,17 @@ class DynamoDeploymentClient:
        base_dir = self.base_log_dir / self.deployment_name
        base_dir.mkdir(parents=True, exist_ok=True)

-        for component in self.components:
+        for component, original_name in zip(self.components, self._original_components):
            component_dir = base_dir / component
            component_dir.mkdir(exist_ok=True)

-            # List pods for this component using the selector label
-            # nvidia.com/selector: deployment-name-component
+            # Use DGD name + component name labels which are consistent across
+            # both Grove (PodCliqueSet) and non-Grove (DCD) deployment pathways.
+            # The previous nvidia.com/selector label includes a worker hash suffix
+            # on the DCD pathway, causing a mismatch with the expected base name.
            label_selector = (
-                f"nvidia.com/selector={self.deployment_name}-{component.lower()}"
+                f"nvidia.com/dynamo-graph-deployment-name={self.deployment_name},"
+                f"nvidia.com/dynamo-component={original_name}"
            )

            pods = await self.core_api.list_namespaced_pod(