feat: remove PVC logic from profiler planner (#4210)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: hongkuanz <hongkuanz@nvidia.com>

feat: remove PVC logic from profiler planner (#4210)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: hongkuanz <hongkuanz@nvidia.com>
fa6a7f94 · hhzhang16 · GitHub · 6f708832 · fa6a7f94 · fa6a7f94
Unverified Commit fa6a7f94 authored Nov 10, 2025 by hhzhang16 Committed by GitHub Nov 10, 2025
10 changed files
--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -35,11 +35,6 @@ console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)


-class VolumeMount(BaseModel):
-    name: str = "dynamo-pvc"
-    mountPoint: str = "/data"
-
-
 class Container(BaseModel):
    image: Optional[str] = None
    workingDir: Optional[str] = None
@@ -71,15 +66,8 @@ class Services(BaseModel):
    model_config = {"extra": "allow"}


-class PVCConfig(BaseModel):
-    name: str = "dynamo-pvc"
-    create: Optional[bool] = False
-    model_config = {"extra": "allow"}
-
-
 class Spec(BaseModel):
    services: dict[str, Service]
-    pvcs: Optional[list[PVCConfig]] = None
    model_config = {"extra": "allow"}


@@ -99,11 +87,15 @@ class MultinodeConfig(BaseModel):


 class DgdPlannerServiceConfig(BaseModel):
+    """Planner service configuration.
+
+    Planner reads profiling data from a ConfigMap (planner-profile-data)
+    automatically created and mounted by the profiler; no PVC dependencies
+    """
+
    dynamoNamespace: str = "dynamo"  # placeholder
    componentType: str = "planner"
    replicas: int = 1
-    # Do not attach PVC; we'll mount a ConfigMap for planner data instead.
-    volumeMounts: list[VolumeMount] = []
    extraPodSpec: PodSpec = PodSpec(
        mainContainer=Container(
            image="my-registry/dynamo-runtime:my-tag",  # placeholder

--- a/benchmarks/profiler/utils/dgd_generation.py
+++ b/benchmarks/profiler/utils/dgd_generation.py
@@ -206,7 +206,7 @@ def generate_dgd_config_with_planner(
        mc_mounts.append(
            {
                "name": "planner-profile-data",
-                "mountPath": cm_mount_path,
+                "mountPoint": cm_mount_path,
                "readOnly": True,
            }
        )

--- a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller.go
+++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller.go
@@ -202,17 +202,8 @@ data:
 EOF
 sed 's/^/    /' {{.OutputPath}}/{{.OutputFile}} >> /tmp/cm.yaml

-# Add profiling data directories to ConfigMap for long-term storage
-# Find all interpolation directories and add their raw_data.npz files
-for dir in {{.OutputPath}}/*/interpolation; do
-  if [ -d "$dir" ]; then
-    dirname=$(basename $(dirname "$dir"))
-    if [ -f "$dir/raw_data.npz" ]; then
-      echo "  ${dirname}_raw_data.npz: |" >> /tmp/cm.yaml
-      base64 "$dir/raw_data.npz" | sed 's/^/    /' >> /tmp/cm.yaml
-    fi
-  fi
-done
+# Note: Profiling data (raw_data.npz converted to JSON) is included in the
+# generated DGD YAML as a separate ConfigMap by the profiler, no need to add it here

 kubectl apply -f /tmp/cm.yaml
 echo "Saved profiling output to ConfigMap {{.ConfigMapName}}"
@@ -405,6 +396,19 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleProfilingState(ctx contex
 	// Record spec generation event
 	r.Recorder.Event(dgdr, corev1.EventTypeNormal, EventReasonSpecGenerated, MessageSpecGenerated)

+	// Create additional resources (ConfigMaps) immediately after profiling
+	// This ensures that the `planner-profile-data` ConfigMap is available for both auto and manual deployment
+	targetNamespace := dgdr.Namespace
+	if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.Namespace != "" {
+		targetNamespace = dgdr.Spec.DeploymentOverrides.Namespace
+	}
+	if err := r.createAdditionalResources(ctx, dgdr, targetNamespace); err != nil {
+		logger.Error(err, "Failed to create additional resources after profiling")
+		// Don't fail the DGDR, just log the error - ConfigMaps can be created manually
+		r.Recorder.Event(dgdr, corev1.EventTypeWarning, "ConfigMapCreationFailed",
+			fmt.Sprintf("Failed to create ConfigMaps from profiling output: %v", err))
+	}
+
 	// If autoApply is enabled, transition to Deploying state
 	if dgdr.Spec.AutoApply {
 		logger.Info("AutoApply enabled, transitioning to Deploying state")
@@ -479,20 +483,6 @@ func (r *DynamoGraphDeploymentRequestReconciler) handleDeployingState(ctx contex

 	// Check if we need to create DGD
 	if dgdr.Status.Deployment == nil || !dgdr.Status.Deployment.Created {
-		// Determine target namespace for deployment
-		targetNamespace := dgdr.Namespace
-		if dgdr.Spec.DeploymentOverrides != nil && dgdr.Spec.DeploymentOverrides.Namespace != "" {
-			targetNamespace = dgdr.Spec.DeploymentOverrides.Namespace
-		}
-
-		// Deploy additional resources (ConfigMaps) from the profiling output first
-		if err := r.createAdditionalResources(ctx, dgdr, targetNamespace); err != nil {
-			logger.Error(err, "Failed to create additional resources")
-			r.Recorder.Event(dgdr, corev1.EventTypeWarning, MessageDeploymentCreationFailed,
-				fmt.Sprintf("Failed to create additional resources: %v", err))
-			return ctrl.Result{}, err
-		}
-
 		return r.createDGD(ctx, dgdr)
 	}

@@ -1094,13 +1084,12 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
 			}},
 		}

-		// Build volumes - use dynamo-pvc for profiling output so data persists for the Planner
+		// Build volumes - use emptyDir for profiling output
+		// The sidecar saves all needed data to ConfigMaps, so persistence is not needed
 		volumes := []corev1.Volume{{
 			Name: VolumeNameProfilingOutput,
 			VolumeSource: corev1.VolumeSource{
-				PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
-					ClaimName: "dynamo-pvc",
-				},
+				EmptyDir: &corev1.EmptyDirVolumeSource{},
 			},
 		}}


--- a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
+++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
@@ -291,14 +291,12 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
 			Expect(job.Spec.Template.Spec.Containers[0].Name).Should(Equal(ContainerNameProfiler))
 			Expect(job.Spec.Template.Spec.Containers[1].Name).Should(Equal(ContainerNameOutputCopier))

-			// Verify PVC volume mount
+			// Verify emptyDir volume (not PVC)
 			Expect(job.Spec.Template.Spec.Volumes).Should(ContainElement(
 				corev1.Volume{
 					Name: VolumeNameProfilingOutput,
 					VolumeSource: corev1.VolumeSource{
-						PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
-							ClaimName: "dynamo-pvc",
-						},
+						EmptyDir: &corev1.EmptyDirVolumeSource{},
 					},
 				},
 			))

--- a/deploy/utils/README.md
+++ b/deploy/utils/README.md
@@ -17,7 +17,7 @@ This includes:

 - `setup_benchmarking_resources.sh` — Sets up benchmarking and profiling resources in your existing Dynamo namespace
 - `manifests/`
-  - `pvc.yaml` — PVC `dynamo-pvc` for storing profiler results and configurations
+  - `pvc.yaml` — PVC `dynamo-pvc`
  - `pvc-access-pod.yaml` — short‑lived pod for copying profiler results from the PVC
 - `kubernetes.py` — helper used by tooling to apply/read resources (e.g., access pod for PVC access)
 - `dynamo_deployment.py` — utilities for working with DynamoGraphDeployment resources
@@ -108,15 +108,21 @@ kubectl cp $NAMESPACE/pvc-access-pod:/data/results ./benchmarks/results
 kubectl cp $NAMESPACE/pvc-access-pod:/data/results/benchmark-name ./benchmarks/results/benchmark-name
 ```

-**Download profiling results (optional, for local inspection):**
+**Inspect profiling results (optional, for local inspection):**

 ```bash
-# Optional: Download profiling data for local analysis
-# The planner reads directly from the PVC, so this is only needed for inspection
-kubectl cp $NAMESPACE/pvc-access-pod:/data ./profiling_data
+# View the generated DGD configuration from profiling
+kubectl get configmap dgdr-output-<dgdr-name> -n $NAMESPACE -o yaml
+
+# View the planner profiling data (JSON format)
+kubectl get configmap planner-profile-data -n $NAMESPACE -o yaml
 ```

-> **Note on Profiling Results**: When using DGDR (DynamoGraphDeploymentRequest) for SLA-driven profiling, profiling data is stored in `/data/` on the PVC. The planner component reads this data directly from the PVC, so downloading is **optional** - only needed if you want to inspect the profiling results locally (e.g., view performance plots, check configurations).
+> **Note on Profiling Results**: When using DGDR (DynamoGraphDeploymentRequest) for SLA-driven profiling, profiling data is automatically stored in ConfigMaps:
+> - `dgdr-output-<dgdr-name>`: Contains the generated DynamoGraphDeployment YAML
+> - `planner-profile-data`: Contains profiling performance data in JSON format for the planner
+>
+> The planner component reads this data directly from the mounted ConfigMap, so no PVC is needed.

 #### Cleanup Access Pod

@@ -131,7 +137,6 @@ kubectl delete pod pvc-access-pod -n $NAMESPACE
 **Common path patterns in the PVC:**
 - `/data/configs/` - Configuration files (DGD manifests)
 - `/data/results/` - Benchmark results (for download after benchmarking jobs)
- `/data/` - Profiling data (used directly by planner, typically not downloaded)
 - `/data/benchmarking/` - Benchmarking artifacts

 #### Next Steps

--- a/docs/planner/sla_planner_quickstart.md
+++ b/docs/planner/sla_planner_quickstart.md
@@ -58,7 +58,6 @@ The Dynamo Operator watches for DGDRs and automatically:
 Before creating a DGDR, ensure:
 - **Dynamo platform installed** with the operator running (see [Installation Guide](/docs/kubernetes/installation_guide.md))
 - **[kube-prometheus-stack](/docs/kubernetes/observability/metrics.md) installed and running** (required for SLA planner)
- **Profiling PVC created** (see [Benchmarking Resource Setup](/deploy/utils/README.md#benchmarking-resource-setup#BenchmarkingResourceSetup))
 - **Image pull secrets configured** if using private registries (typically `nvcr-imagepullsecret` for NVIDIA images)
 - **Sufficient GPU resources** available in your cluster for profiling
 - **Runtime images available** that contain both profiler and runtime components
@@ -360,41 +359,42 @@ spec:
 Then manually extract and apply the generated DGD:

 ```bash
-# Extract generated config
-kubectl get dgdr sla-aic -n $NAMESPACE -o jsonpath='{.status.generatedConfig}' > my-dgd.yaml
+# Extract generated DGD from DGDR status
+kubectl get dgdr sla-aic -n $NAMESPACE -o jsonpath='{.status.generatedDeployment}' | kubectl apply -f -

-# Review and modify if needed
-vi my-dgd.yaml
+# Or save to file first for review/modification
+kubectl get dgdr sla-aic -n $NAMESPACE -o jsonpath='{.status.generatedDeployment}' > my-dgd.yaml

-# Deploy manually
+vi my-dgd.yaml
 kubectl apply -f my-dgd.yaml -n $NAMESPACE
 ```

-The generated DGD includes optimized configurations and the SLA planner component.
+The generated DGD includes optimized configurations and the SLA planner component. The required `planner-profile-data` ConfigMap is automatically created when profiling completes, so the DGD will deploy successfully.

 #### Option 2: Use Standalone Planner Templates (Advanced)

 For advanced use cases, you can manually deploy using the standalone planner templates in `examples/backends/*/deploy/disagg_planner.yaml`:

 ```bash
-# After profiling completes, profiling data is stored on the PVC at /data
+# After profiling completes, profiling data is automatically stored in ConfigMaps

-# OPTIONAL: Download profiling results for local inspection
-# Create access pod (skip this step if access pod is already running)
-kubectl apply -f deploy/utils/manifests/pvc-access-pod.yaml -n $NAMESPACE
-kubectl wait --for=condition=Ready pod/pvc-access-pod -n $NAMESPACE --timeout=60s
+# OPTIONAL: Inspect profiling results stored in ConfigMaps
+# View the generated DGD configuration
+kubectl get configmap dgdr-output-<dgdr-name> -n $NAMESPACE -o yaml

-# Download the data
-kubectl cp $NAMESPACE/pvc-access-pod:/data ./profiling_data
+# View the planner profiling data (JSON format)
+kubectl get configmap planner-profile-data -n $NAMESPACE -o yaml

-# Cleanup
-kubectl delete pod pvc-access-pod -n $NAMESPACE
+# Update the PROMETHEUS_ENDPOINT environment variable in the planner template
+# to match your cluster's Prometheus service location (see comments in the template)

 # Update backend planner manifest as needed, then deploy
 kubectl apply -f examples/backends/<backend>/deploy/disagg_planner.yaml -n $NAMESPACE
 ```

 > **Note**: The standalone templates are provided as examples and may need customization for your model and requirements. The DGDR-generated configuration (Option 1) is recommended as it's automatically tuned to your profiling results and SLA targets.
+>
+> **Important - Prometheus Configuration**: The planner queries Prometheus to get frontend request metrics for scaling decisions. If you see errors like "Failed to resolve prometheus service", ensure the `PROMETHEUS_ENDPOINT` environment variable in your planner configuration correctly points to your Prometheus service. See the comments in the example templates for details.

 ### Relationship to DynamoGraphDeployment (DGD)


--- a/examples/backends/sglang/deploy/disagg_planner.yaml
+++ b/examples/backends/sglang/deploy/disagg_planner.yaml
@@ -6,9 +6,6 @@ kind: DynamoGraphDeployment
 metadata:
  name: sglang-disagg-planner
 spec:
-  pvcs:
-    - name: dynamo-pvc
-      create: false # Must be pre-created before deployment and SLA profiler must have been run
  services:
    Frontend:
      dynamoNamespace: dynamo
@@ -22,9 +19,6 @@ spec:
      envFromSecret: hf-token-secret
      componentType: planner
      replicas: 1
-      volumeMounts:
-        - name: dynamo-pvc
-          mountPoint: /data
      extraPodSpec:
        mainContainer:
          image: my-registry/sglang-runtime:my-tag
@@ -37,7 +31,17 @@ spec:
            - --environment=kubernetes
            - --backend=sglang
            - --adjustment-interval=60
-            - --profile-results-dir=/data
+            - --profile-results-dir=/workspace/profiling_results
+          volumeMounts:
+            - name: planner-profile-data
+              mountPath: /workspace/profiling_results
+              readOnly: true
+        volumes:
+          - name: planner-profile-data
+            configMap:
+              # Must be pre-created before deployment by the profiler
+              # See docs/planner/sla_planner_quickstart.md for more details
+              name: planner-profile-data
    decode:
      dynamoNamespace: dynamo
      envFromSecret: hf-token-secret

--- a/examples/backends/trtllm/deploy/disagg_planner.yaml
+++ b/examples/backends/trtllm/deploy/disagg_planner.yaml
@@ -6,9 +6,6 @@ kind: DynamoGraphDeployment
 metadata:
  name: trtllm-disagg-planner
 spec:
-  pvcs:
-    - name: dynamo-pvc
-      create: false
  services:
    Frontend:
      dynamoNamespace: trtllm-disagg-planner
@@ -39,9 +36,6 @@ spec:
      envFromSecret: hf-token-secret
      componentType: planner
      replicas: 1
-      volumeMounts:
-        - name: dynamo-pvc # Must be pre-created before deployment and SLA profiler must have been run
-          mountPoint: /data
      extraPodSpec:
        mainContainer:
          image: my-registry/trtllm-runtime:my-tag
@@ -57,8 +51,18 @@ spec:
            - --environment=kubernetes
            - --backend=trtllm
            - --adjustment-interval=60
-            - --profile-results-dir=/data
+            - --profile-results-dir=/workspace/profiling_results
            - --prometheus-port=9085
+          volumeMounts:
+            - name: planner-profile-data
+              mountPath: /workspace/profiling_results
+              readOnly: true
+        volumes:
+          - name: planner-profile-data
+            configMap:
+              # Must be pre-created before deployment by the profiler
+              # See docs/planner/sla_planner_quickstart.md for more details
+              name: planner-profile-data
    TRTLLMDecodeWorker:
      dynamoNamespace: trtllm-disagg-planner
      envFromSecret: hf-token-secret

--- a/examples/backends/vllm/deploy/README.md
+++ b/examples/backends/vllm/deploy/README.md
@@ -99,7 +99,7 @@ We have public images available on [NGC Catalog](https://catalog.ngc.nvidia.com/

 ### Pre-Deployment Profiling (SLA Planner Only)

-If using the SLA Planner deployment (`disagg_planner.yaml`), follow the [pre-deployment profiling guide](../../../../docs/benchmarks/sla_driven_profiling.md) to run pre-deployment profiling. The results will be saved to the `dynamo-pvc` PVC and queried by the SLA Planner.
+If using the SLA Planner deployment (`disagg_planner.yaml`), follow the [pre-deployment profiling guide](../../../../docs/benchmarks/sla_driven_profiling.md) to run pre-deployment profiling.

 ## Usage


--- a/examples/backends/vllm/deploy/disagg_planner.yaml
+++ b/examples/backends/vllm/deploy/disagg_planner.yaml
@@ -6,9 +6,6 @@ kind: DynamoGraphDeployment
 metadata:
  name: vllm-disagg-planner
 spec:
-  pvcs:
-    - name: dynamo-pvc
-      create: false # Must be pre-created before deployment and SLA profiler must have been run
  services:
    Frontend:
      dynamoNamespace: vllm-disagg-planner
@@ -21,9 +18,6 @@ spec:
      dynamoNamespace: vllm-disagg-planner
      componentType: planner
      replicas: 1
-      volumeMounts:
-        - name: dynamo-pvc
-          mountPoint: /data
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
@@ -36,7 +30,17 @@ spec:
            - --environment=kubernetes
            - --backend=vllm
            - --adjustment-interval=60
-            - --profile-results-dir=/data
+            - --profile-results-dir=/workspace/profiling_results
+          volumeMounts:
+            - name: planner-profile-data
+              mountPath: /workspace/profiling_results
+              readOnly: true
+        volumes:
+          - name: planner-profile-data
+            configMap:
+              # Must be pre-created before deployment by the profiler
+              # See docs/planner/sla_planner_quickstart.md for more details
+              name: planner-profile-data
    VllmDecodeWorker:
      dynamoNamespace: vllm-disagg-planner
      envFromSecret: hf-token-secret