fix: profiler sidecar and other SLA-driven autodeployment fixes (#3932)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

fix: profiler sidecar and other SLA-driven autodeployment fixes (#3932)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
a7b703bd · hhzhang16 · GitHub · 3998fdcb · a7b703bd · a7b703bd
Unverified Commit a7b703bd authored Oct 28, 2025 by hhzhang16 Committed by GitHub Oct 29, 2025
7 changed files
--- a/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml
@@ -12,7 +12,7 @@ spec:

  # ProfilingConfig maps directly to the profile_sla.py config format
  profilingConfig:
-    profilerImage: "nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-540.5"
+    profilerImage: "nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-554.0"
    config:
      # Sweep/profiling configuration
      sweep:
@@ -31,8 +31,7 @@ spec:

  # Deployment overrides for the auto-created DGD
  deploymentOverrides:
-    workersImage: "nvcr.io/nvidian/dynamo-dev/trtllm-runtime:dep-540.5"
+    workersImage: "nvcr.io/nvidian/dynamo-dev/trtllm-runtime:dep-554.0"

  # Automatically create DynamoGraphDeployment after profiling
  autoApply: true
-
--- a/benchmarks/profiler/deploy/profile_sla_dgdr.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_dgdr.yaml
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
-# DynamoGraphDeploymentRequest for standard online profiling
-# Converted from profile_sla_job.yaml
+# DynamoGraphDeploymentRequest for online profiling (actual deployment testing)
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeploymentRequest
 metadata:
@@ -13,12 +12,11 @@ spec:

  # ProfilingConfig maps directly to the profile_sla.py config format
  profilingConfig:
-    profilerImage: "nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-540.5"
+    profilerImage: "nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-554.0"
    config:
      # Sweep/profiling configuration
      sweep:
-        skip_existing_results: true
-        # Standard online profiling (not using AI Configurator)
+        # Online profiling mode (real deployment testing)
        use_ai_configurator: false

      # SLA targets for profiling
@@ -30,8 +28,7 @@ spec:

  # Deployment overrides for the auto-created DGD
  deploymentOverrides:
-    workersImage: "nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-540.5"
+    workersImage: "nvcr.io/nvidian/dynamo-dev/vllm-runtime:dep-554.0"

  # Automatically create DynamoGraphDeployment after profiling
  autoApply: true
-
--- a/benchmarks/profiler/utils/search_space_autogen.py
+++ b/benchmarks/profiler/utils/search_space_autogen.py
@@ -44,6 +44,10 @@ def auto_generate_search_space(args: argparse.Namespace) -> None:

        logger.info(f"Updating model in DGD config file to {args.model}")
        config = config_modifier.update_model(config, args.model)
+        if args.dgd_image:
+            logger.info(f"Updating DGD image to {args.dgd_image}")
+            config = config_modifier.update_image(config, args.dgd_image)
+
        config_fn = f"{args.output_dir}/disagg_config.yaml"
        logger.info(f"Saving generated disagg DGD config for profiling to {config_fn}")
        os.makedirs(args.output_dir, exist_ok=True)

--- a/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
+++ b/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml
@@ -48,7 +48,6 @@ spec:

      # Sweep/profiling configuration
      sweep:
-        skip_existing_results: true  # Skip configurations that already have results
        prefill_interpolation_granularity: 16  # Samples for TTFT interpolation
        decode_interpolation_granularity: 6  # Samples for ITL interpolation


--- a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller.go
+++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller.go
@@ -159,7 +159,24 @@ const (
 const sidecarScriptTemplate = `
 set -e
 set -o pipefail
+# Wait for the profiler container to complete, not just for the file to exist
+# This ensures we capture the final config, not intermediate results
+echo "Waiting for profiler to complete..."
+while true; do
+  # Check if profiler container has finished (either Completed or Error state)
+  # Use kubectl to check the pod's container status
+  STATUS=$(kubectl get pod $HOSTNAME -n {{.Namespace}} -o jsonpath='{.status.containerStatuses[?(@.name=="profiler")].state}' 2>/dev/null || echo "")
+  if echo "$STATUS" | grep -q "terminated"; then
+    echo "Profiler container has terminated"
+    break
+  fi
+  sleep 5
+done
+
+# Now wait for the output file to exist
+echo "Waiting for output file {{.OutputPath}}/{{.OutputFile}}..."
 while [ ! -f {{.OutputPath}}/{{.OutputFile}} ]; do sleep 2; done
+echo "Output file found, creating ConfigMap..."

 # Start building ConfigMap YAML with DGD spec
 cat >/tmp/cm.yaml <<EOF

--- a/docs/benchmarks/sla_driven_profiling.md
+++ b/docs/benchmarks/sla_driven_profiling.md
@@ -345,7 +345,6 @@ spec:

      sweep:
        use_ai_configurator: false
-        skip_existing_results: false

  deploymentOverrides:
    workersImage: "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1"

--- a/docs/planner/sla_planner_quickstart.md
+++ b/docs/planner/sla_planner_quickstart.md
@@ -324,7 +324,6 @@ profilingConfig:

    # Profiling sweep settings (optional)
    sweep:
-      skip_existing_results: false
      force_rerun: false
 ```