fix(operator): use ConfigMap for vLLM multinode wait-for-leader script (#7954)

317b9614 · Julien Mancuso · GitHub · 4cdc49c2 · 317b9614 · 317b9614
Unverified Commit 317b9614 authored Apr 07, 2026 by Julien Mancuso Committed by GitHub Apr 07, 2026
5 changed files
--- a/deploy/operator/internal/controller/dynamographdeployment_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeployment_controller.go
@@ -326,6 +326,13 @@ func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context
 		return ReconcileResult{}, fmt.Errorf("failed to reconcile EPP resources: %w", err)
 	}

+	// Reconcile the wait-for-leader ConfigMap for multinode mp deployments
+	err = r.reconcileWaitLeaderConfigMap(ctx, dynamoDeployment)
+	if err != nil {
+		logger.Error(err, "Failed to reconcile wait-leader ConfigMap")
+		return ReconcileResult{}, fmt.Errorf("failed to reconcile wait-leader ConfigMap: %w", err)
+	}
+
 	// Determine if any service is multinode
 	hasMultinode := dynamoDeployment.HasAnyMultinodeService()

@@ -1582,6 +1589,21 @@ func (r *DynamoGraphDeploymentReconciler) reconcileEPPResources(ctx context.Cont
 	return nil
 }

+// reconcileWaitLeaderConfigMap ensures the wait-for-leader Python script
+// ConfigMap exists for multinode DGDs. The ConfigMap is only mounted by
+// vLLM mp worker pods (via UpdatePodSpec); for other backends it is inert.
+func (r *DynamoGraphDeploymentReconciler) reconcileWaitLeaderConfigMap(ctx context.Context, dgd *nvidiacomv1alpha1.DynamoGraphDeployment) error {
+	if !dgd.HasAnyMultinodeService() {
+		return nil
+	}
+
+	cm := dynamo.GenerateWaitLeaderConfigMap(dgd.Name, dgd.Namespace)
+	_, _, err := commoncontroller.SyncResource(ctx, r, dgd, func(ctx context.Context) (*corev1.ConfigMap, bool, error) {
+		return cm, false, nil
+	})
+	return err
+}
+
 func (r *DynamoGraphDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error {
 	// for now doing nothing
 	return nil

--- a/deploy/operator/internal/discovery/resource.go
+++ b/deploy/operator/internal/discovery/resource.go
@@ -55,7 +55,7 @@ func GetK8sDiscoveryRole(dgdName string, namespace string) *rbacv1.Role {
 		Rules: []rbacv1.PolicyRule{
 			{
 				APIGroups: []string{apiGroupCore},
-				Resources: []string{"endpoints"},
+				Resources: []string{"endpoints", "pods"},
 				Verbs:     []string{"get", "list", "watch"},
 			},
 			{

--- a/deploy/operator/internal/dynamo/backend_vllm.go
+++ b/deploy/operator/internal/dynamo/backend_vllm.go
@@ -2,6 +2,7 @@ package dynamo

 import (
 	"fmt"
+	"regexp"
 	"strconv"
 	"strings"

@@ -9,6 +10,7 @@ import (
 	commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/featuregate"
 	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 )

@@ -20,7 +22,9 @@ const (
 	dataParallelSizeFlag     = "--data-parallel-size"
 )

-type VLLMBackend struct{}
+type VLLMBackend struct {
+	ParentGraphDeploymentName string
+}

 func (b *VLLMBackend) UpdateContainer(container *corev1.Container, numberOfNodes int32, role Role, component *v1alpha1.DynamoComponentDeploymentSharedSpec, serviceName string, multinodeDeployer MultinodeDeployer) {
 	isMultinode := numberOfNodes > 1
@@ -78,44 +82,162 @@ func (b *VLLMBackend) UpdateContainer(container *corev1.Container, numberOfNodes
 	}
 }

-func (b *VLLMBackend) UpdatePodSpec(podSpec *corev1.PodSpec, numberOfNodes int32, role Role, component *v1alpha1.DynamoComponentDeploymentSharedSpec, serviceName string, multinodeDeployer MultinodeDeployer) {
-	if numberOfNodes <= 1 || role != RoleWorker || !shouldUseMpBackend(component.Annotations) {
-		return
-	}
+const (
+	waitLeaderConfigMapSuffix = "wait-leader-script"
+	waitLeaderScriptKey       = "wait-for-leader.py"
+	waitLeaderVolumeName      = "wait-leader-script"
+	waitLeaderMountPath       = "/scripts"
+)

-	if len(podSpec.Containers) == 0 {
-		return
-	}
+// WaitLeaderScript is the Python script that verifies leader pod health via
+// the K8s API before attempting a TCP connection. It reads LEADER_HOST and
+// LEADER_PORT from environment variables so the script content is generic.
+const WaitLeaderScript = `import socket, time, json, ssl, urllib.request, os

-	leaderHostname := multinodeDeployer.GetLeaderHostname(serviceName)
-	mainImage := podSpec.Containers[0].Image
+SA = "/var/run/secrets/kubernetes.io/serviceaccount"
+host = os.environ["LEADER_HOST"]
+port = int(os.environ["LEADER_PORT"])
+
+def _k8s_ctx():
+    return ssl.create_default_context(cafile=f"{SA}/ca.crt")
+
+def _k8s_headers():
+    token = open(f"{SA}/token").read()
+    return {"Authorization": f"Bearer {token}"}
+
+def _k8s_api():
+    ns = open(f"{SA}/namespace").read()
+    return f"https://kubernetes.default.svc/api/v1/namespaces/{ns}/pods"
+
+def leader_pod_is_healthy():
+    try:
+        ip = socket.gethostbyname(host)
+    except socket.gaierror:
+        return False, "DNS resolution failed", None, None
+    try:
+        req = urllib.request.Request(
+            f"{_k8s_api()}?fieldSelector=status.podIP={ip}",
+            headers=_k8s_headers(),
+        )
+        resp = json.loads(urllib.request.urlopen(req, context=_k8s_ctx(), timeout=5).read())
+        pods = resp.get("items", [])
+        if not pods:
+            return False, f"no pod found with IP {ip}", None, ip
+        pod = pods[0]
+        name = pod["metadata"].get("name", "unknown")
+        uid = pod["metadata"].get("uid", "unknown")
+        phase = pod.get("status", {}).get("phase")
+        deletion_ts = pod["metadata"].get("deletionTimestamp")
+        info = f"ip={ip} pod={name} uid={uid} phase={phase} deletionTimestamp={deletion_ts}"
+        if deletion_ts:
+            return False, f"pod {name} is terminating", info, ip
+        if phase != "Running":
+            return False, f"pod {name} phase is {phase}", info, ip
+        return True, "", info, ip
+    except Exception as e:
+        # Fall back to TCP-only when the API is unavailable (e.g. 403 no RBAC)
+        return True, f"K8s API unavailable ({e}), falling back to TCP", f"ip={ip}", ip

-	waitScript := fmt.Sprintf(`import socket, time
-host, port = "%s", %s
 print(f"Waiting for leader master port at {host}:{port}...", flush=True)
+time.sleep(5)
 start = time.monotonic()
 last_status = start
 last_err = ""
 while True:
-    try:
-        s = socket.create_connection((host, port), timeout=2)
-        s.close()
-        elapsed = time.monotonic() - start
-        print(f"Leader master port ready (waited {elapsed:.1f}s)", flush=True)
-        break
-    except Exception as e:
-        last_err = f"{type(e).__name__}: {e}"
+    healthy, reason, pod_info, leader_ip = leader_pod_is_healthy()
+    if healthy:
+        try:
+            s = socket.create_connection((leader_ip, port), timeout=2)
+            s.close()
+            elapsed = time.monotonic() - start
+            print(f"Leader master port ready (waited {elapsed:.1f}s) [{pod_info}]", flush=True)
+            break
+        except Exception as e:
+            last_err = f"tcp: {type(e).__name__}: {e} [{pod_info}]"
+    else:
+        last_err = f"{reason} [{pod_info}]" if pod_info else reason
    now = time.monotonic()
    if now - last_status >= 30:
-        print(f"Still waiting for {host}:{port}... ({now - start:.0f}s elapsed, last error: {last_err})", flush=True)
+        print(f"Still waiting for {host}:{port}... ({now - start:.0f}s elapsed, last: {last_err})", flush=True)
        last_status = now
-    time.sleep(2)
-`, leaderHostname, commonconsts.VLLMMpMasterPort)
+    time.sleep(5)
+`

+// k8sVarPattern matches Kubernetes $(VAR) env-var expansion syntax.
+var k8sVarPattern = regexp.MustCompile(`\$\((\w+)\)`)
+
+// k8sToShellVarSyntax converts Kubernetes $(VAR) references to shell ${VAR}
+// so that variables can be expanded by a shell at runtime. Plain $VAR
+// references (e.g. from LWS) are already valid shell syntax and left as-is.
+func k8sToShellVarSyntax(s string) string {
+	return k8sVarPattern.ReplaceAllString(s, `${$1}`)
+}
+
+// GetWaitLeaderConfigMapName returns the ConfigMap name for a given DGD.
+func GetWaitLeaderConfigMapName(dgdName string) string {
+	return fmt.Sprintf("%s-%s", dgdName, waitLeaderConfigMapSuffix)
+}
+
+// GenerateWaitLeaderConfigMap creates a ConfigMap containing the wait-for-leader
+// Python script. One ConfigMap is created per DGD and owned by the DGD.
+func GenerateWaitLeaderConfigMap(dgdName, namespace string) *corev1.ConfigMap {
+	return &corev1.ConfigMap{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      GetWaitLeaderConfigMapName(dgdName),
+			Namespace: namespace,
+			Labels: map[string]string{
+				commonconsts.KubeLabelDynamoGraphDeploymentName: dgdName,
+			},
+		},
+		Data: map[string]string{
+			waitLeaderScriptKey: WaitLeaderScript,
+		},
+	}
+}
+
+func (b *VLLMBackend) UpdatePodSpec(podSpec *corev1.PodSpec, numberOfNodes int32, role Role, component *v1alpha1.DynamoComponentDeploymentSharedSpec, serviceName string, multinodeDeployer MultinodeDeployer) {
+	if numberOfNodes <= 1 || role != RoleWorker || !shouldUseMpBackend(component.Annotations) {
+		return
+	}
+
+	if len(podSpec.Containers) == 0 || b.ParentGraphDeploymentName == "" {
+		return
+	}
+
+	leaderHostname := multinodeDeployer.GetLeaderHostname(serviceName)
+	mainImage := podSpec.Containers[0].Image
+	cmName := GetWaitLeaderConfigMapName(b.ParentGraphDeploymentName)
+
+	podSpec.Volumes = append(podSpec.Volumes, corev1.Volume{
+		Name: waitLeaderVolumeName,
+		VolumeSource: corev1.VolumeSource{
+			ConfigMap: &corev1.ConfigMapVolumeSource{
+				LocalObjectReference: corev1.LocalObjectReference{
+					Name: cmName,
+				},
+			},
+		},
+	})
+
+	// Use sh -c so the shell expands variable references at runtime.
+	// Grove/LWS env vars are appended to init containers AFTER our env
+	// vars, so Kubernetes $(VAR) expansion (which is order-dependent)
+	// cannot resolve them. The shell sees all env vars regardless of
+	// definition order.
+	shellHostname := k8sToShellVarSyntax(leaderHostname)
 	initContainer := corev1.Container{
-		Name:    "wait-for-leader-mp",
-		Image:   mainImage,
-		Command: []string{"python3", "-c", waitScript},
+		Name:  "wait-for-leader-mp",
+		Image: mainImage,
+		Command: []string{"sh", "-c", fmt.Sprintf(
+			`export LEADER_HOST="%s" LEADER_PORT="%s" && exec python3 %s/%s`,
+			shellHostname, commonconsts.VLLMMpMasterPort, waitLeaderMountPath, waitLeaderScriptKey)},
+		VolumeMounts: []corev1.VolumeMount{
+			{
+				Name:      waitLeaderVolumeName,
+				MountPath: waitLeaderMountPath,
+				ReadOnly:  true,
+			},
+		},
 	}

 	podSpec.InitContainers = append(podSpec.InitContainers, initContainer)

--- a/deploy/operator/internal/dynamo/backend_vllm_test.go
+++ b/deploy/operator/internal/dynamo/backend_vllm_test.go
@@ -560,20 +560,18 @@ func TestUpdateVLLMMultinodeArgs(t *testing.T) {
 }

 func TestVLLMBackend_UpdatePodSpec(t *testing.T) {
-	backend := &VLLMBackend{}
+	backend := &VLLMBackend{ParentGraphDeploymentName: "test-dgd"}

 	tests := []struct {
-		name                    string
-		numberOfNodes           int32
-		role                    Role
-		component               *v1alpha1.DynamoComponentDeploymentSharedSpec
-		multinodeDeployer       MultinodeDeployer
-		initialPodSpec          *corev1.PodSpec
-		expectInitContainer     bool
-		expectedInitName        string
-		expectedInitImage       string
-		expectedInitCommandLen  int
-		expectWaitScriptContent string
+		name                string
+		numberOfNodes       int32
+		role                Role
+		component           *v1alpha1.DynamoComponentDeploymentSharedSpec
+		multinodeDeployer   MultinodeDeployer
+		initialPodSpec      *corev1.PodSpec
+		expectInitContainer bool
+		expectedInitImage   string
+		expectedLeaderHost  string
 	}{
 		{
 			name:          "mp worker with Grove deployer injects init container",
@@ -590,11 +588,9 @@ func TestVLLMBackend_UpdatePodSpec(t *testing.T) {
 					{Name: "main", Image: "vllm:latest"},
 				},
 			},
-			expectInitContainer:     true,
-			expectedInitName:        "wait-for-leader-mp",
-			expectedInitImage:       "vllm:latest",
-			expectedInitCommandLen:  3,
-			expectWaitScriptContent: "$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE)",
+			expectInitContainer: true,
+			expectedInitImage:   "vllm:latest",
+			expectedLeaderHost:  "${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}",
 		},
 		{
 			name:          "mp worker with LWS deployer injects init container",
@@ -611,11 +607,9 @@ func TestVLLMBackend_UpdatePodSpec(t *testing.T) {
 					{Name: "main", Image: "vllm:v2"},
 				},
 			},
-			expectInitContainer:     true,
-			expectedInitName:        "wait-for-leader-mp",
-			expectedInitImage:       "vllm:v2",
-			expectedInitCommandLen:  3,
-			expectWaitScriptContent: "$LWS_LEADER_ADDRESS",
+			expectInitContainer: true,
+			expectedInitImage:   "vllm:v2",
+			expectedLeaderHost:  "$LWS_LEADER_ADDRESS",
 		},
 		{
 			name:          "mp leader does not inject init container",
@@ -682,11 +676,9 @@ func TestVLLMBackend_UpdatePodSpec(t *testing.T) {
 					{Name: "main", Image: "vllm:latest"},
 				},
 			},
-			expectInitContainer:     true,
-			expectedInitName:        "wait-for-leader-mp",
-			expectedInitImage:       "vllm:latest",
-			expectedInitCommandLen:  3,
-			expectWaitScriptContent: "$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-test-service-ldr-0.$(GROVE_HEADLESS_SERVICE)",
+			expectInitContainer: true,
+			expectedInitImage:   "vllm:latest",
+			expectedLeaderHost:  "${GROVE_PCSG_NAME}-${GROVE_PCSG_INDEX}-test-service-ldr-0.${GROVE_HEADLESS_SERVICE}",
 		},
 	}

@@ -695,27 +687,66 @@ func TestVLLMBackend_UpdatePodSpec(t *testing.T) {
 			g := gomega.NewGomegaWithT(t)

 			initialInitCount := len(tt.initialPodSpec.InitContainers)
+			initialVolCount := len(tt.initialPodSpec.Volumes)
 			backend.UpdatePodSpec(tt.initialPodSpec, tt.numberOfNodes, tt.role, tt.component, "test-service", tt.multinodeDeployer)

 			if tt.expectInitContainer {
-				g.Expect(len(tt.initialPodSpec.InitContainers)).To(gomega.Equal(initialInitCount + 1))
+				g.Expect(tt.initialPodSpec.InitContainers).To(gomega.HaveLen(initialInitCount + 1))
+				g.Expect(tt.initialPodSpec.Volumes).To(gomega.HaveLen(initialVolCount + 1))

 				injected := tt.initialPodSpec.InitContainers[len(tt.initialPodSpec.InitContainers)-1]
-				g.Expect(injected.Name).To(gomega.Equal(tt.expectedInitName))
+				g.Expect(injected.Name).To(gomega.Equal("wait-for-leader-mp"))
 				g.Expect(injected.Image).To(gomega.Equal(tt.expectedInitImage))
-				g.Expect(len(injected.Command)).To(gomega.Equal(tt.expectedInitCommandLen))
-				g.Expect(injected.Command[0]).To(gomega.Equal("python3"))
-				g.Expect(injected.Command[1]).To(gomega.Equal("-c"))
-				g.Expect(injected.Command[2]).To(gomega.ContainSubstring(tt.expectWaitScriptContent))
-				g.Expect(injected.Command[2]).To(gomega.ContainSubstring("socket.create_connection"))
-				g.Expect(injected.Command[2]).To(gomega.ContainSubstring(commonconsts.VLLMMpMasterPort))
+
+				expectedCmd := fmt.Sprintf(
+					`export LEADER_HOST="%s" LEADER_PORT="%s" && exec python3 /scripts/wait-for-leader.py`,
+					tt.expectedLeaderHost, commonconsts.VLLMMpMasterPort)
+				g.Expect(injected.Command).To(gomega.Equal([]string{"sh", "-c", expectedCmd}))
+				g.Expect(injected.Env).To(gomega.BeEmpty())
+
+				g.Expect(injected.VolumeMounts).To(gomega.HaveLen(1))
+				g.Expect(injected.VolumeMounts[0].Name).To(gomega.Equal("wait-leader-script"))
+				g.Expect(injected.VolumeMounts[0].MountPath).To(gomega.Equal("/scripts"))
+				g.Expect(injected.VolumeMounts[0].ReadOnly).To(gomega.BeTrue())
+
+				vol := tt.initialPodSpec.Volumes[len(tt.initialPodSpec.Volumes)-1]
+				g.Expect(vol.Name).To(gomega.Equal("wait-leader-script"))
+				g.Expect(vol.ConfigMap).ToNot(gomega.BeNil())
+				g.Expect(vol.ConfigMap.Name).To(gomega.Equal("test-dgd-wait-leader-script"))
 			} else {
-				g.Expect(len(tt.initialPodSpec.InitContainers)).To(gomega.Equal(initialInitCount))
+				g.Expect(tt.initialPodSpec.InitContainers).To(gomega.HaveLen(initialInitCount))
+				g.Expect(tt.initialPodSpec.Volumes).To(gomega.HaveLen(initialVolCount))
 			}
 		})
 	}
 }

+func TestGenerateWaitLeaderConfigMap(t *testing.T) {
+	g := gomega.NewGomegaWithT(t)
+
+	cm := GenerateWaitLeaderConfigMap("my-dgd", "my-ns")
+
+	g.Expect(cm.Name).To(gomega.Equal("my-dgd-wait-leader-script"))
+	g.Expect(cm.Namespace).To(gomega.Equal("my-ns"))
+	g.Expect(cm.Labels).To(gomega.HaveKeyWithValue(commonconsts.KubeLabelDynamoGraphDeploymentName, "my-dgd"))
+	g.Expect(cm.Data).To(gomega.HaveKey("wait-for-leader.py"))
+
+	script := cm.Data["wait-for-leader.py"]
+	g.Expect(script).To(gomega.ContainSubstring(`os.environ["LEADER_HOST"]`))
+	g.Expect(script).To(gomega.ContainSubstring(`os.environ["LEADER_PORT"]`))
+	g.Expect(script).To(gomega.ContainSubstring("leader_pod_is_healthy"))
+	g.Expect(script).To(gomega.ContainSubstring("kubernetes.default.svc"))
+	g.Expect(script).To(gomega.ContainSubstring("fieldSelector=status.podIP="))
+	g.Expect(script).To(gomega.ContainSubstring("deletionTimestamp"))
+	g.Expect(script).To(gomega.ContainSubstring("socket.create_connection"))
+	g.Expect(script).To(gomega.ContainSubstring("time.sleep(5)"))
+}
+
+func TestGetWaitLeaderConfigMapName(t *testing.T) {
+	g := gomega.NewGomegaWithT(t)
+	g.Expect(GetWaitLeaderConfigMapName("foo")).To(gomega.Equal("foo-wait-leader-script"))
+}
+
 func TestShouldUseMpBackend(t *testing.T) {
 	// Version-based gate behavior is tested in featuregate.TestOperatorOriginFeatureGate_IsEnabled.
 	// These tests focus on the explicit override logic and its interaction with the feature gate.

--- a/deploy/operator/internal/dynamo/graph.go
+++ b/deploy/operator/internal/dynamo/graph.go
@@ -895,12 +895,12 @@ type MultinodeDeployer interface {
 }

 // BackendFactory creates backend instances based on the framework type
-func BackendFactory(backendFramework BackendFramework, operatorConfig *configv1alpha1.OperatorConfiguration) Backend {
+func BackendFactory(backendFramework BackendFramework, operatorConfig *configv1alpha1.OperatorConfiguration, parentGraphDeploymentName string) Backend {
 	switch backendFramework {
 	case BackendFrameworkSGLang:
 		return &SGLangBackend{}
 	case BackendFrameworkVLLM:
-		return &VLLMBackend{}
+		return &VLLMBackend{ParentGraphDeploymentName: parentGraphDeploymentName}
 	case BackendFrameworkTRTLLM:
 		return &TRTLLMBackend{
 			MpiRunSecretName: operatorConfig.MPI.SSHSecretName,
@@ -1121,7 +1121,7 @@ func GenerateBasePodSpec(
 	if multinodeDeployer == nil {
 		return nil, fmt.Errorf("unsupported multinode deployment type: %s", multinodeDeploymentType)
 	}
-	backend := BackendFactory(backendFramework, operatorConfig)
+	backend := BackendFactory(backendFramework, operatorConfig, parentGraphDeploymentName)
 	if backend == nil {
 		return nil, fmt.Errorf("unsupported backend framework: %s", backendFramework)
 	}