feat: add inter-pod GMS (#7777)

a48672f5 · Julien Mancuso · GitHub · 0d635418 · a48672f5 · a48672f5
Unverified Commit a48672f5 authored Apr 23, 2026 by Julien Mancuso Committed by GitHub Apr 23, 2026
20 changed files
--- a/deploy/operator/internal/controller/dynamographdeployment_controller_test.go
+++ b/deploy/operator/internal/controller/dynamographdeployment_controller_test.go
@@ -770,7 +770,9 @@ func Test_reconcileGroveResources(t *testing.T) {
 		name                   string
 		dgdSpec                v1alpha1.DynamoGraphDeploymentSpec
 		existingGroveResources []client.Object
+		draEnabled             bool
 		wantReconcileResult    ReconcileResult
+		wantErrSubstring       string
 	}{
 		{
 			name: "singular frontend service with 2 replicas - creates a PodClique with 2 replicas - ready",
@@ -1038,6 +1040,25 @@ func Test_reconcileGroveResources(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "inter-pod GMS failover requires DRA - returns clear error when DRA is disabled",
+			dgdSpec: v1alpha1.DynamoGraphDeploymentSpec{
+				BackendFramework: "vllm",
+				Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
+					"decode": {
+						ComponentType: string(commonconsts.ComponentTypeDecode),
+						Replicas:      ptr.To(int32(1)),
+						Failover: &v1alpha1.FailoverSpec{
+							Enabled:    true,
+							Mode:       v1alpha1.GMSModeInterPod,
+							NumShadows: 1,
+						},
+					},
+				},
+			},
+			draEnabled:       false,
+			wantErrSubstring: "requires DRA",
+		},
 	}

 	for _, tt := range tests {
@@ -1073,7 +1094,7 @@ func Test_reconcileGroveResources(t *testing.T) {
 				Client:        fakeKubeClient,
 				Recorder:      recorder,
 				Config:        &configv1alpha1.OperatorConfiguration{},
-				RuntimeConfig: &controller_common.RuntimeConfig{},
+				RuntimeConfig: &controller_common.RuntimeConfig{DRAEnabled: tt.draEnabled},
 				ScaleClient:   &mockScaleClient{},
 				DockerSecretRetriever: &mockDockerSecretRetriever{
 					GetSecretsFunc: func(namespace, imageName string) ([]string, error) {
@@ -1083,6 +1104,11 @@ func Test_reconcileGroveResources(t *testing.T) {
 			}

 			result, err := reconciler.reconcileGroveResources(ctx, dgd, nil, nil)
+			if tt.wantErrSubstring != "" {
+				g.Expect(err).To(gomega.HaveOccurred())
+				g.Expect(err.Error()).To(gomega.ContainSubstring(tt.wantErrSubstring))
+				return
+			}
 			g.Expect(err).NotTo(gomega.HaveOccurred())

 			g.Expect(result).To(gomega.Equal(tt.wantReconcileResult))

--- a/deploy/operator/internal/controller/failover_cascade_controller.go
+++ b/deploy/operator/internal/controller/failover_cascade_controller.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package controller
+
+import (
+	"context"
+	"fmt"
+
+	commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/client-go/tools/record"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/builder"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/event"
+	"sigs.k8s.io/controller-runtime/pkg/handler"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/controller-runtime/pkg/predicate"
+)
+
+// Grove labels that together uniquely identify an "engine group" — the set of
+// pods (one per rank in multi-node, or a single pod in single-node) that share
+// the same pod index within a PCSG replica. When any one of them terminates,
+// the whole group must be torn down so Grove can recreate it as a healthy unit.
+const (
+	groveLabelPCSG             = "grove.io/podcliquescalinggroup"
+	groveLabelPCSGReplicaIndex = "grove.io/podcliquescalinggroup-replica-index"
+	groveLabelPodIndex         = "grove.io/podclique-pod-index"
+)
+
+// FailoverCascadeReconciler watches GMS failover pods (restartPolicy: Never)
+// and cascade-deletes all pods in the same engine group when any member
+// reaches a terminal phase (Failed or Succeeded). This ensures broken
+// distributed inference groups are restarted cleanly by Grove.
+//
+// Background: GMS (GPU Memory Service) pods run with restartPolicy: Never so
+// that Kubernetes does not attempt to restart them in-place — a partial
+// restart would leave the distributed inference group in an inconsistent
+// state. Instead, this controller detects the terminal pod and deletes the
+// entire group.  Grove then sees the missing pods and recreates the whole
+// group from scratch.
+//
+// An engine group is identified by three Grove labels:
+//   - grove.io/podcliquescalinggroup              (PCSG name)
+//   - grove.io/podcliquescalinggroup-replica-index (PCSG replica — which copy of the group)
+//   - grove.io/podclique-pod-index                (pod index within the clique)
+//
+// Only pods carrying the dynamo failover engine-group-member label are
+// considered; see failoverCascadePredicate().
+type FailoverCascadeReconciler struct {
+	client.Client
+	Recorder record.EventRecorder
+}
+
+// NewFailoverCascadeReconciler creates a new reconciler.
+func NewFailoverCascadeReconciler(c client.Client, recorder record.EventRecorder) *FailoverCascadeReconciler {
+	return &FailoverCascadeReconciler{
+		Client:   c,
+		Recorder: recorder,
+	}
+}
+
+// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;delete;deletecollection
+
+// Reconcile is called whenever a failover-eligible pod transitions to a
+// terminal phase (see failoverCascadePredicate).
+//
+// DeleteAllOf is idempotent, so concurrent reconciles for multiple pods in the
+// same engine group are harmless — the first deletes the group and subsequent
+// calls are no-ops.
+func (r *FailoverCascadeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+	logger := log.FromContext(ctx)
+
+	var pod corev1.Pod
+	if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
+		if errors.IsNotFound(err) {
+			return ctrl.Result{}, nil
+		}
+		return ctrl.Result{}, err
+	}
+
+	if !isTerminalPhase(pod.Status.Phase) {
+		return ctrl.Result{}, nil
+	}
+
+	// Between predicate evaluation and reconcile execution, another reconcile
+	// may have already cascade-deleted this pod. The pod still exists in the
+	// API server but is marked for deletion — skip it.
+	if pod.DeletionTimestamp != nil {
+		return ctrl.Result{}, nil
+	}
+
+	// Defensive re-check of the engine-group-member label: the predicate
+	// already filters on it at the informer layer, but labels can be removed
+	// between predicate evaluation and reconcile. We never want to cascade-
+	// delete a pod that has been explicitly unlabeled (e.g. an operator
+	// manually quarantining a pod).
+	if pod.Labels[commonconsts.KubeLabelDynamoFailoverEngineGroupMember] != commonconsts.KubeLabelValueTrue {
+		return ctrl.Result{}, nil
+	}
+
+	pcsg := pod.Labels[groveLabelPCSG]
+	pcsgReplica := pod.Labels[groveLabelPCSGReplicaIndex]
+	podIndex := pod.Labels[groveLabelPodIndex]
+	if pcsg == "" || pcsgReplica == "" || podIndex == "" {
+		logger.Info("failover pod missing Grove labels, skipping cascade",
+			"pod", pod.Name,
+			groveLabelPCSG, pcsg,
+			groveLabelPCSGReplicaIndex, pcsgReplica,
+			groveLabelPodIndex, podIndex,
+		)
+		return ctrl.Result{}, nil
+	}
+
+	groupLabels := client.MatchingLabels{
+		commonconsts.KubeLabelDynamoFailoverEngineGroupMember: commonconsts.KubeLabelValueTrue,
+		groveLabelPCSG:             pcsg,
+		groveLabelPCSGReplicaIndex: pcsgReplica,
+		groveLabelPodIndex:         podIndex,
+	}
+
+	// Force delete (grace=0) intentionally: the distributed inference group is
+	// already broken when we get here, so giving the surviving engines a SIGTERM
+	// window only delays Grove's recreation of the cohort and risks leaving
+	// half-torn-down NCCL/CUDA IPC state and stale UDS sockets on the shared
+	// hostPath. We deliberately skip preStop hooks and the graceful shutdown
+	// window; do NOT soften this to a positive grace period.
+	if err := r.DeleteAllOf(ctx, &corev1.Pod{}, client.InNamespace(pod.Namespace), groupLabels, client.GracePeriodSeconds(0)); err != nil {
+		return ctrl.Result{}, fmt.Errorf("failed to cascade-delete engine group: %w", err)
+	}
+
+	logger.Info("cascade-deleted engine group",
+		"trigger", pod.Name,
+		"pcsg", pcsg,
+		"pcsgReplica", pcsgReplica,
+		"podIndex", podIndex,
+	)
+	r.Recorder.Eventf(&pod, corev1.EventTypeWarning, "FailoverCascade",
+		"Pod %s terminated (phase=%s); cascade-deleted engine group (pcsg=%s, replica=%s, index=%s)",
+		pod.Name, pod.Status.Phase, pcsg, pcsgReplica, podIndex,
+	)
+
+	return ctrl.Result{}, nil
+}
+
+// SetupWithManager registers a controller that watches all Pods (not just
+// owned ones) and uses failoverCascadePredicate to filter down to only the
+// failover-eligible phase transitions.  EnqueueRequestForObject means the
+// reconcile key is the pod itself (namespace/name), not a parent resource.
+func (r *FailoverCascadeReconciler) SetupWithManager(mgr ctrl.Manager) error {
+	return ctrl.NewControllerManagedBy(mgr).
+		Named("gms-failover-cascade").
+		Watches(&corev1.Pod{}, &handler.EnqueueRequestForObject{},
+			builder.WithPredicates(failoverCascadePredicate()),
+		).
+		Complete(r)
+}
+
+func isTerminalPhase(phase corev1.PodPhase) bool {
+	return phase == corev1.PodFailed || phase == corev1.PodSucceeded
+}
+
+// failoverCascadePredicate keeps the reconcile queue minimal by filtering
+// events at the informer level, before they ever reach Reconcile().
+//
+// It accepts only pods carrying the dynamo failover engine-group-member label
+// and only when they reach a terminal phase:
+//
+//   - CreateFunc: handles the edge case where the informer's initial list-watch
+//     delivers a pod that is already Failed/Succeeded (e.g. the informer cache
+//     started after the pod transitioned, so no Update event was observed).
+//     Without this, such pods would be silently ignored and their engine group
+//     would never be cascade-deleted.
+//
+//   - UpdateFunc: the primary path — fires when a Running/Pending pod
+//     transitions to Failed/Succeeded.  Pods that already have a
+//     deletionTimestamp are filtered out to avoid acting on pods that are
+//     being terminated by an ongoing cascade or DGD deletion.
+//
+//   - DeleteFunc / GenericFunc: always suppressed — pod deletions are the
+//     *result* of our cascade, not triggers for one.
+func failoverCascadePredicate() predicate.Predicate {
+	hasLabel := func(labels map[string]string) bool {
+		return labels[commonconsts.KubeLabelDynamoFailoverEngineGroupMember] == commonconsts.KubeLabelValueTrue
+	}
+
+	return predicate.Funcs{
+		CreateFunc: func(e event.CreateEvent) bool {
+			if !hasLabel(e.Object.GetLabels()) {
+				return false
+			}
+			pod, ok := e.Object.(*corev1.Pod)
+			if !ok {
+				return false
+			}
+			return isTerminalPhase(pod.Status.Phase)
+		},
+		DeleteFunc: func(e event.DeleteEvent) bool {
+			return false
+		},
+		GenericFunc: func(e event.GenericEvent) bool {
+			return false
+		},
+		UpdateFunc: func(e event.UpdateEvent) bool {
+			if !hasLabel(e.ObjectNew.GetLabels()) {
+				return false
+			}
+			// Ignore pods already being deleted — this avoids reacting to
+			// our own cascade-delete (which sets deletionTimestamp before
+			// the pod actually disappears from the cache).
+			if e.ObjectNew.GetDeletionTimestamp() != nil {
+				return false
+			}
+			newPod, ok := e.ObjectNew.(*corev1.Pod)
+			if !ok {
+				return false
+			}
+			oldPod, ok := e.ObjectOld.(*corev1.Pod)
+			if !ok {
+				return false
+			}
+			// Only trigger on actual phase transitions to avoid processing
+			// the same pod twice (e.g. a metadata update on an already-Failed pod).
+			return !isTerminalPhase(oldPod.Status.Phase) && isTerminalPhase(newPod.Status.Phase)
+		},
+	}
+}
--- a/deploy/operator/internal/controller/failover_cascade_controller_test.go
+++ b/deploy/operator/internal/controller/failover_cascade_controller_test.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package controller
+
+import (
+	"context"
+	"testing"
+
+	commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/client-go/tools/record"
+	"k8s.io/utils/ptr"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+)
+
+const (
+	cascadeTestNamespace = "test-ns"
+	cascadeTestPCSG      = "my-pcsg"
+)
+
+func newFailoverPod(name string, phase corev1.PodPhase, replicaIdx, podIdx string) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      name,
+			Namespace: cascadeTestNamespace,
+			Labels: map[string]string{
+				commonconsts.KubeLabelDynamoFailoverEngineGroupMember: commonconsts.KubeLabelValueTrue,
+				groveLabelPCSG:             cascadeTestPCSG,
+				groveLabelPCSGReplicaIndex: replicaIdx,
+				groveLabelPodIndex:         podIdx,
+			},
+		},
+		Status: corev1.PodStatus{Phase: phase},
+	}
+}
+
+func newCascadeReconciler(objs ...client.Object) (*FailoverCascadeReconciler, client.Client) {
+	scheme := runtime.NewScheme()
+	_ = corev1.AddToScheme(scheme)
+
+	cb := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(&corev1.Pod{})
+	for _, o := range objs {
+		cb = cb.WithObjects(o)
+	}
+	c := cb.Build()
+
+	return NewFailoverCascadeReconciler(c, record.NewFakeRecorder(16)), c
+}
+
+func TestFailoverCascade_FailedPodDeletesEntireGroup(t *testing.T) {
+
+	failedPod := newFailoverPod("ldr-0", corev1.PodFailed, "0", "0")
+	sibling1 := newFailoverPod("gms-0-0", corev1.PodRunning, "0", "0")
+	sibling2 := newFailoverPod("wkr-1-0", corev1.PodRunning, "0", "0")
+
+	r, c := newCascadeReconciler(failedPod, sibling1, sibling2)
+
+	result, err := r.Reconcile(context.Background(), ctrl.Request{
+		NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
+	})
+	require.NoError(t, err)
+	assert.Equal(t, ctrl.Result{}, result)
+
+	var remaining corev1.PodList
+	require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
+	assert.Empty(t, remaining.Items, "all pods in the engine group should be deleted")
+}
+
+func TestFailoverCascade_SucceededPodDeletesEntireGroup(t *testing.T) {
+
+	succeededPod := newFailoverPod("ldr-0", corev1.PodSucceeded, "0", "0")
+	sibling := newFailoverPod("gms-0-0", corev1.PodRunning, "0", "0")
+
+	r, c := newCascadeReconciler(succeededPod, sibling)
+
+	result, err := r.Reconcile(context.Background(), ctrl.Request{
+		NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
+	})
+	require.NoError(t, err)
+	assert.Equal(t, ctrl.Result{}, result)
+
+	var remaining corev1.PodList
+	require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
+	assert.Empty(t, remaining.Items, "succeeded pod should also trigger cascade")
+}
+
+func TestFailoverCascade_DifferentGroupUnaffected(t *testing.T) {
+
+	failedPod := newFailoverPod("ldr-0", corev1.PodFailed, "0", "0")
+	differentGroup := newFailoverPod("ldr-1", corev1.PodRunning, "0", "1")
+
+	r, c := newCascadeReconciler(failedPod, differentGroup)
+
+	_, err := r.Reconcile(context.Background(), ctrl.Request{
+		NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
+	})
+	require.NoError(t, err)
+
+	var remaining corev1.PodList
+	require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
+	assert.Len(t, remaining.Items, 1, "only the different engine group pod should remain")
+	assert.Equal(t, "ldr-1", remaining.Items[0].Name)
+}
+
+func TestFailoverCascade_MultipleFailedPodsAllDeleted(t *testing.T) {
+
+	failedPod := newFailoverPod("ldr-0", corev1.PodFailed, "0", "0")
+	alsoFailed := newFailoverPod("wkr-1-0", corev1.PodFailed, "0", "0")
+	running := newFailoverPod("gms-0-0", corev1.PodRunning, "0", "0")
+
+	r, c := newCascadeReconciler(failedPod, alsoFailed, running)
+
+	_, err := r.Reconcile(context.Background(), ctrl.Request{
+		NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
+	})
+	require.NoError(t, err)
+
+	var remaining corev1.PodList
+	require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
+	assert.Empty(t, remaining.Items, "all pods in the engine group should be deleted")
+}
+
+func TestFailoverCascade_PodWithoutLabelIgnored(t *testing.T) {
+
+	unlabeled := &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "random-pod",
+			Namespace: cascadeTestNamespace,
+		},
+		Status: corev1.PodStatus{Phase: corev1.PodFailed},
+	}
+
+	r, _ := newCascadeReconciler(unlabeled)
+
+	result, err := r.Reconcile(context.Background(), ctrl.Request{
+		NamespacedName: types.NamespacedName{Name: "random-pod", Namespace: cascadeTestNamespace},
+	})
+	require.NoError(t, err)
+	assert.Equal(t, ctrl.Result{}, result)
+}
+
+func TestFailoverCascade_NonFailedPodIsNoop(t *testing.T) {
+
+	runningPod := newFailoverPod("ldr-0", corev1.PodRunning, "0", "0")
+	sibling := newFailoverPod("gms-0-0", corev1.PodRunning, "0", "0")
+
+	r, c := newCascadeReconciler(runningPod, sibling)
+
+	_, err := r.Reconcile(context.Background(), ctrl.Request{
+		NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
+	})
+	require.NoError(t, err)
+
+	var remaining corev1.PodList
+	require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
+	assert.Len(t, remaining.Items, 2, "running pod should not trigger cascade")
+}
+
+func TestFailoverCascade_NotFoundPodIsNoop(t *testing.T) {
+	r, _ := newCascadeReconciler()
+
+	result, err := r.Reconcile(context.Background(), ctrl.Request{
+		NamespacedName: types.NamespacedName{Name: "gone", Namespace: cascadeTestNamespace},
+	})
+	require.NoError(t, err)
+	assert.Equal(t, ctrl.Result{}, result)
+}
+
+func TestFailoverCascade_MissingGroveLabelsIsNoop(t *testing.T) {
+
+	pod := &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "partial-labels",
+			Namespace: cascadeTestNamespace,
+			Labels: map[string]string{
+				commonconsts.KubeLabelDynamoFailoverEngineGroupMember: commonconsts.KubeLabelValueTrue,
+				groveLabelPCSG: "my-pcsg",
+			},
+		},
+		Status: corev1.PodStatus{Phase: corev1.PodFailed},
+	}
+
+	r, _ := newCascadeReconciler(pod)
+
+	result, err := r.Reconcile(context.Background(), ctrl.Request{
+		NamespacedName: types.NamespacedName{Name: "partial-labels", Namespace: cascadeTestNamespace},
+	})
+	require.NoError(t, err)
+	assert.Equal(t, ctrl.Result{}, result)
+}
+
+func TestFailoverCascade_DifferentPCSGReplicaUnaffected(t *testing.T) {
+
+	failedPod := newFailoverPod("ldr-0", corev1.PodFailed, "0", "0")
+	differentReplica := newFailoverPod("ldr-r1-0", corev1.PodRunning, "1", "0")
+
+	r, c := newCascadeReconciler(failedPod, differentReplica)
+
+	_, err := r.Reconcile(context.Background(), ctrl.Request{
+		NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
+	})
+	require.NoError(t, err)
+
+	var remaining corev1.PodList
+	require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
+	assert.Len(t, remaining.Items, 1, "only the different PCSG replica pod should remain")
+	assert.Equal(t, "ldr-r1-0", remaining.Items[0].Name)
+}
+
+func TestFailoverCascade_DeletingPodIsSkipped(t *testing.T) {
+
+	now := metav1.Now()
+
+	failedPod := newFailoverPod("ldr-0", corev1.PodFailed, "0", "0")
+	failedPod.DeletionTimestamp = &now
+	failedPod.DeletionGracePeriodSeconds = ptr.To(int64(0))
+	failedPod.Finalizers = []string{"test-finalizer"}
+	sibling := newFailoverPod("gms-0-0", corev1.PodRunning, "0", "0")
+
+	r, c := newCascadeReconciler(failedPod, sibling)
+
+	result, err := r.Reconcile(context.Background(), ctrl.Request{
+		NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
+	})
+	require.NoError(t, err)
+	assert.Equal(t, ctrl.Result{}, result)
+
+	var remaining corev1.PodList
+	require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
+	assert.Len(t, remaining.Items, 2, "already-deleting pod should not trigger a cascade")
+}
+
+func TestFailoverCascade_ConcurrentReconcileIsIdempotent(t *testing.T) {
+
+	pod1 := newFailoverPod("ldr-0", corev1.PodFailed, "0", "0")
+	pod2 := newFailoverPod("wkr-1-0", corev1.PodFailed, "0", "0")
+
+	r, c := newCascadeReconciler(pod1, pod2)
+
+	_, err := r.Reconcile(context.Background(), ctrl.Request{
+		NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
+	})
+	require.NoError(t, err)
+
+	// Second reconcile for the other pod — it's already gone (NotFound).
+	_, err = r.Reconcile(context.Background(), ctrl.Request{
+		NamespacedName: types.NamespacedName{Name: "wkr-1-0", Namespace: cascadeTestNamespace},
+	})
+	require.NoError(t, err)
+
+	var remaining corev1.PodList
+	require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
+	assert.Empty(t, remaining.Items)
+}
--- a/deploy/operator/internal/dra/dra.go
+++ b/deploy/operator/internal/dra/dra.go
@@ -25,7 +25,11 @@ const (
 	// ClaimName is the pod-level DRA ResourceClaim name for shared GPU access.
 	ClaimName = "intrapod-shared-gpu"

-	defaultDeviceClassName = "gpu.nvidia.com"
+	// DefaultDeviceClassName is the default DRA DeviceClass name used when a
+	// component does not specify an explicit gpuType. It matches the
+	// DeviceClass that ships with the NVIDIA DRA Driver and is the single
+	// source of truth for this string across the operator.
+	DefaultDeviceClassName = "gpu.nvidia.com"
 )

 // ApplyClaim replaces the first container's nvidia.com/gpu resources with a
@@ -120,7 +124,7 @@ func GenerateResourceClaimTemplate(
 	}

 	if deviceClassName == "" {
-		deviceClassName = defaultDeviceClassName
+		deviceClassName = DefaultDeviceClassName
 	}

 	if cl != nil {

--- a/deploy/operator/internal/dra/dra_test.go
+++ b/deploy/operator/internal/dra/dra_test.go
@@ -100,7 +100,7 @@ func TestGenerateResourceClaimTemplate_Enabled(t *testing.T) {
 	assert.Equal(t, "myapp-worker-gpu", tmpl.Name)
 	require.Len(t, tmpl.Spec.Spec.Devices.Requests, 1)
 	req := tmpl.Spec.Spec.Devices.Requests[0]
-	assert.Equal(t, defaultDeviceClassName, req.Exactly.DeviceClassName)
+	assert.Equal(t, DefaultDeviceClassName, req.Exactly.DeviceClassName)
 	assert.Equal(t, int64(4), req.Exactly.Count)
 }


--- a/deploy/operator/internal/dynamo/backend_vllm.go
+++ b/deploy/operator/internal/dynamo/backend_vllm.go
@@ -29,6 +29,25 @@ type VLLMBackend struct {
 }

 func (b *VLLMBackend) UpdateContainer(container *corev1.Container, numberOfNodes int32, role Role, component *v1alpha1.DynamoComponentDeploymentSharedSpec, serviceName string, multinodeDeployer MultinodeDeployer) {
+	// The inter-pod GMS layout (with or without failover) requires the engine
+	// to load weights from the dedicated GMS weight-server pod rather than
+	// from disk. --load-format gms and DYN_VLLM_GMS_SHADOW_MODE activate the
+	// vLLM-side GMS client path and apply to both standalone inter-pod GMS
+	// and inter-pod GMS + failover; the "shadow mode" name is a vLLM upstream
+	// naming convention, not a statement about whether shadow pods are
+	// present.
+	if component.IsInterPodGMSEnabled() {
+		if !containerHasArg(container, "--load-format", "gms") {
+			injectFlagsIntoContainerCommand(container, "--load-format gms", false, "vllm")
+		}
+		// DYN_VLLM_GMS_SHADOW_MODE is a vLLM-engine-specific switch (activates
+		// the vLLM-side GMS client path for shadow weight loading). It is
+		// injected here — in the vLLM backend — rather than in the backend-
+		// agnostic GMS helpers so non-vLLM backends do not inherit a stray,
+		// meaningless env var if/when inter-pod GMS is extended to them.
+		container.Env = append(container.Env, corev1.EnvVar{Name: "DYN_VLLM_GMS_SHADOW_MODE", Value: "true"})
+	}
+
 	isMultinode := numberOfNodes > 1

 	if isMultinode {

--- a/deploy/operator/internal/dynamo/backend_vllm_test.go
+++ b/deploy/operator/internal/dynamo/backend_vllm_test.go
@@ -980,3 +980,73 @@ func TestShouldUseMpBackend(t *testing.T) {
 		})
 	}
 }
+
+// TestVLLMBackend_UpdateContainer_InterPodGMS asserts that when the inter-pod
+// GMS layout is enabled (gpuMemoryService.mode=interPod, with or without
+// failover), the vLLM backend is the one responsible for injecting both the
+// --load-format=gms flag and the DYN_VLLM_GMS_SHADOW_MODE env var. These are
+// vLLM-runtime switches and must live in the backend adapter, not in the
+// backend-agnostic GMS helpers (see gmsEngineEnvVars).
+func TestVLLMBackend_UpdateContainer_InterPodGMS(t *testing.T) {
+	backend := &VLLMBackend{}
+	component := &v1alpha1.DynamoComponentDeploymentSharedSpec{
+		GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{
+			Enabled: true,
+			Mode:    v1alpha1.GMSModeInterPod,
+		},
+		Failover: &v1alpha1.FailoverSpec{
+			Enabled: true,
+			Mode:    v1alpha1.GMSModeInterPod,
+		},
+	}
+	container := &corev1.Container{
+		Command: []string{"python3"},
+		Args:    []string{"-m", "dynamo.vllm"},
+	}
+
+	backend.UpdateContainer(container, 1, RoleMain, component, "svc", &GroveMultinodeDeployer{})
+
+	// --load-format gms flag must be injected into the container args.
+	joined := ""
+	for _, a := range container.Args {
+		joined += " " + a
+	}
+	if !reflect.DeepEqual(containerHasArg(container, "--load-format", "gms"), true) {
+		t.Errorf("expected --load-format gms to be injected; got args=%q", joined)
+	}
+
+	// DYN_VLLM_GMS_SHADOW_MODE must be set exactly once.
+	count := 0
+	for _, e := range container.Env {
+		if e.Name == "DYN_VLLM_GMS_SHADOW_MODE" {
+			count++
+			if e.Value != "true" {
+				t.Errorf("DYN_VLLM_GMS_SHADOW_MODE value = %q, want %q", e.Value, "true")
+			}
+		}
+	}
+	if count != 1 {
+		t.Errorf("DYN_VLLM_GMS_SHADOW_MODE env var count = %d, want 1", count)
+	}
+}
+
+// TestVLLMBackend_UpdateContainer_NoInterPodGMS asserts the complementary
+// invariant: when inter-pod GMS failover is NOT enabled, the vLLM backend
+// must not inject the GMS-specific env var (it is meaningless outside the
+// inter-pod layout).
+func TestVLLMBackend_UpdateContainer_NoInterPodGMS(t *testing.T) {
+	backend := &VLLMBackend{}
+	component := &v1alpha1.DynamoComponentDeploymentSharedSpec{}
+	container := &corev1.Container{
+		Command: []string{"python3"},
+		Args:    []string{"-m", "dynamo.vllm"},
+	}
+
+	backend.UpdateContainer(container, 1, RoleMain, component, "svc", &GroveMultinodeDeployer{})
+
+	for _, e := range container.Env {
+		if e.Name == "DYN_VLLM_GMS_SHADOW_MODE" {
+			t.Errorf("DYN_VLLM_GMS_SHADOW_MODE must not be injected when inter-pod GMS is disabled")
+		}
+	}
+}
--- a/deploy/operator/internal/dynamo/component_common.go
+++ b/deploy/operator/internal/dynamo/component_common.go
@@ -93,6 +93,10 @@ func (b *BaseComponentDefaults) getCommonContainer(context ComponentContext) cor
 		},
 	}
 	container.Env = []corev1.EnvVar{
+		{
+			Name:  "CONTAINER_NAME",
+			Value: commonconsts.MainContainerName,
+		},
 		{
 			Name:  commonconsts.DynamoNamespaceEnvVar,
 			Value: context.DynamoNamespace,
@@ -144,10 +148,9 @@ func (b *BaseComponentDefaults) getCommonContainer(context ComponentContext) cor
 	}

 	if context.Discovery.Mode == configv1alpha1.KubeDiscoveryModeContainer {
-		container.Env = append(container.Env, corev1.EnvVar{
-			Name:  "CONTAINER_NAME",
-			Value: container.Name,
-		})
+		// CONTAINER_NAME is already injected unconditionally above with
+		// MainContainerName (which equals container.Name here); do not append
+		// it again or we end up with two env entries of the same name.
 		container.Env = append(container.Env, corev1.EnvVar{
 			Name:  "DYN_KUBE_DISCOVERY_MODE",
 			Value: string(configv1alpha1.KubeDiscoveryModeContainer),

--- a/deploy/operator/internal/dynamo/component_planner_test.go
+++ b/deploy/operator/internal/dynamo/component_planner_test.go
@@ -82,6 +82,7 @@ func TestPlannerDefaults_GetBaseContainer(t *testing.T) {
 					FailureThreshold: 720,
 				},
 				Env: []corev1.EnvVar{
+					{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
 					{Name: commonconsts.DynamoNamespaceEnvVar, Value: "dynamo-namespace"},
 					{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypePlanner},
 					{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "name"},

--- a/deploy/operator/internal/dynamo/failover.go
+++ b/deploy/operator/internal/dynamo/failover.go
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
 */

 package dynamo
@@ -9,22 +21,389 @@ import (
 	"fmt"
 	"path/filepath"
 	"strconv"
+	"strings"

 	"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
 	commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
+	"github.com/ai-dynamo/dynamo/deploy/operator/internal/dra"
 	gmsruntime "github.com/ai-dynamo/dynamo/deploy/operator/internal/gms"
+	grovev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
 	corev1 "k8s.io/api/core/v1"
+	resourcev1 "k8s.io/api/resource/v1"
 	"k8s.io/apimachinery/pkg/util/intstr"
+	"k8s.io/utils/ptr"
 )

-var failoverLockFile = filepath.Join(gmsruntime.SharedMountPath, "failover.lock")
+// ──────────────────────────────────────────────────────────────────────────────
+// Inter-pod GMS failover (Mode: interPod)
+//
+// A dedicated GMS weight server pod is created per rank. Engine pods share GPU
+// memory via DRA ResourceClaims and a hostPath volume for UDS sockets.
+// ──────────────────────────────────────────────────────────────────────────────
+
+const (
+	gmsSharedVolumeName = "gms-shared"
+	gmsHostPathBase     = "/run/gms"
+	gmsSharedMountPath  = "/run/gms/shared"
+	gmsFailoverLockFile = "failover.lock"
+	gmsPermFixInitName  = "fix-gms-perms"
+)
+
+// gmsWrapperScript generates a bash script that launches the GMS server
+// (gpu_memory_service.cli.server), which auto-discovers DRA-allocated GPUs
+// and exposes both "weights" and "kv_cache" UDS sockets per device. The
+// wrapper cleans up stale sockets from a previous run, forwards SIGTERM/SIGINT
+// to the process group, and propagates the GMS server's exit code so the
+// container's exitCode in the Pod status reflects the actual failure mode
+// (rather than always being 1).
+func gmsWrapperScript() string {
+	return fmt.Sprintf(
+		`rm -f %s/gms_*.sock
+rc=1
+cleanup() { kill -- -$$ 2>/dev/null; exit "$rc"; }
+trap cleanup SIGTERM SIGINT
+python3 -m %s &
+echo "Started GMS server pid=$!"
+wait -n
+rc=$?
+echo "GMS server exited (code=$rc), shutting down"
+cleanup`, gmsSharedMountPath, gmsruntime.ServerModule)
+}
+
+// gmsStartupProbeCommand returns the exec probe command that verifies the GMS
+// server has opened both the weights and kv_cache UDS sockets for every
+// allocated GPU (2 sockets per device).
+func gmsStartupProbeCommand(gpuCount int) []string {
+	return []string{
+		"sh", "-c",
+		fmt.Sprintf("test $(ls %s/gms_*.sock 2>/dev/null | wc -l) -ge %d", gmsSharedMountPath, 2*gpuCount),
+	}
+}
+
+// applyGMSSharedResources attaches the resources common to both GMS weight
+// server pods and engine pods: strips GPU limits (DRA handles allocation),
+// adds the GPU toleration, mounts the rank-isolated hostPath shared volume,
+// and prepends the permission-fix init container.
+func applyGMSSharedResources(podSpec *corev1.PodSpec, c *corev1.Container, rank int32) {
+	removeGPUFromLimits(c)
+	addGPUToleration(podSpec)
+	vol, mount := gmsSharedVolume(rank)
+	podSpec.Volumes = append(podSpec.Volumes, vol)
+	c.VolumeMounts = append(c.VolumeMounts, mount)
+	podSpec.InitContainers = append(podSpec.InitContainers, gmsPermFixInitContainer(rank, c.Image))
+}
+
+// gmsWeightServerPodSpec builds a GMS weight server pod spec by cloning and
+// modifying a base engine pod spec. The GMS pod runs a different command,
+// has no liveness/readiness probes, and uses a startup probe that checks
+// for the expected number of GMS UDS sockets.
+//
+// RestartPolicy is intentionally left unset here (i.e. inherits the base /
+// Grove default, which is Always). A GMS server process holds only local
+// state — GPU allocations (via DRA, which survive the container), hostPath
+// UDS sockets (recreated by gmsWrapperScript on startup), and in-memory
+// weight buffers (re-sharded on reconnection by the engine clients). So an
+// in-place kubelet restart is a fast, correct recovery path.
+//
+// The paired engine pod mirrors this policy in the standalone inter-pod GMS
+// layout (a restarted engine re-imports IPC handles from the still-running
+// GMS server). In the inter-pod GMS failover layout, augmentEngineForGMS
+// overrides the engine's RestartPolicy to Never so the cohort can only be
+// recovered via FailoverCascadeReconciler; see the comment there.
+func gmsWeightServerPodSpec(basePodSpec *corev1.PodSpec, rank int32, gpuCount int) *corev1.PodSpec {
+	podSpec := basePodSpec.DeepCopy()
+	if len(podSpec.Containers) == 0 {
+		return podSpec
+	}
+
+	c := &podSpec.Containers[0]
+	c.Command = []string{"bash", "-c"}
+	c.Args = []string{gmsWrapperScript()}
+
+	c.StartupProbe = &corev1.Probe{
+		ProbeHandler: corev1.ProbeHandler{
+			Exec: &corev1.ExecAction{Command: gmsStartupProbeCommand(gpuCount)},
+		},
+		PeriodSeconds:    2,
+		TimeoutSeconds:   2,
+		FailureThreshold: 150, // 2s * 150 = 5 min
+	}
+	c.LivenessProbe = nil
+	c.ReadinessProbe = nil
+
+	c.Env = append(c.Env, corev1.EnvVar{
+		Name:  gmsruntime.EnvSocketDir,
+		Value: gmsSharedMountPath,
+	})
+
+	applyGMSSharedResources(podSpec, c, rank)
+
+	return podSpec
+}
+
+// gmsEngineEnvVars returns the backend-agnostic environment variables injected
+// into engine pods when GMS failover is enabled. Backend-specific switches
+// (e.g. the vLLM DYN_VLLM_GMS_SHADOW_MODE flag) are injected by the backend's
+// UpdateContainer path so non-vLLM backends do not inherit stray env vars.
+func gmsEngineEnvVars() []corev1.EnvVar {
+	return []corev1.EnvVar{
+		{
+			Name: "ENGINE_ID",
+			ValueFrom: &corev1.EnvVarSource{
+				FieldRef: &corev1.ObjectFieldSelector{
+					FieldPath: "metadata.labels['grove.io/podclique-pod-index']",
+				},
+			},
+		},
+		{Name: gmsruntime.EnvSocketDir, Value: gmsSharedMountPath},
+		{Name: "FAILOVER_LOCK_PATH", Value: gmsSharedMountPath + "/" + gmsFailoverLockFile},
+		{Name: "DYN_SYSTEM_STARTING_HEALTH_STATUS", Value: "notready"},
+	}
+}
+
+// augmentEngineForGMS modifies an engine pod spec in-place to work with the
+// inter-pod GMS layout: injects env vars, shared volume, strips GPU limits,
+// adds toleration, and prepends an init container to fix hostPath directory
+// permissions.
+//
+// RestartPolicy behavior is layout-dependent and is the one asymmetry between
+// standalone inter-pod GMS and inter-pod GMS failover:
+//
+//   - Standalone inter-pod GMS (isInterPodFailover=false): RestartPolicy is
+//     left unset (inherits Always), matching the GMS weight-server pod. A
+//     crashed engine is restarted in place by kubelet; the GMS server keeps
+//     running and the new engine container reconnects to the existing UDS
+//     sockets and re-imports CUDA IPC handles during --load-format gms
+//     startup. There is no cohort state to protect because there is no
+//     cohort — just one engine paired with one GMS server per rank.
+//
+//   - Inter-pod GMS failover (isInterPodFailover=true): RestartPolicy is
+//     forced to Never. Engine pods in a failover cohort hold distributed
+//     state that cannot survive an in-place container restart — active NCCL
+//     collectives, torch.distributed TCPStore membership, and primary/shadow
+//     coordination via the failover lock file and DYN_VLLM_GMS_SHADOW_MODE.
+//     An in-place restart leaves the cohort in a half-torn-down state and
+//     blocks recovery. The correct recovery path is for the pod to exit,
+//     FailoverCascadeReconciler (see failover_cascade_controller.go) to
+//     force-delete the full engine group based on the
+//     KubeLabelDynamoFailoverEngineGroupMember label, and Grove to recreate
+//     the cohort from scratch. That label is applied in graph.go only when
+//     isInterPodFailover is true, so forcing Never in the standalone case
+//     would strand engine pods in Failed state with nothing listening to
+//     force-delete them.
+func augmentEngineForGMS(podSpec *corev1.PodSpec, rank int32, isInterPodFailover bool) {
+	if len(podSpec.Containers) == 0 {
+		return
+	}
+	c := &podSpec.Containers[0]
+
+	c.Env = append(c.Env, gmsEngineEnvVars()...)
+	removeEnvVar(c, "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS")
+
+	applyGMSSharedResources(podSpec, c, rank)
+	if isInterPodFailover {
+		podSpec.RestartPolicy = corev1.RestartPolicyNever
+	}
+}
+
+// gmsSharedVolume returns a hostPath volume and mount with a subPathExpr that
+// isolates the shared directory per PCSG replica and per rank.
+func gmsSharedVolume(rank int32) (corev1.Volume, corev1.VolumeMount) {
+	hostPathType := corev1.HostPathDirectoryOrCreate
+	vol := corev1.Volume{
+		Name: gmsSharedVolumeName,
+		VolumeSource: corev1.VolumeSource{
+			HostPath: &corev1.HostPathVolumeSource{
+				Path: gmsHostPathBase,
+				Type: &hostPathType,
+			},
+		},
+	}
+	mount := corev1.VolumeMount{
+		Name:        gmsSharedVolumeName,
+		MountPath:   gmsSharedMountPath,
+		SubPathExpr: fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)/rank-%d", rank),
+	}
+	return vol, mount
+}
+
+// gmsPermFixInitContainer returns an init container that runs as root and
+// fixes the hostPath directory permissions so the non-root application user
+// can write UDS sockets and lock files. It uses the same subPathExpr as the
+// main container so kubelet creates the isolated subdirectory first.
+func gmsPermFixInitContainer(rank int32, image string) corev1.Container {
+	_, mount := gmsSharedVolume(rank)
+	return corev1.Container{
+		Name:    gmsPermFixInitName,
+		Image:   image,
+		Command: []string{"sh", "-c", fmt.Sprintf("chmod 1777 %s", gmsSharedMountPath)},
+		SecurityContext: &corev1.SecurityContext{
+			// Must run as uid 0 to chmod the hostPath mount for the non-root
+			// engine/server processes. Explicitly set RunAsNonRoot=false so
+			// cluster-wide baseline/restricted PodSecurity policies and some
+			// pod-level SecurityContext defaults do not silently reject this
+			// init container on admission.
+			RunAsUser:    ptr.To[int64](0),
+			RunAsNonRoot: ptr.To(false),
+		},
+		VolumeMounts: []corev1.VolumeMount{mount},
+	}
+}
+
+// removeGPUFromLimits strips nvidia.com/gpu from the container's resource
+// limits and requests because DRA handles GPU allocation for GMS pods.
+func removeGPUFromLimits(c *corev1.Container) {
+	delete(c.Resources.Limits, "nvidia.com/gpu")
+	delete(c.Resources.Requests, "nvidia.com/gpu")
+}
+
+// addGPUToleration ensures pods without explicit GPU limits still get
+// scheduled on GPU nodes.
+func addGPUToleration(podSpec *corev1.PodSpec) {
+	toleration := corev1.Toleration{
+		Key:      "nvidia.com/gpu",
+		Operator: corev1.TolerationOpExists,
+		Effect:   corev1.TaintEffectNoSchedule,
+	}
+	for _, t := range podSpec.Tolerations {
+		if t.Key == toleration.Key && t.Effect == toleration.Effect {
+			return
+		}
+	}
+	podSpec.Tolerations = append(podSpec.Tolerations, toleration)
+}
+
+// removeEnvVar removes all occurrences of the named env var from a container.
+func removeEnvVar(c *corev1.Container, name string) {
+	filtered := c.Env[:0]
+	for _, e := range c.Env {
+		if e.Name != name {
+			filtered = append(filtered, e)
+		}
+	}
+	c.Env = filtered
+}
+
+// getGPUCount extracts the GPU count from the component's resource limits.
+func getGPUCount(resources *v1alpha1.Resources) int32 {
+	if resources == nil || resources.Limits == nil || resources.Limits.GPU == "" {
+		return 0
+	}
+	if n, err := strconv.ParseInt(resources.Limits.GPU, 10, 32); err == nil {
+		return int32(n)
+	}
+	return 0
+}
+
+// getDeviceClassName returns the DRA device class name from gpuType,
+// falling back to the default device class shipped with the NVIDIA DRA
+// driver. The literal "gpu.nvidia.com" is intentionally not duplicated
+// here — it is the single source of truth in the dra package.
+func getDeviceClassName(resources *v1alpha1.Resources) string {
+	if resources != nil && resources.Limits != nil && resources.Limits.GPUType != "" {
+		return resources.Limits.GPUType
+	}
+	return dra.DefaultDeviceClassName
+}
+
+// gmsRCTName returns a deterministic ResourceClaimTemplate name for a given rank.
+func gmsRCTName(serviceName string, rank int32) string {
+	return fmt.Sprintf("%s-gpu-rank-%d", serviceName, rank)
+}
+
+// gmsResourceClaimTemplateConfigs builds one PCS-level ResourceClaimTemplateConfig
+// per rank. Each RCT has the same GPU spec but a distinct per-rank name so that
+// each rank's GMS + engine pods get their own ResourceClaim.
+func gmsResourceClaimTemplateConfigs(serviceName string, resources *v1alpha1.Resources, roles []ServiceRole) []grovev1alpha1.ResourceClaimTemplateConfig {
+	seen := map[int32]bool{}
+	configs := make([]grovev1alpha1.ResourceClaimTemplateConfig, 0, len(roles))
+	for _, r := range roles {
+		if seen[r.Rank] {
+			continue
+		}
+		seen[r.Rank] = true
+		configs = append(configs, grovev1alpha1.ResourceClaimTemplateConfig{
+			Name: gmsRCTName(serviceName, r.Rank),
+			TemplateSpec: resourcev1.ResourceClaimTemplateSpec{
+				Spec: resourcev1.ResourceClaimSpec{
+					Devices: resourcev1.DeviceClaim{
+						Requests: []resourcev1.DeviceRequest{
+							{
+								Name: "gpu",
+								Exactly: &resourcev1.ExactDeviceRequest{
+									DeviceClassName: getDeviceClassName(resources),
+									AllocationMode:  resourcev1.DeviceAllocationModeExactCount,
+									Count:           int64(getGPUCount(resources)),
+								},
+							},
+						},
+					},
+				},
+			},
+		})
+	}
+	return configs
+}
+
+// gmsResourceSharingEntries builds one PCSG-level ResourceSharingSpec per rank.
+// Each entry uses PerReplica scope and a filter listing only the GMS clique
+// and the engine clique for that rank, ensuring GPU isolation between ranks.
+func gmsResourceSharingEntries(serviceName string, roles []ServiceRole) []grovev1alpha1.PCSGResourceSharingSpec {
+	type rankGroup struct {
+		cliqueNames []string
+	}
+	groups := map[int32]*rankGroup{}
+	var rankOrder []int32
+
+	for _, r := range roles {
+		g, ok := groups[r.Rank]
+		if !ok {
+			g = &rankGroup{}
+			groups[r.Rank] = g
+			rankOrder = append(rankOrder, r.Rank)
+		}
+		g.cliqueNames = append(g.cliqueNames, strings.ToLower(r.Name))
+	}
+
+	refs := make([]grovev1alpha1.PCSGResourceSharingSpec, 0, len(groups))
+	for _, rank := range rankOrder {
+		g := groups[rank]
+		refs = append(refs, grovev1alpha1.PCSGResourceSharingSpec{
+			ResourceSharingSpec: grovev1alpha1.ResourceSharingSpec{
+				Name:  gmsRCTName(serviceName, rank),
+				Scope: grovev1alpha1.ResourceSharingScopePerReplica,
+			},
+			Filter: &grovev1alpha1.PCSGResourceSharingFilter{
+				ChildCliqueNames: g.cliqueNames,
+			},
+		})
+	}
+	return refs
+}
+
+// ──────────────────────────────────────────────────────────────────────────────
+// Intra-pod GMS failover (Mode: intraPod)
+//
+// The main container is cloned into two engine containers (active + standby)
+// within the same pod. GPU access is shared via DRA and a GMS sidecar
+// injects weights via the shared emptyDir volume.
+// ──────────────────────────────────────────────────────────────────────────────
+
+// intraPodFailoverLockFile is the lock file path used by engine containers to
+// coordinate active/standby election within the same pod.
+var intraPodFailoverLockFile = filepath.Join(gmsruntime.SharedMountPath, "failover.lock")

 const (
 	failoverEngineCount = 2
 )

+// isFailoverEnabled returns true only for intra-pod failover mode, where the
+// main container is cloned into active + standby containers within the same pod.
+// Inter-pod failover (Mode=interPod) is handled separately via expandRolesForService
+// and generatePodSpecForRole — it does not use container cloning.
 func isFailoverEnabled(component *v1alpha1.DynamoComponentDeploymentSharedSpec) bool {
-	return component.Failover != nil && component.Failover.Enabled
+	return component.Failover != nil && component.Failover.Enabled &&
+		component.Failover.Mode == v1alpha1.GMSModeIntraPod
 }

 // buildFailoverPod clones the main container into two engine containers (active + standby).
@@ -95,11 +474,10 @@ func buildEngineContainer(base corev1.Container, engineID int, systemPort int) c
 		}
 	}

-	containerName := fmt.Sprintf("engine-%d", engineID)
 	failoverEnvs := []corev1.EnvVar{
 		{Name: "ENGINE_ID", Value: strconv.Itoa(engineID)},
-		{Name: "CONTAINER_NAME", Value: containerName},
-		{Name: "FAILOVER_LOCK_PATH", Value: failoverLockFile},
+		{Name: "CONTAINER_NAME", Value: engine.Name},
+		{Name: "FAILOVER_LOCK_PATH", Value: intraPodFailoverLockFile},
 		{Name: "DYN_SYSTEM_STARTING_HEALTH_STATUS", Value: "notready"},
 		{Name: "DYN_SYSTEM_PORT", Value: strconv.Itoa(systemPort)},
 		{Name: "DYN_SYSTEM_ENABLED", Value: "true"},

--- a/deploy/operator/internal/dynamo/failover_test.go
+++ b/deploy/operator/internal/dynamo/failover_test.go
 /*
 * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
 */

 package dynamo
@@ -14,16 +26,435 @@ import (
 	commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/dra"
 	"github.com/ai-dynamo/dynamo/deploy/operator/internal/gms"
+	grovev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	corev1 "k8s.io/api/core/v1"
+	k8sresource "k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/apimachinery/pkg/util/intstr"
 )

-// failoverPodSpec returns a pod spec that has already been transformed by
+// ──────────────────────────────────────────────────────────────────────────────
+// Inter-pod GMS failover tests
+// ──────────────────────────────────────────────────────────────────────────────
+
+func TestGmsWeightServerPodSpec(t *testing.T) {
+	base := &corev1.PodSpec{
+		Containers: []corev1.Container{{
+			Name:    "engine",
+			Command: []string{"python3", "-m", "vllm.entrypoints.openai.api_server"},
+			Args:    []string{"--model", "meta-llama/Llama-3-8B"},
+			LivenessProbe: &corev1.Probe{
+				ProbeHandler: corev1.ProbeHandler{
+					HTTPGet: &corev1.HTTPGetAction{Path: "/health"},
+				},
+			},
+			ReadinessProbe: &corev1.Probe{
+				ProbeHandler: corev1.ProbeHandler{
+					HTTPGet: &corev1.HTTPGetAction{Path: "/ready"},
+				},
+			},
+			Resources: corev1.ResourceRequirements{
+				Limits: corev1.ResourceList{
+					"nvidia.com/gpu":      k8sresource.MustParse("8"),
+					corev1.ResourceMemory: k8sresource.MustParse("64Gi"),
+				},
+			},
+		}},
+	}
+
+	result := gmsWeightServerPodSpec(base, 0, 8)
+
+	require.Len(t, result.Containers, 1)
+	c := result.Containers[0]
+
+	assert.Equal(t, []string{"bash", "-c"}, c.Command, "should use bash")
+	require.Len(t, c.Args, 1)
+	assert.Contains(t, c.Args[0], gms.ServerModule, "should run gpu_memory_service.cli.server")
+	assert.Nil(t, c.LivenessProbe, "liveness probe should be nil")
+	assert.Nil(t, c.ReadinessProbe, "readiness probe should be nil")
+	assert.NotNil(t, c.StartupProbe, "startup probe should be set")
+	assert.Equal(t, gmsStartupProbeCommand(8), c.StartupProbe.Exec.Command)
+
+	assert.NotContains(t, c.Resources.Limits, corev1.ResourceName("nvidia.com/gpu"), "GPU should be stripped")
+	assert.Contains(t, c.Resources.Limits, corev1.ResourceMemory, "non-GPU limits should remain")
+
+	assert.True(t, hasToleration(result, "nvidia.com/gpu"), "should have GPU toleration")
+	assert.True(t, hasVolume(result, gmsSharedVolumeName), "should have shared volume")
+	assert.True(t, hasVolumeMount(c, gmsSharedMountPath), "should have shared volume mount")
+	assert.True(t, hasEnvVar(c, gms.EnvSocketDir, gmsSharedMountPath), "should set GMS_SOCKET_DIR")
+
+	require.Len(t, result.InitContainers, 1, "should have perm-fix init container")
+	initC := result.InitContainers[0]
+	assert.Equal(t, gmsPermFixInitName, initC.Name)
+	assert.Equal(t, c.Image, initC.Image, "init container should reuse the service image")
+	require.NotNil(t, initC.SecurityContext)
+	assert.Equal(t, int64(0), *initC.SecurityContext.RunAsUser)
+
+	// Verify original is not mutated
+	assert.Len(t, base.Containers[0].Command, 3, "original command should be unchanged")
+}
+
+func TestGmsWeightServerPodSpec_EmptyContainers(t *testing.T) {
+	base := &corev1.PodSpec{}
+	result := gmsWeightServerPodSpec(base, 0, 1)
+	assert.Empty(t, result.Containers)
+}
+
+func TestGmsWeightServerPodSpec_SubPathExpr(t *testing.T) {
+	base := &corev1.PodSpec{
+		Containers: []corev1.Container{{Name: "engine"}},
+	}
+
+	t.Run("rank 0", func(t *testing.T) {
+		result := gmsWeightServerPodSpec(base, 0, 4)
+		mount := findVolumeMount(result.Containers[0], gmsSharedMountPath)
+		require.NotNil(t, mount, "GMS container should mount shared volume")
+		assert.Equal(t, "$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)/rank-0", mount.SubPathExpr)
+	})
+
+	t.Run("rank 3", func(t *testing.T) {
+		result := gmsWeightServerPodSpec(base, 3, 4)
+		mount := findVolumeMount(result.Containers[0], gmsSharedMountPath)
+		require.NotNil(t, mount, "GMS container should mount shared volume")
+		assert.Equal(t, "$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)/rank-3", mount.SubPathExpr)
+	})
+}
+
+func TestAugmentEngineForGMS(t *testing.T) {
+	podSpec := &corev1.PodSpec{
+		Containers: []corev1.Container{{
+			Name: "engine",
+			Env: []corev1.EnvVar{
+				{Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "true"},
+				{Name: "KEEP_ME", Value: "yes"},
+			},
+			Resources: corev1.ResourceRequirements{
+				Limits: corev1.ResourceList{
+					"nvidia.com/gpu": k8sresource.MustParse("4"),
+				},
+			},
+		}},
+	}
+
+	augmentEngineForGMS(podSpec, 1, true)
+	c := podSpec.Containers[0]
+
+	assert.True(t, hasEnvVar(c, "ENGINE_ID", ""), "ENGINE_ID should be set (via Downward API)")
+	assert.True(t, hasEnvVar(c, gms.EnvSocketDir, gmsSharedMountPath))
+	assert.True(t, hasEnvVar(c, "FAILOVER_LOCK_PATH", gmsSharedMountPath+"/"+gmsFailoverLockFile))
+	// DYN_VLLM_GMS_SHADOW_MODE is backend-specific and is injected by
+	// VLLMBackend.UpdateContainer, not by augmentEngineForGMS. See
+	// TestVLLMBackend_UpdateContainer_InterPodGMS in backend_vllm_test.go.
+	assert.False(t, hasEnvVar(c, "DYN_VLLM_GMS_SHADOW_MODE", "true"),
+		"vLLM-specific env var must not leak into backend-agnostic GMS helpers")
+	assert.True(t, hasEnvVar(c, "DYN_SYSTEM_STARTING_HEALTH_STATUS", "notready"))
+	assert.True(t, hasEnvVar(c, "KEEP_ME", "yes"), "unrelated env vars should be preserved")
+
+	for _, e := range c.Env {
+		assert.NotEqual(t, "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", e.Name, "should be removed")
+	}
+
+	assert.NotContains(t, c.Resources.Limits, corev1.ResourceName("nvidia.com/gpu"))
+	assert.True(t, hasToleration(podSpec, "nvidia.com/gpu"))
+	assert.True(t, hasVolume(podSpec, gmsSharedVolumeName))
+
+	require.Len(t, podSpec.InitContainers, 1, "should have perm-fix init container")
+	initC := podSpec.InitContainers[0]
+	assert.Equal(t, gmsPermFixInitName, initC.Name)
+	assert.Equal(t, c.Image, initC.Image, "init container should reuse the service image")
+	require.NotNil(t, initC.SecurityContext)
+	assert.Equal(t, int64(0), *initC.SecurityContext.RunAsUser)
+	initMount := findVolumeMount(initC, gmsSharedMountPath)
+	require.NotNil(t, initMount, "init container should mount shared volume")
+	assert.Equal(t, "$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)/rank-1", initMount.SubPathExpr)
+
+	assert.Equal(t, corev1.RestartPolicyNever, podSpec.RestartPolicy,
+		"inter-pod failover engines must be RestartPolicyNever so the "+
+			"FailoverCascadeReconciler is the sole recovery path")
+}
+
+// TestAugmentEngineForGMS_StandaloneDoesNotForceRestartNever pins the
+// standalone inter-pod GMS behavior: the engine pod must NOT be forced to
+// RestartPolicy=Never. The cascade-group label is only applied when
+// isInterPodFailover is true (see graph.go:GenerateGrovePodCliqueSet), so
+// forcing Never in standalone mode would strand a crashed engine in Failed
+// state with nothing listening to force-delete the PCSG replica. Instead the
+// engine inherits the default (Always) and kubelet restarts it in place,
+// matching the paired GMS weight-server pod — the restarted engine reconnects
+// to the still-running GMS server over UDS during --load-format gms startup.
+func TestAugmentEngineForGMS_StandaloneDoesNotForceRestartNever(t *testing.T) {
+	podSpec := &corev1.PodSpec{
+		Containers: []corev1.Container{{
+			Name: "engine",
+			Resources: corev1.ResourceRequirements{
+				Limits: corev1.ResourceList{
+					"nvidia.com/gpu": k8sresource.MustParse("4"),
+				},
+			},
+		}},
+	}
+
+	augmentEngineForGMS(podSpec, 0, false)
+
+	assert.Equal(t, corev1.RestartPolicy(""), podSpec.RestartPolicy,
+		"standalone inter-pod GMS engine must not have RestartPolicy overridden; "+
+			"kubelet restart is the correct recovery path")
+
+	assert.True(t, hasVolume(podSpec, gmsSharedVolumeName),
+		"standalone engine still needs the shared hostPath for UDS sockets")
+	assert.True(t, hasEnvVar(podSpec.Containers[0], gms.EnvSocketDir, gmsSharedMountPath),
+		"standalone engine still needs the socket-dir env var to reach the GMS server")
+}
+
+func TestAugmentEngineForGMS_EmptyContainers(t *testing.T) {
+	podSpec := &corev1.PodSpec{}
+	augmentEngineForGMS(podSpec, 0, true)
+	assert.Empty(t, podSpec.Containers)
+}
+
+func TestRemoveGPUFromLimits(t *testing.T) {
+	c := &corev1.Container{
+		Resources: corev1.ResourceRequirements{
+			Limits: corev1.ResourceList{
+				"nvidia.com/gpu":      k8sresource.MustParse("8"),
+				corev1.ResourceMemory: k8sresource.MustParse("64Gi"),
+			},
+			Requests: corev1.ResourceList{
+				"nvidia.com/gpu": k8sresource.MustParse("8"),
+			},
+		},
+	}
+
+	removeGPUFromLimits(c)
+	assert.NotContains(t, c.Resources.Limits, corev1.ResourceName("nvidia.com/gpu"))
+	assert.Contains(t, c.Resources.Limits, corev1.ResourceMemory)
+	assert.NotContains(t, c.Resources.Requests, corev1.ResourceName("nvidia.com/gpu"))
+}
+
+func TestAddGPUToleration_Idempotent(t *testing.T) {
+	podSpec := &corev1.PodSpec{}
+	addGPUToleration(podSpec)
+	addGPUToleration(podSpec)
+	count := 0
+	for _, tol := range podSpec.Tolerations {
+		if tol.Key == "nvidia.com/gpu" {
+			count++
+		}
+	}
+	assert.Equal(t, 1, count, "toleration should be added only once")
+}
+
+func TestRemoveEnvVar(t *testing.T) {
+	c := &corev1.Container{
+		Env: []corev1.EnvVar{
+			{Name: "A", Value: "1"},
+			{Name: "REMOVE_ME", Value: "x"},
+			{Name: "B", Value: "2"},
+			{Name: "REMOVE_ME", Value: "y"},
+		},
+	}
+
+	removeEnvVar(c, "REMOVE_ME")
+	assert.Len(t, c.Env, 2)
+	assert.Equal(t, "A", c.Env[0].Name)
+	assert.Equal(t, "B", c.Env[1].Name)
+}
+
+func TestGetGPUCount(t *testing.T) {
+	tests := []struct {
+		name      string
+		resources *v1alpha1.Resources
+		want      int32
+	}{
+		{"nil resources", nil, 0},
+		{"nil limits", &v1alpha1.Resources{}, 0},
+		{"empty gpu string", &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPU: ""}}, 0},
+		{"valid gpu count", &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPU: "8"}}, 8},
+		{"invalid gpu string", &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPU: "abc"}}, 0},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.want, getGPUCount(tt.resources))
+		})
+	}
+}
+
+func TestGetDeviceClassName(t *testing.T) {
+	tests := []struct {
+		name      string
+		resources *v1alpha1.Resources
+		want      string
+	}{
+		{"nil resources", nil, "gpu.nvidia.com"},
+		{"nil limits", &v1alpha1.Resources{}, "gpu.nvidia.com"},
+		{"empty gpuType", &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{}}, "gpu.nvidia.com"},
+		{"custom gpuType", &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPUType: "gpu.nvidia.com/h100"}}, "gpu.nvidia.com/h100"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.want, getDeviceClassName(tt.resources))
+		})
+	}
+}
+
+func TestGmsEngineEnvVars(t *testing.T) {
+	envs := gmsEngineEnvVars()
+
+	names := make(map[string]bool)
+	for _, e := range envs {
+		names[e.Name] = true
+	}
+
+	assert.True(t, names["ENGINE_ID"])
+	assert.True(t, names[gms.EnvSocketDir])
+	assert.True(t, names["FAILOVER_LOCK_PATH"])
+	assert.True(t, names["DYN_SYSTEM_STARTING_HEALTH_STATUS"])
+	// DYN_VLLM_GMS_SHADOW_MODE is backend-specific and is injected by
+	// VLLMBackend.UpdateContainer, not by gmsEngineEnvVars. See
+	// TestVLLMBackend_UpdateContainer_InterPodGMS in backend_vllm_test.go.
+	assert.False(t, names["DYN_VLLM_GMS_SHADOW_MODE"],
+		"vLLM-specific env var must not leak into backend-agnostic GMS helpers")
+
+	for _, e := range envs {
+		if e.Name == "ENGINE_ID" {
+			assert.NotNil(t, e.ValueFrom, "ENGINE_ID should use Downward API")
+			assert.NotNil(t, e.ValueFrom.FieldRef)
+			assert.Contains(t, e.ValueFrom.FieldRef.FieldPath, "grove.io/podclique-pod-index")
+		}
+	}
+}
+
+func TestGroveMultinodeDeployer_GMS(t *testing.T) {
+	t.Run("GetNodeRank returns static rank for GMS", func(t *testing.T) {
+		d := &GroveMultinodeDeployer{IsInterPodGMS: true, Rank: 2}
+		rank, isShellExpr := d.GetNodeRank()
+		assert.Equal(t, "2", rank)
+		assert.False(t, isShellExpr, "GMS rank should be static, not a shell expression")
+	})
+
+	t.Run("GetNodeRank returns shell expr for non-GMS", func(t *testing.T) {
+		d := &GroveMultinodeDeployer{IsInterPodGMS: false}
+		rank, isShellExpr := d.GetNodeRank()
+		assert.Contains(t, rank, "GROVE_PCLQ_POD_INDEX")
+		assert.True(t, isShellExpr)
+	})
+
+	t.Run("GetHostNames for GMS multinode", func(t *testing.T) {
+		d := &GroveMultinodeDeployer{IsInterPodGMS: true, Rank: 0}
+		hostnames := d.GetHostNames("svc", 3)
+		assert.Len(t, hostnames, 3)
+		assert.Contains(t, hostnames[0], "ldr-$(GROVE_PCLQ_POD_INDEX)")
+		assert.Contains(t, hostnames[1], "wkr-1-$(GROVE_PCLQ_POD_INDEX)")
+		assert.Contains(t, hostnames[2], "wkr-2-$(GROVE_PCLQ_POD_INDEX)")
+	})
+
+	t.Run("GetHostNames for non-GMS multinode", func(t *testing.T) {
+		d := &GroveMultinodeDeployer{IsInterPodGMS: false}
+		hostnames := d.GetHostNames("svc", 3)
+		assert.Len(t, hostnames, 3)
+		assert.Contains(t, hostnames[0], "ldr")
+		assert.Contains(t, hostnames[1], "wkr-0")
+		assert.Contains(t, hostnames[2], "wkr-1")
+	})
+}
+
+func TestGmsRCTName(t *testing.T) {
+	assert.Equal(t, "my-svc-gpu-rank-0", gmsRCTName("my-svc", 0))
+	assert.Equal(t, "llama-gpu-rank-2", gmsRCTName("llama", 2))
+}
+
+func TestGmsResourceClaimTemplateConfigs_SingleNode(t *testing.T) {
+	resources := &v1alpha1.Resources{
+		Limits: &v1alpha1.ResourceItem{GPU: "8", GPUType: "gpu.nvidia.com/h100"},
+	}
+	roles := []ServiceRole{
+		{Name: "svc-gms-0", Role: RoleGMS, Rank: 0, Replicas: 1},
+		{Name: "svc", Role: RoleMain, Rank: 0, Replicas: 2},
+	}
+
+	configs := gmsResourceClaimTemplateConfigs("svc", resources, roles)
+
+	require.Len(t, configs, 1)
+	assert.Equal(t, "svc-gpu-rank-0", configs[0].Name)
+
+	req := configs[0].TemplateSpec.Spec.Devices.Requests[0]
+	require.NotNil(t, req.Exactly)
+	assert.Equal(t, "gpu.nvidia.com/h100", req.Exactly.DeviceClassName)
+	assert.Equal(t, int64(8), req.Exactly.Count)
+}
+
+func TestGmsResourceClaimTemplateConfigs_Multinode(t *testing.T) {
+	resources := &v1alpha1.Resources{
+		Limits: &v1alpha1.ResourceItem{GPU: "4"},
+	}
+	roles := []ServiceRole{
+		{Name: "svc-gms-0", Role: RoleGMS, Rank: 0, Replicas: 1},
+		{Name: "svc-ldr", Role: RoleLeader, Rank: 0, Replicas: 3},
+		{Name: "svc-gms-1", Role: RoleGMS, Rank: 1, Replicas: 1},
+		{Name: "svc-wkr-1", Role: RoleWorker, Rank: 1, Replicas: 3},
+	}
+
+	configs := gmsResourceClaimTemplateConfigs("svc", resources, roles)
+
+	require.Len(t, configs, 2)
+	assert.Equal(t, "svc-gpu-rank-0", configs[0].Name)
+	assert.Equal(t, "svc-gpu-rank-1", configs[1].Name)
+
+	req := configs[1].TemplateSpec.Spec.Devices.Requests[0]
+	require.NotNil(t, req.Exactly)
+	assert.Equal(t, "gpu.nvidia.com", req.Exactly.DeviceClassName)
+	assert.Equal(t, int64(4), req.Exactly.Count)
+}
+
+func TestGmsResourceSharingEntries_SingleNode(t *testing.T) {
+	roles := []ServiceRole{
+		{Name: "svc-gms-0", Role: RoleGMS, Rank: 0, Replicas: 1},
+		{Name: "svc", Role: RoleMain, Rank: 0, Replicas: 2},
+	}
+
+	refs := gmsResourceSharingEntries("svc", roles)
+
+	require.Len(t, refs, 1)
+	assert.Equal(t, "svc-gpu-rank-0", refs[0].Name)
+	assert.Equal(t, grovev1alpha1.ResourceSharingScopePerReplica, refs[0].Scope)
+	require.NotNil(t, refs[0].Filter)
+	assert.Equal(t, []string{"svc-gms-0", "svc"}, refs[0].Filter.ChildCliqueNames)
+}
+
+func TestGmsResourceSharingEntries_Multinode(t *testing.T) {
+	roles := []ServiceRole{
+		{Name: "svc-gms-0", Role: RoleGMS, Rank: 0, Replicas: 1},
+		{Name: "svc-ldr", Role: RoleLeader, Rank: 0, Replicas: 3},
+		{Name: "svc-gms-1", Role: RoleGMS, Rank: 1, Replicas: 1},
+		{Name: "svc-wkr-1", Role: RoleWorker, Rank: 1, Replicas: 3},
+	}
+
+	refs := gmsResourceSharingEntries("svc", roles)
+
+	require.Len(t, refs, 2)
+
+	assert.Equal(t, "svc-gpu-rank-0", refs[0].Name)
+	assert.Equal(t, grovev1alpha1.ResourceSharingScopePerReplica, refs[0].Scope)
+	require.NotNil(t, refs[0].Filter)
+	assert.Equal(t, []string{"svc-gms-0", "svc-ldr"}, refs[0].Filter.ChildCliqueNames)
+
+	assert.Equal(t, "svc-gpu-rank-1", refs[1].Name)
+	assert.Equal(t, grovev1alpha1.ResourceSharingScopePerReplica, refs[1].Scope)
+	require.NotNil(t, refs[1].Filter)
+	assert.Equal(t, []string{"svc-gms-1", "svc-wkr-1"}, refs[1].Filter.ChildCliqueNames)
+}
+
+// ──────────────────────────────────────────────────────────────────────────────
+// Intra-pod failover tests
+// ──────────────────────────────────────────────────────────────────────────────
+
+// intraPodFailoverPodSpec returns a pod spec that has already been transformed by
 // applyGPUMemoryService (DRA claims, shared volume, TMPDIR set), including
 // a frontend sidecar to verify sidecar preservation.
-func failoverPodSpec() corev1.PodSpec {
+func intraPodFailoverPodSpec() corev1.PodSpec {
 	httpPort := intstr.FromString("system")
 	return corev1.PodSpec{
 		Containers: []corev1.Container{
@@ -72,10 +503,8 @@ func failoverPodSpec() corev1.PodSpec {
 	}
 }

-// --- buildFailoverPod ---
-
 func TestBuildFailoverPod_TwoEnginesPlusSidecar(t *testing.T) {
-	ps := failoverPodSpec()
+	ps := intraPodFailoverPodSpec()
 	err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
 	require.NoError(t, err)

@@ -94,14 +523,14 @@ func TestBuildFailoverPod_EmptyContainers(t *testing.T) {
 }

 func TestBuildFailoverPod_RejectsNonVLLM(t *testing.T) {
-	ps := failoverPodSpec()
+	ps := intraPodFailoverPodSpec()
 	err := buildFailoverPod(&ps, 1, BackendFrameworkSGLang)
 	require.Error(t, err)
 	assert.Contains(t, err.Error(), "currently supported only for vLLM")
 }

 func TestBuildFailoverPod_EngineEnvVars(t *testing.T) {
-	ps := failoverPodSpec()
+	ps := intraPodFailoverPodSpec()
 	err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
 	require.NoError(t, err)

@@ -110,8 +539,7 @@ func TestBuildFailoverPod_EngineEnvVars(t *testing.T) {
 		env := envToMap(engine.Env)
 		assert.Equal(t, strconv.Itoa(i), env["ENGINE_ID"], "engine-%d ENGINE_ID", i)
 		assert.Equal(t, fmt.Sprintf("engine-%d", i), env["CONTAINER_NAME"], "engine-%d CONTAINER_NAME", i)
-		assert.Equal(t, failoverLockFile, env["FAILOVER_LOCK_PATH"], "engine-%d FAILOVER_LOCK_PATH", i)
-		assert.Equal(t, "true", env["DYN_VLLM_GMS_SHADOW_MODE"], "engine-%d shadow mode", i)
+		assert.Equal(t, intraPodFailoverLockFile, env["FAILOVER_LOCK_PATH"], "engine-%d FAILOVER_LOCK_PATH", i)
 		assert.Equal(t, "notready", env["DYN_SYSTEM_STARTING_HEALTH_STATUS"], "engine-%d starting health", i)
 		assert.Equal(t, "true", env["DYN_SYSTEM_ENABLED"], "engine-%d system enabled", i)

@@ -124,7 +552,7 @@ func TestBuildFailoverPod_EngineEnvVars(t *testing.T) {
 }

 func TestBuildFailoverPod_StaggeredPorts(t *testing.T) {
-	ps := failoverPodSpec()
+	ps := intraPodFailoverPodSpec()
 	err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
 	require.NoError(t, err)

@@ -139,7 +567,7 @@ func TestBuildFailoverPod_StaggeredPorts(t *testing.T) {
 }

 func TestBuildFailoverPod_ProbesRetargetedToNamedPort(t *testing.T) {
-	ps := failoverPodSpec()
+	ps := intraPodFailoverPodSpec()
 	err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
 	require.NoError(t, err)

@@ -159,7 +587,7 @@ func TestBuildFailoverPod_ProbesRetargetedToNamedPort(t *testing.T) {
 }

 func TestBuildFailoverPod_PreservesDRAClaim(t *testing.T) {
-	ps := failoverPodSpec()
+	ps := intraPodFailoverPodSpec()
 	err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
 	require.NoError(t, err)

@@ -171,7 +599,7 @@ func TestBuildFailoverPod_PreservesDRAClaim(t *testing.T) {
 }

 func TestBuildFailoverPod_PreservesDiscoveryBackend(t *testing.T) {
-	ps := failoverPodSpec()
+	ps := intraPodFailoverPodSpec()
 	err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
 	require.NoError(t, err)

@@ -182,7 +610,7 @@ func TestBuildFailoverPod_PreservesDiscoveryBackend(t *testing.T) {
 }

 func TestBuildFailoverPod_MultinodeNNODES(t *testing.T) {
-	ps := failoverPodSpec()
+	ps := intraPodFailoverPodSpec()
 	err := buildFailoverPod(&ps, 4, BackendFrameworkVLLM)
 	require.NoError(t, err)

@@ -193,7 +621,7 @@ func TestBuildFailoverPod_MultinodeNNODES(t *testing.T) {
 }

 func TestBuildFailoverPod_SingleNodeNoNNODES(t *testing.T) {
-	ps := failoverPodSpec()
+	ps := intraPodFailoverPodSpec()
 	err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
 	require.NoError(t, err)

@@ -204,18 +632,70 @@ func TestBuildFailoverPod_SingleNodeNoNNODES(t *testing.T) {
 	}
 }

-// --- isFailoverEnabled ---
-
 func TestIsFailoverEnabled(t *testing.T) {
 	assert.True(t, isFailoverEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{
-		Failover: &v1alpha1.FailoverSpec{Enabled: true},
+		Failover: &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeIntraPod},
 	}))
 	assert.False(t, isFailoverEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{
-		Failover: &v1alpha1.FailoverSpec{Enabled: false},
+		Failover: &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
+	}), "inter-pod mode must not trigger intra-pod container cloning")
+	assert.False(t, isFailoverEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{
+		Failover: &v1alpha1.FailoverSpec{Enabled: false, Mode: v1alpha1.GMSModeIntraPod},
 	}))
 	assert.False(t, isFailoverEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{}))
 }

+// ──────────────────────────────────────────────────────────────────────────────
+// Helpers
+// ──────────────────────────────────────────────────────────────────────────────
+
+func hasToleration(podSpec *corev1.PodSpec, key string) bool {
+	for _, t := range podSpec.Tolerations {
+		if t.Key == key {
+			return true
+		}
+	}
+	return false
+}
+
+func hasVolume(podSpec *corev1.PodSpec, name string) bool {
+	for _, v := range podSpec.Volumes {
+		if v.Name == name {
+			return true
+		}
+	}
+	return false
+}
+
+func hasVolumeMount(c corev1.Container, mountPath string) bool {
+	for _, m := range c.VolumeMounts {
+		if m.MountPath == mountPath {
+			return true
+		}
+	}
+	return false
+}
+
+func findVolumeMount(c corev1.Container, mountPath string) *corev1.VolumeMount {
+	for i := range c.VolumeMounts {
+		if c.VolumeMounts[i].MountPath == mountPath {
+			return &c.VolumeMounts[i]
+		}
+	}
+	return nil
+}
+
+func hasEnvVar(c corev1.Container, name, value string) bool {
+	for _, e := range c.Env {
+		if e.Name == name {
+			if value == "" || e.Value == value {
+				return true
+			}
+		}
+	}
+	return false
+}
+
 func envToMap(envs []corev1.EnvVar) map[string]string {
 	m := make(map[string]string, len(envs))
 	for _, e := range envs {

--- a/deploy/operator/internal/dynamo/graph.go
+++ b/deploy/operator/internal/dynamo/graph.go
@@ -522,30 +522,33 @@ func resolveImagePullSecrets(retriever SecretsRetriever, namespace, image string
 }

 // applyCliqueStartupDependencies configures StartsAfter dependencies for cliques in a PodCliqueSet
-// based on the backend framework and multinode deployment patterns.
+// based on the backend framework, multinode deployment patterns, and the
+// inter-pod GMS layout.
 //
 // Rules:
-// - For VLLM and SGLang: worker cliques start after leader clique
-// - For TRTLLM: leader clique starts after worker cliques
-// - Only applies to multinode deployments (numberOfNodes > 1)
+//   - For TRTLLM multinode: leader clique starts after worker cliques
+//   - For inter-pod GMS: engine PCLQs start after their corresponding GMS PCLQ
+//     (per rank). This applies both to the standalone inter-pod layout and to
+//     the inter-pod layout with failover; the ordering reflects that engines
+//     load weights from the weight-server pod regardless of whether shadows are
+//     present.
 //   - Sets the PodCliqueSet StartupType to Explicit if any dependencies are configured
 func applyCliqueStartupDependencies(
 	gangSet *grovev1alpha1.PodCliqueSet,
 	roles []ServiceRole,
 	backendFramework BackendFramework,
 	numberOfNodes int32,
+	isInterPodGMS bool,
 ) {
-	// enabled for TRTLLM multinode deployments only
-	// TODO: reactivate for all backends when we have a better way to handle the readiness probe for the leader.
-	enabled := backendFramework == BackendFrameworkTRTLLM && numberOfNodes > 1
-
-	if !enabled {
-		return // No dependencies for single-node deployments
+	enabledMultinode := backendFramework == BackendFrameworkTRTLLM && numberOfNodes > 1
+	if !enabledMultinode && !isInterPodGMS {
+		return
 	}

-	// Build maps of leader and worker clique names
 	var leaderCliqueName string
 	var workerCliqueNames []string
+	// For GMS: map rank -> GMS clique name
+	gmsCliqueByRank := map[int32]string{}

 	for _, r := range roles {
 		cliqueName := strings.ToLower(r.Name)
@@ -554,30 +557,49 @@ func applyCliqueStartupDependencies(
 			leaderCliqueName = cliqueName
 		case RoleWorker:
 			workerCliqueNames = append(workerCliqueNames, cliqueName)
+		case RoleGMS:
+			gmsCliqueByRank[r.Rank] = cliqueName
 		}
 	}

-	// Apply dependencies to cliques
 	hasDependencies := false
 	for _, clique := range gangSet.Spec.Template.Cliques {
-		// Find the corresponding role for this clique
 		var cliqueRole Role
+		var cliqueRank int32
+		found := false
 		for _, r := range roles {
 			if strings.ToLower(r.Name) == clique.Name {
 				cliqueRole = r.Role
+				cliqueRank = r.Rank
+				found = true
 				break
 			}
 		}
+		if !found {
+			continue
+		}
+
+		var startsAfter []string
+
+		// GMS dependencies: engine PCLQs start after their rank's GMS PCLQ
+		if isInterPodGMS && cliqueRole != RoleGMS {
+			if gmsName, ok := gmsCliqueByRank[cliqueRank]; ok {
+				startsAfter = append(startsAfter, gmsName)
+			}
+		}
+
+		// Existing multinode dependencies
+		if enabledMultinode {
+			multiDeps := getCliqueStartupDependencies(cliqueRole, backendFramework, leaderCliqueName, workerCliqueNames)
+			startsAfter = append(startsAfter, multiDeps...)
+		}

-		// Determine dependencies for this clique
-		startsAfter := getCliqueStartupDependencies(cliqueRole, backendFramework, leaderCliqueName, workerCliqueNames)
 		if len(startsAfter) > 0 {
 			clique.Spec.StartsAfter = startsAfter
 			hasDependencies = true
 		}
 	}

-	// Set explicit startup type if we have any dependencies
 	if hasDependencies {
 		explicitStartupType := grovev1alpha1.CliqueStartupTypeExplicit
 		gangSet.Spec.Template.StartupType = &explicitStartupType
@@ -660,7 +682,7 @@ func GenerateComponentService(params ComponentServiceParams) (*corev1.Service, e
 		labels[k] = v
 	}
 	if params.IsK8sDiscovery {
-		labels[commonconsts.KubeLabelDynamoDiscoveryBackend] = "kubernetes"
+		labels[commonconsts.KubeLabelDynamoDiscoveryBackend] = commonconsts.DiscoveryBackendKubernetes
 		labels[commonconsts.KubeLabelDynamoDiscoveryEnabled] = commonconsts.KubeLabelValueTrue
 	}

@@ -822,28 +844,116 @@ const (
 	RoleWorker     Role = "worker"
 	RoleMain       Role = "main"
 	RoleCheckpoint Role = "checkpoint"
+	RoleGMS        Role = "gms"
 )

-// Update ServiceRole struct for expandRolesForService
-
+// ServiceRole describes one PodClique (PCLQ) to be materialised for a
+// service. A single DynamoComponentDeploymentSharedSpec can expand into
+// multiple ServiceRoles depending on the deployment topology:
+//
+//   - single-node, no GMS: 1 role (RoleMain)
+//   - multinode, no GMS:    2 roles (RoleLeader + RoleWorker)
+//   - single-node, inter-pod GMS: 1 engine PCLQ (replicated) + 1 RoleGMS
+//     weight-server PCLQ
+//   - multinode, inter-pod GMS: N engine PCLQs (one per rank, replicated)
+//   - 1 RoleGMS weight-server PCLQ
+//
+// The fields carry the information buildCliqueForRole needs to produce a
+// concrete PodCliqueTemplateSpec:
+//
+//   - Name: PCLQ name suffix used for Grove resource naming and hostname
+//     derivation.
+//   - Role:     the pod's semantic role (main/leader/worker/gms). Drives
+//     backend-specific wiring (e.g. --load-format, --node-rank, discovery
+//     labels).
+//   - Replicas: the PCLQ replica count. For GMS this is the number of
+//     engine pods per rank (primary + NumShadows shadows); for non-GMS
+//     roles it is typically 1 (the PCSG-level serviceReplicas controls
+//     horizontal scaling).
+//   - Rank:     static node rank (0 = leader/main, 1..N-1 = workers).
+//     Non-trivial for inter-pod GMS because each rank becomes its own
+//     PCLQ and shares a pod index across shadows; for non-GMS multinode
+//     pods the rank is derived dynamically from GROVE_PCLQ_POD_INDEX.
 type ServiceRole struct {
 	Name     string
 	Role     Role
 	Replicas int32
+	Rank     int32 // node rank: 0 = leader/main, 1..N-1 = workers
 }

-// Update expandRolesForService to use Role
-func expandRolesForService(serviceName string, serviceReplicas *int32, numberOfNodes int32) []ServiceRole {
-	var roles []ServiceRole
-	if numberOfNodes > 1 {
-		roles = append(roles, ServiceRole{Name: serviceName + "-" + commonconsts.GroveRoleSuffixLeader, Role: RoleLeader, Replicas: 1})
-		roles = append(roles, ServiceRole{Name: serviceName + "-" + commonconsts.GroveRoleSuffixWorker, Role: RoleWorker, Replicas: numberOfNodes - 1})
-	} else {
+// expandRolesForService turns a service's (numberOfNodes,
+// gpuMemoryService.mode, failover.mode, replicas) tuple into the concrete
+// list of ServiceRole entries the rest of the Grove rendering pipeline
+// iterates over. It is the single place that decides how many PodCliques a
+// service produces and what each PCLQ looks like (name, role, replicas,
+// static rank).
+//
+// The inter-pod GMS branch is selected by IsInterPodGMSEnabled() (layout)
+// rather than IsInterPodFailoverEnabled() (hot-spares): both the standalone
+// inter-pod layout (1 engine pod + 1 weight-server pod per rank) and the
+// inter-pod layout with failover (primary + N shadows + 1 weight-server pod
+// per rank) use the same PCLQ topology, differing only in the per-rank engine
+// clique's Replicas (derived from GetTotalEnginePods).
+//
+// Callers that iterate "engine roles" must still gate on
+// IsInterPodGMSEnabled() — this function emits the GMS weight-server PCLQ
+// as a regular ServiceRole, not as a separate concept.
+func expandRolesForService(serviceName string, serviceReplicas *int32, numberOfNodes int32, component *v1alpha1.DynamoComponentDeploymentSharedSpec) []ServiceRole {
+	isInterPodGMS := component.IsInterPodGMSEnabled()
+	isMultinode := numberOfNodes > 1
+
+	switch {
+	case isMultinode && isInterPodGMS:
+		return expandMultinodeGMSRoles(serviceName, numberOfNodes, component.GetTotalEnginePods())
+	case isMultinode:
+		return expandMultinodeRoles(serviceName, numberOfNodes)
+	case isInterPodGMS:
+		return expandSingleNodeGMSRoles(serviceName, component.GetTotalEnginePods())
+	default:
+		return expandSingleNodeRoles(serviceName, serviceReplicas)
+	}
+}
+
+func expandSingleNodeRoles(serviceName string, serviceReplicas *int32) []ServiceRole {
 	replicas := int32(1)
 	if serviceReplicas != nil {
 		replicas = *serviceReplicas
 	}
-		roles = append(roles, ServiceRole{Name: serviceName, Role: RoleMain, Replicas: replicas})
+	return []ServiceRole{
+		{Name: serviceName, Role: RoleMain, Replicas: replicas},
+	}
+}
+
+func expandMultinodeRoles(serviceName string, numberOfNodes int32) []ServiceRole {
+	return []ServiceRole{
+		{Name: serviceName + "-" + commonconsts.GroveRoleSuffixLeader, Role: RoleLeader, Replicas: 1},
+		{Name: serviceName + "-" + commonconsts.GroveRoleSuffixWorker, Role: RoleWorker, Replicas: numberOfNodes - 1},
+	}
+}
+
+func expandSingleNodeGMSRoles(serviceName string, totalEnginePods int32) []ServiceRole {
+	return []ServiceRole{
+		{Name: fmt.Sprintf("%s-%s-0", serviceName, commonconsts.GroveRoleSuffixGMS), Role: RoleGMS, Replicas: 1, Rank: 0},
+		{Name: serviceName, Role: RoleMain, Replicas: totalEnginePods, Rank: 0},
+	}
+}
+
+func expandMultinodeGMSRoles(serviceName string, numberOfNodes int32, totalEnginePods int32) []ServiceRole {
+	roles := make([]ServiceRole, 0, numberOfNodes*2)
+	for rank := int32(0); rank < numberOfNodes; rank++ {
+		gmsName := fmt.Sprintf("%s-%s-%d", serviceName, commonconsts.GroveRoleSuffixGMS, rank)
+		roles = append(roles, ServiceRole{Name: gmsName, Role: RoleGMS, Replicas: 1, Rank: rank})
+
+		var engineName string
+		var engineRole Role
+		if rank == 0 {
+			engineName = serviceName + "-" + commonconsts.GroveRoleSuffixLeader
+			engineRole = RoleLeader
+		} else {
+			engineName = fmt.Sprintf("%s-%s-%d", serviceName, commonconsts.GroveRoleSuffixWorker, rank)
+			engineRole = RoleWorker
+		}
+		roles = append(roles, ServiceRole{Name: engineName, Role: engineRole, Replicas: totalEnginePods, Rank: rank})
 	}
 	return roles
 }
@@ -1002,6 +1112,7 @@ func GenerateBasePodSpec(
 	multinodeDeploymentType commonconsts.MultinodeDeploymentType,
 	serviceName string,
 	checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info (resolved by ResolveCheckpointForService)
+	deployerOverride MultinodeDeployer, // Optional: overrides factory-created deployer when non-nil
 ) (*corev1.PodSpec, error) {
 	// Start with base container generated per component type
 	componentContext := generateComponentContext(component, parentGraphDeploymentName, namespace, numberOfNodes, NewDiscoveryContext(operatorConfig.Discovery.Backend, component.Annotations))
@@ -1119,10 +1230,13 @@ func GenerateBasePodSpec(
 		})
 	}
 	// Apply backend-specific container modifications
-	multinodeDeployer := MultinodeDeployerFactory(multinodeDeploymentType)
+	multinodeDeployer := deployerOverride
+	if multinodeDeployer == nil {
+		multinodeDeployer = MultinodeDeployerFactory(multinodeDeploymentType)
 		if multinodeDeployer == nil {
 			return nil, fmt.Errorf("unsupported multinode deployment type: %s", multinodeDeploymentType)
 		}
+	}
 	backend := BackendFactory(backendFramework, operatorConfig, parentGraphDeploymentName)
 	if backend == nil {
 		return nil, fmt.Errorf("unsupported backend framework: %s", backendFramework)
@@ -1184,8 +1298,17 @@ func GenerateBasePodSpec(
 		}
 	}

-	// GMS: replace nvidia.com/gpu with a shared DRA claim and add the server sidecar.
-	if component.GPUMemoryService != nil && component.GPUMemoryService.Enabled {
+	// Intra-pod GMS: replace nvidia.com/gpu with a shared DRA claim and add the server
+	// sidecar directly into this pod.
+	//
+	// Inter-pod GMS (gpuMemoryService.mode=interPod, with or without failover)
+	// must be skipped here — that layout wires DRA claims and the GMS server
+	// on a dedicated weight-server pod at the PCSG level (see
+	// generateGrovePodCliqueSet → gmsWeightServerPodSpec); re-applying the
+	// claim and injecting a sidecar here would produce a double-wired engine
+	// pod (stray GMS sidecar, conflicting claim).
+	if component.GPUMemoryService != nil && component.GPUMemoryService.Enabled &&
+		!component.IsInterPodGMSEnabled() {
 		claimTemplateName := dra.ResourceClaimTemplateName(parentGraphDeploymentName, serviceName)
 		if err := dra.ApplyClaim(&podSpec, claimTemplateName); err != nil {
 			return nil, fmt.Errorf("failed to apply DRA claim for GMS: %w", err)
@@ -1283,7 +1406,8 @@ func generateFrontendSidecar(
 	return container, nil
 }

-// GeneratePodSpecForComponent creates a PodSpec for Grove deployments (simplified wrapper)
+// GeneratePodSpecForComponent creates a PodSpec for Grove deployments (simplified wrapper).
+// deployerOverride, when non-nil, overrides the default MultinodeDeployer from the factory.
 func GeneratePodSpecForComponent(
 	component *v1alpha1.DynamoComponentDeploymentSharedSpec,
 	backendFramework BackendFramework,
@@ -1294,7 +1418,8 @@ func GeneratePodSpecForComponent(
 	operatorConfig *configv1alpha1.OperatorConfiguration,
 	multinodeDeploymentType commonconsts.MultinodeDeploymentType,
 	serviceName string,
-	checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info
+	checkpointInfo *checkpoint.CheckpointInfo,
+	deployerOverride MultinodeDeployer,
 ) (*corev1.PodSpec, error) {
 	if len(dynamoDeployment.Spec.Envs) > 0 {
 		component.Envs = MergeEnvs(dynamoDeployment.Spec.Envs, component.Envs)
@@ -1303,7 +1428,7 @@ func GeneratePodSpecForComponent(
 	propagateDGDAnnotations(dynamoDeployment.GetAnnotations(), component)
 	propagateDGDSpecMetadata(dynamoDeployment.Spec.Annotations, dynamoDeployment.Spec.Labels, component)

-	podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, operatorConfig, multinodeDeploymentType, serviceName, checkpointInfo)
+	podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, operatorConfig, multinodeDeploymentType, serviceName, checkpointInfo, deployerOverride)
 	if err != nil {
 		return nil, err
 	}
@@ -1359,6 +1484,150 @@ func propagateDGDSpecMetadata(annotations, labels map[string]string, component *
 }

 // GenerateGrovePodCliqueSet generates a Grove PodCliqueSet for the given deployment, supporting both single-node and multinode cases.
+// cliqueParams groups the context needed to build a single PodClique template
+// from a ServiceRole. All fields come from the enclosing GenerateGrovePodCliqueSet
+// loop iteration and are read-only.
+type cliqueParams struct {
+	r                          ServiceRole
+	component                  *v1alpha1.DynamoComponentDeploymentSharedSpec
+	backendFramework           BackendFramework
+	secretsRetriever           SecretsRetriever
+	dynamoDeployment           *v1alpha1.DynamoGraphDeployment
+	numberOfNodes              int32
+	operatorConfig             *configv1alpha1.OperatorConfiguration
+	runtimeConfig              *controller_common.RuntimeConfig
+	serviceName                string
+	checkpointInfo             *checkpoint.CheckpointInfo
+	isMultinode                bool
+	usesPCSG                   bool
+	isInterPodGMS              bool
+	isInterPodFailover         bool
+	discoveryBackend           configv1alpha1.DiscoveryBackend
+	discoveryContext           DiscoveryContext
+	restartState               *RestartState
+	existingRestartAnnotations map[string]string
+	validatedQueueName         string
+	kubeClient                 ctrlclient.Reader
+	ctx                        context.Context
+}
+
+// buildCliqueForRole generates a single PodCliqueTemplateSpec for the given role,
+// injecting labels, annotations, checkpoint config, and scheduler settings.
+func buildCliqueForRole(p cliqueParams) (*grovev1alpha1.PodCliqueTemplateSpec, error) {
+	podSpec, err := generatePodSpecForRole(
+		p.r, p.component, p.backendFramework, p.secretsRetriever,
+		p.dynamoDeployment, p.numberOfNodes, p.operatorConfig, p.serviceName, p.checkpointInfo,
+	)
+	if err != nil {
+		return nil, fmt.Errorf("failed to generate podSpec for role %s: %w", p.r.Name, err)
+	}
+
+	if p.operatorConfig.Checkpoint.Enabled {
+		if err := checkpoint.InjectCheckpointIntoPodSpec(
+			p.ctx, p.kubeClient, p.dynamoDeployment.Namespace, podSpec, p.checkpointInfo,
+		); err != nil {
+			return nil, fmt.Errorf("failed to inject checkpoint config for role %s: %w", p.r.Name, err)
+		}
+	}
+
+	// minAvailable controls Grove gang-scheduling: the clique is only
+	// considered available when at least this many replicas are Ready.
+	//
+	// The invariant we want is "minAvailable = Replicas unless the clique
+	// has redundant replicas". Concretely:
+	//
+	//   - Plain multinode (no inter-pod GMS failover): the worker clique
+	//     collapses non-leader ranks into a single clique with
+	//     Replicas = numberOfNodes - 1 and those pods are NCCL peers of each
+	//     other — losing any one breaks the collective, so all replicas
+	//     must be Ready. Standalone inter-pod GMS on multinode also lands
+	//     here but has Replicas = 1 per PCLQ (primary only, no shadows), so
+	//     the same rule evaluates to minAvailable = 1 without a special case.
+	//
+	//   - Inter-pod GMS failover (single- or multinode): within each rank
+	//     Replicas = primary + shadows and shadows ARE redundant hot spares
+	//     — requiring every shadow to be Ready would defeat failover, so
+	//     the clique stays at minAvailable = 1.
+	//
+	//   - Single-node clique (no multinode, with or without intra-pod
+	//     failover or standalone inter-pod GMS): Replicas is at most 1 or a
+	//     small DP fanout under the outer PCSG where the replicas are
+	//     independent of each other; minAvailable = 1 is correct.
+	//
+	// The two-line rule below captures all of the above: take the baseline
+	// of 1, then lift it to Replicas only on plain multinode without
+	// inter-pod failover (the only layout that combines >1 replicas per
+	// clique with no redundancy between them).
+	minAvailable := int32(1)
+	if p.isMultinode && !p.isInterPodFailover {
+		minAvailable = p.r.Replicas
+	}
+
+	clique := &grovev1alpha1.PodCliqueTemplateSpec{
+		Name: strings.ToLower(p.r.Name),
+		Spec: grovev1alpha1.PodCliqueSpec{
+			RoleName:     strings.ToLower(p.r.Name),
+			Replicas:     p.r.Replicas,
+			MinAvailable: ptr.To(minAvailable),
+			PodSpec:      *podSpec,
+		},
+	}
+
+	if !p.usesPCSG {
+		clique.TopologyConstraint = toGroveTopologyConstraint(p.component.TopologyConstraint)
+	}
+
+	labels, err := generateLabels(p.component, p.dynamoDeployment, p.serviceName, p.discoveryContext)
+	if err != nil {
+		return nil, fmt.Errorf("failed to generate labels: %w", err)
+	}
+	clique.Labels = labels
+	if p.isInterPodFailover && p.r.Role != RoleGMS {
+		clique.Labels[commonconsts.KubeLabelDynamoFailoverEngineGroupMember] = commonconsts.KubeLabelValueTrue
+	}
+	// Strip discovery labels from RoleGMS pods. generateLabels applies them
+	// unconditionally to every role for container-mode Pod reflector filtering
+	// (see #8067), but GMS weight-server pods run gpu_memory_service.cli.server
+	// — not the dynamo runtime — and never register a DynamoWorkerMetadata CR.
+	// Leaving the labels on them would make the Rust discovery daemon include
+	// them in its reflector store for no purpose and wake its debounce loop on
+	// every GMS restart/fast-kill event.
+	if p.r.Role == RoleGMS {
+		delete(clique.Labels, commonconsts.KubeLabelDynamoDiscoveryBackend)
+		delete(clique.Labels, commonconsts.KubeLabelDynamoDiscoveryEnabled)
+	}
+
+	annotations, err := generateAnnotations(p.component)
+	if err != nil {
+		return nil, fmt.Errorf("failed to generate annotations: %w", err)
+	}
+	checkpoint.ApplyRestorePodMetadata(labels, annotations, p.checkpointInfo)
+	annotations = applyRestartAnnotation(annotations, p.serviceName, p.restartState, p.existingRestartAnnotations)
+	clique.Annotations = annotations
+
+	injectKaiSchedulerIfEnabled(clique, p.runtimeConfig, p.validatedQueueName)
+	return clique, nil
+}
+
+// applyRestartAnnotation adds the restart annotation to the map if needed,
+// creating the map when it is nil.
+func applyRestartAnnotation(annotations map[string]string, serviceName string, restartState *RestartState, existingRestartAnnotations map[string]string) map[string]string {
+	if restartState.ShouldAnnotateService(serviceName) {
+		if annotations == nil {
+			annotations = make(map[string]string)
+		}
+		annotations[commonconsts.RestartAnnotation] = restartState.Timestamp
+	} else if existingRestartAnnotations != nil {
+		if existingTimestamp, ok := existingRestartAnnotations[serviceName]; ok {
+			if annotations == nil {
+				annotations = make(map[string]string)
+			}
+			annotations[commonconsts.RestartAnnotation] = existingTimestamp
+		}
+	}
+	return annotations
+}
+
 func GenerateGrovePodCliqueSet(
 	ctx context.Context,
 	dynamoDeployment *v1alpha1.DynamoGraphDeployment,
@@ -1402,7 +1671,16 @@ func GenerateGrovePodCliqueSet(
 	discoveryContext := NewDiscoveryContext(operatorConfig.Discovery.Backend, dynamoDeployment.Annotations)

 	var scalingGroups []grovev1alpha1.PodCliqueScalingGroupConfig
-	for serviceName, component := range dynamoDeployment.Spec.Services {
+	var resourceClaimTemplates []grovev1alpha1.ResourceClaimTemplateConfig
+
+	sortedServiceNames := make([]string, 0, len(dynamoDeployment.Spec.Services))
+	for name := range dynamoDeployment.Spec.Services {
+		sortedServiceNames = append(sortedServiceNames, name)
+	}
+	sort.Strings(sortedServiceNames)
+
+	for _, serviceName := range sortedServiceNames {
+		component := dynamoDeployment.Spec.Services[serviceName]
 		dynamoNamespace := GetDynamoNamespace(dynamoDeployment, component)
 		component.DynamoNamespace = &dynamoNamespace
 		// Determine backend framework using hybrid approach
@@ -1426,113 +1704,121 @@ func GenerateGrovePodCliqueSet(

 		numberOfNodes := component.GetNumberOfNodes()
 		isMultinode := numberOfNodes > 1
-		roles := expandRolesForService(serviceName, component.Replicas, numberOfNodes)
+		isInterPodGMS := component.IsInterPodGMSEnabled()
+		isInterPodFailover := component.IsInterPodFailoverEnabled()
+		usesPCSG := isMultinode || isInterPodGMS
+		roles := expandRolesForService(serviceName, component.Replicas, numberOfNodes, component)
 		var cliqueNames []string

 		for _, r := range roles {
-			podSpec, err := GeneratePodSpecForComponent(
-				component,
-				backendFramework,
-				secretsRetriever,
-				dynamoDeployment,
-				r.Role,
-				numberOfNodes,
-				operatorConfig,
-				commonconsts.MultinodeDeploymentTypeGrove,
-				serviceName,
-				checkpointInfo,
-			)
+			clique, err := buildCliqueForRole(cliqueParams{
+				r:                          r,
+				component:                  component,
+				backendFramework:           backendFramework,
+				secretsRetriever:           secretsRetriever,
+				dynamoDeployment:           dynamoDeployment,
+				numberOfNodes:              numberOfNodes,
+				operatorConfig:             operatorConfig,
+				runtimeConfig:              runtimeConfig,
+				serviceName:                serviceName,
+				checkpointInfo:             checkpointInfo,
+				isMultinode:                isMultinode,
+				usesPCSG:                   usesPCSG,
+				isInterPodGMS:              isInterPodGMS,
+				isInterPodFailover:         isInterPodFailover,
+				discoveryBackend:           discoveryBackend,
+				discoveryContext:           discoveryContext,
+				restartState:               restartState,
+				existingRestartAnnotations: existingRestartAnnotations,
+				validatedQueueName:         validatedQueueName,
+				kubeClient:                 kubeClient,
+				ctx:                        ctx,
+			})
 			if err != nil {
-				return nil, fmt.Errorf("failed to generate podSpec for role %s: %w", r.Name, err)
-			}
-
-			if operatorConfig.Checkpoint.Enabled {
-				if err := checkpoint.InjectCheckpointIntoPodSpec(
-					ctx,
-					kubeClient,
-					dynamoDeployment.Namespace,
-					podSpec,
-					checkpointInfo,
-				); err != nil {
-					return nil, fmt.Errorf("failed to inject checkpoint config for role %s: %w", r.Name, err)
+				return nil, err
 			}
+			gangSet.Spec.Template.Cliques = append(gangSet.Spec.Template.Cliques, clique)
+			cliqueNames = append(cliqueNames, strings.ToLower(r.Name))
 		}

-			minAvailable := int32(1)
-			if isMultinode {
-				minAvailable = r.Replicas
-			}
+		applyCliqueStartupDependencies(gangSet, roles, backendFramework, numberOfNodes, isInterPodGMS)

-			clique := &grovev1alpha1.PodCliqueTemplateSpec{
-				Name: strings.ToLower(r.Name),
-				Spec: grovev1alpha1.PodCliqueSpec{
-					RoleName:     strings.ToLower(r.Name),
-					Replicas:     r.Replicas,
-					MinAvailable: ptr.To(minAvailable),
-					PodSpec:      *podSpec,
-				},
+		if isInterPodGMS {
+			resourceClaimTemplates = append(resourceClaimTemplates, gmsResourceClaimTemplateConfigs(serviceName, component.Resources, roles)...)
 		}

-			// For single-node services, set topology constraint directly on the clique.
-			// For multinode services, the constraint goes on the PCSG instead;
-			// child cliques inherit from PCSG and should NOT have explicit constraints.
-			if !isMultinode {
-				clique.TopologyConstraint = toGroveTopologyConstraint(component.TopologyConstraint)
+		if usesPCSG {
+			pcsg := grovev1alpha1.PodCliqueScalingGroupConfig{
+				Name:               strings.ToLower(serviceName),
+				CliqueNames:        cliqueNames,
+				Replicas:           component.Replicas,
+				MinAvailable:       ptr.To(int32(1)),
+				TopologyConstraint: toGroveTopologyConstraint(component.TopologyConstraint),
 			}
-			labels, err := generateLabels(component, dynamoDeployment, serviceName, discoveryContext)
-			if err != nil {
-				return nil, fmt.Errorf("failed to generate labels: %w", err)
+			if isInterPodGMS {
+				pcsg.ResourceSharing = gmsResourceSharingEntries(serviceName, roles)
 			}
-			clique.Labels = labels
-			annotations, err := generateAnnotations(component)
-			if err != nil {
-				return nil, fmt.Errorf("failed to generate annotations: %w", err)
+			scalingGroups = append(scalingGroups, pcsg)
 		}
-			checkpoint.ApplyRestorePodMetadata(labels, annotations, checkpointInfo)
-
-			// Apply restart annotation if this service should be restarted.
-			// For services not in the current restart order, preserve their existing annotation
-			// to avoid triggering unwanted rollouts when a new restart begins.
-			if restartState.ShouldAnnotateService(serviceName) {
-				if annotations == nil {
-					annotations = make(map[string]string)
 	}
-				annotations[commonconsts.RestartAnnotation] = restartState.Timestamp
-			} else if existingRestartAnnotations != nil {
-				if existingTimestamp, ok := existingRestartAnnotations[serviceName]; ok {
-					if annotations == nil {
-						annotations = make(map[string]string)
-					}
-					annotations[commonconsts.RestartAnnotation] = existingTimestamp
+	if len(scalingGroups) > 0 {
+		gangSet.Spec.Template.PodCliqueScalingGroupConfigs = scalingGroups
 	}
+	if len(resourceClaimTemplates) > 0 {
+		gangSet.Spec.Template.ResourceClaimTemplates = resourceClaimTemplates
 	}
-			clique.Annotations = annotations

-			// Inject kai-scheduler settings if enabled
-			injectKaiSchedulerIfEnabled(clique, runtimeConfig, validatedQueueName)
+	return gangSet, nil
+}

-			gangSet.Spec.Template.Cliques = append(gangSet.Spec.Template.Cliques, clique)
-			cliqueNames = append(cliqueNames, strings.ToLower(r.Name))
+// generatePodSpecForRole builds the pod spec for a single role, handling GMS
+// weight server pods and GMS engine pods differently from regular pods.
+func generatePodSpecForRole(
+	r ServiceRole,
+	component *v1alpha1.DynamoComponentDeploymentSharedSpec,
+	backendFramework BackendFramework,
+	secretsRetriever SecretsRetriever,
+	dynamoDeployment *v1alpha1.DynamoGraphDeployment,
+	numberOfNodes int32,
+	operatorConfig *configv1alpha1.OperatorConfiguration,
+	serviceName string,
+	checkpointInfo *checkpoint.CheckpointInfo,
+) (*corev1.PodSpec, error) {
+	isInterPodGMS := component.IsInterPodGMSEnabled()
+
+	if r.Role == RoleGMS {
+		// GMS weight server: generate a base engine spec then transform it
+		basePodSpec, err := GeneratePodSpecForComponent(
+			component, backendFramework, secretsRetriever, dynamoDeployment,
+			RoleMain, 1, operatorConfig,
+			commonconsts.MultinodeDeploymentTypeGrove, serviceName, checkpointInfo, nil,
+		)
+		if err != nil {
+			return nil, fmt.Errorf("failed to generate base podSpec for GMS: %w", err)
+		}
+		return gmsWeightServerPodSpec(basePodSpec, r.Rank, int(getGPUCount(component.Resources))), nil
 	}

-		// Apply startup dependencies for this service
-		applyCliqueStartupDependencies(gangSet, roles, backendFramework, numberOfNodes)
-
-		if isMultinode {
-			scalingGroups = append(scalingGroups, grovev1alpha1.PodCliqueScalingGroupConfig{
-				Name:               strings.ToLower(serviceName),
-				CliqueNames:        cliqueNames,
-				Replicas:           component.Replicas,
-				MinAvailable:       ptr.To(int32(1)),
-				TopologyConstraint: toGroveTopologyConstraint(component.TopologyConstraint),
-			})
+	// Engine pod (or non-GMS pod): optionally use a rank-aware deployer for multinode inter-pod GMS
+	var deployer MultinodeDeployer
+	if isInterPodGMS && numberOfNodes > 1 {
+		deployer = &GroveMultinodeDeployer{IsInterPodGMS: true, Rank: r.Rank}
 	}
+
+	podSpec, err := GeneratePodSpecForComponent(
+		component, backendFramework, secretsRetriever, dynamoDeployment,
+		r.Role, numberOfNodes, operatorConfig,
+		commonconsts.MultinodeDeploymentTypeGrove, serviceName, checkpointInfo, deployer,
+	)
+	if err != nil {
+		return nil, err
 	}
-	if len(scalingGroups) > 0 {
-		gangSet.Spec.Template.PodCliqueScalingGroupConfigs = scalingGroups
+
+	if isInterPodGMS {
+		augmentEngineForGMS(podSpec, r.Rank, component.IsInterPodFailoverEnabled())
 	}

-	return gangSet, nil
+	return podSpec, nil
 }

 func generateLabels(
@@ -1579,9 +1865,17 @@ func generateLabels(
 	if workerHash := component.Labels[commonconsts.KubeLabelDynamoWorkerHash]; workerHash != "" {
 		labels[commonconsts.KubeLabelDynamoWorkerHash] = workerHash
 	}
-	// Discovery labels on pod template — needed for Pod reflector filtering in container mode
+	// Discovery labels on pod template — needed for Pod reflector filtering in
+	// container mode (see lib/runtime/src/discovery/kube/daemon.rs). Applied to
+	// every role by default because any role may host the dynamo runtime — for
+	// example, multinode vLLM workers in data-parallel hybrid-lb mode run their
+	// own API server (see RoleWorker branch in injectDataParallelLaunchFlags).
+	// Callers that render non-dynamo pods (specifically the RoleGMS weight
+	// server, which runs gpu_memory_service.cli.server and never registers a
+	// DynamoWorkerMetadata CR) are responsible for stripping these labels after
+	// the fact — see buildCliqueForRole.
 	if discovery.Backend == configv1alpha1.DiscoveryBackendKubernetes {
-		labels[commonconsts.KubeLabelDynamoDiscoveryBackend] = "kubernetes"
+		labels[commonconsts.KubeLabelDynamoDiscoveryBackend] = commonconsts.DiscoveryBackendKubernetes
 		labels[commonconsts.KubeLabelDynamoDiscoveryEnabled] = commonconsts.KubeLabelValueTrue
 	}
 	return labels, nil
@@ -1783,6 +2077,7 @@ func GenerateBasePodSpecForController(
 		multinodeDeploymentType,
 		serviceName,
 		checkpointInfo,
+		nil, // use default deployer
 	)
 	if err != nil {
 		return nil, err

--- a/deploy/operator/internal/dynamo/graph_test.go
+++ b/deploy/operator/internal/dynamo/graph_test.go
@@ -1491,6 +1491,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 													},
 												},
 												Env: []corev1.EnvVar{
+													{
+														Name:  "CONTAINER_NAME",
+														Value: commonconsts.MainContainerName,
+													},
 													{
 														Name:  "DYN_HTTP_PORT",
 														Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
@@ -1697,6 +1701,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 													FailureThreshold: 720,
 												},
 												Env: []corev1.EnvVar{
+													{
+														Name:  "CONTAINER_NAME",
+														Value: commonconsts.MainContainerName,
+													},
 													{
 														Name:  "DYNAMO_POD_GANG_SET_REPLICAS",
 														Value: "1",
@@ -2092,6 +2100,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 													},
 												},
 												Env: []corev1.EnvVar{
+													{
+														Name:  "CONTAINER_NAME",
+														Value: commonconsts.MainContainerName,
+													},
 													{
 														Name:  "DYNAMO_POD_GANG_SET_REPLICAS",
 														Value: "1",
@@ -2302,6 +2314,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 													},
 												},
 												Env: []corev1.EnvVar{
+													{
+														Name:  "CONTAINER_NAME",
+														Value: commonconsts.MainContainerName,
+													},
 													{
 														Name:  "DYNAMO_POD_GANG_SET_REPLICAS",
 														Value: "1",
@@ -2489,6 +2505,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 													},
 												},
 												Env: []corev1.EnvVar{
+													{
+														Name:  "CONTAINER_NAME",
+														Value: commonconsts.MainContainerName,
+													},
 													{
 														Name:  "DYN_HTTP_PORT",
 														Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
@@ -2686,6 +2706,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 													FailureThreshold: 720,
 												},
 												Env: []corev1.EnvVar{
+													{
+														Name:  "CONTAINER_NAME",
+														Value: commonconsts.MainContainerName,
+													},
 													{
 														Name:  "DYNAMO_POD_GANG_SET_REPLICAS",
 														Value: "1",
@@ -3103,6 +3127,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 													},
 												},
 												Env: []corev1.EnvVar{
+													{
+														Name:  "CONTAINER_NAME",
+														Value: commonconsts.MainContainerName,
+													},
 													{
 														Name:  "DYNAMO_POD_GANG_SET_REPLICAS",
 														Value: "1",
@@ -3300,6 +3328,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 													},
 												},
 												Env: []corev1.EnvVar{
+													{
+														Name:  "CONTAINER_NAME",
+														Value: commonconsts.MainContainerName,
+													},
 													{
 														Name:  "DYNAMO_POD_GANG_SET_REPLICAS",
 														Value: "1",
@@ -3487,6 +3519,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 													},
 												},
 												Env: []corev1.EnvVar{
+													{
+														Name:  "CONTAINER_NAME",
+														Value: commonconsts.MainContainerName,
+													},
 													{
 														Name:  "DYN_HTTP_PORT",
 														Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
@@ -3684,6 +3720,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 													FailureThreshold: 720,
 												},
 												Env: []corev1.EnvVar{
+													{
+														Name:  "CONTAINER_NAME",
+														Value: commonconsts.MainContainerName,
+													},
 													{
 														Name:  "DYNAMO_POD_GANG_SET_REPLICAS",
 														Value: "1",
@@ -4013,6 +4053,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"worker",
 				nil, // No checkpoint info in tests
+				nil, // Use default deployer
 			)

 			if tt.expectError {
@@ -4171,6 +4212,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"worker",
 				nil, // No checkpoint info in tests
+				nil, // Use default deployer
 			)

 			if tt.expectError {
@@ -4258,6 +4300,7 @@ func TestGeneratePodSpecForComponent_UnsupportedBackend(t *testing.T) {
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"worker",
 				nil, // No checkpoint info in tests
+				nil, // Use default deployer
 			)

 			if tt.expectError {
@@ -4283,6 +4326,7 @@ func TestExpandRolesForService(t *testing.T) {
 		serviceName     string
 		numberOfNodes   int32
 		serviceReplicas *int32
+		component       *v1alpha1.DynamoComponentDeploymentSharedSpec
 		expected        []ServiceRole
 	}{
 		{
@@ -4338,11 +4382,99 @@ func TestExpandRolesForService(t *testing.T) {
 				{Name: "test-service", Role: RoleMain, Replicas: 0},
 			},
 		},
+		{
+			name:          "single-node GMS with 1 shadow",
+			serviceName:   "svc",
+			numberOfNodes: 1,
+			component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
+				GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
+				Failover:         &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod, NumShadows: 1},
+			},
+			expected: []ServiceRole{
+				{Name: "svc-gms-0", Role: RoleGMS, Replicas: 1, Rank: 0},
+				{Name: "svc", Role: RoleMain, Replicas: 2, Rank: 0},
+			},
+		},
+		{
+			name:          "single-node GMS with 3 shadows",
+			serviceName:   "svc",
+			numberOfNodes: 1,
+			component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
+				GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
+				Failover:         &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod, NumShadows: 3},
+			},
+			expected: []ServiceRole{
+				{Name: "svc-gms-0", Role: RoleGMS, Replicas: 1, Rank: 0},
+				{Name: "svc", Role: RoleMain, Replicas: 4, Rank: 0},
+			},
+		},
+		{
+			name:          "single-node standalone inter-pod GMS (no failover)",
+			serviceName:   "svc",
+			numberOfNodes: 1,
+			component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
+				GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
+			},
+			expected: []ServiceRole{
+				{Name: "svc-gms-0", Role: RoleGMS, Replicas: 1, Rank: 0},
+				{Name: "svc", Role: RoleMain, Replicas: 1, Rank: 0},
+			},
+		},
+		{
+			name:          "multinode GMS 2 nodes 1 shadow",
+			serviceName:   "svc",
+			numberOfNodes: 2,
+			component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
+				GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
+				Failover:         &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod, NumShadows: 1},
+			},
+			expected: []ServiceRole{
+				{Name: "svc-gms-0", Role: RoleGMS, Replicas: 1, Rank: 0},
+				{Name: "svc-ldr", Role: RoleLeader, Replicas: 2, Rank: 0},
+				{Name: "svc-gms-1", Role: RoleGMS, Replicas: 1, Rank: 1},
+				{Name: "svc-wkr-1", Role: RoleWorker, Replicas: 2, Rank: 1},
+			},
+		},
+		{
+			name:          "multinode GMS 3 nodes 2 shadows",
+			serviceName:   "svc",
+			numberOfNodes: 3,
+			component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
+				GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
+				Failover:         &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod, NumShadows: 2},
+			},
+			expected: []ServiceRole{
+				{Name: "svc-gms-0", Role: RoleGMS, Replicas: 1, Rank: 0},
+				{Name: "svc-ldr", Role: RoleLeader, Replicas: 3, Rank: 0},
+				{Name: "svc-gms-1", Role: RoleGMS, Replicas: 1, Rank: 1},
+				{Name: "svc-wkr-1", Role: RoleWorker, Replicas: 3, Rank: 1},
+				{Name: "svc-gms-2", Role: RoleGMS, Replicas: 1, Rank: 2},
+				{Name: "svc-wkr-2", Role: RoleWorker, Replicas: 3, Rank: 2},
+			},
+		},
+		{
+			name:          "multinode standalone inter-pod GMS (no failover)",
+			serviceName:   "svc",
+			numberOfNodes: 2,
+			component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
+				GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
+			},
+			expected: []ServiceRole{
+				{Name: "svc-gms-0", Role: RoleGMS, Replicas: 1, Rank: 0},
+				{Name: "svc-ldr", Role: RoleLeader, Replicas: 1, Rank: 0},
+				{Name: "svc-gms-1", Role: RoleGMS, Replicas: 1, Rank: 1},
+				{Name: "svc-wkr-1", Role: RoleWorker, Replicas: 1, Rank: 1},
+			},
+		},
 	}

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			result := expandRolesForService(tt.serviceName, tt.serviceReplicas, tt.numberOfNodes)
+			component := tt.component
+			if component == nil {
+				component = &v1alpha1.DynamoComponentDeploymentSharedSpec{}
+			}
+			result := expandRolesForService(tt.serviceName, tt.serviceReplicas, tt.numberOfNodes, component)
 			if !reflect.DeepEqual(result, tt.expected) {
 				t.Errorf("expandRolesForService() = %v, want %v", result, tt.expected)
 			}
@@ -4802,8 +4934,8 @@ func TestApplyCliqueStartupDependencies(t *testing.T) {
 				gangSet.Spec.Template.Cliques = append(gangSet.Spec.Template.Cliques, clique)
 			}

-			// Apply dependencies
-			applyCliqueStartupDependencies(gangSet, tt.roles, tt.backendFramework, tt.numberOfNodes)
+			// Apply dependencies (non-GMS)
+			applyCliqueStartupDependencies(gangSet, tt.roles, tt.backendFramework, tt.numberOfNodes, false)

 			// Verify StartupType
 			if tt.expectStartupType {
@@ -4832,6 +4964,80 @@ func TestApplyCliqueStartupDependencies(t *testing.T) {
 	}
 }

+func TestApplyCliqueStartupDependencies_GMS(t *testing.T) {
+	t.Run("gms_single_node_engine_starts_after_gms", func(t *testing.T) {
+		gmsRoles := []ServiceRole{
+			{Name: "svc-gms-0", Role: RoleGMS, Rank: 0, Replicas: 1},
+			{Name: "svc", Role: RoleMain, Rank: 0, Replicas: 2},
+		}
+		gangSet := &grovev1alpha1.PodCliqueSet{
+			Spec: grovev1alpha1.PodCliqueSetSpec{
+				Template: grovev1alpha1.PodCliqueSetTemplateSpec{
+					Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{
+						{Name: "svc-gms-0", Spec: grovev1alpha1.PodCliqueSpec{RoleName: "svc-gms-0", Replicas: 1}},
+						{Name: "svc", Spec: grovev1alpha1.PodCliqueSpec{RoleName: "svc", Replicas: 2}},
+					},
+				},
+			},
+		}
+
+		applyCliqueStartupDependencies(gangSet, gmsRoles, BackendFrameworkVLLM, 1, true)
+
+		if gangSet.Spec.Template.StartupType == nil || *gangSet.Spec.Template.StartupType != grovev1alpha1.CliqueStartupTypeExplicit {
+			t.Fatal("expected CliqueStartupTypeExplicit")
+		}
+		for _, c := range gangSet.Spec.Template.Cliques {
+			switch c.Name {
+			case "svc-gms-0":
+				if c.Spec.StartsAfter != nil {
+					t.Errorf("GMS clique should have no startsAfter, got %v", c.Spec.StartsAfter)
+				}
+			case "svc":
+				if !reflect.DeepEqual(c.Spec.StartsAfter, []string{"svc-gms-0"}) {
+					t.Errorf("engine clique startsAfter = %v, want [svc-gms-0]", c.Spec.StartsAfter)
+				}
+			}
+		}
+	})
+
+	t.Run("gms_does_not_leak_startsAfter_to_unrelated_cliques", func(t *testing.T) {
+		gmsRoles := []ServiceRole{
+			{Name: "engine-gms-0", Role: RoleGMS, Rank: 0, Replicas: 1},
+			{Name: "engine", Role: RoleMain, Rank: 0, Replicas: 2},
+		}
+		gangSet := &grovev1alpha1.PodCliqueSet{
+			Spec: grovev1alpha1.PodCliqueSetSpec{
+				Template: grovev1alpha1.PodCliqueSetTemplateSpec{
+					Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{
+						{Name: "frontend", Spec: grovev1alpha1.PodCliqueSpec{RoleName: "frontend", Replicas: 1}},
+						{Name: "engine-gms-0", Spec: grovev1alpha1.PodCliqueSpec{RoleName: "engine-gms-0", Replicas: 1}},
+						{Name: "engine", Spec: grovev1alpha1.PodCliqueSpec{RoleName: "engine", Replicas: 2}},
+					},
+				},
+			},
+		}
+
+		applyCliqueStartupDependencies(gangSet, gmsRoles, BackendFrameworkVLLM, 1, true)
+
+		for _, c := range gangSet.Spec.Template.Cliques {
+			switch c.Name {
+			case "frontend":
+				if c.Spec.StartsAfter != nil {
+					t.Errorf("frontend clique should have no startsAfter, got %v", c.Spec.StartsAfter)
+				}
+			case "engine-gms-0":
+				if c.Spec.StartsAfter != nil {
+					t.Errorf("GMS clique should have no startsAfter, got %v", c.Spec.StartsAfter)
+				}
+			case "engine":
+				if !reflect.DeepEqual(c.Spec.StartsAfter, []string{"engine-gms-0"}) {
+					t.Errorf("engine clique startsAfter = %v, want [engine-gms-0]", c.Spec.StartsAfter)
+				}
+			}
+		}
+	})
+}
+
 func TestGetCliqueStartupDependencies(t *testing.T) {
 	tests := []struct {
 		name              string
@@ -5064,6 +5270,7 @@ func TestGenerateBasePodSpec_Frontend(t *testing.T) {
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
 				nil, // No checkpoint info in tests
+				nil, // Use default deployer
 			)

 			if (err != nil) != tt.wantErr {
@@ -5140,6 +5347,7 @@ func TestGenerateBasePodSpec_PlannerServiceAccount(t *testing.T) {
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
 				nil, // No checkpoint info in tests
+				nil, // Use default deployer
 			)

 			if err != nil {
@@ -5263,6 +5471,7 @@ func TestGenerateBasePodSpec_DisableImagePullSecretDiscovery(t *testing.T) {
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
 				nil, // No checkpoint info in tests
+				nil, // Use default deployer
 			)

 			if err != nil {
@@ -5369,6 +5578,7 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
 				nil, // No checkpoint info in tests
+				nil, // Use default deployer
 			)
 			if !assert.NoError(t, err) {
 				return
@@ -5421,7 +5631,9 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
 						Args:    []string{"-m", "dynamo.worker"},
 						Env: []corev1.EnvVar{
 							{Name: "ANOTHER_COMPONENTENV", Value: "true"},
-							{Name: "ANOTHER_CONTAINER_ENV", Value: "true"}, {Name: commonconsts.DynamoComponentEnvVar, Value: "worker"},
+							{Name: "ANOTHER_CONTAINER_ENV", Value: "true"},
+							{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
+							{Name: commonconsts.DynamoComponentEnvVar, Value: "worker"},
 							{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
 							{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
 							{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-deployment"},
@@ -5537,6 +5749,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
 				nil, // No checkpoint info in tests
+				nil, // Use default deployer
 			)

 			if err != nil {
@@ -5634,6 +5847,7 @@ func TestGenerateBasePodSpec_VolumeMounts(t *testing.T) {
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
 				nil, // No checkpoint info in tests
+				nil, // Use default deployer
 			)

 			if tt.expectError {
@@ -5870,6 +6084,7 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
 				nil, // No checkpoint info in tests
+				nil, // Use default deployer
 			)

 			if tt.expectError {
@@ -6082,6 +6297,7 @@ func TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport(t *testing.T)
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
 				nil, // No checkpoint info in tests
+				nil, // Use default deployer
 			)

 			if tt.expectError {
@@ -6268,6 +6484,7 @@ func TestGenerateBasePodSpec_SecurityContext(t *testing.T) {
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
 				nil, // No checkpoint info in tests
+				nil, // Use default deployer
 			)

 			if err != nil {
@@ -6954,6 +7171,134 @@ func TestGenerateLabels_ReassertsRestoreIdentityLabelsAfterMetadataMerge(t *test
 	assert.Equal(t, "workerhash", labels[commonconsts.KubeLabelDynamoWorkerHash])
 }

+// TestGenerateGrovePodCliqueSet_GMSPodsDoNotCarryDiscoveryLabels pins the
+// contract that inter-pod GMS weight-server cliques (RoleGMS) do NOT carry
+// the kubernetes discovery labels, while engine cliques (RoleMain / RoleLeader
+// / RoleWorker) do — the latter matches the behavior introduced by
+// #8067 "per-container kube discovery for multi-engine pods". The Rust
+// discovery daemon (lib/runtime/src/discovery/kube/daemon.rs) uses these
+// labels as a reflector filter; GMS pods run gpu_memory_service.cli.server,
+// not the dynamo runtime, and never register a DynamoWorkerMetadata CR, so
+// they must be excluded to avoid reflector-store bloat and spurious wake-ups.
+func TestGenerateGrovePodCliqueSet_GMSPodsDoNotCarryDiscoveryLabels(t *testing.T) {
+	dgd := &v1alpha1.DynamoGraphDeployment{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "test-dgd",
+			Namespace: "test-ns",
+		},
+		Spec: v1alpha1.DynamoGraphDeploymentSpec{
+			BackendFramework: "vllm",
+			Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
+				"decode": {
+					ComponentType: commonconsts.ComponentTypeDecode,
+					Replicas:      ptr.To(int32(1)),
+					Resources: &v1alpha1.Resources{
+						Limits: &v1alpha1.ResourceItem{GPU: "1"},
+					},
+					GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{
+						Enabled: true,
+						Mode:    v1alpha1.GMSModeInterPod,
+					},
+					Failover: &v1alpha1.FailoverSpec{
+						Enabled:    true,
+						Mode:       v1alpha1.GMSModeInterPod,
+						NumShadows: 1,
+					},
+				},
+			},
+		},
+	}
+
+	controllerConfig := &configv1alpha1.OperatorConfiguration{
+		Discovery: configv1alpha1.DiscoveryConfiguration{Backend: "kubernetes"},
+		Infrastructure: configv1alpha1.InfrastructureConfiguration{
+			ETCDAddress: "etcd-address",
+			NATSAddress: "nats-address",
+		},
+	}
+
+	got, err := GenerateGrovePodCliqueSet(context.Background(), dgd, controllerConfig, &controller_common.RuntimeConfig{DRAEnabled: true}, nil, nil, nil, nil, nil)
+	require.NoError(t, err)
+	require.NotNil(t, got)
+
+	var sawGMS, sawEngine bool
+	for _, clique := range got.Spec.Template.Cliques {
+		_, hasBackend := clique.Labels[commonconsts.KubeLabelDynamoDiscoveryBackend]
+		_, hasEnabled := clique.Labels[commonconsts.KubeLabelDynamoDiscoveryEnabled]
+		if strings.Contains(clique.Name, "gms") {
+			sawGMS = true
+			assert.False(t, hasBackend, "GMS clique %q must not carry KubeLabelDynamoDiscoveryBackend", clique.Name)
+			assert.False(t, hasEnabled, "GMS clique %q must not carry KubeLabelDynamoDiscoveryEnabled", clique.Name)
+		} else {
+			sawEngine = true
+			assert.True(t, hasBackend, "engine clique %q must carry KubeLabelDynamoDiscoveryBackend (#8067 contract)", clique.Name)
+			assert.True(t, hasEnabled, "engine clique %q must carry KubeLabelDynamoDiscoveryEnabled (#8067 contract)", clique.Name)
+		}
+	}
+	assert.True(t, sawGMS, "test setup should produce at least one GMS clique")
+	assert.True(t, sawEngine, "test setup should produce at least one engine clique")
+}
+
+// TestGenerateGrovePodCliqueSet_MinAvailable_FailoverShadowsAreRedundant pins
+// the contract that per-rank engine cliques in an inter-pod failover cohort
+// use MinAvailable=1 even when multinode (numberOfNodes > 1). Replicas here
+// represent (primary + shadows) AT THAT RANK — redundant hot spares of each
+// other, NOT NCCL peers. Gang-scheduling them (MinAvailable = Replicas) would
+// require every shadow at every rank to be Ready before Grove considered the
+// clique available, which defeats failover. See the minAvailable comment in
+// renderClique for the full rationale.
+func TestGenerateGrovePodCliqueSet_MinAvailable_FailoverShadowsAreRedundant(t *testing.T) {
+	const numberOfNodes int32 = 2
+	const numShadows int32 = 1
+	const totalEnginePods = numShadows + 1 // primary + shadows per rank
+
+	dgd := &v1alpha1.DynamoGraphDeployment{
+		ObjectMeta: metav1.ObjectMeta{Name: "test-dgd", Namespace: "test-ns"},
+		Spec: v1alpha1.DynamoGraphDeploymentSpec{
+			BackendFramework: "vllm",
+			Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
+				"decode": {
+					ComponentType:    commonconsts.ComponentTypeDecode,
+					Replicas:         ptr.To(int32(1)),
+					Multinode:        &v1alpha1.MultinodeSpec{NodeCount: numberOfNodes},
+					Resources:        &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPU: "1"}},
+					GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
+					Failover:         &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod, NumShadows: numShadows},
+				},
+			},
+		},
+	}
+
+	got, err := GenerateGrovePodCliqueSet(
+		context.Background(),
+		dgd,
+		&configv1alpha1.OperatorConfiguration{
+			Discovery:      configv1alpha1.DiscoveryConfiguration{Backend: "kubernetes"},
+			Infrastructure: configv1alpha1.InfrastructureConfiguration{ETCDAddress: "etcd-address", NATSAddress: "nats-address"},
+		},
+		&controller_common.RuntimeConfig{DRAEnabled: true},
+		nil, nil, nil, nil, nil,
+	)
+	require.NoError(t, err)
+	require.NotNil(t, got)
+
+	var sawEngineClique bool
+	for _, clique := range got.Spec.Template.Cliques {
+		require.NotNil(t, clique.Spec.MinAvailable, "clique %q has nil MinAvailable", clique.Name)
+		if strings.Contains(clique.Name, "gms") {
+			assert.EqualValues(t, 1, *clique.Spec.MinAvailable, "GMS clique %q MinAvailable", clique.Name)
+			assert.EqualValues(t, 1, clique.Spec.Replicas, "GMS clique %q Replicas", clique.Name)
+			continue
+		}
+		sawEngineClique = true
+		assert.EqualValues(t, totalEnginePods, clique.Spec.Replicas,
+			"multinode failover engine clique %q Replicas should be primary+shadows=%d", clique.Name, totalEnginePods)
+		assert.EqualValues(t, 1, *clique.Spec.MinAvailable,
+			"multinode failover engine clique %q MinAvailable must be 1 (shadows are redundant hot spares, NOT NCCL peers)", clique.Name)
+	}
+	assert.True(t, sawEngineClique, "test setup should produce at least one engine (non-GMS) clique")
+}
+
 func TestIsWorkerComponent(t *testing.T) {
 	workers := []string{commonconsts.ComponentTypeWorker, commonconsts.ComponentTypePrefill, commonconsts.ComponentTypeDecode}
 	nonWorkers := []string{commonconsts.ComponentTypeFrontend, commonconsts.ComponentTypePlanner, commonconsts.ComponentTypeEPP, "custom", ""}
@@ -7235,7 +7580,8 @@ func TestGenerateBasePodSpec_FrontendSidecar(t *testing.T) {
 				controllerConfig,
 				commonconsts.MultinodeDeploymentTypeGrove,
 				"test-service",
-				nil,
+				nil, // checkpointInfo
+				nil, // deployerOverride
 			)

 			if (err != nil) != tt.wantErr {

--- a/deploy/operator/internal/dynamo/grove.go
+++ b/deploy/operator/internal/dynamo/grove.go
@@ -23,31 +23,55 @@ import (

 type GroveMultinodeDeployer struct {
 	MultinodeDeployer
+	// IsInterPodGMS is true when this deployer produces pod specs for an
+	// engine PCLQ that uses the inter-pod GMS *layout* (one engine pod per
+	// rank, per shadow, with a dedicated GMS weight server pod). It is a
+	// layout/topology flag — not a failover policy flag — and governs how
+	// hostnames, node ranks, and per-pod wiring are computed. Today this
+	// layout is only produced when inter-pod GMS failover is enabled, but
+	// the deployer itself should not encode that assumption.
+	IsInterPodGMS bool
+	Rank          int32 // explicit node rank (used when IsInterPodGMS is true)
 }

 func (d *GroveMultinodeDeployer) GetLeaderHostname(serviceName string) string {
-	return fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-0.$(GROVE_HEADLESS_SERVICE)", strings.ToLower(serviceName), commonconsts.GroveRoleSuffixLeader)
+	if d.IsInterPodGMS {
+		// GMS: each PCLQ has multiple replicas; pods at the same index across
+		// ranks form a communication group, so use the dynamic pod index.
+		return fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-$(GROVE_PCLQ_POD_INDEX).$(GROVE_HEADLESS_SERVICE)",
+			strings.ToLower(serviceName), commonconsts.GroveRoleSuffixLeader)
+	}
+	return fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-0.$(GROVE_HEADLESS_SERVICE)",
+		strings.ToLower(serviceName), commonconsts.GroveRoleSuffixLeader)
 }

 func (d *GroveMultinodeDeployer) GetNodeRank() (string, bool) {
-	// This requires shell expansion for arithmetic expression
+	if d.IsInterPodGMS {
+		return fmt.Sprintf("%d", d.Rank), false
+	}
 	return "$((GROVE_PCLQ_POD_INDEX + 1))", true
 }

 func (d *GroveMultinodeDeployer) NeedsDNSWait() bool {
-	// Grove doesn't need DNS wait - it handles startup coordination differently
 	return false
 }

 func (d *GroveMultinodeDeployer) GetHostNames(serviceName string, numberOfNodes int32) []string {
 	hostnames := make([]string, 0, numberOfNodes)
-	leaderHostname := d.GetLeaderHostname(serviceName)
-	hostnames = append(hostnames, leaderHostname)
-	// Add worker hostnames
+	hostnames = append(hostnames, d.GetLeaderHostname(serviceName))
+
+	if d.IsInterPodGMS {
+		for rank := int32(1); rank < numberOfNodes; rank++ {
+			hostname := fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-%d-$(GROVE_PCLQ_POD_INDEX).$(GROVE_HEADLESS_SERVICE)",
+				strings.ToLower(serviceName), commonconsts.GroveRoleSuffixWorker, rank)
+			hostnames = append(hostnames, hostname)
+		}
+	} else {
 		for i := int32(0); i < numberOfNodes-1; i++ {
-		workerHostname := fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-%d.$(GROVE_HEADLESS_SERVICE)",
+			hostname := fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-%d.$(GROVE_HEADLESS_SERVICE)",
 				strings.ToLower(serviceName), commonconsts.GroveRoleSuffixWorker, i)
-		hostnames = append(hostnames, workerHostname)
+			hostnames = append(hostnames, hostname)
+		}
 	}
 	return hostnames
 }
@@ -63,18 +87,16 @@ func GetComponentReadinessAndServiceReplicaStatuses(ctx context.Context, client
 	serviceStatuses := make(map[string]v1alpha1.ServiceReplicaStatus, len(dgd.Spec.Services))

 	for serviceName, component := range dgd.Spec.Services {
-		isMultinode := component.GetNumberOfNodes() > 1
+		usesPCSG := component.GetNumberOfNodes() > 1 || component.IsInterPodGMSEnabled()
 		resourceName := fmt.Sprintf("%s-0-%s", dgd.Name, strings.ToLower(serviceName))

-		if isMultinode {
-			// Check PodCliqueScalingGroup: spec.replicas == status.availableReplicas
+		if usesPCSG {
 			ok, reason, serviceStatus := CheckPCSGReady(ctx, client, resourceName, dgd.Namespace, logger)
 			serviceStatuses[serviceName] = serviceStatus
 			if !ok {
 				notReadyComponents = append(notReadyComponents, fmt.Sprintf("pcsg/%s: %s", resourceName, reason))
 			}
 		} else {
-			// Check PodClique: spec.replicas == status.readyReplicas
 			ok, reason, serviceStatus := CheckPodCliqueReady(ctx, client, resourceName, dgd.Namespace, logger)
 			serviceStatuses[serviceName] = serviceStatus
 			if !ok {

--- a/deploy/operator/internal/dynamo/utils.go
+++ b/deploy/operator/internal/dynamo/utils.go
@@ -54,6 +54,27 @@ func shellQuoteForBashC(s string) string {
 	return s
 }

+// containerHasArg reports whether the container already carries the given
+// flag/value pair in its Args (either as adjacent tokens "flag", "value" or
+// as a single token "flag=value" or "flag value" embedded inside a shell
+// string). It is used to make flag injection idempotent.
+func containerHasArg(container *corev1.Container, flag, value string) bool {
+	if container == nil {
+		return false
+	}
+	joined := flag + " " + value
+	equals := flag + "=" + value
+	for i, arg := range container.Args {
+		if strings.Contains(arg, joined) || strings.Contains(arg, equals) {
+			return true
+		}
+		if arg == flag && i+1 < len(container.Args) && container.Args[i+1] == value {
+			return true
+		}
+	}
+	return false
+}
+
 func injectFlagsIntoContainerCommand(container *corev1.Container, flags string, needsShell bool, framework string) {
 	if len(container.Command) > 0 && isPythonCommand(container.Command[0]) {
 		// Direct python command case

--- a/deploy/operator/internal/webhook/validation/dynamographdeployment.go
+++ b/deploy/operator/internal/webhook/validation/dynamographdeployment.go
@@ -43,6 +43,11 @@ const (
 	// Pod names follow formats like: <pcs-name>-<pcs-index>-<pcsg-name>-<pcsg-index>-<pclq-name>-<random>
 	// The random string and hyphens consume additional characters, leaving 45 for the resource names.
 	maxCombinedResourceNameLength = 45
+
+	// backendFrameworkVLLM is the spec.backendFramework value that identifies
+	// a vLLM deployment. Duplicated here (instead of importing from
+	// internal/dynamo) to avoid a webhook -> dynamo import cycle.
+	backendFrameworkVLLM = "vllm"
 )

 // DynamoGraphDeploymentValidator validates DynamoGraphDeployment resources.
@@ -50,21 +55,24 @@ const (
 type DynamoGraphDeploymentValidator struct {
 	deployment   *nvidiacomv1alpha1.DynamoGraphDeployment
 	mgr          ctrl.Manager // Optional: for API group detection via discovery client
+	groveEnabled bool
 }

 // NewDynamoGraphDeploymentValidator creates a new validator for DynamoGraphDeployment.
-func NewDynamoGraphDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoGraphDeployment) *DynamoGraphDeploymentValidator {
+// groveEnabled should reflect the operator's runtime config (global.grove.enabled).
+func NewDynamoGraphDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoGraphDeployment, groveEnabled bool) *DynamoGraphDeploymentValidator {
 	return &DynamoGraphDeploymentValidator{
 		deployment:   deployment,
-		mgr:        nil,
+		groveEnabled: groveEnabled,
 	}
 }

 // NewDynamoGraphDeploymentValidatorWithManager creates a validator with a manager for API group detection.
-func NewDynamoGraphDeploymentValidatorWithManager(deployment *nvidiacomv1alpha1.DynamoGraphDeployment, mgr ctrl.Manager) *DynamoGraphDeploymentValidator {
+func NewDynamoGraphDeploymentValidatorWithManager(deployment *nvidiacomv1alpha1.DynamoGraphDeployment, mgr ctrl.Manager, groveEnabled bool) *DynamoGraphDeploymentValidator {
 	return &DynamoGraphDeploymentValidator{
 		deployment:   deployment,
 		mgr:          mgr,
+		groveEnabled: groveEnabled,
 	}
 }

@@ -176,6 +184,44 @@ func (v *DynamoGraphDeploymentValidator) validateImmutableFields(old *nvidiacomv
 		}
 	}

+	// Validate inter-pod GMS layout and failover immutability.
+	//
+	// Flipping the inter-pod GMS layout or toggling failover within an
+	// inter-pod layout both change the PodClique topology (weight-server PCLQ,
+	// per-rank engine PCLQs, shadow PCLQs, DRA ResourceClaimTemplates), which
+	// Grove cannot transform in place. Force the user to delete and recreate.
+	for serviceName, newService := range v.deployment.Spec.Services {
+		oldService, exists := old.Spec.Services[serviceName]
+		if !exists {
+			continue
+		}
+		oldInterPodGMS := oldService.IsInterPodGMSEnabled()
+		newInterPodGMS := newService.IsInterPodGMSEnabled()
+		if oldInterPodGMS != newInterPodGMS {
+			errs = append(errs, fmt.Errorf(
+				"spec.services[%s].gpuMemoryService.mode: the inter-pod GMS layout cannot be toggled after creation; "+
+					"delete and recreate the DynamoGraphDeployment",
+				serviceName,
+			))
+		}
+		oldInterPodFailover := oldService.IsInterPodFailoverEnabled()
+		newInterPodFailover := newService.IsInterPodFailoverEnabled()
+		if oldInterPodFailover != newInterPodFailover {
+			errs = append(errs, fmt.Errorf(
+				"spec.services[%s].failover: inter-pod GMS failover cannot be toggled after creation; "+
+					"delete and recreate the DynamoGraphDeployment",
+				serviceName,
+			))
+		}
+		if oldInterPodFailover && newInterPodFailover && oldService.Failover.NumShadows != newService.Failover.NumShadows {
+			errs = append(errs, fmt.Errorf(
+				"spec.services[%s].failover.numShadows is immutable for inter-pod GMS failover; "+
+					"delete and recreate the DynamoGraphDeployment to change it",
+				serviceName,
+			))
+		}
+	}
+
 	// Validate topology constraint immutability
 	if err := v.validateTopologyConstraintImmutability(old); err != nil {
 		errs = append(errs, err)
@@ -279,6 +325,41 @@ func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv
 // validateService validates a single service configuration using SharedSpecValidator.
 // Returns warnings and error.
 func (v *DynamoGraphDeploymentValidator) validateService(ctx context.Context, serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) (admission.Warnings, error) {
+	// The inter-pod GMS layout (with or without failover) requires the Grove
+	// pathway: the weight-server pod, per-rank PCLQs, and DRA ResourceClaim
+	// templates are all wired at the PodCliqueScalingGroup level, which only
+	// the Grove renderer produces.
+	if service.IsInterPodGMSEnabled() && !v.isGrovePathway() {
+		if !v.groveEnabled {
+			return nil, fmt.Errorf(
+				"spec.services[%s]: gpuMemoryService.mode=%q requires the Grove pathway, but Grove is disabled at the operator level (global.grove.enabled=false)",
+				serviceName, nvidiacomv1alpha1.GMSModeInterPod)
+		}
+		return nil, fmt.Errorf(
+			"spec.services[%s]: gpuMemoryService.mode=%q requires the Grove pathway; remove or unset the %q annotation (currently %q)",
+			serviceName, nvidiacomv1alpha1.GMSModeInterPod,
+			consts.KubeAnnotationEnableGrove, v.deployment.Annotations[consts.KubeAnnotationEnableGrove])
+	}
+
+	// The inter-pod GMS layout is currently implemented only for vLLM (the
+	// engine relies on vLLM-specific runtime hooks like --load-format gms and
+	// DYN_VLLM_GMS_SHADOW_MODE that activate the GMS client path). Fail fast
+	// at admission rather than producing a broken deployment when another or
+	// no backend is configured — an empty BackendFramework means the operator
+	// cannot confirm the engine speaks vLLM, which is a hard prerequisite for
+	// inter-pod GMS (both standalone and with failover).
+	if service.IsInterPodGMSEnabled() &&
+		v.deployment.Spec.BackendFramework != backendFrameworkVLLM {
+		detected := v.deployment.Spec.BackendFramework
+		if detected == "" {
+			detected = "<unset>"
+		}
+		return nil, fmt.Errorf(
+			"spec.services[%s]: the inter-pod GMS layout (gpuMemoryService.mode=%q) is currently supported only for vLLM (detected: %s); "+
+				"set spec.backendFramework=%q",
+			serviceName, nvidiacomv1alpha1.GMSModeInterPod, detected, backendFrameworkVLLM)
+	}
+
 	// Validate service name length constraints for Grove PodCliqueSet naming
 	// Only validate when Grove pathway may be in use
 	if v.isGrovePathway() {
@@ -318,44 +399,69 @@ func (v *DynamoGraphDeploymentValidator) validateServiceNameLength(serviceName s
 	dgdName := v.deployment.Name
 	lowerServiceName := strings.ToLower(serviceName)

-	// Check if this is a multinode service
 	isMultinode := service.GetNumberOfNodes() > 1
+	isInterPodGMS := service.IsInterPodGMSEnabled()
+
+	// Determine the longest PodClique name that will be generated.
+	// Grove validates: len(PCS name) + len(PCSG name) + len(PCLQ name) <= 45
+	var longestPCLQName string
+	var pcsgName string

+	switch {
+	case isInterPodGMS:
+		// GMS services always get a PCSG named after the service.
+		// Longest PCLQ name is "serviceName-gms-0" (len + 6) or "serviceName-wkr-N".
+		pcsgName = lowerServiceName
+		gmsName := fmt.Sprintf("%s-%s-0", lowerServiceName, consts.GroveRoleSuffixGMS)
+		longestPCLQName = gmsName
 		if isMultinode {
-		// For multinode: PodCliqueSet name + PodCliqueScalingGroup name + PodClique name (with leader suffix)
-		// The PodClique name is serviceName + "-ldr" (using GroveRoleSuffixLeader)
-		leaderPodCliqueName := lowerServiceName + "-" + consts.GroveRoleSuffixLeader
-		combinedLength := len(dgdName) + len(lowerServiceName) + len(leaderPodCliqueName)
+			// For high node counts, "svc-wkr-NN" can be longer than "svc-gms-0"
+			maxRank := service.GetNumberOfNodes() - 1
+			workerName := fmt.Sprintf("%s-%s-%d", lowerServiceName, consts.GroveRoleSuffixWorker, maxRank)
+			if len(workerName) > len(longestPCLQName) {
+				longestPCLQName = workerName
+			}
+		}
+
+	case isMultinode:
+		pcsgName = lowerServiceName
+		longestPCLQName = lowerServiceName + "-" + consts.GroveRoleSuffixLeader

+	default:
+		// Single-node non-GMS: no PCSG, only PCS + PCLQ
+		combinedLength := len(dgdName) + len(lowerServiceName)
 		if combinedLength > maxCombinedResourceNameLength {
 			return fmt.Errorf("spec.services[%s]: combined resource name length %d exceeds %d-character limit required for pod naming. "+
 				"Consider shortening the DynamoGraphDeployment name '%s' (length %d) or service name '%s' (length %d). "+
-				"For multinode services, the combined length of DGD name + service name + service name with role suffix (e.g., '%s-ldr') must not exceed %d characters",
+				"The combined length of DGD name + service name must not exceed %d characters",
 				serviceName, combinedLength, maxCombinedResourceNameLength,
 				dgdName, len(dgdName), serviceName, len(serviceName),
-				lowerServiceName, maxCombinedResourceNameLength)
+				maxCombinedResourceNameLength)
+		}
+		return nil
 	}
-	} else {
-		// For single-node: PodCliqueSet name + PodClique name
-		combinedLength := len(dgdName) + len(lowerServiceName)

+	// For services with PCSG: PCS name + PCSG name + longest PCLQ name
+	combinedLength := len(dgdName) + len(pcsgName) + len(longestPCLQName)
 	if combinedLength > maxCombinedResourceNameLength {
 		return fmt.Errorf("spec.services[%s]: combined resource name length %d exceeds %d-character limit required for pod naming. "+
 			"Consider shortening the DynamoGraphDeployment name '%s' (length %d) or service name '%s' (length %d). "+
-				"The combined length of DGD name + service name must not exceed %d characters",
+			"The combined length of DGD name + PCSG name + longest PodClique name ('%s') must not exceed %d characters",
 			serviceName, combinedLength, maxCombinedResourceNameLength,
 			dgdName, len(dgdName), serviceName, len(serviceName),
-				maxCombinedResourceNameLength)
-		}
+			longestPCLQName, maxCombinedResourceNameLength)
 	}

 	return nil
 }

 // isGrovePathway determines if Grove pathway may be used for this deployment.
-// Grove is used when the nvidia.com/enable-grove annotation is NOT explicitly set to "false".
-// This is a conservative check - if Grove might be used, we validate the name length constraints.
+// Grove requires both operator-level enablement (global.grove.enabled) and the
+// per-DGD annotation not being explicitly set to "false".
 func (v *DynamoGraphDeploymentValidator) isGrovePathway() bool {
+	if !v.groveEnabled {
+		return false
+	}
 	return v.deployment.Annotations == nil ||
 		strings.ToLower(v.deployment.Annotations[consts.KubeAnnotationEnableGrove]) != consts.KubeLabelValueFalse
 }
@@ -797,18 +903,22 @@ func (v *DynamoGraphDeploymentValidator) validateNoRestartDuringRollingUpdate(ol
 }

 // validateFailoverRequiresDiscoveryMode checks that when any service has
-// failover enabled, the DGD carries the nvidia.com/dynamo-kube-discovery-mode
-// annotation set to "container". Failover pods produce multiple engine
-// containers that each need their own discovery identity.
+// intra-pod failover enabled, the DGD carries the nvidia.com/dynamo-kube-discovery-mode
+// annotation set to "container". Intra-pod failover produces multiple engine
+// containers within the same pod that each need their own discovery identity.
+// Inter-pod failover uses separate pods, so the annotation is not required.
 func (v *DynamoGraphDeploymentValidator) validateFailoverRequiresDiscoveryMode() error {
-	hasFailover := false
+	hasIntraPodFailover := false
 	for _, svc := range v.deployment.Spec.Services {
-		if svc != nil && svc.Failover != nil && svc.Failover.Enabled {
-			hasFailover = true
+		if svc == nil || svc.Failover == nil || !svc.Failover.Enabled {
+			continue
+		}
+		if svc.Failover.Mode == nvidiacomv1alpha1.GMSModeIntraPod {
+			hasIntraPodFailover = true
 			break
 		}
 	}
-	if !hasFailover {
+	if !hasIntraPodFailover {
 		return nil
 	}


--- a/deploy/operator/internal/webhook/validation/dynamographdeployment_handler.go
+++ b/deploy/operator/internal/webhook/validation/dynamographdeployment_handler.go
@@ -43,15 +43,18 @@ const (
 type DynamoGraphDeploymentHandler struct {
 	mgr               manager.Manager
 	operatorPrincipal string
+	groveEnabled      bool
 }

 // NewDynamoGraphDeploymentHandler creates a new handler for DynamoGraphDeployment Webhook.
 // operatorPrincipal is the full Kubernetes SA username of the operator, used to authorize
 // replica changes on scaling-adapter-enabled services (#7656).
-func NewDynamoGraphDeploymentHandler(mgr manager.Manager, operatorPrincipal string) *DynamoGraphDeploymentHandler {
+// groveEnabled reflects the operator's runtime config (global.grove.enabled).
+func NewDynamoGraphDeploymentHandler(mgr manager.Manager, operatorPrincipal string, groveEnabled bool) *DynamoGraphDeploymentHandler {
 	return &DynamoGraphDeploymentHandler{
 		mgr:               mgr,
 		operatorPrincipal: operatorPrincipal,
+		groveEnabled:      groveEnabled,
 	}
 }

@@ -67,7 +70,7 @@ func (h *DynamoGraphDeploymentHandler) ValidateCreate(ctx context.Context, obj r
 	logger.Info("validate create", "name", deployment.Name, "namespace", deployment.Namespace)

 	// Create validator with manager for API group detection and perform validation
-	validator := NewDynamoGraphDeploymentValidatorWithManager(deployment, h.mgr)
+	validator := NewDynamoGraphDeploymentValidatorWithManager(deployment, h.mgr, h.groveEnabled)
 	return validator.Validate(ctx)
 }

@@ -94,7 +97,7 @@ func (h *DynamoGraphDeploymentHandler) ValidateUpdate(ctx context.Context, oldOb
 	}

 	// Create validator with manager for API group detection and perform validation.
-	validator := NewDynamoGraphDeploymentValidatorWithManager(newDeployment, h.mgr)
+	validator := NewDynamoGraphDeploymentValidatorWithManager(newDeployment, h.mgr, h.groveEnabled)
 	warnings, err := validator.Validate(ctx)
 	if err != nil {
 		return warnings, err

--- a/deploy/operator/internal/webhook/validation/dynamographdeployment_test.go
+++ b/deploy/operator/internal/webhook/validation/dynamographdeployment_test.go
@@ -42,6 +42,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
 	tests := []struct {
 		name         string
 		deployment   *nvidiacomv1alpha1.DynamoGraphDeployment
+		groveEnabled bool
 		wantErr      bool
 		errMsg       string
 		errContains  bool
@@ -511,6 +512,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
 		// Service name length validation tests
 		{
 			name:         "service name too long for single-node deployment",
+			groveEnabled: true,
 			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      "verylongdynamographdeploymentname",
@@ -528,6 +530,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
 		},
 		{
 			name:         "service name too long for multinode deployment",
+			groveEnabled: true,
 			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      "vllm-agg",
@@ -549,6 +552,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
 		},
 		{
 			name:         "valid service name length for single-node",
+			groveEnabled: true,
 			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      "dgd",
@@ -564,6 +568,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
 		},
 		{
 			name:         "valid service name length for multinode",
+			groveEnabled: true,
 			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      "dgd",
@@ -583,6 +588,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
 		},
 		{
 			name:         "boundary case - exactly at 45 char limit for single-node",
+			groveEnabled: true,
 			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
 				ObjectMeta: metav1.ObjectMeta{
 					// DGD name (3 chars) + service name (42 chars) = 45 chars (exactly at limit)
@@ -600,6 +606,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
 		},
 		{
 			name:         "boundary case - one char over limit for single-node",
+			groveEnabled: true,
 			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
 				ObjectMeta: metav1.ObjectMeta{
 					// DGD name (3 chars) + service name (43 chars) = 46 chars (over limit)
@@ -620,6 +627,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
 		// Grove disabled tests - service name length validation should be skipped
 		{
 			name:         "long service name allowed when Grove disabled via annotation",
+			groveEnabled: true,
 			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      "verylongdynamographdeploymentname",
@@ -638,6 +646,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
 		},
 		{
 			name:         "long multinode service name allowed when Grove disabled via annotation",
+			groveEnabled: true,
 			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      "vllm-agg",
@@ -660,6 +669,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
 		},
 		{
 			name:         "Grove annotation case insensitive - FALSE",
+			groveEnabled: true,
 			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
 				ObjectMeta: metav1.ObjectMeta{
 					Name:      "verylongdynamographdeploymentname",
@@ -676,6 +686,280 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
 			},
 			wantErr: false,
 		},
+		// GMS failover validation test cases
+		{
+			name:         "valid GMS failover single-node with GPU",
+			groveEnabled: true,
+			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-gms",
+					Namespace: "default",
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "vllm",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							ComponentType: consts.ComponentTypeWorker,
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+							Failover: &nvidiacomv1alpha1.FailoverSpec{
+								Enabled:    true,
+								Mode:       nvidiacomv1alpha1.GMSModeInterPod,
+								NumShadows: 1,
+							},
+							Resources: &nvidiacomv1alpha1.Resources{
+								Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "8"},
+							},
+						},
+					},
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name:         "valid standalone inter-pod GMS (no failover) single-node with GPU",
+			groveEnabled: true,
+			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-gms-standalone",
+					Namespace: "default",
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "vllm",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							ComponentType: consts.ComponentTypeWorker,
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+							Resources: &nvidiacomv1alpha1.Resources{
+								Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "8"},
+							},
+						},
+					},
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name:         "GMS failover without GPU",
+			groveEnabled: true,
+			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-gms",
+					Namespace: "default",
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "vllm",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							ComponentType: consts.ComponentTypeWorker,
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+							Failover: &nvidiacomv1alpha1.FailoverSpec{
+								Enabled:    true,
+								Mode:       nvidiacomv1alpha1.GMSModeInterPod,
+								NumShadows: 1,
+							},
+						},
+					},
+				},
+			},
+			wantErr:     true,
+			errContains: true,
+			// validateGPUMemoryService fires first when the inter-pod layout
+			// is declared without any GPU resources.
+			errMsg: "requires resources.limits.gpu",
+		},
+		{
+			name:         "inter-pod GMS on frontend component rejected",
+			groveEnabled: true,
+			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-gms",
+					Namespace: "default",
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "vllm",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"fe": {
+							ComponentType: "frontend",
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+							Resources: &nvidiacomv1alpha1.Resources{
+								Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "1"},
+							},
+						},
+					},
+				},
+			},
+			wantErr:     true,
+			errContains: true,
+			errMsg:      "GPU memory service is only supported for worker components",
+		},
+		{
+			name:         "GMS failover requires Grove pathway - annotation disabled",
+			groveEnabled: true,
+			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-gms",
+					Namespace: "default",
+					Annotations: map[string]string{
+						consts.KubeAnnotationEnableGrove: "false",
+					},
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "vllm",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							ComponentType: consts.ComponentTypeWorker,
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+							Failover: &nvidiacomv1alpha1.FailoverSpec{
+								Enabled:    true,
+								Mode:       nvidiacomv1alpha1.GMSModeInterPod,
+								NumShadows: 1,
+							},
+							Resources: &nvidiacomv1alpha1.Resources{
+								Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "8"},
+							},
+						},
+					},
+				},
+			},
+			wantErr:     true,
+			errContains: true,
+			errMsg:      "requires the Grove pathway",
+		},
+		{
+			name:         "GMS failover requires Grove pathway - operator grove disabled",
+			groveEnabled: false,
+			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-gms",
+					Namespace: "default",
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "vllm",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							ComponentType: consts.ComponentTypeWorker,
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+							Failover: &nvidiacomv1alpha1.FailoverSpec{
+								Enabled:    true,
+								Mode:       nvidiacomv1alpha1.GMSModeInterPod,
+								NumShadows: 1,
+							},
+							Resources: &nvidiacomv1alpha1.Resources{
+								Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "8"},
+							},
+						},
+					},
+				},
+			},
+			wantErr:     true,
+			errContains: true,
+			errMsg:      "requires the Grove pathway",
+		},
+		{
+			name:         "inter-pod GMS rejected on non-vLLM backend",
+			groveEnabled: true,
+			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-gms",
+					Namespace: "default",
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "sglang",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							ComponentType: consts.ComponentTypeWorker,
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+							Failover: &nvidiacomv1alpha1.FailoverSpec{
+								Enabled:    true,
+								Mode:       nvidiacomv1alpha1.GMSModeInterPod,
+								NumShadows: 1,
+							},
+							Resources: &nvidiacomv1alpha1.Resources{
+								Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "8"},
+							},
+						},
+					},
+				},
+			},
+			wantErr:     true,
+			errContains: true,
+			errMsg:      "currently supported only for vLLM",
+		},
+		{
+			name:         "inter-pod GMS rejected when backendFramework is unset",
+			groveEnabled: true,
+			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-gms",
+					Namespace: "default",
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					// BackendFramework intentionally left empty — the
+					// inter-pod gate must fail closed rather than silently
+					// accept a deployment whose engine may not speak vLLM.
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							ComponentType: consts.ComponentTypeWorker,
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+							Failover: &nvidiacomv1alpha1.FailoverSpec{
+								Enabled:    true,
+								Mode:       nvidiacomv1alpha1.GMSModeInterPod,
+								NumShadows: 1,
+							},
+							Resources: &nvidiacomv1alpha1.Resources{
+								Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "8"},
+							},
+						},
+					},
+				},
+			},
+			wantErr:     true,
+			errContains: true,
+			errMsg:      "currently supported only for vLLM",
+		},
+		{
+			name: "GMS failover disabled is valid without GPU",
+			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-gms",
+					Namespace: "default",
+				},
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							Failover: &nvidiacomv1alpha1.FailoverSpec{
+								Enabled: false,
+							},
+						},
+					},
+				},
+			},
+			wantErr: false,
+		},
 		// Annotation validation test cases
 		{
 			name: "valid annotation vllm-distributed-executor-backend=mp",
@@ -1245,7 +1529,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			validator := NewDynamoGraphDeploymentValidator(tt.deployment)
+			validator := NewDynamoGraphDeploymentValidator(tt.deployment, tt.groveEnabled)
 			_, err := validator.Validate(context.Background())

 			if (err != nil) != tt.wantErr {
@@ -1928,11 +2212,119 @@ func TestDynamoGraphDeploymentValidator_ValidateUpdate(t *testing.T) {
 			},
 			wantErr: false,
 		},
+		{
+			name: "toggling GMS failover is immutable",
+			oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "vllm",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							ComponentType: consts.ComponentTypeWorker,
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+						},
+					},
+				},
+			},
+			newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "vllm",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							ComponentType: consts.ComponentTypeWorker,
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+							Failover: &nvidiacomv1alpha1.FailoverSpec{
+								Enabled:    true,
+								Mode:       nvidiacomv1alpha1.GMSModeInterPod,
+								NumShadows: 1,
+							},
+						},
+					},
+				},
+			},
+			wantErr: true,
+			errMsg:  "failover cannot be toggled after creation",
+		},
+		{
+			name: "toggling inter-pod GMS layout is immutable",
+			oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "vllm",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {},
+					},
+				},
+			},
+			newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "vllm",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							ComponentType: consts.ComponentTypeWorker,
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+						},
+					},
+				},
+			},
+			wantErr: true,
+			errMsg:  "inter-pod GMS layout cannot be toggled after creation",
+		},
+		{
+			name: "changing numShadows is immutable",
+			oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "vllm",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							ComponentType: consts.ComponentTypeWorker,
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+							Failover: &nvidiacomv1alpha1.FailoverSpec{
+								Enabled:    true,
+								Mode:       nvidiacomv1alpha1.GMSModeInterPod,
+								NumShadows: 1,
+							},
+						},
+					},
+				},
+			},
+			newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
+				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
+					BackendFramework: "vllm",
+					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+						"worker": {
+							ComponentType: consts.ComponentTypeWorker,
+							GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+								Enabled: true,
+								Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+							},
+							Failover: &nvidiacomv1alpha1.FailoverSpec{
+								Enabled:    true,
+								Mode:       nvidiacomv1alpha1.GMSModeInterPod,
+								NumShadows: 3,
+							},
+						},
+					},
+				},
+			},
+			wantErr: true,
+			errMsg:  "failover.numShadows is immutable",
+		},
 	}

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			validator := NewDynamoGraphDeploymentValidator(tt.newDeployment)
+			validator := NewDynamoGraphDeploymentValidator(tt.newDeployment, true)
 			// Pass nil userInfo and empty operatorPrincipal - these tests don't modify replicas, so it's safe
 			warnings, err := validator.ValidateUpdate(tt.oldDeployment, nil, "")


--- a/deploy/operator/internal/webhook/validation/shared.go
+++ b/deploy/operator/internal/webhook/validation/shared.go
@@ -19,6 +19,7 @@ package validation

 import (
 	"context"
+	"errors"
 	"fmt"
 	"strconv"
 	"strings"
@@ -129,12 +130,12 @@ func (v *SharedSpecValidator) Validate(ctx context.Context) (admission.Warnings,
 		return nil, err
 	}

-	// Validate GPU memory service configuration
+	// Validate GPU memory service configuration (intra-pod GMS)
 	if err := v.validateGPUMemoryService(); err != nil {
 		return nil, err
 	}

-	// Validate failover configuration
+	// Validate GMS failover constraints
 	if err := v.validateFailover(); err != nil {
 		return nil, err
 	}
@@ -266,50 +267,128 @@ func (v *SharedSpecValidator) validateFrontendSidecar() error {
 	return nil
 }

-// validateFailover validates the failover configuration for a service.
-// Structural checks only — DRA/DeviceClass availability is checked by the controller
-// at reconcile time (same pattern as Grove orchestrator availability).
+// parseGPUCount extracts the GPU count from a Resources block, preferring
+// Limits then Requests. Returns (0, nil) when no GPU is requested, or an
+// error if the value is non-numeric.
+func parseGPUCount(r *nvidiacomv1alpha1.Resources) (int, error) {
+	gpuStr := ""
+	switch {
+	case r != nil && r.Limits != nil && r.Limits.GPU != "":
+		gpuStr = r.Limits.GPU
+	case r != nil && r.Requests != nil && r.Requests.GPU != "":
+		gpuStr = r.Requests.GPU
+	}
+	if gpuStr == "" {
+		return 0, nil
+	}
+	n, err := strconv.Atoi(gpuStr)
+	if err != nil {
+		return 0, fmt.Errorf("invalid value %q: %w", gpuStr, err)
+	}
+	return n, nil
+}
+
+// validateFailover validates GMS failover configuration constraints.
+//
+// The layout (intra-pod sidecar vs. inter-pod weight-server pod) is declared
+// by gpuMemoryService.mode. failover is an independent toggle: when enabled,
+// failover.mode MUST match gpuMemoryService.mode so the two knobs describe a
+// consistent topology. It is also valid to configure gpuMemoryService without
+// failover (no shadows; a single engine + GMS server pair) — see
+// validateGPUMemoryService below.
 func (v *SharedSpecValidator) validateFailover() error {
 	if v.spec.Failover == nil || !v.spec.Failover.Enabled {
+		// When failover.enabled is false the sub-fields (mode, numShadows)
+		// are dormant configuration and the render path ignores them
+		// (GetNumShadows returns 0). We deliberately do not validate them
+		// here so users can stage a failover config before flipping
+		// enabled=true — matching the K8s convention that fields on a
+		// disabled feature are not constrained.
 		return nil
 	}

-	// Failover requires GPU memory service
+	var errs []error
+
+	// For intra-pod mode: require gpuMemoryService.enabled and validate mode matching.
+	if v.spec.Failover.Mode == nvidiacomv1alpha1.GMSModeIntraPod {
 		if v.spec.GPUMemoryService == nil || !v.spec.GPUMemoryService.Enabled {
-		return fmt.Errorf(
-			"%s.failover: failover requires gpuMemoryService.enabled to be true",
-			v.fieldPath)
+			errs = append(errs, fmt.Errorf(
+				"%s.failover: intraPod failover requires gpuMemoryService.enabled to be true",
+				v.fieldPath))
+		} else if v.spec.GPUMemoryService.Mode != "" &&
+			v.spec.GPUMemoryService.Mode != nvidiacomv1alpha1.GMSModeIntraPod {
+			errs = append(errs, fmt.Errorf(
+				"%s.failover: failover.mode %q must match gpuMemoryService.mode %q",
+				v.fieldPath, v.spec.Failover.Mode, v.spec.GPUMemoryService.Mode))
 		}

-	// Failover mode must match GMS mode when both are set
-	if v.spec.Failover.Mode != "" && v.spec.GPUMemoryService.Mode != "" &&
-		v.spec.Failover.Mode != v.spec.GPUMemoryService.Mode {
-		return fmt.Errorf(
-			"%s.failover: failover.mode %q must match gpuMemoryService.mode %q",
-			v.fieldPath, v.spec.Failover.Mode, v.spec.GPUMemoryService.Mode)
+		// intraPod is a fixed 1 primary + 1 shadow sidecar layout; numShadows
+		// is meaningless here and any value other than the implicit 1 is
+		// almost certainly a configuration error (user probably wanted
+		// mode=interPod).
+		if v.spec.Failover.NumShadows != 0 && v.spec.Failover.NumShadows != 1 {
+			errs = append(errs, fmt.Errorf(
+				"%s.failover.numShadows=%d is invalid for mode=%q: intraPod uses a fixed 1 primary + 1 shadow sidecar; "+
+					"use failover.mode=%q to configure numShadows",
+				v.fieldPath, v.spec.Failover.NumShadows, nvidiacomv1alpha1.GMSModeIntraPod, nvidiacomv1alpha1.GMSModeInterPod))
+		}
 	}

-	// interPod failover is not yet supported
+	// For inter-pod mode: require the inter-pod GMS layout (gpuMemoryService
+	// with mode=interPod) so failover hot-spares are added on top of an
+	// already-declared weight-server pod layout.
 	if v.spec.Failover.Mode == nvidiacomv1alpha1.GMSModeInterPod {
-		return fmt.Errorf(
-			"%s.failover: mode \"interPod\" is not yet supported",
-			v.fieldPath)
+		if v.spec.GPUMemoryService == nil || !v.spec.GPUMemoryService.Enabled {
+			errs = append(errs, fmt.Errorf(
+				"%s.failover: interPod failover requires gpuMemoryService.enabled=true and gpuMemoryService.mode=%q",
+				v.fieldPath, nvidiacomv1alpha1.GMSModeInterPod))
+		} else if v.spec.GPUMemoryService.Mode != nvidiacomv1alpha1.GMSModeInterPod {
+			// An unset gpuMemoryService.mode defaults to the intra-pod sidecar
+			// layout, which is incompatible with inter-pod failover; the user
+			// must set gpuMemoryService.mode=interPod explicitly.
+			detected := string(v.spec.GPUMemoryService.Mode)
+			if detected == "" {
+				detected = "<unset>"
+			}
+			errs = append(errs, fmt.Errorf(
+				"%s.failover: interPod failover requires gpuMemoryService.mode=%q (got %q)",
+				v.fieldPath, nvidiacomv1alpha1.GMSModeInterPod, detected))
 		}

-	return nil
+		if v.spec.Failover.NumShadows < 1 {
+			errs = append(errs, fmt.Errorf("%s.failover.numShadows must be >= 1", v.fieldPath))
+		}
+
+		gpuCount, err := parseGPUCount(v.spec.Resources)
+		if err != nil {
+			errs = append(errs, fmt.Errorf("%s.resources.limits.gpu: %w", v.fieldPath, err))
+		} else if gpuCount < 1 {
+			errs = append(errs, fmt.Errorf("%s: GMS failover requires at least 1 GPU in resources.limits.gpu", v.fieldPath))
+		}
+
+		switch v.spec.ComponentType {
+		case consts.ComponentTypeEPP, consts.ComponentTypeFrontend, consts.ComponentTypePlanner:
+			errs = append(errs, fmt.Errorf("%s: GMS failover is not supported for componentType %q", v.fieldPath, v.spec.ComponentType))
+		}
+	}
+
+	return errors.Join(errs...)
 }

+// validateGPUMemoryService validates gpuMemoryService constraints.
+//
+// gpuMemoryService declares the GMS layout (intra-pod sidecar vs. inter-pod
+// dedicated weight-server pod) and may be enabled independently of failover:
+// the intra-pod layout gives the engine a GMS sidecar in the same pod, and
+// the inter-pod layout gives it a dedicated weight-server pod paired with one
+// engine pod. Failover adds shadow engine pods on top of the declared layout
+// (see validateFailover); it is not the sole way to request the inter-pod
+// layout.
 func (v *SharedSpecValidator) validateGPUMemoryService() error {
 	if v.spec.GPUMemoryService == nil || !v.spec.GPUMemoryService.Enabled {
 		return nil
 	}

-	if v.spec.GPUMemoryService.Mode == nvidiacomv1alpha1.GMSModeInterPod {
-		return fmt.Errorf(
-			"%s.gpuMemoryService: mode \"interPod\" is not yet supported",
-			v.fieldPath)
-	}
-
 	isWorker := v.spec.ComponentType == consts.ComponentTypeWorker ||
 		v.spec.ComponentType == consts.ComponentTypePrefill ||
 		v.spec.ComponentType == consts.ComponentTypeDecode
@@ -319,27 +398,7 @@ func (v *SharedSpecValidator) validateGPUMemoryService() error {
 			v.fieldPath)
 	}

-	if v.spec.Resources == nil {
-		return fmt.Errorf(
-			"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
-			v.fieldPath)
-	}
-
-	gpuStr := ""
-	switch {
-	case v.spec.Resources.Limits != nil && v.spec.Resources.Limits.GPU != "":
-		gpuStr = v.spec.Resources.Limits.GPU
-	case v.spec.Resources.Requests != nil && v.spec.Resources.Requests.GPU != "":
-		gpuStr = v.spec.Resources.Requests.GPU
-	}
-
-	if gpuStr == "" {
-		return fmt.Errorf(
-			"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
-			v.fieldPath)
-	}
-
-	gpuCount, err := strconv.Atoi(gpuStr)
+	gpuCount, err := parseGPUCount(v.spec.Resources)
 	if err != nil || gpuCount < 1 {
 		return fmt.Errorf(
 			"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",

--- a/deploy/operator/internal/webhook/validation/shared_test.go
+++ b/deploy/operator/internal/webhook/validation/shared_test.go
@@ -396,6 +396,195 @@ func TestSharedSpecValidator_Validate_Warnings(t *testing.T) {
 	}
 }

+// TestSharedSpecValidator_Failover_ModeConstraints covers the layout/failover
+// symmetry invariants enforced by validateFailover / validateGPUMemoryService:
+//
+//  1. gpuMemoryService declares the layout (intra-pod sidecar vs. inter-pod
+//     weight-server pod). Both modes are valid on their own (standalone GMS
+//     with no failover), and both may be paired with failover of a matching
+//     mode.
+//  2. failover.mode=intraPod requires gpuMemoryService.enabled=true and a
+//     matching (or unset) gpuMemoryService.mode.
+//  3. failover.mode=interPod requires gpuMemoryService.enabled=true AND
+//     gpuMemoryService.mode=interPod — the symmetric counterpart of (2).
+//  4. intraPod failover with numShadows != 1 is rejected (intraPod is a
+//     fixed 1 primary + 1 shadow layout).
+//  5. When failover.enabled=false, sub-fields (mode, numShadows) are dormant
+//     configuration and are intentionally NOT validated — the render path
+//     ignores them and users may stage a config before enabling failover.
+func TestSharedSpecValidator_Failover_ModeConstraints(t *testing.T) {
+	workerGPU := &nvidiacomv1alpha1.Resources{
+		Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "1"},
+	}
+
+	tests := []struct {
+		name      string
+		spec      *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
+		wantErr   bool
+		errSubstr string
+	}{
+		{
+			name: "standalone inter-pod GMS (no failover) is accepted",
+			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+				ComponentType: consts.ComponentTypeWorker,
+				Resources:     workerGPU,
+				GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+					Enabled: true,
+					Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name: "sidecar gpuMemoryService mode=intraPod is accepted",
+			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+				ComponentType: consts.ComponentTypeWorker,
+				Resources:     workerGPU,
+				GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+					Enabled: true,
+					Mode:    nvidiacomv1alpha1.GMSModeIntraPod,
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name: "sidecar gpuMemoryService mode unset is accepted",
+			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+				ComponentType: consts.ComponentTypeWorker,
+				Resources:     workerGPU,
+				GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+					Enabled: true,
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name: "inter-pod failover requires gpuMemoryService.enabled",
+			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+				ComponentType: consts.ComponentTypeWorker,
+				Resources:     workerGPU,
+				Failover: &nvidiacomv1alpha1.FailoverSpec{
+					Enabled:    true,
+					Mode:       nvidiacomv1alpha1.GMSModeInterPod,
+					NumShadows: 1,
+				},
+			},
+			wantErr:   true,
+			errSubstr: "gpuMemoryService.enabled=true",
+		},
+		{
+			name: "inter-pod failover requires gpuMemoryService.mode=interPod",
+			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+				ComponentType: consts.ComponentTypeWorker,
+				Resources:     workerGPU,
+				GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+					Enabled: true,
+					Mode:    nvidiacomv1alpha1.GMSModeIntraPod,
+				},
+				Failover: &nvidiacomv1alpha1.FailoverSpec{
+					Enabled:    true,
+					Mode:       nvidiacomv1alpha1.GMSModeInterPod,
+					NumShadows: 1,
+				},
+			},
+			wantErr:   true,
+			errSubstr: "requires gpuMemoryService.mode",
+		},
+		{
+			name: "inter-pod failover with matching gpuMemoryService.mode=interPod is accepted",
+			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+				ComponentType: consts.ComponentTypeWorker,
+				Resources:     workerGPU,
+				GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+					Enabled: true,
+					Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+				},
+				Failover: &nvidiacomv1alpha1.FailoverSpec{
+					Enabled:    true,
+					Mode:       nvidiacomv1alpha1.GMSModeInterPod,
+					NumShadows: 1,
+				},
+			},
+			wantErr: false,
+		},
+		{
+			// numShadows is dormant configuration when failover.enabled=false
+			// and GetNumShadows returns 0; validateFailover deliberately does
+			// not constrain sub-fields on a disabled feature so users can
+			// stage a config before flipping enabled=true.
+			name: "numShadows with failover.enabled=false is accepted (dormant config)",
+			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+				ComponentType: consts.ComponentTypeWorker,
+				Resources:     workerGPU,
+				GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+					Enabled: true,
+					Mode:    nvidiacomv1alpha1.GMSModeInterPod,
+				},
+				Failover: &nvidiacomv1alpha1.FailoverSpec{
+					Enabled:    false,
+					NumShadows: 2,
+				},
+			},
+			wantErr: false,
+		},
+		{
+			name: "intraPod failover with numShadows=2 is rejected",
+			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+				ComponentType: consts.ComponentTypeWorker,
+				Resources:     workerGPU,
+				GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+					Enabled: true,
+					Mode:    nvidiacomv1alpha1.GMSModeIntraPod,
+				},
+				Failover: &nvidiacomv1alpha1.FailoverSpec{
+					Enabled:    true,
+					Mode:       nvidiacomv1alpha1.GMSModeIntraPod,
+					NumShadows: 2,
+				},
+			},
+			wantErr:   true,
+			errSubstr: "numShadows",
+		},
+		{
+			name: "intraPod failover with numShadows=1 is accepted",
+			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+				ComponentType: consts.ComponentTypeWorker,
+				Resources:     workerGPU,
+				GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
+					Enabled: true,
+					Mode:    nvidiacomv1alpha1.GMSModeIntraPod,
+				},
+				Failover: &nvidiacomv1alpha1.FailoverSpec{
+					Enabled:    true,
+					Mode:       nvidiacomv1alpha1.GMSModeIntraPod,
+					NumShadows: 1,
+				},
+			},
+			wantErr: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			v := NewSharedSpecValidator(tt.spec, "spec", "default-my-dgd")
+			_, err := v.Validate(context.Background())
+
+			if tt.wantErr {
+				if err == nil {
+					t.Fatalf("expected error, got nil")
+				}
+				if tt.errSubstr != "" && !contains(err.Error(), tt.errSubstr) {
+					t.Errorf("error %q does not contain %q", err.Error(), tt.errSubstr)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+		})
+	}
+}
+
 // contains checks if s contains substr
 func contains(s, substr string) bool {
 	return len(s) >= len(substr) && (s == substr || len(substr) == 0 ||