feat: add inter-pod GMS (#7777)

a48672f5 · Julien Mancuso · GitHub · 0d635418 · a48672f5 · a48672f5
Unverified Commit a48672f5 authored Apr 23, 2026 by Julien Mancuso Committed by GitHub Apr 23, 2026
20 changed files
--- a/deploy/helm/charts/platform/Chart.yaml
+++ b/deploy/helm/charts/platform/Chart.yaml
@@ -40,6 +40,6 @@ dependencies:
    condition: global.kai-scheduler.install
  - name: grove-charts
    alias: grove
-    version: v0.1.0-alpha.7
+    version: v0.1.0-alpha.8
    repository: oci://ghcr.io/ai-dynamo/grove
    condition: global.grove.install
--- a/deploy/helm/charts/platform/README.md
+++ b/deploy/helm/charts/platform/README.md
@@ -100,7 +100,7 @@ The chart includes built-in validation to prevent all operator conflicts:
 | file://components/operator | dynamo-operator | 1.1.0 |
 | https://charts.bitnami.com/bitnami | etcd | 12.0.18 |
 | https://nats-io.github.io/k8s/helm/charts/ | nats | 1.3.2 |
-| oci://ghcr.io/ai-dynamo/grove | grove(grove-charts) | v0.1.0-alpha.7 |
+| oci://ghcr.io/ai-dynamo/grove | grove(grove-charts) | v0.1.0-alpha.8 |
 | oci://ghcr.io/kai-scheduler/kai-scheduler | kai-scheduler | v0.13.4 |
 ## Values
@@ -207,6 +207,7 @@ For **production environments**, Kai Scheduler and Grove should be installed sep
 | dynamo-platform | kai-scheduler | Grove |
 |-----------------|---------------|-------|
 | 1.0.x           | >= v0.13.0    | >= v0.1.0-alpha.6 |
+| 1.1.x           | >= v0.13.4    | >= v0.1.0-alpha.8 |
 After installing them separately, enable Dynamo integration:

--- a/deploy/helm/charts/platform/README.md.gotmpl
+++ b/deploy/helm/charts/platform/README.md.gotmpl
@@ -133,6 +133,7 @@ For **production environments**, Kai Scheduler and Grove should be installed sep
 | dynamo-platform | kai-scheduler | Grove |
 |-----------------|---------------|-------|
 | 1.0.x           | >= v0.13.0    | >= v0.1.0-alpha.6 |
+| 1.1.x           | >= v0.13.4    | >= v0.1.0-alpha.8 |
 After installing them separately, enable Dynamo integration:

--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamocomponentdeployments.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamocomponentdeployments.yaml
@@ -10551,19 +10551,20 @@ spec:
                  type: object
                failover:
                  description: |-
-                    Failover configures active-passive GPU failover for this service.
+                    Failover configures GMS (GPU Memory Service) failover for this service.
-                    When enabled, the main container is cloned into two engine containers
+                    For intraPod mode: the main container is cloned into two engine containers (active + standby).
-                    (active + standby) sharing GPUs via DRA. Requires gpuMemoryService.enabled.
+                    For interPod mode: the operator creates a dedicated GMS weight server pod and
+                    multiple engine pods per rank that share GPUs via DRA resource claims.
                  properties:
                    enabled:
-                      description: |-
+                      description: Enabled activates failover mode.
-                        Enabled activates failover mode. The main container is cloned into two
-                        engine containers (active + standby) sharing GPUs via DRA. The standby
-                        acquires the flock when the active engine fails.
                      type: boolean
                    mode:
                      default: intraPod
-                      description: Mode selects the failover deployment topology. Must match gpuMemoryService.mode.
+                      description: |-
+                        Mode selects the failover deployment topology.
+                        intraPod: engine containers run within the same pod (requires gpuMemoryService.enabled).
+                        interPod: a dedicated GMS weight server pod + engine pods per rank (requires Grove).
                      enum:
                        - intraPod
                        - interPod
@@ -10571,10 +10572,13 @@ spec:
                    numShadows:
                      default: 1
                      description: |-
-                        NumShadows is the number of shadow (standby) engine containers per rank.
+                        NumShadows is the number of shadow (standby) engine pods per rank.
-                        Reserved for future use — the operator currently creates exactly one shadow.
+                        Total engine pods per rank = NumShadows + 1 (1 primary + NumShadows shadows).
+                        NumShadows is only meaningful for mode=interPod; intraPod uses a fixed
+                        1 primary + 1 shadow sidecar layout and any value other than 1 is
+                        rejected at admission time.
                      format: int32
-                      maximum: 1
                      minimum: 1
                      type: integer
                  required:

--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeployments.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeployments.yaml
@@ -10774,19 +10774,20 @@ spec:
                        type: object
                      failover:
                        description: |-
-                          Failover configures active-passive GPU failover for this service.
+                          Failover configures GMS (GPU Memory Service) failover for this service.
-                          When enabled, the main container is cloned into two engine containers
+                          For intraPod mode: the main container is cloned into two engine containers (active + standby).
-                          (active + standby) sharing GPUs via DRA. Requires gpuMemoryService.enabled.
+                          For interPod mode: the operator creates a dedicated GMS weight server pod and
+                          multiple engine pods per rank that share GPUs via DRA resource claims.
                        properties:
                          enabled:
-                            description: |-
+                            description: Enabled activates failover mode.
-                              Enabled activates failover mode. The main container is cloned into two
-                              engine containers (active + standby) sharing GPUs via DRA. The standby
-                              acquires the flock when the active engine fails.
                            type: boolean
                          mode:
                            default: intraPod
-                            description: Mode selects the failover deployment topology. Must match gpuMemoryService.mode.
+                            description: |-
+                              Mode selects the failover deployment topology.
+                              intraPod: engine containers run within the same pod (requires gpuMemoryService.enabled).
+                              interPod: a dedicated GMS weight server pod + engine pods per rank (requires Grove).
                            enum:
                              - intraPod
                              - interPod
@@ -10794,10 +10795,13 @@ spec:
                          numShadows:
                            default: 1
                            description: |-
-                              NumShadows is the number of shadow (standby) engine containers per rank.
+                              NumShadows is the number of shadow (standby) engine pods per rank.
-                              Reserved for future use — the operator currently creates exactly one shadow.
+                              Total engine pods per rank = NumShadows + 1 (1 primary + NumShadows shadows).
+                              NumShadows is only meaningful for mode=interPod; intraPod uses a fixed
+                              1 primary + 1 shadow sidecar layout and any value other than 1 is
+                              rejected at admission time.
                            format: int32
-                            maximum: 1
                            minimum: 1
                            type: integer
                        required:

--- a/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml
+++ b/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml
@@ -266,6 +266,7 @@ rules:
  verbs:
  - create
  - delete
+  - deletecollection
  - get
  - list
  - patch

--- a/deploy/operator/.gitignore
+++ b/deploy/operator/.gitignore
--- a/deploy/operator/api/config/v1alpha1/types.go
+++ b/deploy/operator/api/config/v1alpha1/types.go
@@ -47,6 +47,9 @@ type OperatorConfiguration struct {
 	// Orchestrator configuration with optional overrides
 	Orchestrators OrchestratorConfiguration `json:"orchestrators"`
+	// DRA (Dynamic Resource Allocation) settings with optional override
+	DRA DRAConfiguration `json:"dra,omitempty"`
 	// Service mesh and infrastructure addresses
 	Infrastructure InfrastructureConfiguration `json:"infrastructure"`
@@ -194,6 +197,24 @@ type KaiSchedulerConfiguration struct {
 	Enabled *bool `json:"enabled,omitempty"`
 }
+// DRAConfiguration holds Dynamic Resource Allocation (resource.k8s.io) settings.
+//
+// NOTE: auto-detection here only verifies that the resource.k8s.io API group is
+// registered on the apiserver (Kubernetes 1.32+). It does NOT verify that a
+// GPU-specific DRA resource driver (e.g. nvidia/k8s-dra-driver-gpu) is
+// installed, that its DeviceClass exists, or that node-level GPU drivers are
+// compatible. An admin can use `enabled: false` to force-off DRA integration
+// on clusters where the API is present but the GPU driver stack is not wired
+// up — this makes the operator fail GMS / inter-pod failover admissions early
+// with a clear error instead of letting pods Pend with a confusing
+// "resourceclaim not found" at schedule time.
+type DRAConfiguration struct {
+	// Enabled overrides auto-detection of the resource.k8s.io API group.
+	// nil = auto-detect. Setting true requires detection to also succeed (the
+	// operator will exit at startup otherwise).
+	Enabled *bool `json:"enabled,omitempty"`
+}
 // InfrastructureConfiguration holds service mesh and backend addresses.
 type InfrastructureConfiguration struct {
 	// NATSAddress is the address of the NATS server

--- a/deploy/operator/api/config/v1alpha1/zz_generated.deepcopy.go
+++ b/deploy/operator/api/config/v1alpha1/zz_generated.deepcopy.go
@@ -120,6 +120,26 @@ func (in *CheckpointStorageConfiguration) DeepCopy() *CheckpointStorageConfigura
 	return out
 }
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *DRAConfiguration) DeepCopyInto(out *DRAConfiguration) {
+	*out = *in
+	if in.Enabled != nil {
+		in, out := &in.Enabled, &out.Enabled
+		*out = new(bool)
+		**out = **in
+	}
+}
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRAConfiguration.
+func (in *DRAConfiguration) DeepCopy() *DRAConfiguration {
+	if in == nil {
+		return nil
+	}
+	out := new(DRAConfiguration)
+	in.DeepCopyInto(out)
+	return out
+}
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *DiscoveryConfiguration) DeepCopyInto(out *DiscoveryConfiguration) {
 	*out = *in
@@ -353,6 +373,7 @@ func (in *OperatorConfiguration) DeepCopyInto(out *OperatorConfiguration) {
 	out.LeaderElection = in.LeaderElection
 	out.Namespace = in.Namespace
 	in.Orchestrators.DeepCopyInto(&out.Orchestrators)
+	in.DRA.DeepCopyInto(&out.DRA)
 	out.Infrastructure = in.Infrastructure
 	out.Ingress = in.Ingress
 	out.RBAC = in.RBAC

--- a/deploy/operator/api/v1alpha1/common.go
+++ b/deploy/operator/api/v1alpha1/common.go
@@ -161,7 +161,10 @@ type GPUMemoryServiceMode string
 const (
 	// GMSModeIntraPod runs GMS as a sidecar within the same pod.
 	GMSModeIntraPod GPUMemoryServiceMode = "intraPod"
-	// GMSModeInterPod runs GMS as a separate pod (not yet supported).
+	// GMSModeInterPod runs GMS as a separate weight server pod and one or more
+	// engine pods per rank, sharing GPUs via DRA ResourceClaims and a shared
+	// hostPath volume for UDS sockets. Only valid on FailoverSpec; the
+	// GPUMemoryServiceSpec sidecar always runs in intraPod mode.
 	GMSModeInterPod GPUMemoryServiceMode = "interPod"
 )
@@ -185,23 +188,28 @@ type GPUMemoryServiceSpec struct {
 }
 // FailoverSpec configures active-passive failover for a worker component.
-// Requires gpuMemoryService.enabled and the nvidia.com/dynamo-kube-discovery-mode: container
+// For intraPod mode: requires gpuMemoryService.enabled; the main container is cloned
-// annotation on the DGD.
+// into engine containers (active + standby) within the same pod.
+// For interPod mode: the operator creates a dedicated GMS weight server pod and
+// multiple engine pods per rank that share GPUs via DRA resource claims.
 type FailoverSpec struct {
-	// Enabled activates failover mode. The main container is cloned into two
+	// Enabled activates failover mode.
-	// engine containers (active + standby) sharing GPUs via DRA. The standby
-	// acquires the flock when the active engine fails.
 	Enabled bool `json:"enabled"`
-	// Mode selects the failover deployment topology. Must match gpuMemoryService.mode.
+	// Mode selects the failover deployment topology.
+	// intraPod: engine containers run within the same pod (requires gpuMemoryService.enabled).
+	// interPod: a dedicated GMS weight server pod + engine pods per rank (requires Grove).
 	// +kubebuilder:default=intraPod
 	// +kubebuilder:validation:Enum=intraPod;interPod
 	// +optional
 	Mode GPUMemoryServiceMode `json:"mode,omitempty"`
-	// NumShadows is the number of shadow (standby) engine containers per rank.
+	// NumShadows is the number of shadow (standby) engine pods per rank.
-	// Reserved for future use — the operator currently creates exactly one shadow.
+	// Total engine pods per rank = NumShadows + 1 (1 primary + NumShadows shadows).
+	//
+	// NumShadows is only meaningful for mode=interPod; intraPod uses a fixed
+	// 1 primary + 1 shadow sidecar layout and any value other than 1 is
+	// rejected at admission time.
 	// +kubebuilder:default=1
 	// +kubebuilder:validation:Minimum=1
-	// +kubebuilder:validation:Maximum=1
 	// +optional
 	NumShadows int32 `json:"numShadows,omitempty"`
 }

--- a/deploy/operator/api/v1alpha1/dynamocomponentdeployment_types.go
+++ b/deploy/operator/api/v1alpha1/dynamocomponentdeployment_types.go
@@ -155,9 +155,10 @@ type DynamoComponentDeploymentSharedSpec struct {
 	// +optional
 	GPUMemoryService *GPUMemoryServiceSpec `json:"gpuMemoryService,omitempty"`
-	// Failover configures active-passive GPU failover for this service.
+	// Failover configures GMS (GPU Memory Service) failover for this service.
-	// When enabled, the main container is cloned into two engine containers
+	// For intraPod mode: the main container is cloned into two engine containers (active + standby).
-	// (active + standby) sharing GPUs via DRA. Requires gpuMemoryService.enabled.
+	// For interPod mode: the operator creates a dedicated GMS weight server pod and
+	// multiple engine pods per rank that share GPUs via DRA resource claims.
 	// +optional
 	Failover *FailoverSpec `json:"failover,omitempty"`
 }
@@ -344,6 +345,59 @@ func (s *DynamoComponentDeploymentSharedSpec) GetNumberOfNodes() int32 {
 	return 1
 }
+// IsInterPodGMSEnabled reports whether the inter-pod GMS layout is requested
+// (dedicated GMS weight-server pod per rank + engine pods, sharing GPUs via
+// DRA). This is a layout-only signal and does NOT imply failover is enabled;
+// callers deciding whether to add shadow engine pods or apply failover-group
+// cascade labels must additionally consult IsInterPodFailoverEnabled().
+func (s *DynamoComponentDeploymentSharedSpec) IsInterPodGMSEnabled() bool {
+	return s.GPUMemoryService != nil && s.GPUMemoryService.Enabled &&
+		s.GPUMemoryService.Mode == GMSModeInterPod
+}
+// IsInterPodFailoverEnabled reports whether failover with hot-spare shadow
+// engine pods is configured for the inter-pod GMS layout. When true, the
+// service also implies IsInterPodGMSEnabled() (the layout invariant is
+// enforced by admission). Use this to gate shadow-pod expansion and
+// failover-cascade labels; use IsInterPodGMSEnabled() for layout-only
+// decisions (weight-server PCLQ, DRA claims, Grove pathway gating, etc.).
+func (s *DynamoComponentDeploymentSharedSpec) IsInterPodFailoverEnabled() bool {
+	return s.Failover != nil && s.Failover.Enabled && s.Failover.Mode == GMSModeInterPod
+}
+// GetNumShadows returns the number of shadow engine replicas configured for
+// inter-pod GMS failover. It returns 0 when inter-pod failover is disabled
+// (including the standalone inter-pod GMS layout and intra-pod failover).
+// Defaults to 1 if inter-pod failover is enabled but NumShadows is unset or <1.
+//
+// Callers that iterate "engine roles" must gate on IsInterPodFailoverEnabled()
+// first — treating a 0 return as "just the primary" is a bug, because the
+// primary is still modeled as a regular single-pod service in that case.
+func (s *DynamoComponentDeploymentSharedSpec) GetNumShadows() int32 {
+	if !s.IsInterPodFailoverEnabled() {
+		return 0
+	}
+	if s.Failover.NumShadows < 1 {
+		return 1
+	}
+	return s.Failover.NumShadows
+}
+// GetTotalEnginePods returns the total number of engine pods (primary +
+// shadows) for the inter-pod GMS layout. Returns 1 for the standalone
+// inter-pod layout (no failover) — a single engine pod paired with a
+// dedicated weight-server pod — and N+1 when inter-pod failover is enabled.
+// Returns 1 for non-inter-pod layouts as a sizing convenience.
+//
+// Callers that iterate "engine roles" must gate on IsInterPodGMSEnabled()
+// first — the 1 return for non-inter-pod services is a convenience for sizing
+// math, NOT a signal that there is a "primary role" to iterate over; the
+// non-inter-pod path models the service as a single clique, not as primary +
+// shadows.
+func (s *DynamoComponentDeploymentSharedSpec) GetTotalEnginePods() int32 {
+	return s.GetNumShadows() + 1
+}
 func (s *DynamoComponentDeployment) GetParentGraphDeploymentName() string {
 	for _, ownerRef := range s.ObjectMeta.OwnerReferences {
 		if ownerRef.Kind == "DynamoGraphDeployment" {

--- a/deploy/operator/cmd/main.go
+++ b/deploy/operator/cmd/main.go
@@ -445,7 +445,23 @@ func main() {
 	}
 	setupLog.Info("Detecting DRA (Dynamic Resource Allocation) availability...")
-	runtimeConfig.DRAEnabled = commonController.DetectDRAAvailability(mainCtx, mgr)
+	draDetected := commonController.DetectDRAAvailability(mainCtx, mgr)
+	switch {
+	case operatorCfg.DRA.Enabled == nil:
+		runtimeConfig.DRAEnabled = draDetected
+	case *operatorCfg.DRA.Enabled:
+		if !draDetected {
+			setupLog.Error(nil,
+				"DRA is explicitly enabled in config but the resource.k8s.io API group"+
+					" was not detected in the cluster (requires Kubernetes 1.32+)",
+			)
+			os.Exit(1)
+		}
+		runtimeConfig.DRAEnabled = true
+	default:
+		setupLog.Info("DRA is explicitly disabled via config override")
+		runtimeConfig.DRAEnabled = false
+	}
 	setupLog.Info("Detected orchestrators availability",
 		"grove", runtimeConfig.GroveEnabled,
@@ -681,6 +697,15 @@ func registerControllers(
 		return fmt.Errorf("unable to create DynamoCheckpoint controller: %w", err)
 	}
+	if runtimeConfig.GroveEnabled {
+		if err = controller.NewFailoverCascadeReconciler(
+			mgr.GetClient(),
+			mgr.GetEventRecorderFor("gms-failover-cascade"),
+		).SetupWithManager(mgr); err != nil {
+			return fmt.Errorf("unable to create GMS FailoverCascade controller: %w", err)
+		}
+	}
 	setupLog.Info("Controllers registered successfully")
 	return nil
 }
@@ -716,7 +741,7 @@ func registerWebhooks(
 		return fmt.Errorf("unable to register DynamoComponentDeployment webhook: %w", err)
 	}
-	dgdHandler := webhookvalidation.NewDynamoGraphDeploymentHandler(mgr, operatorPrincipal)
+	dgdHandler := webhookvalidation.NewDynamoGraphDeploymentHandler(mgr, operatorPrincipal, runtimeConfig.GroveEnabled)
 	if err := dgdHandler.RegisterWithManager(mgr); err != nil {
 		return fmt.Errorf("unable to register DynamoGraphDeployment webhook: %w", err)
 	}

--- a/deploy/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml
+++ b/deploy/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml
@@ -10551,19 +10551,20 @@ spec:
                  type: object
                failover:
                  description: |-
-                    Failover configures active-passive GPU failover for this service.
+                    Failover configures GMS (GPU Memory Service) failover for this service.
-                    When enabled, the main container is cloned into two engine containers
+                    For intraPod mode: the main container is cloned into two engine containers (active + standby).
-                    (active + standby) sharing GPUs via DRA. Requires gpuMemoryService.enabled.
+                    For interPod mode: the operator creates a dedicated GMS weight server pod and
+                    multiple engine pods per rank that share GPUs via DRA resource claims.
                  properties:
                    enabled:
-                      description: |-
+                      description: Enabled activates failover mode.
-                        Enabled activates failover mode. The main container is cloned into two
-                        engine containers (active + standby) sharing GPUs via DRA. The standby
-                        acquires the flock when the active engine fails.
                      type: boolean
                    mode:
                      default: intraPod
-                      description: Mode selects the failover deployment topology. Must match gpuMemoryService.mode.
+                      description: |-
+                        Mode selects the failover deployment topology.
+                        intraPod: engine containers run within the same pod (requires gpuMemoryService.enabled).
+                        interPod: a dedicated GMS weight server pod + engine pods per rank (requires Grove).
                      enum:
                        - intraPod
                        - interPod
@@ -10571,10 +10572,13 @@ spec:
                    numShadows:
                      default: 1
                      description: |-
-                        NumShadows is the number of shadow (standby) engine containers per rank.
+                        NumShadows is the number of shadow (standby) engine pods per rank.
-                        Reserved for future use — the operator currently creates exactly one shadow.
+                        Total engine pods per rank = NumShadows + 1 (1 primary + NumShadows shadows).
+                        NumShadows is only meaningful for mode=interPod; intraPod uses a fixed
+                        1 primary + 1 shadow sidecar layout and any value other than 1 is
+                        rejected at admission time.
                      format: int32
-                      maximum: 1
                      minimum: 1
                      type: integer
                  required:

--- a/deploy/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml
+++ b/deploy/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml
@@ -10774,19 +10774,20 @@ spec:
                        type: object
                      failover:
                        description: |-
-                          Failover configures active-passive GPU failover for this service.
+                          Failover configures GMS (GPU Memory Service) failover for this service.
-                          When enabled, the main container is cloned into two engine containers
+                          For intraPod mode: the main container is cloned into two engine containers (active + standby).
-                          (active + standby) sharing GPUs via DRA. Requires gpuMemoryService.enabled.
+                          For interPod mode: the operator creates a dedicated GMS weight server pod and
+                          multiple engine pods per rank that share GPUs via DRA resource claims.
                        properties:
                          enabled:
-                            description: |-
+                            description: Enabled activates failover mode.
-                              Enabled activates failover mode. The main container is cloned into two
-                              engine containers (active + standby) sharing GPUs via DRA. The standby
-                              acquires the flock when the active engine fails.
                            type: boolean
                          mode:
                            default: intraPod
-                            description: Mode selects the failover deployment topology. Must match gpuMemoryService.mode.
+                            description: |-
+                              Mode selects the failover deployment topology.
+                              intraPod: engine containers run within the same pod (requires gpuMemoryService.enabled).
+                              interPod: a dedicated GMS weight server pod + engine pods per rank (requires Grove).
                            enum:
                              - intraPod
                              - interPod
@@ -10794,10 +10795,13 @@ spec:
                          numShadows:
                            default: 1
                            description: |-
-                              NumShadows is the number of shadow (standby) engine containers per rank.
+                              NumShadows is the number of shadow (standby) engine pods per rank.
-                              Reserved for future use — the operator currently creates exactly one shadow.
+                              Total engine pods per rank = NumShadows + 1 (1 primary + NumShadows shadows).
+                              NumShadows is only meaningful for mode=interPod; intraPod uses a fixed
+                              1 primary + 1 shadow sidecar layout and any value other than 1 is
+                              rejected at admission time.
                            format: int32
-                            maximum: 1
                            minimum: 1
                            type: integer
                        required:

--- a/deploy/operator/config/rbac/role.yaml
+++ b/deploy/operator/config/rbac/role.yaml
@@ -35,6 +35,8 @@ rules:
  resources:
  - pods
  verbs:
+  - delete
+  - deletecollection
  - get
  - list
  - watch

--- a/deploy/operator/go.mod
+++ b/deploy/operator/go.mod
@@ -6,7 +6,7 @@ require (
 	emperror.dev/errors v0.8.1
 	github.com/Masterminds/semver/v3 v3.4.0
 	github.com/ai-dynamo/dynamo/deploy/snapshot v0.0.0
-	github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.6
+	github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.8
 	github.com/bsm/gomega v1.27.10
 	github.com/go-logr/logr v1.4.3
 	github.com/google/go-cmp v0.7.0
@@ -26,7 +26,7 @@ require (
 	k8s.io/apiextensions-apiserver v0.34.3
 	k8s.io/apimachinery v0.34.3
 	k8s.io/client-go v0.34.3
-	k8s.io/utils v0.0.0-20251002143259-bc988d571ff4
+	k8s.io/utils v0.0.0-20260108192941-914a6e750570
 	sigs.k8s.io/controller-runtime v0.22.4
 	sigs.k8s.io/gateway-api-inference-extension v1.2.0
 	sigs.k8s.io/lws v0.6.1
@@ -74,7 +74,7 @@ require (
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/prometheus/procfs v0.17.0 // indirect
-	github.com/spf13/cobra v1.9.1 // indirect
+	github.com/spf13/cobra v1.10.1 // indirect
 	github.com/spf13/pflag v1.0.10 // indirect
 	github.com/stoewer/go-strcase v1.3.0 // indirect
 	github.com/x448/float16 v0.8.4 // indirect

--- a/deploy/operator/go.sum
+++ b/deploy/operator/go.sum
@@ -4,8 +4,8 @@ emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
 emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE=
 github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
 github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
-github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.6 h1:6xspRy93dVsGzwiRebNUhrEnamXUtAGvt5tP50uxkOA=
+github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.8 h1:hsAJ8YOsmxDBqB4bpFlnss5jVBJ8rSZ0W7mPLKIPo5A=
-github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.6/go.mod h1:kg35gYA1E7y8SGGZNfllkj6MBPtsdvc/TCNHL9Ysnpw=
+github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.8/go.mod h1:Vm2I1VlLyNKJRmSsAoaT5KzkMQXuzYKgZ1s2GPaURHQ=
 github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI=
 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g=
 github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@@ -139,9 +139,9 @@ github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUO
 github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
 github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
-github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo=
+github.com/spf13/cobra v1.10.1 h1:lJeBwCfmrnXthfAupyUTzJ/J4Nc1RsHC/mSRU2dll/s=
-github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0=
+github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0=
-github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
 github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk=
 github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
 github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs=
@@ -294,8 +294,8 @@ k8s.io/kube-aggregator v0.33.4 h1:TdIJKHb0/bLpby7FblXIaVEzyA1jGEjzt/n9cRvwq8U=
 k8s.io/kube-aggregator v0.33.4/go.mod h1:wZuctdRvGde5bwzxkZRs0GYj2KOpCNgx8rRGVoNb62k=
 k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE=
 k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts=
-k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck=
+k8s.io/utils v0.0.0-20260108192941-914a6e750570 h1:JT4W8lsdrGENg9W+YwwdLJxklIuKWdRm+BC+xt33FOY=
-k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
+k8s.io/utils v0.0.0-20260108192941-914a6e750570/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk=
 sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM=
 sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
 sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A=

--- a/deploy/operator/internal/consts/consts.go
+++ b/deploy/operator/internal/consts/consts.go
@@ -121,7 +121,11 @@ const (
 	// Grove multinode role suffixes
 	GroveRoleSuffixLeader = "ldr"
 	GroveRoleSuffixWorker = "wkr"
+	GroveRoleSuffixGMS    = "gms"
+	KubeLabelDynamoFailoverEngineGroupMember = "nvidia.com/dynamo-failover-engine-group-member"
+	DiscoveryBackendKubernetes   = "kubernetes" // label value for KubeLabelDynamoDiscoveryBackend
 	MainContainerName            = "main"
 	FrontendSidecarContainerName = "sidecar-frontend"

--- a/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go
+++ b/deploy/operator/internal/controller/dynamocomponentdeployment_controller_test.go
@@ -770,6 +770,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
 										Command: []string{"/bin/sh", "-c"},
 										Args:    []string{"ray start --head --port=6379 && some dynamo command --tensor-parallel-size 4 --pipeline-parallel-size 1 --distributed-executor-backend ray"},
 										Env: []corev1.EnvVar{
+											{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
 											{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
 											{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
 											{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
@@ -912,6 +913,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
 										Command: []string{"/bin/sh", "-c"},
 										Args:    []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"},
 										Env: []corev1.EnvVar{
+											{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
 											{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
 											{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
 											{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},

--- a/deploy/operator/internal/controller/dynamographdeployment_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeployment_controller.go
@@ -422,9 +422,14 @@ func (r *DynamoGraphDeploymentReconciler) getUpdatedInProgressForGrove(ctx conte
 		var isReady bool
 		var reason string
-		if component.GetNumberOfNodes() > 1 {
+		// Keep in sync with reconcileGroveScaling: any service that requires a
+		// PodCliqueScalingGroup (multinode OR inter-pod GMS failover) must be
+		// queried via CheckPCSGReady, otherwise single-node GMS services stall
+		// in the "in progress" list because the corresponding PodClique never
+		// exists.
+		usesPCSG := component.GetNumberOfNodes() > 1 || component.IsInterPodFailoverEnabled()
+		if usesPCSG {
 			isReady, reason, _ = dynamo.CheckPCSGReady(ctx, r.Client, resourceName, dgd.Namespace, logger)
 		} else {
 			isReady, reason, _ = dynamo.CheckPodCliqueReady(ctx, r.Client, resourceName, dgd.Namespace, logger)
 		}
@@ -625,13 +630,10 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont
 			continue
 		}
-		numberOfNodes := component.GetNumberOfNodes()
+		usesPCSG := component.GetNumberOfNodes() > 1 || component.IsInterPodFailoverEnabled()
-		isMultinode := numberOfNodes > 1
-		if isMultinode {
-			// Scale PodCliqueScalingGroup for multinode services
-			// Grove naming pattern: {DGD.name}-{replicaIndex}-{serviceName}
 		resourceName := fmt.Sprintf("%s-%d-%s", dynamoDeployment.Name, replicaIndex, strings.ToLower(serviceName))
+		if usesPCSG {
 			err := r.scaleGroveResource(ctx,
 				resourceName,
 				dynamoDeployment.Namespace,
@@ -642,9 +644,6 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont
 				return fmt.Errorf("failed to scale PodCliqueScalingGroup %s: %w", resourceName, err)
 			}
 		} else {
-			// Scale individual PodClique for single-node services
-			// Grove naming pattern: {DGD.name}-{replicaIndex}-{serviceName}
-			resourceName := fmt.Sprintf("%s-%d-%s", dynamoDeployment.Name, replicaIndex, strings.ToLower(serviceName))
 			err := r.scaleGroveResource(ctx,
 				resourceName,
 				dynamoDeployment.Namespace,
@@ -661,11 +660,29 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont
 	return nil
 }
-func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState, checkpointInfos map[string]*checkpoint.CheckpointInfo) (ReconcileResult, error) {
+// reconcileGMSResourceClaimTemplates syncs one ResourceClaimTemplate per
+// service when DRA is available, and otherwise fails fast if any service
+// needs DRA-backed GPU allocation.
+//
+// Both the GMS sidecar (gpuMemoryService.enabled=true) and inter-pod GMS
+// failover (failover.mode=interPod) allocate GPUs via DRA ResourceClaims.
+// Without DRA, pods would be admitted by the webhook but silently reference
+// ResourceClaimTemplates that reconcile never creates, producing a confusing
+// "resourceclaim not found" at schedule time. We fail fast here so the user
+// gets an actionable error instead.
+func (r *DynamoGraphDeploymentReconciler) reconcileGMSResourceClaimTemplates(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error {
 	logger := log.FromContext(ctx)
-	// Sync ResourceClaimTemplates for GMS-enabled components before creating pods.
+	if !r.RuntimeConfig.DRAEnabled {
-	if r.RuntimeConfig.DRAEnabled {
+		for _, component := range dynamoDeployment.Spec.Services {
+			if (component.GPUMemoryService != nil && component.GPUMemoryService.Enabled) ||
+				component.IsInterPodFailoverEnabled() {
+				return fmt.Errorf("gpuMemoryService / inter-pod GMS failover requires DRA (Dynamic Resource Allocation), but DRA is not available (either the resource.k8s.io API group is not registered on this cluster, which requires Kubernetes 1.32+, or DRA has been explicitly disabled in the operator configuration)")
+			}
+		}
+		return nil
+	}
 	for serviceName, component := range dynamoDeployment.Spec.Services {
 		gpuCount, deviceClassName := dra.ExtractGPUParams(component.GPUMemoryService, component.Resources)
 		claimTemplateName := dra.ResourceClaimTemplateName(dynamoDeployment.Name, serviceName)
@@ -674,15 +691,17 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
 		})
 		if err != nil {
 			logger.Error(err, "failed to sync GMS ResourceClaimTemplate", "service", serviceName)
-				return ReconcileResult{}, fmt.Errorf("failed to sync GMS ResourceClaimTemplate for %s: %w", serviceName, err)
+			return fmt.Errorf("failed to sync GMS ResourceClaimTemplate for %s: %w", serviceName, err)
-			}
-		}
-	} else {
-		for _, component := range dynamoDeployment.Spec.Services {
-			if component.GPUMemoryService != nil && component.GPUMemoryService.Enabled {
-				return ReconcileResult{}, fmt.Errorf("gpuMemoryService requires DRA (Dynamic Resource Allocation), but the resource.k8s.io API group is not available on this cluster (requires Kubernetes 1.32+)")
 		}
 	}
+	return nil
+}
+func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState, checkpointInfos map[string]*checkpoint.CheckpointInfo) (ReconcileResult, error) {
+	logger := log.FromContext(ctx)
+	if err := r.reconcileGMSResourceClaimTemplates(ctx, dynamoDeployment); err != nil {
+		return ReconcileResult{}, err
 	}
 	grovePodCliqueSetAsResource, err := r.reconcileGrovePodCliqueSet(ctx, dynamoDeployment, restartState, checkpointInfos)
@@ -1455,6 +1474,7 @@ func (r *DynamoGraphDeploymentReconciler) buildCheckpointJobPodTemplate(
 		consts.MultinodeDeploymentTypeGrove, // Use Grove (single-node backends return early)
 		serviceName,
 		nil, // No checkpoint info for checkpoint creation jobs
+		nil, // Use default deployer
 	)
 	if err != nil {
 		return corev1.PodTemplateSpec{}, fmt.Errorf("failed to generate base pod spec: %w", err)