feat: add gpu memory service API on engine services (#8148)

Signed-off-by: mohammedabdulwahhab <furkhan324@berkeley.edu> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

feat: add gpu memory service API on engine services (#8148)
Signed-off-by: mohammedabdulwahhab <furkhan324@berkeley.edu> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
47897f36 · mohammedabdulwahhab · GitHub · 617d55c0 · 47897f36 · 47897f36
Unverified Commit 47897f36 authored Apr 14, 2026 by mohammedabdulwahhab Committed by GitHub Apr 14, 2026
20 changed files
--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamocomponentdeployments.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamocomponentdeployments.yaml
@@ -10727,6 +10727,30 @@ spec:
                globalDynamoNamespace:
                  description: GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace
                  type: boolean
+                gpuMemoryService:
+                  description: |-
+                    GPUMemoryService configures the GPU Memory Service (GMS) sidecar.
+                    When enabled, a GMS sidecar is injected and GPU access is managed via DRA.
+                  properties:
+                    deviceClassName:
+                      default: gpu.nvidia.com
+                      description: DeviceClassName is the DRA DeviceClass to request GPUs from.
+                      type: string
+                    enabled:
+                      description: |-
+                        Enabled activates the GMS sidecar. GPU resources on the main container
+                        are replaced with a DRA ResourceClaim for shared GPU access.
+                      type: boolean
+                    mode:
+                      default: intraPod
+                      description: Mode selects the GMS deployment topology.
+                      enum:
+                        - intraPod
+                        - interPod
+                      type: string
+                  required:
+                    - enabled
+                  type: object
                ingress:
                  description: Ingress config to expose the component outside the cluster (or through a service mesh).
                  properties:

--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeployments.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeployments.yaml
@@ -10950,6 +10950,30 @@ spec:
                      globalDynamoNamespace:
                        description: GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace
                        type: boolean
+                      gpuMemoryService:
+                        description: |-
+                          GPUMemoryService configures the GPU Memory Service (GMS) sidecar.
+                          When enabled, a GMS sidecar is injected and GPU access is managed via DRA.
+                        properties:
+                          deviceClassName:
+                            default: gpu.nvidia.com
+                            description: DeviceClassName is the DRA DeviceClass to request GPUs from.
+                            type: string
+                          enabled:
+                            description: |-
+                              Enabled activates the GMS sidecar. GPU resources on the main container
+                              are replaced with a DRA ResourceClaim for shared GPU access.
+                            type: boolean
+                          mode:
+                            default: intraPod
+                            description: Mode selects the GMS deployment topology.
+                            enum:
+                              - intraPod
+                              - interPod
+                            type: string
+                        required:
+                          - enabled
+                        type: object
                      ingress:
                        description: Ingress config to expose the component outside the cluster (or through a service mesh).
                        properties:

--- a/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml
+++ b/deploy/helm/charts/platform/components/operator/templates/manager-rbac.yaml
@@ -356,6 +356,18 @@ rules:
  - patch
  - update
  - watch
+- apiGroups:
+  - resource.k8s.io
+  resources:
+  - resourceclaimtemplates
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
 - apiGroups:
  - inference.networking.k8s.io
  resources:
@@ -585,6 +597,46 @@ roleRef:
  kind: ClusterRole
  name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-queue-reader
 subjects:
+- kind: ServiceAccount
+  name: '{{ include "dynamo-operator.fullname" . }}-controller-manager'
+  namespace: '{{ .Release.Namespace }}'
+---
+# ClusterRole for DRA DeviceClass access
+# This is always a ClusterRole since DeviceClass resources are cluster-scoped
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-deviceclass-reader
+  labels:
+    app.kubernetes.io/component: rbac
+    app.kubernetes.io/created-by: dynamo-operator
+    app.kubernetes.io/part-of: dynamo-operator
+  {{- include "dynamo-operator.labels" . | nindent 4 }}
+rules:
+- apiGroups:
+  - resource.k8s.io
+  resources:
+  - deviceclasses
+  verbs:
+  - get
+  - list
+  - watch
+---
+# ClusterRoleBinding for DRA DeviceClass access
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-deviceclass-reader-binding
+  labels:
+    app.kubernetes.io/component: rbac
+    app.kubernetes.io/created-by: dynamo-operator
+    app.kubernetes.io/part-of: dynamo-operator
+  {{- include "dynamo-operator.labels" . | nindent 4 }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-deviceclass-reader
+subjects:
 - kind: ServiceAccount
  name: '{{ include "dynamo-operator.fullname" . }}-controller-manager'
  namespace: '{{ .Release.Namespace }}'
\ No newline at end of file
--- a/deploy/operator/api/v1alpha1/common.go
+++ b/deploy/operator/api/v1alpha1/common.go
@@ -155,6 +155,35 @@ func (e ExtraPodSpec) MarshalJSON() ([]byte, error) {
 	return json.Marshal(aux)
 }
+// GPUMemoryServiceMode selects the GMS deployment topology.
+type GPUMemoryServiceMode string
+const (
+	// GMSModeIntraPod runs GMS as a sidecar within the same pod.
+	GMSModeIntraPod GPUMemoryServiceMode = "intraPod"
+	// GMSModeInterPod runs GMS as a separate pod (not yet supported).
+	GMSModeInterPod GPUMemoryServiceMode = "interPod"
+)
+// GPUMemoryServiceSpec configures the GPU Memory Service (GMS) sidecar for a worker component.
+// When enabled, the operator injects a GMS sidecar that provides shared GPU memory access
+// via DRA (Dynamic Resource Allocation). The sidecar runs two GMS processes per GPU
+// (weights + kv_cache) and communicates with the main container over UDS sockets.
+type GPUMemoryServiceSpec struct {
+	// Enabled activates the GMS sidecar. GPU resources on the main container
+	// are replaced with a DRA ResourceClaim for shared GPU access.
+	Enabled bool `json:"enabled"`
+	// Mode selects the GMS deployment topology.
+	// +kubebuilder:default=intraPod
+	// +kubebuilder:validation:Enum=intraPod;interPod
+	// +optional
+	Mode GPUMemoryServiceMode `json:"mode,omitempty"`
+	// DeviceClassName is the DRA DeviceClass to request GPUs from.
+	// +kubebuilder:default="gpu.nvidia.com"
+	// +optional
+	DeviceClassName string `json:"deviceClassName,omitempty"`
+}
 // ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter
 // for replica management. When enabled, the DGDSA owns the replicas field and
 // external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource.

--- a/deploy/operator/api/v1alpha1/dynamocomponentdeployment_types.go
+++ b/deploy/operator/api/v1alpha1/dynamocomponentdeployment_types.go
@@ -149,6 +149,11 @@ type DynamoComponentDeploymentSharedSpec struct {
 	// must be narrower than or equal to the spec-level packDomain.
 	// +optional
 	TopologyConstraint *TopologyConstraint `json:"topologyConstraint,omitempty"`
+	// GPUMemoryService configures the GPU Memory Service (GMS) sidecar.
+	// When enabled, a GMS sidecar is injected and GPU access is managed via DRA.
+	// +optional
+	GPUMemoryService *GPUMemoryServiceSpec `json:"gpuMemoryService,omitempty"`
 }
 type MultinodeSpec struct {

--- a/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go
+++ b/deploy/operator/api/v1alpha1/zz_generated.deepcopy.go
@@ -557,6 +557,11 @@ func (in *DynamoComponentDeploymentSharedSpec) DeepCopyInto(out *DynamoComponent
 		*out = new(TopologyConstraint)
 		**out = **in
 	}
+	if in.GPUMemoryService != nil {
+		in, out := &in.GPUMemoryService, &out.GPUMemoryService
+		*out = new(GPUMemoryServiceSpec)
+		**out = **in
+	}
 }
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoComponentDeploymentSharedSpec.
@@ -1251,6 +1256,21 @@ func (in *FrontendSidecarSpec) DeepCopy() *FrontendSidecarSpec {
 	return out
 }
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *GPUMemoryServiceSpec) DeepCopyInto(out *GPUMemoryServiceSpec) {
+	*out = *in
+}
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUMemoryServiceSpec.
+func (in *GPUMemoryServiceSpec) DeepCopy() *GPUMemoryServiceSpec {
+	if in == nil {
+		return nil
+	}
+	out := new(GPUMemoryServiceSpec)
+	in.DeepCopyInto(out)
+	return out
+}
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *IngressSpec) DeepCopyInto(out *IngressSpec) {
 	*out = *in

--- a/deploy/operator/cmd/main.go
+++ b/deploy/operator/cmd/main.go
@@ -444,11 +444,15 @@ func main() {
 		runtimeConfig.KaiSchedulerEnabled = false
 	}
+	setupLog.Info("Detecting DRA (Dynamic Resource Allocation) availability...")
+	runtimeConfig.DRAEnabled = commonController.DetectDRAAvailability(mainCtx, mgr)
 	setupLog.Info("Detected orchestrators availability",
 		"grove", runtimeConfig.GroveEnabled,
 		"lws", runtimeConfig.LWSEnabled,
 		"volcano", volcanoDetected,
 		"kai-scheduler", runtimeConfig.KaiSchedulerEnabled,
+		"dra", runtimeConfig.DRAEnabled,
 	)
 	dockerSecretRetriever := secrets.NewDockerSecretIndexer(mgr.GetClient())

--- a/deploy/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml
+++ b/deploy/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml
@@ -10727,6 +10727,30 @@ spec:
                globalDynamoNamespace:
                  description: GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace
                  type: boolean
+                gpuMemoryService:
+                  description: |-
+                    GPUMemoryService configures the GPU Memory Service (GMS) sidecar.
+                    When enabled, a GMS sidecar is injected and GPU access is managed via DRA.
+                  properties:
+                    deviceClassName:
+                      default: gpu.nvidia.com
+                      description: DeviceClassName is the DRA DeviceClass to request GPUs from.
+                      type: string
+                    enabled:
+                      description: |-
+                        Enabled activates the GMS sidecar. GPU resources on the main container
+                        are replaced with a DRA ResourceClaim for shared GPU access.
+                      type: boolean
+                    mode:
+                      default: intraPod
+                      description: Mode selects the GMS deployment topology.
+                      enum:
+                        - intraPod
+                        - interPod
+                      type: string
+                  required:
+                    - enabled
+                  type: object
                ingress:
                  description: Ingress config to expose the component outside the cluster (or through a service mesh).
                  properties:

--- a/deploy/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml
+++ b/deploy/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml
@@ -10950,6 +10950,30 @@ spec:
                      globalDynamoNamespace:
                        description: GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace
                        type: boolean
+                      gpuMemoryService:
+                        description: |-
+                          GPUMemoryService configures the GPU Memory Service (GMS) sidecar.
+                          When enabled, a GMS sidecar is injected and GPU access is managed via DRA.
+                        properties:
+                          deviceClassName:
+                            default: gpu.nvidia.com
+                            description: DeviceClassName is the DRA DeviceClass to request GPUs from.
+                            type: string
+                          enabled:
+                            description: |-
+                              Enabled activates the GMS sidecar. GPU resources on the main container
+                              are replaced with a DRA ResourceClaim for shared GPU access.
+                            type: boolean
+                          mode:
+                            default: intraPod
+                            description: Mode selects the GMS deployment topology.
+                            enum:
+                              - intraPod
+                              - interPod
+                            type: string
+                        required:
+                          - enabled
+                        type: object
                      ingress:
                        description: Ingress config to expose the component outside the cluster (or through a service mesh).
                        properties:

--- a/deploy/operator/config/rbac/role.yaml
+++ b/deploy/operator/config/rbac/role.yaml
@@ -244,6 +244,26 @@ rules:
  - get
  - patch
  - update
+- apiGroups:
+  - resource.k8s.io
+  resources:
+  - deviceclasses
+  verbs:
+  - get
+  - list
+  - watch
+- apiGroups:
+  - resource.k8s.io
+  resources:
+  - resourceclaimtemplates
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
 - apiGroups:
  - scheduling.run.ai
  resources:

--- a/deploy/operator/internal/controller/dynamocomponentdeployment_controller.go
+++ b/deploy/operator/internal/controller/dynamocomponentdeployment_controller.go
@@ -30,6 +30,7 @@ import (
 	autoscalingv2 "k8s.io/api/autoscaling/v2"
 	corev1 "k8s.io/api/core/v1"
 	networkingv1 "k8s.io/api/networking/v1"
+	resourcev1 "k8s.io/api/resource/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"emperror.dev/errors"
@@ -183,6 +184,23 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req
 		}
 	}
+	// Sync GMS ResourceClaimTemplate before creating workload resources
+	if r.RuntimeConfig.DRAEnabled {
+		serviceName := dynamoComponentDeployment.Spec.ServiceName
+		if serviceName == "" {
+			serviceName = dynamoComponentDeployment.Name
+		}
+		claimTemplateName := dynamo.GMSResourceClaimTemplateName(dynamoComponentDeployment.GetParentGraphDeploymentName(), serviceName)
+		_, _, err = commonController.SyncResource(ctx, r, dynamoComponentDeployment, func(ctx context.Context) (*resourcev1.ResourceClaimTemplate, bool, error) {
+			return dynamo.GenerateGMSResourceClaimTemplate(ctx, r.Client, claimTemplateName, dynamoComponentDeployment.Namespace, &dynamoComponentDeployment.Spec.DynamoComponentDeploymentSharedSpec)
+		})
+		if err != nil {
+			return ctrl.Result{}, fmt.Errorf("failed to sync GMS ResourceClaimTemplate: %w", err)
+		}
+	} else if dynamoComponentDeployment.Spec.GPUMemoryService != nil && dynamoComponentDeployment.Spec.GPUMemoryService.Enabled {
+		return ctrl.Result{}, fmt.Errorf("gpuMemoryService requires DRA (Dynamic Resource Allocation), but the resource.k8s.io API group is not available on this cluster (requires Kubernetes 1.32+)")
+	}
 	// Create the appropriate workload resource based on deployment type
 	var componentReconcileResult ComponentReconcileResult
 	if r.RuntimeConfig.LWSEnabled && dynamoComponentDeployment.IsMultinode() {

--- a/deploy/operator/internal/controller/dynamographdeployment_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeployment_controller.go
@@ -35,6 +35,7 @@ import (
 	networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
 	corev1 "k8s.io/api/core/v1"
 	networkingv1 "k8s.io/api/networking/v1"
+	resourcev1 "k8s.io/api/resource/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	"k8s.io/apimachinery/pkg/types"
@@ -90,6 +91,8 @@ type DynamoGraphDeploymentReconciler struct {
 // +kubebuilder:rbac:groups=grove.io,resources=clustertopologies,verbs=get;list;watch
 // +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list
 // +kubebuilder:rbac:groups=inference.networking.k8s.io,resources=inferencepools,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaimtemplates,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=resource.k8s.io,resources=deviceclasses,verbs=get;list;watch
 // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch
 // +kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch
@@ -658,6 +661,27 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont
 func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState, checkpointInfos map[string]*checkpoint.CheckpointInfo) (ReconcileResult, error) {
 	logger := log.FromContext(ctx)
+	// Sync ResourceClaimTemplates for GMS-enabled components before creating pods.
+	if r.RuntimeConfig.DRAEnabled {
+		for serviceName, component := range dynamoDeployment.Spec.Services {
+			svcComponent := component
+			claimTemplateName := dynamo.GMSResourceClaimTemplateName(dynamoDeployment.Name, serviceName)
+			_, _, err := commoncontroller.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*resourcev1.ResourceClaimTemplate, bool, error) {
+				return dynamo.GenerateGMSResourceClaimTemplate(ctx, r.Client, claimTemplateName, dynamoDeployment.Namespace, svcComponent)
+			})
+			if err != nil {
+				logger.Error(err, "failed to sync GMS ResourceClaimTemplate", "service", serviceName)
+				return ReconcileResult{}, fmt.Errorf("failed to sync GMS ResourceClaimTemplate for %s: %w", serviceName, err)
+			}
+		}
+	} else {
+		for _, component := range dynamoDeployment.Spec.Services {
+			if component.GPUMemoryService != nil && component.GPUMemoryService.Enabled {
+				return ReconcileResult{}, fmt.Errorf("gpuMemoryService requires DRA (Dynamic Resource Allocation), but the resource.k8s.io API group is not available on this cluster (requires Kubernetes 1.32+)")
+			}
+		}
+	}
 	grovePodCliqueSetAsResource, err := r.reconcileGrovePodCliqueSet(ctx, dynamoDeployment, restartState, checkpointInfos)
 	if err != nil {
 		logger.Error(err, "failed to reconcile the Grove PodClique Set")

--- a/deploy/operator/internal/controller_common/predicate.go
+++ b/deploy/operator/internal/controller_common/predicate.go
@@ -62,6 +62,12 @@ func DetectInferencePoolAvailability(ctx context.Context, mgr ctrl.Manager) bool
 	return detectAPIGroupAvailability(ctx, mgr, "inference.networking.k8s.io")
 }
+// DetectDRAAvailability checks if Dynamic Resource Allocation is available
+// by checking if the resource.k8s.io API group is registered (Kubernetes 1.32+)
+func DetectDRAAvailability(ctx context.Context, mgr ctrl.Manager) bool {
+	return detectAPIGroupAvailability(ctx, mgr, "resource.k8s.io")
+}
 // detectAPIGroupAvailability checks if a specific API group is registered in the cluster
 func detectAPIGroupAvailability(ctx context.Context, mgr ctrl.Manager, groupName string) bool {
 	logger := log.FromContext(ctx)

--- a/deploy/operator/internal/controller_common/runtime.go
+++ b/deploy/operator/internal/controller_common/runtime.go
@@ -26,6 +26,8 @@ type RuntimeConfig struct {
 	LWSEnabled bool
 	// KaiSchedulerEnabled is the resolved Kai-scheduler availability (config override merged with auto-detection)
 	KaiSchedulerEnabled bool
+	// DRAEnabled indicates whether Dynamic Resource Allocation (resource.k8s.io) is available
+	DRAEnabled bool
 	// ExcludedNamespaces for cluster-wide mode namespace filtering
 	ExcludedNamespaces ExcludedNamespacesInterface
 }
--- a/deploy/operator/internal/dynamo/gms.go
+++ b/deploy/operator/internal/dynamo/gms.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package dynamo
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+	"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
+	commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
+	corev1 "k8s.io/api/core/v1"
+	resourcev1 "k8s.io/api/resource/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/utils/ptr"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+const (
+	gmsSharedVolumeName      = "gms-shared"
+	gmsSharedMountPath       = "/shared"
+	gmsDRAClaimName          = "shared-gpu"
+	defaultDeviceClassName   = "gpu.nvidia.com"
+	gmsProcessesPerGPU       = 2
+	gmsStartupProbeTimeout   = 2 * time.Minute
+	gmsStartupProbePeriodSec = 2
+)
+func isGMSEnabled(component *v1alpha1.DynamoComponentDeploymentSharedSpec) bool {
+	return component.GPUMemoryService != nil && component.GPUMemoryService.Enabled
+}
+// getGPUCount extracts the GPU count from the component resource spec.
+func getGPUCount(component *v1alpha1.DynamoComponentDeploymentSharedSpec) (int, error) {
+	if component.Resources == nil {
+		return 0, fmt.Errorf("resources must be specified when GPU memory service is enabled")
+	}
+	gpuStr := ""
+	if component.Resources.Limits != nil && component.Resources.Limits.GPU != "" {
+		gpuStr = component.Resources.Limits.GPU
+	} else if component.Resources.Requests != nil && component.Resources.Requests.GPU != "" {
+		gpuStr = component.Resources.Requests.GPU
+	}
+	if gpuStr == "" {
+		return 0, fmt.Errorf("GPU count must be specified when GPU memory service is enabled")
+	}
+	count, err := strconv.Atoi(gpuStr)
+	if err != nil {
+		return 0, fmt.Errorf("invalid GPU count %q: %w", gpuStr, err)
+	}
+	return count, nil
+}
+// getDeviceClassName returns the DRA DeviceClass name for the component.
+// It reads from GPUMemoryServiceSpec.DeviceClassName, falling back to the default.
+func getDeviceClassName(component *v1alpha1.DynamoComponentDeploymentSharedSpec) string {
+	if component.GPUMemoryService != nil && component.GPUMemoryService.DeviceClassName != "" {
+		return component.GPUMemoryService.DeviceClassName
+	}
+	return defaultDeviceClassName
+}
+// applyGPUMemoryService transforms a pod spec to include a GMS sidecar with
+// DRA shared GPU access. The main container's GPU resources are replaced with
+// a DRA ResourceClaim, and a GMS init container is added.
+//
+// claimTemplateName is the name of the ResourceClaimTemplate that will provide
+// shared GPU access; callers should compute it via GMSResourceClaimTemplateName.
+func applyGPUMemoryService(
+	podSpec *corev1.PodSpec,
+	component *v1alpha1.DynamoComponentDeploymentSharedSpec,
+	claimTemplateName string,
+) error {
+	if len(podSpec.Containers) == 0 {
+		return fmt.Errorf("pod spec must have at least one container for GPU memory service")
+	}
+	gpuCount, err := getGPUCount(component)
+	if err != nil {
+		return err
+	}
+	mainContainer := &podSpec.Containers[0]
+	// Replace GPU resources with DRA claim on main container
+	removeGPUResources(mainContainer)
+	mainContainer.Resources.Claims = append(mainContainer.Resources.Claims, corev1.ResourceClaim{
+		Name: gmsDRAClaimName,
+	})
+	// Add shared volume mount and TMPDIR to main container
+	mainContainer.VolumeMounts = append(mainContainer.VolumeMounts, corev1.VolumeMount{
+		Name:      gmsSharedVolumeName,
+		MountPath: gmsSharedMountPath,
+	})
+	mainContainer.Env = append(mainContainer.Env, corev1.EnvVar{
+		Name: "TMPDIR", Value: gmsSharedMountPath,
+	})
+	// Add GMS sidecar
+	gmsSidecar := buildGMSSidecar(mainContainer.Image, gpuCount)
+	podSpec.InitContainers = append(podSpec.InitContainers, gmsSidecar)
+	// Add shared volume
+	podSpec.Volumes = append(podSpec.Volumes, gmsSharedVolume())
+	// GPU nodes are typically tainted with nvidia.com/gpu=NoSchedule. With
+	// traditional scheduling the device-plugin injects the matching toleration,
+	// but DRA bypasses that path. Re-add the toleration explicitly so the pod
+	// can schedule on GPU nodes.
+	podSpec.Tolerations = append(podSpec.Tolerations, corev1.Toleration{
+		Key:      commonconsts.KubeResourceGPUNvidia,
+		Operator: corev1.TolerationOpExists,
+		Effect:   corev1.TaintEffectNoSchedule,
+	})
+	// Add pod-level DRA resource claim referencing the ResourceClaimTemplate
+	podSpec.ResourceClaims = append(podSpec.ResourceClaims, corev1.PodResourceClaim{
+		Name:                      gmsDRAClaimName,
+		ResourceClaimTemplateName: &claimTemplateName,
+	})
+	return nil
+}
+// removeGPUResources strips nvidia.com/gpu from container resource limits and requests.
+// GPU allocation is handled by DRA when GMS is enabled.
+func removeGPUResources(container *corev1.Container) {
+	gpuResource := corev1.ResourceName(commonconsts.KubeResourceGPUNvidia)
+	if container.Resources.Limits != nil {
+		delete(container.Resources.Limits, gpuResource)
+	}
+	if container.Resources.Requests != nil {
+		delete(container.Resources.Requests, gpuResource)
+	}
+}
+// buildGMSSidecar creates the GMS weight server as a sidecar init container
+// (restartPolicy: Always). kubelet starts it before regular containers and
+// keeps it running for the pod's lifetime.
+//
+// Each GPU gets two GMS subprocesses (weights + kv_cache) via a bash wrapper
+// that forwards signals and exits if any child dies. TMPDIR is set so
+// UUID-based sockets land in the shared volume.
+func buildGMSSidecar(image string, gpuCount int) corev1.Container {
+	return corev1.Container{
+		Name:          "gms-weights",
+		Image:         image,
+		Command:       []string{"bash", "-c"},
+		Args:          []string{gmsWrapperScript(gpuCount)},
+		RestartPolicy: ptr.To(corev1.ContainerRestartPolicyAlways),
+		Env: []corev1.EnvVar{
+			{Name: "TMPDIR", Value: gmsSharedMountPath},
+		},
+		VolumeMounts: []corev1.VolumeMount{
+			{
+				Name:      gmsSharedVolumeName,
+				MountPath: gmsSharedMountPath,
+			},
+		},
+		StartupProbe: &corev1.Probe{
+			ProbeHandler: corev1.ProbeHandler{
+				Exec: &corev1.ExecAction{
+					Command: gmsReadyCheckCommand(gpuCount),
+				},
+			},
+			PeriodSeconds:    int32(gmsStartupProbePeriodSec),
+			FailureThreshold: int32(gmsStartupProbeTimeout/time.Second) / int32(gmsStartupProbePeriodSec),
+		},
+		Resources: corev1.ResourceRequirements{
+			Claims: []corev1.ResourceClaim{
+				{Name: gmsDRAClaimName},
+			},
+		},
+	}
+}
+// gmsWrapperScript generates a bash script that launches two GMS subprocesses
+// per GPU device (one for weights, one for kv_cache), waits for any to exit,
+// then tears down the process group.
+func gmsWrapperScript(gpuCount int) string {
+	devList := make([]string, gpuCount)
+	for i := range gpuCount {
+		devList[i] = strconv.Itoa(i)
+	}
+	return fmt.Sprintf(
+		`trap 'kill 0 2>/dev/null || true' EXIT
+for dev in %s; do
+  python3 -m gpu_memory_service --device "$dev" --tag weights &
+  echo "Started GMS device=$dev tag=weights pid=$!"
+  python3 -m gpu_memory_service --device "$dev" --tag kv_cache &
+  echo "Started GMS device=$dev tag=kv_cache pid=$!"
+done
+wait -n
+echo "A GMS subprocess exited, shutting down"`, strings.Join(devList, " "))
+}
+// gmsReadyCheckCommand returns the exec probe command that verifies the
+// expected number of GMS UDS sockets exist on the shared volume.
+// With 2-tag GMS (weights + kv_cache), there are 2 sockets per GPU.
+func gmsReadyCheckCommand(gpuCount int) []string {
+	return []string{
+		"sh", "-c",
+		fmt.Sprintf("test $(ls %s/gms_*.sock 2>/dev/null | wc -l) -ge %d", gmsSharedMountPath, gpuCount*gmsProcessesPerGPU),
+	}
+}
+func gmsSharedVolume() corev1.Volume {
+	return corev1.Volume{
+		Name: gmsSharedVolumeName,
+		VolumeSource: corev1.VolumeSource{
+			EmptyDir: &corev1.EmptyDirVolumeSource{},
+		},
+	}
+}
+// GMSResourceClaimTemplateName returns the deterministic name for the
+// ResourceClaimTemplate associated with a GMS-enabled component.
+func GMSResourceClaimTemplateName(parentName, serviceName string) string {
+	return fmt.Sprintf("%s-%s-gpu", parentName, strings.ToLower(serviceName))
+}
+// GenerateGMSResourceClaimTemplate builds the ResourceClaimTemplate that
+// provides shared GPU access to all containers in a GMS-enabled pod via DRA.
+//
+// claimTemplateName is the deterministic name for the template; callers should
+// compute it via GMSResourceClaimTemplateName.
+//
+// When GMS is not enabled for the component, it returns the template skeleton
+// with toDelete=true so that SyncResource cleans up any previously created template.
+//
+// The cl parameter is used to verify the DeviceClass exists before creating
+// the template. Pass nil to skip the DeviceClass check.
+func GenerateGMSResourceClaimTemplate(
+	ctx context.Context,
+	cl client.Client,
+	claimTemplateName, namespace string,
+	component *v1alpha1.DynamoComponentDeploymentSharedSpec,
+) (*resourcev1.ResourceClaimTemplate, bool, error) {
+	template := &resourcev1.ResourceClaimTemplate{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      claimTemplateName,
+			Namespace: namespace,
+		},
+	}
+	if !isGMSEnabled(component) {
+		return template, true, nil
+	}
+	gpuCount, err := getGPUCount(component)
+	if err != nil {
+		return nil, false, fmt.Errorf("failed to get GPU count for ResourceClaimTemplate: %w", err)
+	}
+	deviceClassName := getDeviceClassName(component)
+	// Verify the DeviceClass exists before creating the template
+	if cl != nil {
+		dc := &resourcev1.DeviceClass{}
+		if err := cl.Get(ctx, types.NamespacedName{Name: deviceClassName}, dc); err != nil {
+			if apierrors.IsNotFound(err) {
+				return nil, false, fmt.Errorf(
+					"DeviceClass %q not found: ensure the GPU DRA driver is installed and the device class is registered",
+					deviceClassName)
+			}
+			return nil, false, fmt.Errorf("failed to verify DeviceClass %q: %w", deviceClassName, err)
+		}
+	}
+	template.Spec = resourcev1.ResourceClaimTemplateSpec{
+		Spec: resourcev1.ResourceClaimSpec{
+			Devices: resourcev1.DeviceClaim{
+				Requests: []resourcev1.DeviceRequest{
+					{
+						Name: "gpus",
+						Exactly: &resourcev1.ExactDeviceRequest{
+							DeviceClassName: deviceClassName,
+							AllocationMode:  resourcev1.DeviceAllocationModeExactCount,
+							Count:           int64(gpuCount),
+						},
+					},
+				},
+			},
+		},
+	}
+	return template, false, nil
+}
--- a/deploy/operator/internal/dynamo/gms_test.go
+++ b/deploy/operator/internal/dynamo/gms_test.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package dynamo
+import (
+	"context"
+	"strconv"
+	"testing"
+	"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
+	commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	"k8s.io/apimachinery/pkg/util/intstr"
+)
+func gmsComponent(gpuCount int) *v1alpha1.DynamoComponentDeploymentSharedSpec {
+	return &v1alpha1.DynamoComponentDeploymentSharedSpec{
+		ComponentType:    commonconsts.ComponentTypeWorker,
+		GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true},
+		Resources: &v1alpha1.Resources{
+			Limits: &v1alpha1.ResourceItem{GPU: strconv.Itoa(gpuCount)},
+		},
+	}
+}
+func gmsBasePodSpec() corev1.PodSpec {
+	httpPort := intstr.FromString("system")
+	return corev1.PodSpec{
+		Containers: []corev1.Container{
+			{
+				Name:    "main",
+				Image:   "test-image:latest",
+				Command: []string{"python3", "-m", "dynamo.vllm"},
+				Env: []corev1.EnvVar{
+					{Name: "DYN_SYSTEM_PORT", Value: "9090"},
+					{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
+				},
+				Ports: []corev1.ContainerPort{
+					{Name: "system", ContainerPort: 9090, Protocol: corev1.ProtocolTCP},
+				},
+				StartupProbe: &corev1.Probe{
+					ProbeHandler: corev1.ProbeHandler{
+						HTTPGet: &corev1.HTTPGetAction{Path: "/health", Port: httpPort},
+					},
+				},
+				Resources: corev1.ResourceRequirements{
+					Limits: corev1.ResourceList{
+						corev1.ResourceName(commonconsts.KubeResourceGPUNvidia): resource.MustParse("2"),
+					},
+				},
+			},
+		},
+	}
+}
+// --- applyGPUMemoryService ---
+func TestApplyGPUMemoryService_EmptyContainers(t *testing.T) {
+	ps := corev1.PodSpec{}
+	err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "at least one container")
+}
+func TestApplyGPUMemoryService_MainContainerTransformed(t *testing.T) {
+	ps := gmsBasePodSpec()
+	err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
+	require.NoError(t, err)
+	main := ps.Containers[0]
+	// GPU resources should be removed
+	gpuResource := corev1.ResourceName(commonconsts.KubeResourceGPUNvidia)
+	_, hasGPU := main.Resources.Limits[gpuResource]
+	assert.False(t, hasGPU, "main container should not have GPU limits")
+	// Should have DRA claim
+	require.Len(t, main.Resources.Claims, 1)
+	assert.Equal(t, gmsDRAClaimName, main.Resources.Claims[0].Name)
+	// Should have shared volume mount
+	var hasSharedMount bool
+	for _, vm := range main.VolumeMounts {
+		if vm.Name == gmsSharedVolumeName && vm.MountPath == gmsSharedMountPath {
+			hasSharedMount = true
+		}
+	}
+	assert.True(t, hasSharedMount, "main container should have gms-shared volume mount")
+	// Should have TMPDIR
+	envMap := envToMap(main.Env)
+	assert.Equal(t, gmsSharedMountPath, envMap["TMPDIR"])
+}
+func TestApplyGPUMemoryService_GMSSidecarInjected(t *testing.T) {
+	ps := gmsBasePodSpec()
+	err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
+	require.NoError(t, err)
+	require.Len(t, ps.InitContainers, 1)
+	gms := ps.InitContainers[0]
+	assert.Equal(t, "gms-weights", gms.Name)
+	assert.Equal(t, "test-image:latest", gms.Image)
+	assert.Equal(t, []string{"bash", "-c"}, gms.Command)
+	assert.Contains(t, gms.Args[0], "gpu_memory_service --device")
+	assert.NotNil(t, gms.RestartPolicy)
+	assert.Equal(t, corev1.ContainerRestartPolicyAlways, *gms.RestartPolicy)
+	// GMS sidecar should have DRA claim
+	require.Len(t, gms.Resources.Claims, 1)
+	assert.Equal(t, gmsDRAClaimName, gms.Resources.Claims[0].Name)
+	// GMS sidecar should have TMPDIR
+	gmsEnv := envToMap(gms.Env)
+	assert.Equal(t, gmsSharedMountPath, gmsEnv["TMPDIR"])
+}
+func TestApplyGPUMemoryService_SharedVolume(t *testing.T) {
+	ps := gmsBasePodSpec()
+	err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
+	require.NoError(t, err)
+	var found bool
+	for _, v := range ps.Volumes {
+		if v.Name == gmsSharedVolumeName {
+			assert.NotNil(t, v.EmptyDir)
+			found = true
+		}
+	}
+	assert.True(t, found, "should have gms-shared volume")
+}
+func TestApplyGPUMemoryService_GPUToleration(t *testing.T) {
+	ps := gmsBasePodSpec()
+	err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
+	require.NoError(t, err)
+	var found bool
+	for _, tol := range ps.Tolerations {
+		if tol.Key == commonconsts.KubeResourceGPUNvidia && tol.Effect == corev1.TaintEffectNoSchedule {
+			assert.Equal(t, corev1.TolerationOpExists, tol.Operator)
+			found = true
+		}
+	}
+	assert.True(t, found, "should have nvidia.com/gpu NoSchedule toleration")
+}
+func TestApplyGPUMemoryService_DRAResourceClaim(t *testing.T) {
+	ps := gmsBasePodSpec()
+	err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
+	require.NoError(t, err)
+	require.Len(t, ps.ResourceClaims, 1)
+	assert.Equal(t, gmsDRAClaimName, ps.ResourceClaims[0].Name)
+	assert.Equal(t, "myapp-worker-gpu", *ps.ResourceClaims[0].ResourceClaimTemplateName)
+}
+func TestApplyGPUMemoryService_PreservesExistingEnv(t *testing.T) {
+	ps := gmsBasePodSpec()
+	err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
+	require.NoError(t, err)
+	main := ps.Containers[0]
+	envMap := envToMap(main.Env)
+	assert.Equal(t, "kubernetes", envMap[commonconsts.DynamoDiscoveryBackendEnvVar])
+	assert.Equal(t, "9090", envMap["DYN_SYSTEM_PORT"])
+}
+func TestApplyGPUMemoryService_SingleContainer(t *testing.T) {
+	ps := gmsBasePodSpec()
+	err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
+	require.NoError(t, err)
+	// Should still have exactly 1 regular container (no duplication)
+	assert.Len(t, ps.Containers, 1)
+	assert.Equal(t, "main", ps.Containers[0].Name)
+}
+// --- GMS sidecar helpers ---
+func TestGmsWrapperScript_TwoTagsPerDevice(t *testing.T) {
+	script := gmsWrapperScript(3)
+	assert.Contains(t, script, "for dev in 0 1 2")
+	assert.Contains(t, script, "--tag weights")
+	assert.Contains(t, script, "--tag kv_cache")
+	assert.Contains(t, script, "trap 'kill 0")
+	assert.Contains(t, script, "wait -n")
+}
+func TestGmsReadyCheckCommand_TwoSocketsPerGPU(t *testing.T) {
+	cmd := gmsReadyCheckCommand(2)
+	assert.Equal(t, "sh", cmd[0])
+	assert.Equal(t, "-c", cmd[1])
+	assert.Contains(t, cmd[2], "gms_*.sock")
+	// 2 GPUs * 2 tags = 4 sockets
+	assert.Contains(t, cmd[2], "-ge 4")
+}
+func TestGmsReadyCheckCommand_SingleGPU(t *testing.T) {
+	cmd := gmsReadyCheckCommand(1)
+	// 1 GPU * 2 tags = 2 sockets
+	assert.Contains(t, cmd[2], "-ge 2")
+}
+// --- GenerateGMSResourceClaimTemplate ---
+func TestGenerateGMSResourceClaimTemplate_Enabled(t *testing.T) {
+	component := gmsComponent(4)
+	tmpl, toDelete, err := GenerateGMSResourceClaimTemplate(context.Background(), nil, "myapp-worker-gpu", "default", component)
+	require.NoError(t, err)
+	assert.False(t, toDelete)
+	assert.Equal(t, "myapp-worker-gpu", tmpl.Name)
+	assert.Equal(t, "default", tmpl.Namespace)
+	require.Len(t, tmpl.Spec.Spec.Devices.Requests, 1)
+	req := tmpl.Spec.Spec.Devices.Requests[0]
+	assert.Equal(t, "gpus", req.Name)
+	require.NotNil(t, req.Exactly)
+	assert.Equal(t, defaultDeviceClassName, req.Exactly.DeviceClassName)
+	assert.Equal(t, int64(4), req.Exactly.Count)
+}
+func TestGenerateGMSResourceClaimTemplate_CustomDeviceClass(t *testing.T) {
+	component := gmsComponent(2)
+	component.GPUMemoryService.DeviceClassName = "gpu.intel.com/xe"
+	tmpl, toDelete, err := GenerateGMSResourceClaimTemplate(context.Background(), nil, "myapp-worker-gpu", "default", component)
+	require.NoError(t, err)
+	assert.False(t, toDelete)
+	assert.Equal(t, "gpu.intel.com/xe", tmpl.Spec.Spec.Devices.Requests[0].Exactly.DeviceClassName)
+}
+func TestGenerateGMSResourceClaimTemplate_DisabledReturnsDelete(t *testing.T) {
+	component := &v1alpha1.DynamoComponentDeploymentSharedSpec{
+		ComponentType: commonconsts.ComponentTypeWorker,
+	}
+	tmpl, toDelete, err := GenerateGMSResourceClaimTemplate(context.Background(), nil, "myapp-worker-gpu", "default", component)
+	require.NoError(t, err)
+	assert.True(t, toDelete)
+	assert.Equal(t, "myapp-worker-gpu", tmpl.Name)
+}
+func TestGenerateGMSResourceClaimTemplate_NoGPUCountError(t *testing.T) {
+	component := &v1alpha1.DynamoComponentDeploymentSharedSpec{
+		ComponentType:    commonconsts.ComponentTypeWorker,
+		GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true},
+	}
+	_, _, err := GenerateGMSResourceClaimTemplate(context.Background(), nil, "myapp-worker-gpu", "default", component)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "resources must be specified")
+}
+// --- GMSResourceClaimTemplateName ---
+func TestGMSResourceClaimTemplateName(t *testing.T) {
+	assert.Equal(t, "myapp-worker-gpu", GMSResourceClaimTemplateName("myapp", "Worker"))
+	assert.Equal(t, "app-vllmdecodeworker-gpu", GMSResourceClaimTemplateName("app", "VllmDecodeWorker"))
+}
+// --- isGMSEnabled ---
+func TestIsGMSEnabled(t *testing.T) {
+	assert.True(t, isGMSEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{
+		GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true},
+	}))
+	assert.False(t, isGMSEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{
+		GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: false},
+	}))
+	assert.False(t, isGMSEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{}))
+}
+// --- getGPUCount ---
+func TestGetGPUCount(t *testing.T) {
+	tests := []struct {
+		name      string
+		component *v1alpha1.DynamoComponentDeploymentSharedSpec
+		want      int
+		wantErr   bool
+	}{
+		{
+			name:      "from limits",
+			component: &v1alpha1.DynamoComponentDeploymentSharedSpec{Resources: &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPU: "4"}}},
+			want:      4,
+		},
+		{
+			name:      "from requests",
+			component: &v1alpha1.DynamoComponentDeploymentSharedSpec{Resources: &v1alpha1.Resources{Requests: &v1alpha1.ResourceItem{GPU: "2"}}},
+			want:      2,
+		},
+		{
+			name:      "no resources",
+			component: &v1alpha1.DynamoComponentDeploymentSharedSpec{},
+			wantErr:   true,
+		},
+		{
+			name:      "invalid GPU string",
+			component: &v1alpha1.DynamoComponentDeploymentSharedSpec{Resources: &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPU: "abc"}}},
+			wantErr:   true,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := getGPUCount(tt.component)
+			if tt.wantErr {
+				assert.Error(t, err)
+			} else {
+				require.NoError(t, err)
+				assert.Equal(t, tt.want, got)
+			}
+		})
+	}
+}
+// --- getDeviceClassName ---
+func TestGetDeviceClassName(t *testing.T) {
+	assert.Equal(t, defaultDeviceClassName, getDeviceClassName(&v1alpha1.DynamoComponentDeploymentSharedSpec{}))
+	assert.Equal(t, defaultDeviceClassName, getDeviceClassName(&v1alpha1.DynamoComponentDeploymentSharedSpec{
+		GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true},
+	}))
+	assert.Equal(t, "gpu.intel.com/xe", getDeviceClassName(&v1alpha1.DynamoComponentDeploymentSharedSpec{
+		GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, DeviceClassName: "gpu.intel.com/xe"},
+	}))
+}
+// helpers
+func envToMap(envs []corev1.EnvVar) map[string]string {
+	m := make(map[string]string, len(envs))
+	for _, e := range envs {
+		m[e.Name] = e.Value
+	}
+	return m
+}
--- a/deploy/operator/internal/dynamo/graph.go
+++ b/deploy/operator/internal/dynamo/graph.go
@@ -1182,6 +1182,14 @@ func GenerateBasePodSpec(
 		}
 	}
+	// Inject GMS sidecar with DRA shared GPU access when GPU memory service is enabled.
+	if isGMSEnabled(component) {
+		claimTemplateName := GMSResourceClaimTemplateName(parentGraphDeploymentName, serviceName)
+		if err := applyGPUMemoryService(&podSpec, component, claimTemplateName); err != nil {
+			return nil, fmt.Errorf("failed to apply GPU memory service: %w", err)
+		}
+	}
 	return &podSpec, nil
 }

--- a/deploy/operator/internal/webhook/validation/shared.go
+++ b/deploy/operator/internal/webhook/validation/shared.go
@@ -20,6 +20,7 @@ package validation
 import (
 	"context"
 	"fmt"
+	"strconv"
 	"strings"
 	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
@@ -128,6 +129,11 @@ func (v *SharedSpecValidator) Validate(ctx context.Context) (admission.Warnings,
 		return nil, err
 	}
+	// Validate GPU memory service configuration
+	if err := v.validateGPUMemoryService(); err != nil {
+		return nil, err
+	}
 	return warnings, nil
 }
@@ -255,6 +261,57 @@ func (v *SharedSpecValidator) validateFrontendSidecar() error {
 	return nil
 }
+// validateGPUMemoryService validates the GPU memory service configuration.
+func (v *SharedSpecValidator) validateGPUMemoryService() error {
+	if v.spec.GPUMemoryService == nil || !v.spec.GPUMemoryService.Enabled {
+		return nil
+	}
+	if v.spec.GPUMemoryService.Mode == nvidiacomv1alpha1.GMSModeInterPod {
+		return fmt.Errorf(
+			"%s.gpuMemoryService: mode \"interPod\" is not yet supported",
+			v.fieldPath)
+	}
+	isWorker := v.spec.ComponentType == consts.ComponentTypeWorker ||
+		v.spec.ComponentType == consts.ComponentTypePrefill ||
+		v.spec.ComponentType == consts.ComponentTypeDecode
+	if !isWorker {
+		return fmt.Errorf(
+			"%s.gpuMemoryService: GPU memory service is only supported for worker components (componentType must be worker, prefill, or decode)",
+			v.fieldPath)
+	}
+	if v.spec.Resources == nil {
+		return fmt.Errorf(
+			"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
+			v.fieldPath)
+	}
+	gpuStr := ""
+	switch {
+	case v.spec.Resources.Limits != nil && v.spec.Resources.Limits.GPU != "":
+		gpuStr = v.spec.Resources.Limits.GPU
+	case v.spec.Resources.Requests != nil && v.spec.Resources.Requests.GPU != "":
+		gpuStr = v.spec.Resources.Requests.GPU
+	}
+	if gpuStr == "" {
+		return fmt.Errorf(
+			"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
+			v.fieldPath)
+	}
+	gpuCount, err := strconv.Atoi(gpuStr)
+	if err != nil || gpuCount < 1 {
+		return fmt.Errorf(
+			"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
+			v.fieldPath)
+	}
+	return nil
+}
 // validateServiceAnnotations validates known annotations on the service-level spec.
 func (v *SharedSpecValidator) validateServiceAnnotations() error {
 	if v.spec.Annotations == nil {

--- a/docs/kubernetes/api-reference.md
+++ b/docs/kubernetes/api-reference.md
@@ -402,6 +402,7 @@ _Appears in:_
 | `frontendSidecar` _[FrontendSidecarSpec](#frontendsidecarspec)_ | FrontendSidecar configures an auto-generated frontend sidecar container.<br />When specified, the operator injects a fully configured frontend container<br />with all standard Dynamo environment variables, health probes, and ports.<br />This eliminates the need to manually specify these in extraPodSpec.containers. (GAIE) |  | Optional: \{\} <br /> |
 | `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. |  | Optional: \{\} <br /> |
 | `topologyConstraint` _[TopologyConstraint](#topologyconstraint)_ | TopologyConstraint for this service. packDomain is required.<br />When both this and spec.topologyConstraint.packDomain are set, packDomain<br />must be narrower than or equal to the spec-level packDomain. |  | Optional: \{\} <br /> |
+| `gpuMemoryService` _[GPUMemoryServiceSpec](#gpumemoryservicespec)_ | GPUMemoryService configures the GPU Memory Service (GMS) sidecar.<br />When enabled, a GMS sidecar is injected and GPU access is managed via DRA. |  | Optional: \{\} <br /> |
 #### DynamoComponentDeploymentSpec
@@ -444,6 +445,7 @@ _Appears in:_
 | `frontendSidecar` _[FrontendSidecarSpec](#frontendsidecarspec)_ | FrontendSidecar configures an auto-generated frontend sidecar container.<br />When specified, the operator injects a fully configured frontend container<br />with all standard Dynamo environment variables, health probes, and ports.<br />This eliminates the need to manually specify these in extraPodSpec.containers. (GAIE) |  | Optional: \{\} <br /> |
 | `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. |  | Optional: \{\} <br /> |
 | `topologyConstraint` _[TopologyConstraint](#topologyconstraint)_ | TopologyConstraint for this service. packDomain is required.<br />When both this and spec.topologyConstraint.packDomain are set, packDomain<br />must be narrower than or equal to the spec-level packDomain. |  | Optional: \{\} <br /> |
+| `gpuMemoryService` _[GPUMemoryServiceSpec](#gpumemoryservicespec)_ | GPUMemoryService configures the GPU Memory Service (GMS) sidecar.<br />When enabled, a GMS sidecar is injected and GPU access is managed via DRA. |  | Optional: \{\} <br /> |
 #### DynamoGraphDeployment
@@ -821,6 +823,45 @@ _Appears in:_
 | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables for the frontend sidecar.<br />These are merged with (and can override) the auto-generated Dynamo env vars. |  | Optional: \{\} <br /> |
+#### GPUMemoryServiceMode
+_Underlying type:_ _string_
+GPUMemoryServiceMode selects the GMS deployment topology.
+_Appears in:_
+- [GPUMemoryServiceSpec](#gpumemoryservicespec)
+| Field | Description |
+| --- | --- |
+| `intraPod` | GMSModeIntraPod runs GMS as a sidecar within the same pod.<br /> |
+| `interPod` | GMSModeInterPod runs GMS as a separate pod (not yet supported).<br /> |
+#### GPUMemoryServiceSpec
+GPUMemoryServiceSpec configures the GPU Memory Service (GMS) sidecar for a worker component.
+When enabled, the operator injects a GMS sidecar that provides shared GPU memory access
+via DRA (Dynamic Resource Allocation). The sidecar runs two GMS processes per GPU
+(weights + kv_cache) and communicates with the main container over UDS sockets.
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `enabled` _boolean_ | Enabled activates the GMS sidecar. GPU resources on the main container<br />are replaced with a DRA ResourceClaim for shared GPU access. |  |  |
+| `mode` _[GPUMemoryServiceMode](#gpumemoryservicemode)_ | Mode selects the GMS deployment topology. | intraPod | Enum: [intraPod interPod] <br />Optional: \{\} <br /> |
+| `deviceClassName` _string_ | DeviceClassName is the DRA DeviceClass to request GPUs from. | gpu.nvidia.com | Optional: \{\} <br /> |
 #### IngressSpec

--- a/examples/backends/vllm/deploy/agg_gms.yaml
+++ b/examples/backends/vllm/deploy/agg_gms.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# GPU Memory Service (GMS) sidecar example.
+#
+# The operator injects a GMS sidecar init container that provides shared GPU
+# memory access via DRA (Dynamic Resource Allocation). The sidecar runs two GMS
+# processes per GPU (weights + kv_cache) and communicates with the main container
+# over UDS sockets on a shared emptyDir volume.
+#
+# Requires Kubernetes 1.32+ with DRA enabled and the NVIDIA GPU DRA driver installed.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: vllm-agg-gms
+spec:
+  services:
+    Frontend:
+      envFromSecret: hf-token-secret
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+    VllmWorker:
+      envFromSecret: hf-token-secret
+      componentType: worker
+      replicas: 1
+      resources:
+        limits:
+          gpu: "1"
+        requests:
+          custom:
+            ephemeral-storage: "2Gi"
+      gpuMemoryService:
+        enabled: true
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
+          workingDir: /workspace/examples/backends/vllm
+          command:
+            - python3
+            - -m
+            - dynamo.vllm
+          args:
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --load-format
+            - gms