Unverified Commit 47897f36 authored by mohammedabdulwahhab's avatar mohammedabdulwahhab Committed by GitHub
Browse files

feat: add gpu memory service API on engine services (#8148)


Signed-off-by: default avatarmohammedabdulwahhab <furkhan324@berkeley.edu>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent 617d55c0
...@@ -10727,6 +10727,30 @@ spec: ...@@ -10727,6 +10727,30 @@ spec:
globalDynamoNamespace: globalDynamoNamespace:
description: GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace description: GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace
type: boolean type: boolean
gpuMemoryService:
description: |-
GPUMemoryService configures the GPU Memory Service (GMS) sidecar.
When enabled, a GMS sidecar is injected and GPU access is managed via DRA.
properties:
deviceClassName:
default: gpu.nvidia.com
description: DeviceClassName is the DRA DeviceClass to request GPUs from.
type: string
enabled:
description: |-
Enabled activates the GMS sidecar. GPU resources on the main container
are replaced with a DRA ResourceClaim for shared GPU access.
type: boolean
mode:
default: intraPod
description: Mode selects the GMS deployment topology.
enum:
- intraPod
- interPod
type: string
required:
- enabled
type: object
ingress: ingress:
description: Ingress config to expose the component outside the cluster (or through a service mesh). description: Ingress config to expose the component outside the cluster (or through a service mesh).
properties: properties:
......
...@@ -10950,6 +10950,30 @@ spec: ...@@ -10950,6 +10950,30 @@ spec:
globalDynamoNamespace: globalDynamoNamespace:
description: GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace description: GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace
type: boolean type: boolean
gpuMemoryService:
description: |-
GPUMemoryService configures the GPU Memory Service (GMS) sidecar.
When enabled, a GMS sidecar is injected and GPU access is managed via DRA.
properties:
deviceClassName:
default: gpu.nvidia.com
description: DeviceClassName is the DRA DeviceClass to request GPUs from.
type: string
enabled:
description: |-
Enabled activates the GMS sidecar. GPU resources on the main container
are replaced with a DRA ResourceClaim for shared GPU access.
type: boolean
mode:
default: intraPod
description: Mode selects the GMS deployment topology.
enum:
- intraPod
- interPod
type: string
required:
- enabled
type: object
ingress: ingress:
description: Ingress config to expose the component outside the cluster (or through a service mesh). description: Ingress config to expose the component outside the cluster (or through a service mesh).
properties: properties:
......
...@@ -356,6 +356,18 @@ rules: ...@@ -356,6 +356,18 @@ rules:
- patch - patch
- update - update
- watch - watch
- apiGroups:
- resource.k8s.io
resources:
- resourceclaimtemplates
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups: - apiGroups:
- inference.networking.k8s.io - inference.networking.k8s.io
resources: resources:
...@@ -585,6 +597,46 @@ roleRef: ...@@ -585,6 +597,46 @@ roleRef:
kind: ClusterRole kind: ClusterRole
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-queue-reader name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-queue-reader
subjects: subjects:
- kind: ServiceAccount
name: '{{ include "dynamo-operator.fullname" . }}-controller-manager'
namespace: '{{ .Release.Namespace }}'
---
# ClusterRole for DRA DeviceClass access
# This is always a ClusterRole since DeviceClass resources are cluster-scoped
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-deviceclass-reader
labels:
app.kubernetes.io/component: rbac
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
rules:
- apiGroups:
- resource.k8s.io
resources:
- deviceclasses
verbs:
- get
- list
- watch
---
# ClusterRoleBinding for DRA DeviceClass access
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-deviceclass-reader-binding
labels:
app.kubernetes.io/component: rbac
app.kubernetes.io/created-by: dynamo-operator
app.kubernetes.io/part-of: dynamo-operator
{{- include "dynamo-operator.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-deviceclass-reader
subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: '{{ include "dynamo-operator.fullname" . }}-controller-manager' name: '{{ include "dynamo-operator.fullname" . }}-controller-manager'
namespace: '{{ .Release.Namespace }}' namespace: '{{ .Release.Namespace }}'
\ No newline at end of file
...@@ -155,6 +155,35 @@ func (e ExtraPodSpec) MarshalJSON() ([]byte, error) { ...@@ -155,6 +155,35 @@ func (e ExtraPodSpec) MarshalJSON() ([]byte, error) {
return json.Marshal(aux) return json.Marshal(aux)
} }
// GPUMemoryServiceMode selects the GMS deployment topology.
type GPUMemoryServiceMode string
const (
// GMSModeIntraPod runs GMS as a sidecar within the same pod.
GMSModeIntraPod GPUMemoryServiceMode = "intraPod"
// GMSModeInterPod runs GMS as a separate pod (not yet supported).
GMSModeInterPod GPUMemoryServiceMode = "interPod"
)
// GPUMemoryServiceSpec configures the GPU Memory Service (GMS) sidecar for a worker component.
// When enabled, the operator injects a GMS sidecar that provides shared GPU memory access
// via DRA (Dynamic Resource Allocation). The sidecar runs two GMS processes per GPU
// (weights + kv_cache) and communicates with the main container over UDS sockets.
type GPUMemoryServiceSpec struct {
// Enabled activates the GMS sidecar. GPU resources on the main container
// are replaced with a DRA ResourceClaim for shared GPU access.
Enabled bool `json:"enabled"`
// Mode selects the GMS deployment topology.
// +kubebuilder:default=intraPod
// +kubebuilder:validation:Enum=intraPod;interPod
// +optional
Mode GPUMemoryServiceMode `json:"mode,omitempty"`
// DeviceClassName is the DRA DeviceClass to request GPUs from.
// +kubebuilder:default="gpu.nvidia.com"
// +optional
DeviceClassName string `json:"deviceClassName,omitempty"`
}
// ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter // ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter
// for replica management. When enabled, the DGDSA owns the replicas field and // for replica management. When enabled, the DGDSA owns the replicas field and
// external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource. // external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource.
......
...@@ -149,6 +149,11 @@ type DynamoComponentDeploymentSharedSpec struct { ...@@ -149,6 +149,11 @@ type DynamoComponentDeploymentSharedSpec struct {
// must be narrower than or equal to the spec-level packDomain. // must be narrower than or equal to the spec-level packDomain.
// +optional // +optional
TopologyConstraint *TopologyConstraint `json:"topologyConstraint,omitempty"` TopologyConstraint *TopologyConstraint `json:"topologyConstraint,omitempty"`
// GPUMemoryService configures the GPU Memory Service (GMS) sidecar.
// When enabled, a GMS sidecar is injected and GPU access is managed via DRA.
// +optional
GPUMemoryService *GPUMemoryServiceSpec `json:"gpuMemoryService,omitempty"`
} }
type MultinodeSpec struct { type MultinodeSpec struct {
......
...@@ -557,6 +557,11 @@ func (in *DynamoComponentDeploymentSharedSpec) DeepCopyInto(out *DynamoComponent ...@@ -557,6 +557,11 @@ func (in *DynamoComponentDeploymentSharedSpec) DeepCopyInto(out *DynamoComponent
*out = new(TopologyConstraint) *out = new(TopologyConstraint)
**out = **in **out = **in
} }
if in.GPUMemoryService != nil {
in, out := &in.GPUMemoryService, &out.GPUMemoryService
*out = new(GPUMemoryServiceSpec)
**out = **in
}
} }
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoComponentDeploymentSharedSpec. // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DynamoComponentDeploymentSharedSpec.
...@@ -1251,6 +1256,21 @@ func (in *FrontendSidecarSpec) DeepCopy() *FrontendSidecarSpec { ...@@ -1251,6 +1256,21 @@ func (in *FrontendSidecarSpec) DeepCopy() *FrontendSidecarSpec {
return out return out
} }
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *GPUMemoryServiceSpec) DeepCopyInto(out *GPUMemoryServiceSpec) {
*out = *in
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GPUMemoryServiceSpec.
func (in *GPUMemoryServiceSpec) DeepCopy() *GPUMemoryServiceSpec {
if in == nil {
return nil
}
out := new(GPUMemoryServiceSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *IngressSpec) DeepCopyInto(out *IngressSpec) { func (in *IngressSpec) DeepCopyInto(out *IngressSpec) {
*out = *in *out = *in
......
...@@ -444,11 +444,15 @@ func main() { ...@@ -444,11 +444,15 @@ func main() {
runtimeConfig.KaiSchedulerEnabled = false runtimeConfig.KaiSchedulerEnabled = false
} }
setupLog.Info("Detecting DRA (Dynamic Resource Allocation) availability...")
runtimeConfig.DRAEnabled = commonController.DetectDRAAvailability(mainCtx, mgr)
setupLog.Info("Detected orchestrators availability", setupLog.Info("Detected orchestrators availability",
"grove", runtimeConfig.GroveEnabled, "grove", runtimeConfig.GroveEnabled,
"lws", runtimeConfig.LWSEnabled, "lws", runtimeConfig.LWSEnabled,
"volcano", volcanoDetected, "volcano", volcanoDetected,
"kai-scheduler", runtimeConfig.KaiSchedulerEnabled, "kai-scheduler", runtimeConfig.KaiSchedulerEnabled,
"dra", runtimeConfig.DRAEnabled,
) )
dockerSecretRetriever := secrets.NewDockerSecretIndexer(mgr.GetClient()) dockerSecretRetriever := secrets.NewDockerSecretIndexer(mgr.GetClient())
......
...@@ -10727,6 +10727,30 @@ spec: ...@@ -10727,6 +10727,30 @@ spec:
globalDynamoNamespace: globalDynamoNamespace:
description: GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace description: GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace
type: boolean type: boolean
gpuMemoryService:
description: |-
GPUMemoryService configures the GPU Memory Service (GMS) sidecar.
When enabled, a GMS sidecar is injected and GPU access is managed via DRA.
properties:
deviceClassName:
default: gpu.nvidia.com
description: DeviceClassName is the DRA DeviceClass to request GPUs from.
type: string
enabled:
description: |-
Enabled activates the GMS sidecar. GPU resources on the main container
are replaced with a DRA ResourceClaim for shared GPU access.
type: boolean
mode:
default: intraPod
description: Mode selects the GMS deployment topology.
enum:
- intraPod
- interPod
type: string
required:
- enabled
type: object
ingress: ingress:
description: Ingress config to expose the component outside the cluster (or through a service mesh). description: Ingress config to expose the component outside the cluster (or through a service mesh).
properties: properties:
......
...@@ -10950,6 +10950,30 @@ spec: ...@@ -10950,6 +10950,30 @@ spec:
globalDynamoNamespace: globalDynamoNamespace:
description: GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace description: GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace
type: boolean type: boolean
gpuMemoryService:
description: |-
GPUMemoryService configures the GPU Memory Service (GMS) sidecar.
When enabled, a GMS sidecar is injected and GPU access is managed via DRA.
properties:
deviceClassName:
default: gpu.nvidia.com
description: DeviceClassName is the DRA DeviceClass to request GPUs from.
type: string
enabled:
description: |-
Enabled activates the GMS sidecar. GPU resources on the main container
are replaced with a DRA ResourceClaim for shared GPU access.
type: boolean
mode:
default: intraPod
description: Mode selects the GMS deployment topology.
enum:
- intraPod
- interPod
type: string
required:
- enabled
type: object
ingress: ingress:
description: Ingress config to expose the component outside the cluster (or through a service mesh). description: Ingress config to expose the component outside the cluster (or through a service mesh).
properties: properties:
......
...@@ -244,6 +244,26 @@ rules: ...@@ -244,6 +244,26 @@ rules:
- get - get
- patch - patch
- update - update
- apiGroups:
- resource.k8s.io
resources:
- deviceclasses
verbs:
- get
- list
- watch
- apiGroups:
- resource.k8s.io
resources:
- resourceclaimtemplates
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups: - apiGroups:
- scheduling.run.ai - scheduling.run.ai
resources: resources:
......
...@@ -30,6 +30,7 @@ import ( ...@@ -30,6 +30,7 @@ import (
autoscalingv2 "k8s.io/api/autoscaling/v2" autoscalingv2 "k8s.io/api/autoscaling/v2"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
networkingv1 "k8s.io/api/networking/v1" networkingv1 "k8s.io/api/networking/v1"
resourcev1 "k8s.io/api/resource/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"emperror.dev/errors" "emperror.dev/errors"
...@@ -183,6 +184,23 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req ...@@ -183,6 +184,23 @@ func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req
} }
} }
// Sync GMS ResourceClaimTemplate before creating workload resources
if r.RuntimeConfig.DRAEnabled {
serviceName := dynamoComponentDeployment.Spec.ServiceName
if serviceName == "" {
serviceName = dynamoComponentDeployment.Name
}
claimTemplateName := dynamo.GMSResourceClaimTemplateName(dynamoComponentDeployment.GetParentGraphDeploymentName(), serviceName)
_, _, err = commonController.SyncResource(ctx, r, dynamoComponentDeployment, func(ctx context.Context) (*resourcev1.ResourceClaimTemplate, bool, error) {
return dynamo.GenerateGMSResourceClaimTemplate(ctx, r.Client, claimTemplateName, dynamoComponentDeployment.Namespace, &dynamoComponentDeployment.Spec.DynamoComponentDeploymentSharedSpec)
})
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to sync GMS ResourceClaimTemplate: %w", err)
}
} else if dynamoComponentDeployment.Spec.GPUMemoryService != nil && dynamoComponentDeployment.Spec.GPUMemoryService.Enabled {
return ctrl.Result{}, fmt.Errorf("gpuMemoryService requires DRA (Dynamic Resource Allocation), but the resource.k8s.io API group is not available on this cluster (requires Kubernetes 1.32+)")
}
// Create the appropriate workload resource based on deployment type // Create the appropriate workload resource based on deployment type
var componentReconcileResult ComponentReconcileResult var componentReconcileResult ComponentReconcileResult
if r.RuntimeConfig.LWSEnabled && dynamoComponentDeployment.IsMultinode() { if r.RuntimeConfig.LWSEnabled && dynamoComponentDeployment.IsMultinode() {
......
...@@ -35,6 +35,7 @@ import ( ...@@ -35,6 +35,7 @@ import (
networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1" networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
networkingv1 "k8s.io/api/networking/v1" networkingv1 "k8s.io/api/networking/v1"
resourcev1 "k8s.io/api/resource/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/types"
...@@ -90,6 +91,8 @@ type DynamoGraphDeploymentReconciler struct { ...@@ -90,6 +91,8 @@ type DynamoGraphDeploymentReconciler struct {
// +kubebuilder:rbac:groups=grove.io,resources=clustertopologies,verbs=get;list;watch // +kubebuilder:rbac:groups=grove.io,resources=clustertopologies,verbs=get;list;watch
// +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list // +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list
// +kubebuilder:rbac:groups=inference.networking.k8s.io,resources=inferencepools,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=inference.networking.k8s.io,resources=inferencepools,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaimtemplates,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=resource.k8s.io,resources=deviceclasses,verbs=get;list;watch
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch
// +kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch // +kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch
...@@ -658,6 +661,27 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont ...@@ -658,6 +661,27 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont
func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState, checkpointInfos map[string]*checkpoint.CheckpointInfo) (ReconcileResult, error) { func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState, checkpointInfos map[string]*checkpoint.CheckpointInfo) (ReconcileResult, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
// Sync ResourceClaimTemplates for GMS-enabled components before creating pods.
if r.RuntimeConfig.DRAEnabled {
for serviceName, component := range dynamoDeployment.Spec.Services {
svcComponent := component
claimTemplateName := dynamo.GMSResourceClaimTemplateName(dynamoDeployment.Name, serviceName)
_, _, err := commoncontroller.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*resourcev1.ResourceClaimTemplate, bool, error) {
return dynamo.GenerateGMSResourceClaimTemplate(ctx, r.Client, claimTemplateName, dynamoDeployment.Namespace, svcComponent)
})
if err != nil {
logger.Error(err, "failed to sync GMS ResourceClaimTemplate", "service", serviceName)
return ReconcileResult{}, fmt.Errorf("failed to sync GMS ResourceClaimTemplate for %s: %w", serviceName, err)
}
}
} else {
for _, component := range dynamoDeployment.Spec.Services {
if component.GPUMemoryService != nil && component.GPUMemoryService.Enabled {
return ReconcileResult{}, fmt.Errorf("gpuMemoryService requires DRA (Dynamic Resource Allocation), but the resource.k8s.io API group is not available on this cluster (requires Kubernetes 1.32+)")
}
}
}
grovePodCliqueSetAsResource, err := r.reconcileGrovePodCliqueSet(ctx, dynamoDeployment, restartState, checkpointInfos) grovePodCliqueSetAsResource, err := r.reconcileGrovePodCliqueSet(ctx, dynamoDeployment, restartState, checkpointInfos)
if err != nil { if err != nil {
logger.Error(err, "failed to reconcile the Grove PodClique Set") logger.Error(err, "failed to reconcile the Grove PodClique Set")
......
...@@ -62,6 +62,12 @@ func DetectInferencePoolAvailability(ctx context.Context, mgr ctrl.Manager) bool ...@@ -62,6 +62,12 @@ func DetectInferencePoolAvailability(ctx context.Context, mgr ctrl.Manager) bool
return detectAPIGroupAvailability(ctx, mgr, "inference.networking.k8s.io") return detectAPIGroupAvailability(ctx, mgr, "inference.networking.k8s.io")
} }
// DetectDRAAvailability checks if Dynamic Resource Allocation is available
// by checking if the resource.k8s.io API group is registered (Kubernetes 1.32+)
func DetectDRAAvailability(ctx context.Context, mgr ctrl.Manager) bool {
return detectAPIGroupAvailability(ctx, mgr, "resource.k8s.io")
}
// detectAPIGroupAvailability checks if a specific API group is registered in the cluster // detectAPIGroupAvailability checks if a specific API group is registered in the cluster
func detectAPIGroupAvailability(ctx context.Context, mgr ctrl.Manager, groupName string) bool { func detectAPIGroupAvailability(ctx context.Context, mgr ctrl.Manager, groupName string) bool {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
......
...@@ -26,6 +26,8 @@ type RuntimeConfig struct { ...@@ -26,6 +26,8 @@ type RuntimeConfig struct {
LWSEnabled bool LWSEnabled bool
// KaiSchedulerEnabled is the resolved Kai-scheduler availability (config override merged with auto-detection) // KaiSchedulerEnabled is the resolved Kai-scheduler availability (config override merged with auto-detection)
KaiSchedulerEnabled bool KaiSchedulerEnabled bool
// DRAEnabled indicates whether Dynamic Resource Allocation (resource.k8s.io) is available
DRAEnabled bool
// ExcludedNamespaces for cluster-wide mode namespace filtering // ExcludedNamespaces for cluster-wide mode namespace filtering
ExcludedNamespaces ExcludedNamespacesInterface ExcludedNamespaces ExcludedNamespacesInterface
} }
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package dynamo
import (
"context"
"fmt"
"strconv"
"strings"
"time"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
resourcev1 "k8s.io/api/resource/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
)
const (
gmsSharedVolumeName = "gms-shared"
gmsSharedMountPath = "/shared"
gmsDRAClaimName = "shared-gpu"
defaultDeviceClassName = "gpu.nvidia.com"
gmsProcessesPerGPU = 2
gmsStartupProbeTimeout = 2 * time.Minute
gmsStartupProbePeriodSec = 2
)
func isGMSEnabled(component *v1alpha1.DynamoComponentDeploymentSharedSpec) bool {
return component.GPUMemoryService != nil && component.GPUMemoryService.Enabled
}
// getGPUCount extracts the GPU count from the component resource spec.
func getGPUCount(component *v1alpha1.DynamoComponentDeploymentSharedSpec) (int, error) {
if component.Resources == nil {
return 0, fmt.Errorf("resources must be specified when GPU memory service is enabled")
}
gpuStr := ""
if component.Resources.Limits != nil && component.Resources.Limits.GPU != "" {
gpuStr = component.Resources.Limits.GPU
} else if component.Resources.Requests != nil && component.Resources.Requests.GPU != "" {
gpuStr = component.Resources.Requests.GPU
}
if gpuStr == "" {
return 0, fmt.Errorf("GPU count must be specified when GPU memory service is enabled")
}
count, err := strconv.Atoi(gpuStr)
if err != nil {
return 0, fmt.Errorf("invalid GPU count %q: %w", gpuStr, err)
}
return count, nil
}
// getDeviceClassName returns the DRA DeviceClass name for the component.
// It reads from GPUMemoryServiceSpec.DeviceClassName, falling back to the default.
func getDeviceClassName(component *v1alpha1.DynamoComponentDeploymentSharedSpec) string {
if component.GPUMemoryService != nil && component.GPUMemoryService.DeviceClassName != "" {
return component.GPUMemoryService.DeviceClassName
}
return defaultDeviceClassName
}
// applyGPUMemoryService transforms a pod spec to include a GMS sidecar with
// DRA shared GPU access. The main container's GPU resources are replaced with
// a DRA ResourceClaim, and a GMS init container is added.
//
// claimTemplateName is the name of the ResourceClaimTemplate that will provide
// shared GPU access; callers should compute it via GMSResourceClaimTemplateName.
func applyGPUMemoryService(
podSpec *corev1.PodSpec,
component *v1alpha1.DynamoComponentDeploymentSharedSpec,
claimTemplateName string,
) error {
if len(podSpec.Containers) == 0 {
return fmt.Errorf("pod spec must have at least one container for GPU memory service")
}
gpuCount, err := getGPUCount(component)
if err != nil {
return err
}
mainContainer := &podSpec.Containers[0]
// Replace GPU resources with DRA claim on main container
removeGPUResources(mainContainer)
mainContainer.Resources.Claims = append(mainContainer.Resources.Claims, corev1.ResourceClaim{
Name: gmsDRAClaimName,
})
// Add shared volume mount and TMPDIR to main container
mainContainer.VolumeMounts = append(mainContainer.VolumeMounts, corev1.VolumeMount{
Name: gmsSharedVolumeName,
MountPath: gmsSharedMountPath,
})
mainContainer.Env = append(mainContainer.Env, corev1.EnvVar{
Name: "TMPDIR", Value: gmsSharedMountPath,
})
// Add GMS sidecar
gmsSidecar := buildGMSSidecar(mainContainer.Image, gpuCount)
podSpec.InitContainers = append(podSpec.InitContainers, gmsSidecar)
// Add shared volume
podSpec.Volumes = append(podSpec.Volumes, gmsSharedVolume())
// GPU nodes are typically tainted with nvidia.com/gpu=NoSchedule. With
// traditional scheduling the device-plugin injects the matching toleration,
// but DRA bypasses that path. Re-add the toleration explicitly so the pod
// can schedule on GPU nodes.
podSpec.Tolerations = append(podSpec.Tolerations, corev1.Toleration{
Key: commonconsts.KubeResourceGPUNvidia,
Operator: corev1.TolerationOpExists,
Effect: corev1.TaintEffectNoSchedule,
})
// Add pod-level DRA resource claim referencing the ResourceClaimTemplate
podSpec.ResourceClaims = append(podSpec.ResourceClaims, corev1.PodResourceClaim{
Name: gmsDRAClaimName,
ResourceClaimTemplateName: &claimTemplateName,
})
return nil
}
// removeGPUResources strips nvidia.com/gpu from container resource limits and requests.
// GPU allocation is handled by DRA when GMS is enabled.
func removeGPUResources(container *corev1.Container) {
gpuResource := corev1.ResourceName(commonconsts.KubeResourceGPUNvidia)
if container.Resources.Limits != nil {
delete(container.Resources.Limits, gpuResource)
}
if container.Resources.Requests != nil {
delete(container.Resources.Requests, gpuResource)
}
}
// buildGMSSidecar creates the GMS weight server as a sidecar init container
// (restartPolicy: Always). kubelet starts it before regular containers and
// keeps it running for the pod's lifetime.
//
// Each GPU gets two GMS subprocesses (weights + kv_cache) via a bash wrapper
// that forwards signals and exits if any child dies. TMPDIR is set so
// UUID-based sockets land in the shared volume.
func buildGMSSidecar(image string, gpuCount int) corev1.Container {
return corev1.Container{
Name: "gms-weights",
Image: image,
Command: []string{"bash", "-c"},
Args: []string{gmsWrapperScript(gpuCount)},
RestartPolicy: ptr.To(corev1.ContainerRestartPolicyAlways),
Env: []corev1.EnvVar{
{Name: "TMPDIR", Value: gmsSharedMountPath},
},
VolumeMounts: []corev1.VolumeMount{
{
Name: gmsSharedVolumeName,
MountPath: gmsSharedMountPath,
},
},
StartupProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: gmsReadyCheckCommand(gpuCount),
},
},
PeriodSeconds: int32(gmsStartupProbePeriodSec),
FailureThreshold: int32(gmsStartupProbeTimeout/time.Second) / int32(gmsStartupProbePeriodSec),
},
Resources: corev1.ResourceRequirements{
Claims: []corev1.ResourceClaim{
{Name: gmsDRAClaimName},
},
},
}
}
// gmsWrapperScript generates a bash script that launches two GMS subprocesses
// per GPU device (one for weights, one for kv_cache), waits for any to exit,
// then tears down the process group.
func gmsWrapperScript(gpuCount int) string {
devList := make([]string, gpuCount)
for i := range gpuCount {
devList[i] = strconv.Itoa(i)
}
return fmt.Sprintf(
`trap 'kill 0 2>/dev/null || true' EXIT
for dev in %s; do
python3 -m gpu_memory_service --device "$dev" --tag weights &
echo "Started GMS device=$dev tag=weights pid=$!"
python3 -m gpu_memory_service --device "$dev" --tag kv_cache &
echo "Started GMS device=$dev tag=kv_cache pid=$!"
done
wait -n
echo "A GMS subprocess exited, shutting down"`, strings.Join(devList, " "))
}
// gmsReadyCheckCommand returns the exec probe command that verifies the
// expected number of GMS UDS sockets exist on the shared volume.
// With 2-tag GMS (weights + kv_cache), there are 2 sockets per GPU.
func gmsReadyCheckCommand(gpuCount int) []string {
return []string{
"sh", "-c",
fmt.Sprintf("test $(ls %s/gms_*.sock 2>/dev/null | wc -l) -ge %d", gmsSharedMountPath, gpuCount*gmsProcessesPerGPU),
}
}
func gmsSharedVolume() corev1.Volume {
return corev1.Volume{
Name: gmsSharedVolumeName,
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
}
}
// GMSResourceClaimTemplateName returns the deterministic name for the
// ResourceClaimTemplate associated with a GMS-enabled component.
func GMSResourceClaimTemplateName(parentName, serviceName string) string {
return fmt.Sprintf("%s-%s-gpu", parentName, strings.ToLower(serviceName))
}
// GenerateGMSResourceClaimTemplate builds the ResourceClaimTemplate that
// provides shared GPU access to all containers in a GMS-enabled pod via DRA.
//
// claimTemplateName is the deterministic name for the template; callers should
// compute it via GMSResourceClaimTemplateName.
//
// When GMS is not enabled for the component, it returns the template skeleton
// with toDelete=true so that SyncResource cleans up any previously created template.
//
// The cl parameter is used to verify the DeviceClass exists before creating
// the template. Pass nil to skip the DeviceClass check.
func GenerateGMSResourceClaimTemplate(
ctx context.Context,
cl client.Client,
claimTemplateName, namespace string,
component *v1alpha1.DynamoComponentDeploymentSharedSpec,
) (*resourcev1.ResourceClaimTemplate, bool, error) {
template := &resourcev1.ResourceClaimTemplate{
ObjectMeta: metav1.ObjectMeta{
Name: claimTemplateName,
Namespace: namespace,
},
}
if !isGMSEnabled(component) {
return template, true, nil
}
gpuCount, err := getGPUCount(component)
if err != nil {
return nil, false, fmt.Errorf("failed to get GPU count for ResourceClaimTemplate: %w", err)
}
deviceClassName := getDeviceClassName(component)
// Verify the DeviceClass exists before creating the template
if cl != nil {
dc := &resourcev1.DeviceClass{}
if err := cl.Get(ctx, types.NamespacedName{Name: deviceClassName}, dc); err != nil {
if apierrors.IsNotFound(err) {
return nil, false, fmt.Errorf(
"DeviceClass %q not found: ensure the GPU DRA driver is installed and the device class is registered",
deviceClassName)
}
return nil, false, fmt.Errorf("failed to verify DeviceClass %q: %w", deviceClassName, err)
}
}
template.Spec = resourcev1.ResourceClaimTemplateSpec{
Spec: resourcev1.ResourceClaimSpec{
Devices: resourcev1.DeviceClaim{
Requests: []resourcev1.DeviceRequest{
{
Name: "gpus",
Exactly: &resourcev1.ExactDeviceRequest{
DeviceClassName: deviceClassName,
AllocationMode: resourcev1.DeviceAllocationModeExactCount,
Count: int64(gpuCount),
},
},
},
},
},
}
return template, false, nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package dynamo
import (
"context"
"strconv"
"testing"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/intstr"
)
func gmsComponent(gpuCount int) *v1alpha1.DynamoComponentDeploymentSharedSpec {
return &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true},
Resources: &v1alpha1.Resources{
Limits: &v1alpha1.ResourceItem{GPU: strconv.Itoa(gpuCount)},
},
}
}
func gmsBasePodSpec() corev1.PodSpec {
httpPort := intstr.FromString("system")
return corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "main",
Image: "test-image:latest",
Command: []string{"python3", "-m", "dynamo.vllm"},
Env: []corev1.EnvVar{
{Name: "DYN_SYSTEM_PORT", Value: "9090"},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
},
Ports: []corev1.ContainerPort{
{Name: "system", ContainerPort: 9090, Protocol: corev1.ProtocolTCP},
},
StartupProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{Path: "/health", Port: httpPort},
},
},
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceName(commonconsts.KubeResourceGPUNvidia): resource.MustParse("2"),
},
},
},
},
}
}
// --- applyGPUMemoryService ---
func TestApplyGPUMemoryService_EmptyContainers(t *testing.T) {
ps := corev1.PodSpec{}
err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
require.Error(t, err)
assert.Contains(t, err.Error(), "at least one container")
}
func TestApplyGPUMemoryService_MainContainerTransformed(t *testing.T) {
ps := gmsBasePodSpec()
err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
require.NoError(t, err)
main := ps.Containers[0]
// GPU resources should be removed
gpuResource := corev1.ResourceName(commonconsts.KubeResourceGPUNvidia)
_, hasGPU := main.Resources.Limits[gpuResource]
assert.False(t, hasGPU, "main container should not have GPU limits")
// Should have DRA claim
require.Len(t, main.Resources.Claims, 1)
assert.Equal(t, gmsDRAClaimName, main.Resources.Claims[0].Name)
// Should have shared volume mount
var hasSharedMount bool
for _, vm := range main.VolumeMounts {
if vm.Name == gmsSharedVolumeName && vm.MountPath == gmsSharedMountPath {
hasSharedMount = true
}
}
assert.True(t, hasSharedMount, "main container should have gms-shared volume mount")
// Should have TMPDIR
envMap := envToMap(main.Env)
assert.Equal(t, gmsSharedMountPath, envMap["TMPDIR"])
}
func TestApplyGPUMemoryService_GMSSidecarInjected(t *testing.T) {
ps := gmsBasePodSpec()
err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
require.NoError(t, err)
require.Len(t, ps.InitContainers, 1)
gms := ps.InitContainers[0]
assert.Equal(t, "gms-weights", gms.Name)
assert.Equal(t, "test-image:latest", gms.Image)
assert.Equal(t, []string{"bash", "-c"}, gms.Command)
assert.Contains(t, gms.Args[0], "gpu_memory_service --device")
assert.NotNil(t, gms.RestartPolicy)
assert.Equal(t, corev1.ContainerRestartPolicyAlways, *gms.RestartPolicy)
// GMS sidecar should have DRA claim
require.Len(t, gms.Resources.Claims, 1)
assert.Equal(t, gmsDRAClaimName, gms.Resources.Claims[0].Name)
// GMS sidecar should have TMPDIR
gmsEnv := envToMap(gms.Env)
assert.Equal(t, gmsSharedMountPath, gmsEnv["TMPDIR"])
}
func TestApplyGPUMemoryService_SharedVolume(t *testing.T) {
ps := gmsBasePodSpec()
err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
require.NoError(t, err)
var found bool
for _, v := range ps.Volumes {
if v.Name == gmsSharedVolumeName {
assert.NotNil(t, v.EmptyDir)
found = true
}
}
assert.True(t, found, "should have gms-shared volume")
}
func TestApplyGPUMemoryService_GPUToleration(t *testing.T) {
ps := gmsBasePodSpec()
err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
require.NoError(t, err)
var found bool
for _, tol := range ps.Tolerations {
if tol.Key == commonconsts.KubeResourceGPUNvidia && tol.Effect == corev1.TaintEffectNoSchedule {
assert.Equal(t, corev1.TolerationOpExists, tol.Operator)
found = true
}
}
assert.True(t, found, "should have nvidia.com/gpu NoSchedule toleration")
}
func TestApplyGPUMemoryService_DRAResourceClaim(t *testing.T) {
ps := gmsBasePodSpec()
err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
require.NoError(t, err)
require.Len(t, ps.ResourceClaims, 1)
assert.Equal(t, gmsDRAClaimName, ps.ResourceClaims[0].Name)
assert.Equal(t, "myapp-worker-gpu", *ps.ResourceClaims[0].ResourceClaimTemplateName)
}
func TestApplyGPUMemoryService_PreservesExistingEnv(t *testing.T) {
ps := gmsBasePodSpec()
err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
require.NoError(t, err)
main := ps.Containers[0]
envMap := envToMap(main.Env)
assert.Equal(t, "kubernetes", envMap[commonconsts.DynamoDiscoveryBackendEnvVar])
assert.Equal(t, "9090", envMap["DYN_SYSTEM_PORT"])
}
func TestApplyGPUMemoryService_SingleContainer(t *testing.T) {
ps := gmsBasePodSpec()
err := applyGPUMemoryService(&ps, gmsComponent(2), "myapp-worker-gpu")
require.NoError(t, err)
// Should still have exactly 1 regular container (no duplication)
assert.Len(t, ps.Containers, 1)
assert.Equal(t, "main", ps.Containers[0].Name)
}
// --- GMS sidecar helpers ---
func TestGmsWrapperScript_TwoTagsPerDevice(t *testing.T) {
script := gmsWrapperScript(3)
assert.Contains(t, script, "for dev in 0 1 2")
assert.Contains(t, script, "--tag weights")
assert.Contains(t, script, "--tag kv_cache")
assert.Contains(t, script, "trap 'kill 0")
assert.Contains(t, script, "wait -n")
}
func TestGmsReadyCheckCommand_TwoSocketsPerGPU(t *testing.T) {
cmd := gmsReadyCheckCommand(2)
assert.Equal(t, "sh", cmd[0])
assert.Equal(t, "-c", cmd[1])
assert.Contains(t, cmd[2], "gms_*.sock")
// 2 GPUs * 2 tags = 4 sockets
assert.Contains(t, cmd[2], "-ge 4")
}
func TestGmsReadyCheckCommand_SingleGPU(t *testing.T) {
cmd := gmsReadyCheckCommand(1)
// 1 GPU * 2 tags = 2 sockets
assert.Contains(t, cmd[2], "-ge 2")
}
// --- GenerateGMSResourceClaimTemplate ---
func TestGenerateGMSResourceClaimTemplate_Enabled(t *testing.T) {
component := gmsComponent(4)
tmpl, toDelete, err := GenerateGMSResourceClaimTemplate(context.Background(), nil, "myapp-worker-gpu", "default", component)
require.NoError(t, err)
assert.False(t, toDelete)
assert.Equal(t, "myapp-worker-gpu", tmpl.Name)
assert.Equal(t, "default", tmpl.Namespace)
require.Len(t, tmpl.Spec.Spec.Devices.Requests, 1)
req := tmpl.Spec.Spec.Devices.Requests[0]
assert.Equal(t, "gpus", req.Name)
require.NotNil(t, req.Exactly)
assert.Equal(t, defaultDeviceClassName, req.Exactly.DeviceClassName)
assert.Equal(t, int64(4), req.Exactly.Count)
}
func TestGenerateGMSResourceClaimTemplate_CustomDeviceClass(t *testing.T) {
component := gmsComponent(2)
component.GPUMemoryService.DeviceClassName = "gpu.intel.com/xe"
tmpl, toDelete, err := GenerateGMSResourceClaimTemplate(context.Background(), nil, "myapp-worker-gpu", "default", component)
require.NoError(t, err)
assert.False(t, toDelete)
assert.Equal(t, "gpu.intel.com/xe", tmpl.Spec.Spec.Devices.Requests[0].Exactly.DeviceClassName)
}
func TestGenerateGMSResourceClaimTemplate_DisabledReturnsDelete(t *testing.T) {
component := &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
}
tmpl, toDelete, err := GenerateGMSResourceClaimTemplate(context.Background(), nil, "myapp-worker-gpu", "default", component)
require.NoError(t, err)
assert.True(t, toDelete)
assert.Equal(t, "myapp-worker-gpu", tmpl.Name)
}
func TestGenerateGMSResourceClaimTemplate_NoGPUCountError(t *testing.T) {
component := &v1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: commonconsts.ComponentTypeWorker,
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true},
}
_, _, err := GenerateGMSResourceClaimTemplate(context.Background(), nil, "myapp-worker-gpu", "default", component)
require.Error(t, err)
assert.Contains(t, err.Error(), "resources must be specified")
}
// --- GMSResourceClaimTemplateName ---
func TestGMSResourceClaimTemplateName(t *testing.T) {
assert.Equal(t, "myapp-worker-gpu", GMSResourceClaimTemplateName("myapp", "Worker"))
assert.Equal(t, "app-vllmdecodeworker-gpu", GMSResourceClaimTemplateName("app", "VllmDecodeWorker"))
}
// --- isGMSEnabled ---
func TestIsGMSEnabled(t *testing.T) {
assert.True(t, isGMSEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true},
}))
assert.False(t, isGMSEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: false},
}))
assert.False(t, isGMSEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{}))
}
// --- getGPUCount ---
func TestGetGPUCount(t *testing.T) {
tests := []struct {
name string
component *v1alpha1.DynamoComponentDeploymentSharedSpec
want int
wantErr bool
}{
{
name: "from limits",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{Resources: &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPU: "4"}}},
want: 4,
},
{
name: "from requests",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{Resources: &v1alpha1.Resources{Requests: &v1alpha1.ResourceItem{GPU: "2"}}},
want: 2,
},
{
name: "no resources",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{},
wantErr: true,
},
{
name: "invalid GPU string",
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{Resources: &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPU: "abc"}}},
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := getGPUCount(tt.component)
if tt.wantErr {
assert.Error(t, err)
} else {
require.NoError(t, err)
assert.Equal(t, tt.want, got)
}
})
}
}
// --- getDeviceClassName ---
func TestGetDeviceClassName(t *testing.T) {
assert.Equal(t, defaultDeviceClassName, getDeviceClassName(&v1alpha1.DynamoComponentDeploymentSharedSpec{}))
assert.Equal(t, defaultDeviceClassName, getDeviceClassName(&v1alpha1.DynamoComponentDeploymentSharedSpec{
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true},
}))
assert.Equal(t, "gpu.intel.com/xe", getDeviceClassName(&v1alpha1.DynamoComponentDeploymentSharedSpec{
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, DeviceClassName: "gpu.intel.com/xe"},
}))
}
// helpers
func envToMap(envs []corev1.EnvVar) map[string]string {
m := make(map[string]string, len(envs))
for _, e := range envs {
m[e.Name] = e.Value
}
return m
}
...@@ -1182,6 +1182,14 @@ func GenerateBasePodSpec( ...@@ -1182,6 +1182,14 @@ func GenerateBasePodSpec(
} }
} }
// Inject GMS sidecar with DRA shared GPU access when GPU memory service is enabled.
if isGMSEnabled(component) {
claimTemplateName := GMSResourceClaimTemplateName(parentGraphDeploymentName, serviceName)
if err := applyGPUMemoryService(&podSpec, component, claimTemplateName); err != nil {
return nil, fmt.Errorf("failed to apply GPU memory service: %w", err)
}
}
return &podSpec, nil return &podSpec, nil
} }
......
...@@ -20,6 +20,7 @@ package validation ...@@ -20,6 +20,7 @@ package validation
import ( import (
"context" "context"
"fmt" "fmt"
"strconv"
"strings" "strings"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
...@@ -128,6 +129,11 @@ func (v *SharedSpecValidator) Validate(ctx context.Context) (admission.Warnings, ...@@ -128,6 +129,11 @@ func (v *SharedSpecValidator) Validate(ctx context.Context) (admission.Warnings,
return nil, err return nil, err
} }
// Validate GPU memory service configuration
if err := v.validateGPUMemoryService(); err != nil {
return nil, err
}
return warnings, nil return warnings, nil
} }
...@@ -255,6 +261,57 @@ func (v *SharedSpecValidator) validateFrontendSidecar() error { ...@@ -255,6 +261,57 @@ func (v *SharedSpecValidator) validateFrontendSidecar() error {
return nil return nil
} }
// validateGPUMemoryService validates the GPU memory service configuration.
func (v *SharedSpecValidator) validateGPUMemoryService() error {
if v.spec.GPUMemoryService == nil || !v.spec.GPUMemoryService.Enabled {
return nil
}
if v.spec.GPUMemoryService.Mode == nvidiacomv1alpha1.GMSModeInterPod {
return fmt.Errorf(
"%s.gpuMemoryService: mode \"interPod\" is not yet supported",
v.fieldPath)
}
isWorker := v.spec.ComponentType == consts.ComponentTypeWorker ||
v.spec.ComponentType == consts.ComponentTypePrefill ||
v.spec.ComponentType == consts.ComponentTypeDecode
if !isWorker {
return fmt.Errorf(
"%s.gpuMemoryService: GPU memory service is only supported for worker components (componentType must be worker, prefill, or decode)",
v.fieldPath)
}
if v.spec.Resources == nil {
return fmt.Errorf(
"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
v.fieldPath)
}
gpuStr := ""
switch {
case v.spec.Resources.Limits != nil && v.spec.Resources.Limits.GPU != "":
gpuStr = v.spec.Resources.Limits.GPU
case v.spec.Resources.Requests != nil && v.spec.Resources.Requests.GPU != "":
gpuStr = v.spec.Resources.Requests.GPU
}
if gpuStr == "" {
return fmt.Errorf(
"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
v.fieldPath)
}
gpuCount, err := strconv.Atoi(gpuStr)
if err != nil || gpuCount < 1 {
return fmt.Errorf(
"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
v.fieldPath)
}
return nil
}
// validateServiceAnnotations validates known annotations on the service-level spec. // validateServiceAnnotations validates known annotations on the service-level spec.
func (v *SharedSpecValidator) validateServiceAnnotations() error { func (v *SharedSpecValidator) validateServiceAnnotations() error {
if v.spec.Annotations == nil { if v.spec.Annotations == nil {
......
...@@ -402,6 +402,7 @@ _Appears in:_ ...@@ -402,6 +402,7 @@ _Appears in:_
| `frontendSidecar` _[FrontendSidecarSpec](#frontendsidecarspec)_ | FrontendSidecar configures an auto-generated frontend sidecar container.<br />When specified, the operator injects a fully configured frontend container<br />with all standard Dynamo environment variables, health probes, and ports.<br />This eliminates the need to manually specify these in extraPodSpec.containers. (GAIE) | | Optional: \{\} <br /> | | `frontendSidecar` _[FrontendSidecarSpec](#frontendsidecarspec)_ | FrontendSidecar configures an auto-generated frontend sidecar container.<br />When specified, the operator injects a fully configured frontend container<br />with all standard Dynamo environment variables, health probes, and ports.<br />This eliminates the need to manually specify these in extraPodSpec.containers. (GAIE) | | Optional: \{\} <br /> |
| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. | | Optional: \{\} <br /> | | `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. | | Optional: \{\} <br /> |
| `topologyConstraint` _[TopologyConstraint](#topologyconstraint)_ | TopologyConstraint for this service. packDomain is required.<br />When both this and spec.topologyConstraint.packDomain are set, packDomain<br />must be narrower than or equal to the spec-level packDomain. | | Optional: \{\} <br /> | | `topologyConstraint` _[TopologyConstraint](#topologyconstraint)_ | TopologyConstraint for this service. packDomain is required.<br />When both this and spec.topologyConstraint.packDomain are set, packDomain<br />must be narrower than or equal to the spec-level packDomain. | | Optional: \{\} <br /> |
| `gpuMemoryService` _[GPUMemoryServiceSpec](#gpumemoryservicespec)_ | GPUMemoryService configures the GPU Memory Service (GMS) sidecar.<br />When enabled, a GMS sidecar is injected and GPU access is managed via DRA. | | Optional: \{\} <br /> |
#### DynamoComponentDeploymentSpec #### DynamoComponentDeploymentSpec
...@@ -444,6 +445,7 @@ _Appears in:_ ...@@ -444,6 +445,7 @@ _Appears in:_
| `frontendSidecar` _[FrontendSidecarSpec](#frontendsidecarspec)_ | FrontendSidecar configures an auto-generated frontend sidecar container.<br />When specified, the operator injects a fully configured frontend container<br />with all standard Dynamo environment variables, health probes, and ports.<br />This eliminates the need to manually specify these in extraPodSpec.containers. (GAIE) | | Optional: \{\} <br /> | | `frontendSidecar` _[FrontendSidecarSpec](#frontendsidecarspec)_ | FrontendSidecar configures an auto-generated frontend sidecar container.<br />When specified, the operator injects a fully configured frontend container<br />with all standard Dynamo environment variables, health probes, and ports.<br />This eliminates the need to manually specify these in extraPodSpec.containers. (GAIE) | | Optional: \{\} <br /> |
| `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. | | Optional: \{\} <br /> | | `checkpoint` _[ServiceCheckpointConfig](#servicecheckpointconfig)_ | Checkpoint configures container checkpointing for this service.<br />When enabled, pods can be restored from a checkpoint files for faster cold start. | | Optional: \{\} <br /> |
| `topologyConstraint` _[TopologyConstraint](#topologyconstraint)_ | TopologyConstraint for this service. packDomain is required.<br />When both this and spec.topologyConstraint.packDomain are set, packDomain<br />must be narrower than or equal to the spec-level packDomain. | | Optional: \{\} <br /> | | `topologyConstraint` _[TopologyConstraint](#topologyconstraint)_ | TopologyConstraint for this service. packDomain is required.<br />When both this and spec.topologyConstraint.packDomain are set, packDomain<br />must be narrower than or equal to the spec-level packDomain. | | Optional: \{\} <br /> |
| `gpuMemoryService` _[GPUMemoryServiceSpec](#gpumemoryservicespec)_ | GPUMemoryService configures the GPU Memory Service (GMS) sidecar.<br />When enabled, a GMS sidecar is injected and GPU access is managed via DRA. | | Optional: \{\} <br /> |
#### DynamoGraphDeployment #### DynamoGraphDeployment
...@@ -821,6 +823,45 @@ _Appears in:_ ...@@ -821,6 +823,45 @@ _Appears in:_
| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables for the frontend sidecar.<br />These are merged with (and can override) the auto-generated Dynamo env vars. | | Optional: \{\} <br /> | | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables for the frontend sidecar.<br />These are merged with (and can override) the auto-generated Dynamo env vars. | | Optional: \{\} <br /> |
#### GPUMemoryServiceMode
_Underlying type:_ _string_
GPUMemoryServiceMode selects the GMS deployment topology.
_Appears in:_
- [GPUMemoryServiceSpec](#gpumemoryservicespec)
| Field | Description |
| --- | --- |
| `intraPod` | GMSModeIntraPod runs GMS as a sidecar within the same pod.<br /> |
| `interPod` | GMSModeInterPod runs GMS as a separate pod (not yet supported).<br /> |
#### GPUMemoryServiceSpec
GPUMemoryServiceSpec configures the GPU Memory Service (GMS) sidecar for a worker component.
When enabled, the operator injects a GMS sidecar that provides shared GPU memory access
via DRA (Dynamic Resource Allocation). The sidecar runs two GMS processes per GPU
(weights + kv_cache) and communicates with the main container over UDS sockets.
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | Enabled activates the GMS sidecar. GPU resources on the main container<br />are replaced with a DRA ResourceClaim for shared GPU access. | | |
| `mode` _[GPUMemoryServiceMode](#gpumemoryservicemode)_ | Mode selects the GMS deployment topology. | intraPod | Enum: [intraPod interPod] <br />Optional: \{\} <br /> |
| `deviceClassName` _string_ | DeviceClassName is the DRA DeviceClass to request GPUs from. | gpu.nvidia.com | Optional: \{\} <br /> |
#### IngressSpec #### IngressSpec
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# GPU Memory Service (GMS) sidecar example.
#
# The operator injects a GMS sidecar init container that provides shared GPU
# memory access via DRA (Dynamic Resource Allocation). The sidecar runs two GMS
# processes per GPU (weights + kv_cache) and communicates with the main container
# over UDS sockets on a shared emptyDir volume.
#
# Requires Kubernetes 1.32+ with DRA enabled and the NVIDIA GPU DRA driver installed.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-agg-gms
spec:
services:
Frontend:
envFromSecret: hf-token-secret
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
VllmWorker:
envFromSecret: hf-token-secret
componentType: worker
replicas: 1
resources:
limits:
gpu: "1"
requests:
custom:
ephemeral-storage: "2Gi"
gpuMemoryService:
enabled: true
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- Qwen/Qwen3-0.6B
- --load-format
- gms
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment