"lib/mocker/src/vscode:/vscode.git/clone" did not exist on "1dc0975b4bde4b5be30b9d1b73beb6d34b0b60d9"
Unverified Commit a48672f5 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: add inter-pod GMS (#7777)

parent 0d635418
...@@ -40,6 +40,6 @@ dependencies: ...@@ -40,6 +40,6 @@ dependencies:
condition: global.kai-scheduler.install condition: global.kai-scheduler.install
- name: grove-charts - name: grove-charts
alias: grove alias: grove
version: v0.1.0-alpha.7 version: v0.1.0-alpha.8
repository: oci://ghcr.io/ai-dynamo/grove repository: oci://ghcr.io/ai-dynamo/grove
condition: global.grove.install condition: global.grove.install
...@@ -100,7 +100,7 @@ The chart includes built-in validation to prevent all operator conflicts: ...@@ -100,7 +100,7 @@ The chart includes built-in validation to prevent all operator conflicts:
| file://components/operator | dynamo-operator | 1.1.0 | | file://components/operator | dynamo-operator | 1.1.0 |
| https://charts.bitnami.com/bitnami | etcd | 12.0.18 | | https://charts.bitnami.com/bitnami | etcd | 12.0.18 |
| https://nats-io.github.io/k8s/helm/charts/ | nats | 1.3.2 | | https://nats-io.github.io/k8s/helm/charts/ | nats | 1.3.2 |
| oci://ghcr.io/ai-dynamo/grove | grove(grove-charts) | v0.1.0-alpha.7 | | oci://ghcr.io/ai-dynamo/grove | grove(grove-charts) | v0.1.0-alpha.8 |
| oci://ghcr.io/kai-scheduler/kai-scheduler | kai-scheduler | v0.13.4 | | oci://ghcr.io/kai-scheduler/kai-scheduler | kai-scheduler | v0.13.4 |
## Values ## Values
...@@ -207,6 +207,7 @@ For **production environments**, Kai Scheduler and Grove should be installed sep ...@@ -207,6 +207,7 @@ For **production environments**, Kai Scheduler and Grove should be installed sep
| dynamo-platform | kai-scheduler | Grove | | dynamo-platform | kai-scheduler | Grove |
|-----------------|---------------|-------| |-----------------|---------------|-------|
| 1.0.x | >= v0.13.0 | >= v0.1.0-alpha.6 | | 1.0.x | >= v0.13.0 | >= v0.1.0-alpha.6 |
| 1.1.x | >= v0.13.4 | >= v0.1.0-alpha.8 |
After installing them separately, enable Dynamo integration: After installing them separately, enable Dynamo integration:
......
...@@ -133,6 +133,7 @@ For **production environments**, Kai Scheduler and Grove should be installed sep ...@@ -133,6 +133,7 @@ For **production environments**, Kai Scheduler and Grove should be installed sep
| dynamo-platform | kai-scheduler | Grove | | dynamo-platform | kai-scheduler | Grove |
|-----------------|---------------|-------| |-----------------|---------------|-------|
| 1.0.x | >= v0.13.0 | >= v0.1.0-alpha.6 | | 1.0.x | >= v0.13.0 | >= v0.1.0-alpha.6 |
| 1.1.x | >= v0.13.4 | >= v0.1.0-alpha.8 |
After installing them separately, enable Dynamo integration: After installing them separately, enable Dynamo integration:
......
...@@ -10551,19 +10551,20 @@ spec: ...@@ -10551,19 +10551,20 @@ spec:
type: object type: object
failover: failover:
description: |- description: |-
Failover configures active-passive GPU failover for this service. Failover configures GMS (GPU Memory Service) failover for this service.
When enabled, the main container is cloned into two engine containers For intraPod mode: the main container is cloned into two engine containers (active + standby).
(active + standby) sharing GPUs via DRA. Requires gpuMemoryService.enabled. For interPod mode: the operator creates a dedicated GMS weight server pod and
multiple engine pods per rank that share GPUs via DRA resource claims.
properties: properties:
enabled: enabled:
description: |- description: Enabled activates failover mode.
Enabled activates failover mode. The main container is cloned into two
engine containers (active + standby) sharing GPUs via DRA. The standby
acquires the flock when the active engine fails.
type: boolean type: boolean
mode: mode:
default: intraPod default: intraPod
description: Mode selects the failover deployment topology. Must match gpuMemoryService.mode. description: |-
Mode selects the failover deployment topology.
intraPod: engine containers run within the same pod (requires gpuMemoryService.enabled).
interPod: a dedicated GMS weight server pod + engine pods per rank (requires Grove).
enum: enum:
- intraPod - intraPod
- interPod - interPod
...@@ -10571,10 +10572,13 @@ spec: ...@@ -10571,10 +10572,13 @@ spec:
numShadows: numShadows:
default: 1 default: 1
description: |- description: |-
NumShadows is the number of shadow (standby) engine containers per rank. NumShadows is the number of shadow (standby) engine pods per rank.
Reserved for future use — the operator currently creates exactly one shadow. Total engine pods per rank = NumShadows + 1 (1 primary + NumShadows shadows).
NumShadows is only meaningful for mode=interPod; intraPod uses a fixed
1 primary + 1 shadow sidecar layout and any value other than 1 is
rejected at admission time.
format: int32 format: int32
maximum: 1
minimum: 1 minimum: 1
type: integer type: integer
required: required:
......
...@@ -10774,19 +10774,20 @@ spec: ...@@ -10774,19 +10774,20 @@ spec:
type: object type: object
failover: failover:
description: |- description: |-
Failover configures active-passive GPU failover for this service. Failover configures GMS (GPU Memory Service) failover for this service.
When enabled, the main container is cloned into two engine containers For intraPod mode: the main container is cloned into two engine containers (active + standby).
(active + standby) sharing GPUs via DRA. Requires gpuMemoryService.enabled. For interPod mode: the operator creates a dedicated GMS weight server pod and
multiple engine pods per rank that share GPUs via DRA resource claims.
properties: properties:
enabled: enabled:
description: |- description: Enabled activates failover mode.
Enabled activates failover mode. The main container is cloned into two
engine containers (active + standby) sharing GPUs via DRA. The standby
acquires the flock when the active engine fails.
type: boolean type: boolean
mode: mode:
default: intraPod default: intraPod
description: Mode selects the failover deployment topology. Must match gpuMemoryService.mode. description: |-
Mode selects the failover deployment topology.
intraPod: engine containers run within the same pod (requires gpuMemoryService.enabled).
interPod: a dedicated GMS weight server pod + engine pods per rank (requires Grove).
enum: enum:
- intraPod - intraPod
- interPod - interPod
...@@ -10794,10 +10795,13 @@ spec: ...@@ -10794,10 +10795,13 @@ spec:
numShadows: numShadows:
default: 1 default: 1
description: |- description: |-
NumShadows is the number of shadow (standby) engine containers per rank. NumShadows is the number of shadow (standby) engine pods per rank.
Reserved for future use — the operator currently creates exactly one shadow. Total engine pods per rank = NumShadows + 1 (1 primary + NumShadows shadows).
NumShadows is only meaningful for mode=interPod; intraPod uses a fixed
1 primary + 1 shadow sidecar layout and any value other than 1 is
rejected at admission time.
format: int32 format: int32
maximum: 1
minimum: 1 minimum: 1
type: integer type: integer
required: required:
......
...@@ -266,6 +266,7 @@ rules: ...@@ -266,6 +266,7 @@ rules:
verbs: verbs:
- create - create
- delete - delete
- deletecollection
- get - get
- list - list
- patch - patch
......
...@@ -47,6 +47,9 @@ type OperatorConfiguration struct { ...@@ -47,6 +47,9 @@ type OperatorConfiguration struct {
// Orchestrator configuration with optional overrides // Orchestrator configuration with optional overrides
Orchestrators OrchestratorConfiguration `json:"orchestrators"` Orchestrators OrchestratorConfiguration `json:"orchestrators"`
// DRA (Dynamic Resource Allocation) settings with optional override
DRA DRAConfiguration `json:"dra,omitempty"`
// Service mesh and infrastructure addresses // Service mesh and infrastructure addresses
Infrastructure InfrastructureConfiguration `json:"infrastructure"` Infrastructure InfrastructureConfiguration `json:"infrastructure"`
...@@ -194,6 +197,24 @@ type KaiSchedulerConfiguration struct { ...@@ -194,6 +197,24 @@ type KaiSchedulerConfiguration struct {
Enabled *bool `json:"enabled,omitempty"` Enabled *bool `json:"enabled,omitempty"`
} }
// DRAConfiguration holds Dynamic Resource Allocation (resource.k8s.io) settings.
//
// NOTE: auto-detection here only verifies that the resource.k8s.io API group is
// registered on the apiserver (Kubernetes 1.32+). It does NOT verify that a
// GPU-specific DRA resource driver (e.g. nvidia/k8s-dra-driver-gpu) is
// installed, that its DeviceClass exists, or that node-level GPU drivers are
// compatible. An admin can use `enabled: false` to force-off DRA integration
// on clusters where the API is present but the GPU driver stack is not wired
// up — this makes the operator fail GMS / inter-pod failover admissions early
// with a clear error instead of letting pods Pend with a confusing
// "resourceclaim not found" at schedule time.
type DRAConfiguration struct {
// Enabled overrides auto-detection of the resource.k8s.io API group.
// nil = auto-detect. Setting true requires detection to also succeed (the
// operator will exit at startup otherwise).
Enabled *bool `json:"enabled,omitempty"`
}
// InfrastructureConfiguration holds service mesh and backend addresses. // InfrastructureConfiguration holds service mesh and backend addresses.
type InfrastructureConfiguration struct { type InfrastructureConfiguration struct {
// NATSAddress is the address of the NATS server // NATSAddress is the address of the NATS server
......
...@@ -120,6 +120,26 @@ func (in *CheckpointStorageConfiguration) DeepCopy() *CheckpointStorageConfigura ...@@ -120,6 +120,26 @@ func (in *CheckpointStorageConfiguration) DeepCopy() *CheckpointStorageConfigura
return out return out
} }
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DRAConfiguration) DeepCopyInto(out *DRAConfiguration) {
*out = *in
if in.Enabled != nil {
in, out := &in.Enabled, &out.Enabled
*out = new(bool)
**out = **in
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRAConfiguration.
func (in *DRAConfiguration) DeepCopy() *DRAConfiguration {
if in == nil {
return nil
}
out := new(DRAConfiguration)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DiscoveryConfiguration) DeepCopyInto(out *DiscoveryConfiguration) { func (in *DiscoveryConfiguration) DeepCopyInto(out *DiscoveryConfiguration) {
*out = *in *out = *in
...@@ -353,6 +373,7 @@ func (in *OperatorConfiguration) DeepCopyInto(out *OperatorConfiguration) { ...@@ -353,6 +373,7 @@ func (in *OperatorConfiguration) DeepCopyInto(out *OperatorConfiguration) {
out.LeaderElection = in.LeaderElection out.LeaderElection = in.LeaderElection
out.Namespace = in.Namespace out.Namespace = in.Namespace
in.Orchestrators.DeepCopyInto(&out.Orchestrators) in.Orchestrators.DeepCopyInto(&out.Orchestrators)
in.DRA.DeepCopyInto(&out.DRA)
out.Infrastructure = in.Infrastructure out.Infrastructure = in.Infrastructure
out.Ingress = in.Ingress out.Ingress = in.Ingress
out.RBAC = in.RBAC out.RBAC = in.RBAC
......
...@@ -161,7 +161,10 @@ type GPUMemoryServiceMode string ...@@ -161,7 +161,10 @@ type GPUMemoryServiceMode string
const ( const (
// GMSModeIntraPod runs GMS as a sidecar within the same pod. // GMSModeIntraPod runs GMS as a sidecar within the same pod.
GMSModeIntraPod GPUMemoryServiceMode = "intraPod" GMSModeIntraPod GPUMemoryServiceMode = "intraPod"
// GMSModeInterPod runs GMS as a separate pod (not yet supported). // GMSModeInterPod runs GMS as a separate weight server pod and one or more
// engine pods per rank, sharing GPUs via DRA ResourceClaims and a shared
// hostPath volume for UDS sockets. Only valid on FailoverSpec; the
// GPUMemoryServiceSpec sidecar always runs in intraPod mode.
GMSModeInterPod GPUMemoryServiceMode = "interPod" GMSModeInterPod GPUMemoryServiceMode = "interPod"
) )
...@@ -185,23 +188,28 @@ type GPUMemoryServiceSpec struct { ...@@ -185,23 +188,28 @@ type GPUMemoryServiceSpec struct {
} }
// FailoverSpec configures active-passive failover for a worker component. // FailoverSpec configures active-passive failover for a worker component.
// Requires gpuMemoryService.enabled and the nvidia.com/dynamo-kube-discovery-mode: container // For intraPod mode: requires gpuMemoryService.enabled; the main container is cloned
// annotation on the DGD. // into engine containers (active + standby) within the same pod.
// For interPod mode: the operator creates a dedicated GMS weight server pod and
// multiple engine pods per rank that share GPUs via DRA resource claims.
type FailoverSpec struct { type FailoverSpec struct {
// Enabled activates failover mode. The main container is cloned into two // Enabled activates failover mode.
// engine containers (active + standby) sharing GPUs via DRA. The standby
// acquires the flock when the active engine fails.
Enabled bool `json:"enabled"` Enabled bool `json:"enabled"`
// Mode selects the failover deployment topology. Must match gpuMemoryService.mode. // Mode selects the failover deployment topology.
// intraPod: engine containers run within the same pod (requires gpuMemoryService.enabled).
// interPod: a dedicated GMS weight server pod + engine pods per rank (requires Grove).
// +kubebuilder:default=intraPod // +kubebuilder:default=intraPod
// +kubebuilder:validation:Enum=intraPod;interPod // +kubebuilder:validation:Enum=intraPod;interPod
// +optional // +optional
Mode GPUMemoryServiceMode `json:"mode,omitempty"` Mode GPUMemoryServiceMode `json:"mode,omitempty"`
// NumShadows is the number of shadow (standby) engine containers per rank. // NumShadows is the number of shadow (standby) engine pods per rank.
// Reserved for future use — the operator currently creates exactly one shadow. // Total engine pods per rank = NumShadows + 1 (1 primary + NumShadows shadows).
//
// NumShadows is only meaningful for mode=interPod; intraPod uses a fixed
// 1 primary + 1 shadow sidecar layout and any value other than 1 is
// rejected at admission time.
// +kubebuilder:default=1 // +kubebuilder:default=1
// +kubebuilder:validation:Minimum=1 // +kubebuilder:validation:Minimum=1
// +kubebuilder:validation:Maximum=1
// +optional // +optional
NumShadows int32 `json:"numShadows,omitempty"` NumShadows int32 `json:"numShadows,omitempty"`
} }
......
...@@ -155,9 +155,10 @@ type DynamoComponentDeploymentSharedSpec struct { ...@@ -155,9 +155,10 @@ type DynamoComponentDeploymentSharedSpec struct {
// +optional // +optional
GPUMemoryService *GPUMemoryServiceSpec `json:"gpuMemoryService,omitempty"` GPUMemoryService *GPUMemoryServiceSpec `json:"gpuMemoryService,omitempty"`
// Failover configures active-passive GPU failover for this service. // Failover configures GMS (GPU Memory Service) failover for this service.
// When enabled, the main container is cloned into two engine containers // For intraPod mode: the main container is cloned into two engine containers (active + standby).
// (active + standby) sharing GPUs via DRA. Requires gpuMemoryService.enabled. // For interPod mode: the operator creates a dedicated GMS weight server pod and
// multiple engine pods per rank that share GPUs via DRA resource claims.
// +optional // +optional
Failover *FailoverSpec `json:"failover,omitempty"` Failover *FailoverSpec `json:"failover,omitempty"`
} }
...@@ -344,6 +345,59 @@ func (s *DynamoComponentDeploymentSharedSpec) GetNumberOfNodes() int32 { ...@@ -344,6 +345,59 @@ func (s *DynamoComponentDeploymentSharedSpec) GetNumberOfNodes() int32 {
return 1 return 1
} }
// IsInterPodGMSEnabled reports whether the inter-pod GMS layout is requested
// (dedicated GMS weight-server pod per rank + engine pods, sharing GPUs via
// DRA). This is a layout-only signal and does NOT imply failover is enabled;
// callers deciding whether to add shadow engine pods or apply failover-group
// cascade labels must additionally consult IsInterPodFailoverEnabled().
func (s *DynamoComponentDeploymentSharedSpec) IsInterPodGMSEnabled() bool {
return s.GPUMemoryService != nil && s.GPUMemoryService.Enabled &&
s.GPUMemoryService.Mode == GMSModeInterPod
}
// IsInterPodFailoverEnabled reports whether failover with hot-spare shadow
// engine pods is configured for the inter-pod GMS layout. When true, the
// service also implies IsInterPodGMSEnabled() (the layout invariant is
// enforced by admission). Use this to gate shadow-pod expansion and
// failover-cascade labels; use IsInterPodGMSEnabled() for layout-only
// decisions (weight-server PCLQ, DRA claims, Grove pathway gating, etc.).
func (s *DynamoComponentDeploymentSharedSpec) IsInterPodFailoverEnabled() bool {
return s.Failover != nil && s.Failover.Enabled && s.Failover.Mode == GMSModeInterPod
}
// GetNumShadows returns the number of shadow engine replicas configured for
// inter-pod GMS failover. It returns 0 when inter-pod failover is disabled
// (including the standalone inter-pod GMS layout and intra-pod failover).
// Defaults to 1 if inter-pod failover is enabled but NumShadows is unset or <1.
//
// Callers that iterate "engine roles" must gate on IsInterPodFailoverEnabled()
// first — treating a 0 return as "just the primary" is a bug, because the
// primary is still modeled as a regular single-pod service in that case.
func (s *DynamoComponentDeploymentSharedSpec) GetNumShadows() int32 {
if !s.IsInterPodFailoverEnabled() {
return 0
}
if s.Failover.NumShadows < 1 {
return 1
}
return s.Failover.NumShadows
}
// GetTotalEnginePods returns the total number of engine pods (primary +
// shadows) for the inter-pod GMS layout. Returns 1 for the standalone
// inter-pod layout (no failover) — a single engine pod paired with a
// dedicated weight-server pod — and N+1 when inter-pod failover is enabled.
// Returns 1 for non-inter-pod layouts as a sizing convenience.
//
// Callers that iterate "engine roles" must gate on IsInterPodGMSEnabled()
// first — the 1 return for non-inter-pod services is a convenience for sizing
// math, NOT a signal that there is a "primary role" to iterate over; the
// non-inter-pod path models the service as a single clique, not as primary +
// shadows.
func (s *DynamoComponentDeploymentSharedSpec) GetTotalEnginePods() int32 {
return s.GetNumShadows() + 1
}
func (s *DynamoComponentDeployment) GetParentGraphDeploymentName() string { func (s *DynamoComponentDeployment) GetParentGraphDeploymentName() string {
for _, ownerRef := range s.ObjectMeta.OwnerReferences { for _, ownerRef := range s.ObjectMeta.OwnerReferences {
if ownerRef.Kind == "DynamoGraphDeployment" { if ownerRef.Kind == "DynamoGraphDeployment" {
......
...@@ -445,7 +445,23 @@ func main() { ...@@ -445,7 +445,23 @@ func main() {
} }
setupLog.Info("Detecting DRA (Dynamic Resource Allocation) availability...") setupLog.Info("Detecting DRA (Dynamic Resource Allocation) availability...")
runtimeConfig.DRAEnabled = commonController.DetectDRAAvailability(mainCtx, mgr) draDetected := commonController.DetectDRAAvailability(mainCtx, mgr)
switch {
case operatorCfg.DRA.Enabled == nil:
runtimeConfig.DRAEnabled = draDetected
case *operatorCfg.DRA.Enabled:
if !draDetected {
setupLog.Error(nil,
"DRA is explicitly enabled in config but the resource.k8s.io API group"+
" was not detected in the cluster (requires Kubernetes 1.32+)",
)
os.Exit(1)
}
runtimeConfig.DRAEnabled = true
default:
setupLog.Info("DRA is explicitly disabled via config override")
runtimeConfig.DRAEnabled = false
}
setupLog.Info("Detected orchestrators availability", setupLog.Info("Detected orchestrators availability",
"grove", runtimeConfig.GroveEnabled, "grove", runtimeConfig.GroveEnabled,
...@@ -681,6 +697,15 @@ func registerControllers( ...@@ -681,6 +697,15 @@ func registerControllers(
return fmt.Errorf("unable to create DynamoCheckpoint controller: %w", err) return fmt.Errorf("unable to create DynamoCheckpoint controller: %w", err)
} }
if runtimeConfig.GroveEnabled {
if err = controller.NewFailoverCascadeReconciler(
mgr.GetClient(),
mgr.GetEventRecorderFor("gms-failover-cascade"),
).SetupWithManager(mgr); err != nil {
return fmt.Errorf("unable to create GMS FailoverCascade controller: %w", err)
}
}
setupLog.Info("Controllers registered successfully") setupLog.Info("Controllers registered successfully")
return nil return nil
} }
...@@ -716,7 +741,7 @@ func registerWebhooks( ...@@ -716,7 +741,7 @@ func registerWebhooks(
return fmt.Errorf("unable to register DynamoComponentDeployment webhook: %w", err) return fmt.Errorf("unable to register DynamoComponentDeployment webhook: %w", err)
} }
dgdHandler := webhookvalidation.NewDynamoGraphDeploymentHandler(mgr, operatorPrincipal) dgdHandler := webhookvalidation.NewDynamoGraphDeploymentHandler(mgr, operatorPrincipal, runtimeConfig.GroveEnabled)
if err := dgdHandler.RegisterWithManager(mgr); err != nil { if err := dgdHandler.RegisterWithManager(mgr); err != nil {
return fmt.Errorf("unable to register DynamoGraphDeployment webhook: %w", err) return fmt.Errorf("unable to register DynamoGraphDeployment webhook: %w", err)
} }
......
...@@ -10551,19 +10551,20 @@ spec: ...@@ -10551,19 +10551,20 @@ spec:
type: object type: object
failover: failover:
description: |- description: |-
Failover configures active-passive GPU failover for this service. Failover configures GMS (GPU Memory Service) failover for this service.
When enabled, the main container is cloned into two engine containers For intraPod mode: the main container is cloned into two engine containers (active + standby).
(active + standby) sharing GPUs via DRA. Requires gpuMemoryService.enabled. For interPod mode: the operator creates a dedicated GMS weight server pod and
multiple engine pods per rank that share GPUs via DRA resource claims.
properties: properties:
enabled: enabled:
description: |- description: Enabled activates failover mode.
Enabled activates failover mode. The main container is cloned into two
engine containers (active + standby) sharing GPUs via DRA. The standby
acquires the flock when the active engine fails.
type: boolean type: boolean
mode: mode:
default: intraPod default: intraPod
description: Mode selects the failover deployment topology. Must match gpuMemoryService.mode. description: |-
Mode selects the failover deployment topology.
intraPod: engine containers run within the same pod (requires gpuMemoryService.enabled).
interPod: a dedicated GMS weight server pod + engine pods per rank (requires Grove).
enum: enum:
- intraPod - intraPod
- interPod - interPod
...@@ -10571,10 +10572,13 @@ spec: ...@@ -10571,10 +10572,13 @@ spec:
numShadows: numShadows:
default: 1 default: 1
description: |- description: |-
NumShadows is the number of shadow (standby) engine containers per rank. NumShadows is the number of shadow (standby) engine pods per rank.
Reserved for future use — the operator currently creates exactly one shadow. Total engine pods per rank = NumShadows + 1 (1 primary + NumShadows shadows).
NumShadows is only meaningful for mode=interPod; intraPod uses a fixed
1 primary + 1 shadow sidecar layout and any value other than 1 is
rejected at admission time.
format: int32 format: int32
maximum: 1
minimum: 1 minimum: 1
type: integer type: integer
required: required:
......
...@@ -10774,19 +10774,20 @@ spec: ...@@ -10774,19 +10774,20 @@ spec:
type: object type: object
failover: failover:
description: |- description: |-
Failover configures active-passive GPU failover for this service. Failover configures GMS (GPU Memory Service) failover for this service.
When enabled, the main container is cloned into two engine containers For intraPod mode: the main container is cloned into two engine containers (active + standby).
(active + standby) sharing GPUs via DRA. Requires gpuMemoryService.enabled. For interPod mode: the operator creates a dedicated GMS weight server pod and
multiple engine pods per rank that share GPUs via DRA resource claims.
properties: properties:
enabled: enabled:
description: |- description: Enabled activates failover mode.
Enabled activates failover mode. The main container is cloned into two
engine containers (active + standby) sharing GPUs via DRA. The standby
acquires the flock when the active engine fails.
type: boolean type: boolean
mode: mode:
default: intraPod default: intraPod
description: Mode selects the failover deployment topology. Must match gpuMemoryService.mode. description: |-
Mode selects the failover deployment topology.
intraPod: engine containers run within the same pod (requires gpuMemoryService.enabled).
interPod: a dedicated GMS weight server pod + engine pods per rank (requires Grove).
enum: enum:
- intraPod - intraPod
- interPod - interPod
...@@ -10794,10 +10795,13 @@ spec: ...@@ -10794,10 +10795,13 @@ spec:
numShadows: numShadows:
default: 1 default: 1
description: |- description: |-
NumShadows is the number of shadow (standby) engine containers per rank. NumShadows is the number of shadow (standby) engine pods per rank.
Reserved for future use — the operator currently creates exactly one shadow. Total engine pods per rank = NumShadows + 1 (1 primary + NumShadows shadows).
NumShadows is only meaningful for mode=interPod; intraPod uses a fixed
1 primary + 1 shadow sidecar layout and any value other than 1 is
rejected at admission time.
format: int32 format: int32
maximum: 1
minimum: 1 minimum: 1
type: integer type: integer
required: required:
......
...@@ -35,6 +35,8 @@ rules: ...@@ -35,6 +35,8 @@ rules:
resources: resources:
- pods - pods
verbs: verbs:
- delete
- deletecollection
- get - get
- list - list
- watch - watch
......
...@@ -6,7 +6,7 @@ require ( ...@@ -6,7 +6,7 @@ require (
emperror.dev/errors v0.8.1 emperror.dev/errors v0.8.1
github.com/Masterminds/semver/v3 v3.4.0 github.com/Masterminds/semver/v3 v3.4.0
github.com/ai-dynamo/dynamo/deploy/snapshot v0.0.0 github.com/ai-dynamo/dynamo/deploy/snapshot v0.0.0
github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.6 github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.8
github.com/bsm/gomega v1.27.10 github.com/bsm/gomega v1.27.10
github.com/go-logr/logr v1.4.3 github.com/go-logr/logr v1.4.3
github.com/google/go-cmp v0.7.0 github.com/google/go-cmp v0.7.0
...@@ -26,7 +26,7 @@ require ( ...@@ -26,7 +26,7 @@ require (
k8s.io/apiextensions-apiserver v0.34.3 k8s.io/apiextensions-apiserver v0.34.3
k8s.io/apimachinery v0.34.3 k8s.io/apimachinery v0.34.3
k8s.io/client-go v0.34.3 k8s.io/client-go v0.34.3
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 k8s.io/utils v0.0.0-20260108192941-914a6e750570
sigs.k8s.io/controller-runtime v0.22.4 sigs.k8s.io/controller-runtime v0.22.4
sigs.k8s.io/gateway-api-inference-extension v1.2.0 sigs.k8s.io/gateway-api-inference-extension v1.2.0
sigs.k8s.io/lws v0.6.1 sigs.k8s.io/lws v0.6.1
...@@ -74,7 +74,7 @@ require ( ...@@ -74,7 +74,7 @@ require (
github.com/pkg/errors v0.9.1 // indirect github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/procfs v0.17.0 // indirect github.com/prometheus/procfs v0.17.0 // indirect
github.com/spf13/cobra v1.9.1 // indirect github.com/spf13/cobra v1.10.1 // indirect
github.com/spf13/pflag v1.0.10 // indirect github.com/spf13/pflag v1.0.10 // indirect
github.com/stoewer/go-strcase v1.3.0 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect
github.com/x448/float16 v0.8.4 // indirect github.com/x448/float16 v0.8.4 // indirect
......
...@@ -4,8 +4,8 @@ emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0= ...@@ -4,8 +4,8 @@ emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE= emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE=
github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.6 h1:6xspRy93dVsGzwiRebNUhrEnamXUtAGvt5tP50uxkOA= github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.8 h1:hsAJ8YOsmxDBqB4bpFlnss5jVBJ8rSZ0W7mPLKIPo5A=
github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.6/go.mod h1:kg35gYA1E7y8SGGZNfllkj6MBPtsdvc/TCNHL9Ysnpw= github.com/ai-dynamo/grove/operator/api v0.1.0-alpha.8/go.mod h1:Vm2I1VlLyNKJRmSsAoaT5KzkMQXuzYKgZ1s2GPaURHQ=
github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI=
github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
...@@ -139,9 +139,9 @@ github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUO ...@@ -139,9 +139,9 @@ github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUO
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= github.com/spf13/cobra v1.10.1 h1:lJeBwCfmrnXthfAupyUTzJ/J4Nc1RsHC/mSRU2dll/s=
github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= github.com/spf13/cobra v1.10.1/go.mod h1:7SmJGaTHFVBY0jW4NXGluQoLvhqFQM+6XSKD+P4XaB0=
github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk=
github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs=
...@@ -294,8 +294,8 @@ k8s.io/kube-aggregator v0.33.4 h1:TdIJKHb0/bLpby7FblXIaVEzyA1jGEjzt/n9cRvwq8U= ...@@ -294,8 +294,8 @@ k8s.io/kube-aggregator v0.33.4 h1:TdIJKHb0/bLpby7FblXIaVEzyA1jGEjzt/n9cRvwq8U=
k8s.io/kube-aggregator v0.33.4/go.mod h1:wZuctdRvGde5bwzxkZRs0GYj2KOpCNgx8rRGVoNb62k= k8s.io/kube-aggregator v0.33.4/go.mod h1:wZuctdRvGde5bwzxkZRs0GYj2KOpCNgx8rRGVoNb62k=
k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE= k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE=
k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts=
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck= k8s.io/utils v0.0.0-20260108192941-914a6e750570 h1:JT4W8lsdrGENg9W+YwwdLJxklIuKWdRm+BC+xt33FOY=
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= k8s.io/utils v0.0.0-20260108192941-914a6e750570/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A= sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A=
......
...@@ -121,7 +121,11 @@ const ( ...@@ -121,7 +121,11 @@ const (
// Grove multinode role suffixes // Grove multinode role suffixes
GroveRoleSuffixLeader = "ldr" GroveRoleSuffixLeader = "ldr"
GroveRoleSuffixWorker = "wkr" GroveRoleSuffixWorker = "wkr"
GroveRoleSuffixGMS = "gms"
KubeLabelDynamoFailoverEngineGroupMember = "nvidia.com/dynamo-failover-engine-group-member"
DiscoveryBackendKubernetes = "kubernetes" // label value for KubeLabelDynamoDiscoveryBackend
MainContainerName = "main" MainContainerName = "main"
FrontendSidecarContainerName = "sidecar-frontend" FrontendSidecarContainerName = "sidecar-frontend"
......
...@@ -770,6 +770,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -770,6 +770,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Command: []string{"/bin/sh", "-c"}, Command: []string{"/bin/sh", "-c"},
Args: []string{"ray start --head --port=6379 && some dynamo command --tensor-parallel-size 4 --pipeline-parallel-size 1 --distributed-executor-backend ray"}, Args: []string{"ray start --head --port=6379 && some dynamo command --tensor-parallel-size 4 --pipeline-parallel-size 1 --distributed-executor-backend ray"},
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker}, {Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"}, {Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"}, {Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
...@@ -912,6 +913,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -912,6 +913,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Command: []string{"/bin/sh", "-c"}, Command: []string{"/bin/sh", "-c"},
Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"}, Args: []string{"ray start --address=$(LWS_LEADER_ADDRESS):6379 --block"},
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker}, {Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"}, {Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"}, {Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
......
...@@ -422,9 +422,14 @@ func (r *DynamoGraphDeploymentReconciler) getUpdatedInProgressForGrove(ctx conte ...@@ -422,9 +422,14 @@ func (r *DynamoGraphDeploymentReconciler) getUpdatedInProgressForGrove(ctx conte
var isReady bool var isReady bool
var reason string var reason string
if component.GetNumberOfNodes() > 1 { // Keep in sync with reconcileGroveScaling: any service that requires a
// PodCliqueScalingGroup (multinode OR inter-pod GMS failover) must be
// queried via CheckPCSGReady, otherwise single-node GMS services stall
// in the "in progress" list because the corresponding PodClique never
// exists.
usesPCSG := component.GetNumberOfNodes() > 1 || component.IsInterPodFailoverEnabled()
if usesPCSG {
isReady, reason, _ = dynamo.CheckPCSGReady(ctx, r.Client, resourceName, dgd.Namespace, logger) isReady, reason, _ = dynamo.CheckPCSGReady(ctx, r.Client, resourceName, dgd.Namespace, logger)
} else { } else {
isReady, reason, _ = dynamo.CheckPodCliqueReady(ctx, r.Client, resourceName, dgd.Namespace, logger) isReady, reason, _ = dynamo.CheckPodCliqueReady(ctx, r.Client, resourceName, dgd.Namespace, logger)
} }
...@@ -625,13 +630,10 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont ...@@ -625,13 +630,10 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont
continue continue
} }
numberOfNodes := component.GetNumberOfNodes() usesPCSG := component.GetNumberOfNodes() > 1 || component.IsInterPodFailoverEnabled()
isMultinode := numberOfNodes > 1
if isMultinode {
// Scale PodCliqueScalingGroup for multinode services
// Grove naming pattern: {DGD.name}-{replicaIndex}-{serviceName}
resourceName := fmt.Sprintf("%s-%d-%s", dynamoDeployment.Name, replicaIndex, strings.ToLower(serviceName)) resourceName := fmt.Sprintf("%s-%d-%s", dynamoDeployment.Name, replicaIndex, strings.ToLower(serviceName))
if usesPCSG {
err := r.scaleGroveResource(ctx, err := r.scaleGroveResource(ctx,
resourceName, resourceName,
dynamoDeployment.Namespace, dynamoDeployment.Namespace,
...@@ -642,9 +644,6 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont ...@@ -642,9 +644,6 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont
return fmt.Errorf("failed to scale PodCliqueScalingGroup %s: %w", resourceName, err) return fmt.Errorf("failed to scale PodCliqueScalingGroup %s: %w", resourceName, err)
} }
} else { } else {
// Scale individual PodClique for single-node services
// Grove naming pattern: {DGD.name}-{replicaIndex}-{serviceName}
resourceName := fmt.Sprintf("%s-%d-%s", dynamoDeployment.Name, replicaIndex, strings.ToLower(serviceName))
err := r.scaleGroveResource(ctx, err := r.scaleGroveResource(ctx,
resourceName, resourceName,
dynamoDeployment.Namespace, dynamoDeployment.Namespace,
...@@ -661,11 +660,29 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont ...@@ -661,11 +660,29 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont
return nil return nil
} }
func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState, checkpointInfos map[string]*checkpoint.CheckpointInfo) (ReconcileResult, error) { // reconcileGMSResourceClaimTemplates syncs one ResourceClaimTemplate per
// service when DRA is available, and otherwise fails fast if any service
// needs DRA-backed GPU allocation.
//
// Both the GMS sidecar (gpuMemoryService.enabled=true) and inter-pod GMS
// failover (failover.mode=interPod) allocate GPUs via DRA ResourceClaims.
// Without DRA, pods would be admitted by the webhook but silently reference
// ResourceClaimTemplates that reconcile never creates, producing a confusing
// "resourceclaim not found" at schedule time. We fail fast here so the user
// gets an actionable error instead.
func (r *DynamoGraphDeploymentReconciler) reconcileGMSResourceClaimTemplates(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) error {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
// Sync ResourceClaimTemplates for GMS-enabled components before creating pods. if !r.RuntimeConfig.DRAEnabled {
if r.RuntimeConfig.DRAEnabled { for _, component := range dynamoDeployment.Spec.Services {
if (component.GPUMemoryService != nil && component.GPUMemoryService.Enabled) ||
component.IsInterPodFailoverEnabled() {
return fmt.Errorf("gpuMemoryService / inter-pod GMS failover requires DRA (Dynamic Resource Allocation), but DRA is not available (either the resource.k8s.io API group is not registered on this cluster, which requires Kubernetes 1.32+, or DRA has been explicitly disabled in the operator configuration)")
}
}
return nil
}
for serviceName, component := range dynamoDeployment.Spec.Services { for serviceName, component := range dynamoDeployment.Spec.Services {
gpuCount, deviceClassName := dra.ExtractGPUParams(component.GPUMemoryService, component.Resources) gpuCount, deviceClassName := dra.ExtractGPUParams(component.GPUMemoryService, component.Resources)
claimTemplateName := dra.ResourceClaimTemplateName(dynamoDeployment.Name, serviceName) claimTemplateName := dra.ResourceClaimTemplateName(dynamoDeployment.Name, serviceName)
...@@ -674,15 +691,17 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co ...@@ -674,15 +691,17 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
}) })
if err != nil { if err != nil {
logger.Error(err, "failed to sync GMS ResourceClaimTemplate", "service", serviceName) logger.Error(err, "failed to sync GMS ResourceClaimTemplate", "service", serviceName)
return ReconcileResult{}, fmt.Errorf("failed to sync GMS ResourceClaimTemplate for %s: %w", serviceName, err) return fmt.Errorf("failed to sync GMS ResourceClaimTemplate for %s: %w", serviceName, err)
}
}
} else {
for _, component := range dynamoDeployment.Spec.Services {
if component.GPUMemoryService != nil && component.GPUMemoryService.Enabled {
return ReconcileResult{}, fmt.Errorf("gpuMemoryService requires DRA (Dynamic Resource Allocation), but the resource.k8s.io API group is not available on this cluster (requires Kubernetes 1.32+)")
} }
} }
return nil
}
func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment, restartState *dynamo.RestartState, checkpointInfos map[string]*checkpoint.CheckpointInfo) (ReconcileResult, error) {
logger := log.FromContext(ctx)
if err := r.reconcileGMSResourceClaimTemplates(ctx, dynamoDeployment); err != nil {
return ReconcileResult{}, err
} }
grovePodCliqueSetAsResource, err := r.reconcileGrovePodCliqueSet(ctx, dynamoDeployment, restartState, checkpointInfos) grovePodCliqueSetAsResource, err := r.reconcileGrovePodCliqueSet(ctx, dynamoDeployment, restartState, checkpointInfos)
...@@ -1455,6 +1474,7 @@ func (r *DynamoGraphDeploymentReconciler) buildCheckpointJobPodTemplate( ...@@ -1455,6 +1474,7 @@ func (r *DynamoGraphDeploymentReconciler) buildCheckpointJobPodTemplate(
consts.MultinodeDeploymentTypeGrove, // Use Grove (single-node backends return early) consts.MultinodeDeploymentTypeGrove, // Use Grove (single-node backends return early)
serviceName, serviceName,
nil, // No checkpoint info for checkpoint creation jobs nil, // No checkpoint info for checkpoint creation jobs
nil, // Use default deployer
) )
if err != nil { if err != nil {
return corev1.PodTemplateSpec{}, fmt.Errorf("failed to generate base pod spec: %w", err) return corev1.PodTemplateSpec{}, fmt.Errorf("failed to generate base pod spec: %w", err)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment