Unverified Commit a48672f5 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: add inter-pod GMS (#7777)

parent 0d635418
......@@ -770,7 +770,9 @@ func Test_reconcileGroveResources(t *testing.T) {
name string
dgdSpec v1alpha1.DynamoGraphDeploymentSpec
existingGroveResources []client.Object
draEnabled bool
wantReconcileResult ReconcileResult
wantErrSubstring string
}{
{
name: "singular frontend service with 2 replicas - creates a PodClique with 2 replicas - ready",
......@@ -1038,6 +1040,25 @@ func Test_reconcileGroveResources(t *testing.T) {
},
},
},
{
name: "inter-pod GMS failover requires DRA - returns clear error when DRA is disabled",
dgdSpec: v1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"decode": {
ComponentType: string(commonconsts.ComponentTypeDecode),
Replicas: ptr.To(int32(1)),
Failover: &v1alpha1.FailoverSpec{
Enabled: true,
Mode: v1alpha1.GMSModeInterPod,
NumShadows: 1,
},
},
},
},
draEnabled: false,
wantErrSubstring: "requires DRA",
},
}
for _, tt := range tests {
......@@ -1073,7 +1094,7 @@ func Test_reconcileGroveResources(t *testing.T) {
Client: fakeKubeClient,
Recorder: recorder,
Config: &configv1alpha1.OperatorConfiguration{},
RuntimeConfig: &controller_common.RuntimeConfig{},
RuntimeConfig: &controller_common.RuntimeConfig{DRAEnabled: tt.draEnabled},
ScaleClient: &mockScaleClient{},
DockerSecretRetriever: &mockDockerSecretRetriever{
GetSecretsFunc: func(namespace, imageName string) ([]string, error) {
......@@ -1083,6 +1104,11 @@ func Test_reconcileGroveResources(t *testing.T) {
}
result, err := reconciler.reconcileGroveResources(ctx, dgd, nil, nil)
if tt.wantErrSubstring != "" {
g.Expect(err).To(gomega.HaveOccurred())
g.Expect(err.Error()).To(gomega.ContainSubstring(tt.wantErrSubstring))
return
}
g.Expect(err).NotTo(gomega.HaveOccurred())
g.Expect(result).To(gomega.Equal(tt.wantReconcileResult))
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package controller
import (
"context"
"fmt"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
)
// Grove labels that together uniquely identify an "engine group" — the set of
// pods (one per rank in multi-node, or a single pod in single-node) that share
// the same pod index within a PCSG replica. When any one of them terminates,
// the whole group must be torn down so Grove can recreate it as a healthy unit.
const (
groveLabelPCSG = "grove.io/podcliquescalinggroup"
groveLabelPCSGReplicaIndex = "grove.io/podcliquescalinggroup-replica-index"
groveLabelPodIndex = "grove.io/podclique-pod-index"
)
// FailoverCascadeReconciler watches GMS failover pods (restartPolicy: Never)
// and cascade-deletes all pods in the same engine group when any member
// reaches a terminal phase (Failed or Succeeded). This ensures broken
// distributed inference groups are restarted cleanly by Grove.
//
// Background: GMS (GPU Memory Service) pods run with restartPolicy: Never so
// that Kubernetes does not attempt to restart them in-place — a partial
// restart would leave the distributed inference group in an inconsistent
// state. Instead, this controller detects the terminal pod and deletes the
// entire group. Grove then sees the missing pods and recreates the whole
// group from scratch.
//
// An engine group is identified by three Grove labels:
// - grove.io/podcliquescalinggroup (PCSG name)
// - grove.io/podcliquescalinggroup-replica-index (PCSG replica — which copy of the group)
// - grove.io/podclique-pod-index (pod index within the clique)
//
// Only pods carrying the dynamo failover engine-group-member label are
// considered; see failoverCascadePredicate().
type FailoverCascadeReconciler struct {
client.Client
Recorder record.EventRecorder
}
// NewFailoverCascadeReconciler creates a new reconciler.
func NewFailoverCascadeReconciler(c client.Client, recorder record.EventRecorder) *FailoverCascadeReconciler {
return &FailoverCascadeReconciler{
Client: c,
Recorder: recorder,
}
}
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;delete;deletecollection
// Reconcile is called whenever a failover-eligible pod transitions to a
// terminal phase (see failoverCascadePredicate).
//
// DeleteAllOf is idempotent, so concurrent reconciles for multiple pods in the
// same engine group are harmless — the first deletes the group and subsequent
// calls are no-ops.
func (r *FailoverCascadeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)
var pod corev1.Pod
if err := r.Get(ctx, req.NamespacedName, &pod); err != nil {
if errors.IsNotFound(err) {
return ctrl.Result{}, nil
}
return ctrl.Result{}, err
}
if !isTerminalPhase(pod.Status.Phase) {
return ctrl.Result{}, nil
}
// Between predicate evaluation and reconcile execution, another reconcile
// may have already cascade-deleted this pod. The pod still exists in the
// API server but is marked for deletion — skip it.
if pod.DeletionTimestamp != nil {
return ctrl.Result{}, nil
}
// Defensive re-check of the engine-group-member label: the predicate
// already filters on it at the informer layer, but labels can be removed
// between predicate evaluation and reconcile. We never want to cascade-
// delete a pod that has been explicitly unlabeled (e.g. an operator
// manually quarantining a pod).
if pod.Labels[commonconsts.KubeLabelDynamoFailoverEngineGroupMember] != commonconsts.KubeLabelValueTrue {
return ctrl.Result{}, nil
}
pcsg := pod.Labels[groveLabelPCSG]
pcsgReplica := pod.Labels[groveLabelPCSGReplicaIndex]
podIndex := pod.Labels[groveLabelPodIndex]
if pcsg == "" || pcsgReplica == "" || podIndex == "" {
logger.Info("failover pod missing Grove labels, skipping cascade",
"pod", pod.Name,
groveLabelPCSG, pcsg,
groveLabelPCSGReplicaIndex, pcsgReplica,
groveLabelPodIndex, podIndex,
)
return ctrl.Result{}, nil
}
groupLabels := client.MatchingLabels{
commonconsts.KubeLabelDynamoFailoverEngineGroupMember: commonconsts.KubeLabelValueTrue,
groveLabelPCSG: pcsg,
groveLabelPCSGReplicaIndex: pcsgReplica,
groveLabelPodIndex: podIndex,
}
// Force delete (grace=0) intentionally: the distributed inference group is
// already broken when we get here, so giving the surviving engines a SIGTERM
// window only delays Grove's recreation of the cohort and risks leaving
// half-torn-down NCCL/CUDA IPC state and stale UDS sockets on the shared
// hostPath. We deliberately skip preStop hooks and the graceful shutdown
// window; do NOT soften this to a positive grace period.
if err := r.DeleteAllOf(ctx, &corev1.Pod{}, client.InNamespace(pod.Namespace), groupLabels, client.GracePeriodSeconds(0)); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to cascade-delete engine group: %w", err)
}
logger.Info("cascade-deleted engine group",
"trigger", pod.Name,
"pcsg", pcsg,
"pcsgReplica", pcsgReplica,
"podIndex", podIndex,
)
r.Recorder.Eventf(&pod, corev1.EventTypeWarning, "FailoverCascade",
"Pod %s terminated (phase=%s); cascade-deleted engine group (pcsg=%s, replica=%s, index=%s)",
pod.Name, pod.Status.Phase, pcsg, pcsgReplica, podIndex,
)
return ctrl.Result{}, nil
}
// SetupWithManager registers a controller that watches all Pods (not just
// owned ones) and uses failoverCascadePredicate to filter down to only the
// failover-eligible phase transitions. EnqueueRequestForObject means the
// reconcile key is the pod itself (namespace/name), not a parent resource.
func (r *FailoverCascadeReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
Named("gms-failover-cascade").
Watches(&corev1.Pod{}, &handler.EnqueueRequestForObject{},
builder.WithPredicates(failoverCascadePredicate()),
).
Complete(r)
}
func isTerminalPhase(phase corev1.PodPhase) bool {
return phase == corev1.PodFailed || phase == corev1.PodSucceeded
}
// failoverCascadePredicate keeps the reconcile queue minimal by filtering
// events at the informer level, before they ever reach Reconcile().
//
// It accepts only pods carrying the dynamo failover engine-group-member label
// and only when they reach a terminal phase:
//
// - CreateFunc: handles the edge case where the informer's initial list-watch
// delivers a pod that is already Failed/Succeeded (e.g. the informer cache
// started after the pod transitioned, so no Update event was observed).
// Without this, such pods would be silently ignored and their engine group
// would never be cascade-deleted.
//
// - UpdateFunc: the primary path — fires when a Running/Pending pod
// transitions to Failed/Succeeded. Pods that already have a
// deletionTimestamp are filtered out to avoid acting on pods that are
// being terminated by an ongoing cascade or DGD deletion.
//
// - DeleteFunc / GenericFunc: always suppressed — pod deletions are the
// *result* of our cascade, not triggers for one.
func failoverCascadePredicate() predicate.Predicate {
hasLabel := func(labels map[string]string) bool {
return labels[commonconsts.KubeLabelDynamoFailoverEngineGroupMember] == commonconsts.KubeLabelValueTrue
}
return predicate.Funcs{
CreateFunc: func(e event.CreateEvent) bool {
if !hasLabel(e.Object.GetLabels()) {
return false
}
pod, ok := e.Object.(*corev1.Pod)
if !ok {
return false
}
return isTerminalPhase(pod.Status.Phase)
},
DeleteFunc: func(e event.DeleteEvent) bool {
return false
},
GenericFunc: func(e event.GenericEvent) bool {
return false
},
UpdateFunc: func(e event.UpdateEvent) bool {
if !hasLabel(e.ObjectNew.GetLabels()) {
return false
}
// Ignore pods already being deleted — this avoids reacting to
// our own cascade-delete (which sets deletionTimestamp before
// the pod actually disappears from the cache).
if e.ObjectNew.GetDeletionTimestamp() != nil {
return false
}
newPod, ok := e.ObjectNew.(*corev1.Pod)
if !ok {
return false
}
oldPod, ok := e.ObjectOld.(*corev1.Pod)
if !ok {
return false
}
// Only trigger on actual phase transitions to avoid processing
// the same pod twice (e.g. a metadata update on an already-Failed pod).
return !isTerminalPhase(oldPod.Status.Phase) && isTerminalPhase(newPod.Status.Phase)
},
}
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"context"
"testing"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)
const (
cascadeTestNamespace = "test-ns"
cascadeTestPCSG = "my-pcsg"
)
func newFailoverPod(name string, phase corev1.PodPhase, replicaIdx, podIdx string) *corev1.Pod {
return &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: cascadeTestNamespace,
Labels: map[string]string{
commonconsts.KubeLabelDynamoFailoverEngineGroupMember: commonconsts.KubeLabelValueTrue,
groveLabelPCSG: cascadeTestPCSG,
groveLabelPCSGReplicaIndex: replicaIdx,
groveLabelPodIndex: podIdx,
},
},
Status: corev1.PodStatus{Phase: phase},
}
}
func newCascadeReconciler(objs ...client.Object) (*FailoverCascadeReconciler, client.Client) {
scheme := runtime.NewScheme()
_ = corev1.AddToScheme(scheme)
cb := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(&corev1.Pod{})
for _, o := range objs {
cb = cb.WithObjects(o)
}
c := cb.Build()
return NewFailoverCascadeReconciler(c, record.NewFakeRecorder(16)), c
}
func TestFailoverCascade_FailedPodDeletesEntireGroup(t *testing.T) {
failedPod := newFailoverPod("ldr-0", corev1.PodFailed, "0", "0")
sibling1 := newFailoverPod("gms-0-0", corev1.PodRunning, "0", "0")
sibling2 := newFailoverPod("wkr-1-0", corev1.PodRunning, "0", "0")
r, c := newCascadeReconciler(failedPod, sibling1, sibling2)
result, err := r.Reconcile(context.Background(), ctrl.Request{
NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
})
require.NoError(t, err)
assert.Equal(t, ctrl.Result{}, result)
var remaining corev1.PodList
require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
assert.Empty(t, remaining.Items, "all pods in the engine group should be deleted")
}
func TestFailoverCascade_SucceededPodDeletesEntireGroup(t *testing.T) {
succeededPod := newFailoverPod("ldr-0", corev1.PodSucceeded, "0", "0")
sibling := newFailoverPod("gms-0-0", corev1.PodRunning, "0", "0")
r, c := newCascadeReconciler(succeededPod, sibling)
result, err := r.Reconcile(context.Background(), ctrl.Request{
NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
})
require.NoError(t, err)
assert.Equal(t, ctrl.Result{}, result)
var remaining corev1.PodList
require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
assert.Empty(t, remaining.Items, "succeeded pod should also trigger cascade")
}
func TestFailoverCascade_DifferentGroupUnaffected(t *testing.T) {
failedPod := newFailoverPod("ldr-0", corev1.PodFailed, "0", "0")
differentGroup := newFailoverPod("ldr-1", corev1.PodRunning, "0", "1")
r, c := newCascadeReconciler(failedPod, differentGroup)
_, err := r.Reconcile(context.Background(), ctrl.Request{
NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
})
require.NoError(t, err)
var remaining corev1.PodList
require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
assert.Len(t, remaining.Items, 1, "only the different engine group pod should remain")
assert.Equal(t, "ldr-1", remaining.Items[0].Name)
}
func TestFailoverCascade_MultipleFailedPodsAllDeleted(t *testing.T) {
failedPod := newFailoverPod("ldr-0", corev1.PodFailed, "0", "0")
alsoFailed := newFailoverPod("wkr-1-0", corev1.PodFailed, "0", "0")
running := newFailoverPod("gms-0-0", corev1.PodRunning, "0", "0")
r, c := newCascadeReconciler(failedPod, alsoFailed, running)
_, err := r.Reconcile(context.Background(), ctrl.Request{
NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
})
require.NoError(t, err)
var remaining corev1.PodList
require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
assert.Empty(t, remaining.Items, "all pods in the engine group should be deleted")
}
func TestFailoverCascade_PodWithoutLabelIgnored(t *testing.T) {
unlabeled := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "random-pod",
Namespace: cascadeTestNamespace,
},
Status: corev1.PodStatus{Phase: corev1.PodFailed},
}
r, _ := newCascadeReconciler(unlabeled)
result, err := r.Reconcile(context.Background(), ctrl.Request{
NamespacedName: types.NamespacedName{Name: "random-pod", Namespace: cascadeTestNamespace},
})
require.NoError(t, err)
assert.Equal(t, ctrl.Result{}, result)
}
func TestFailoverCascade_NonFailedPodIsNoop(t *testing.T) {
runningPod := newFailoverPod("ldr-0", corev1.PodRunning, "0", "0")
sibling := newFailoverPod("gms-0-0", corev1.PodRunning, "0", "0")
r, c := newCascadeReconciler(runningPod, sibling)
_, err := r.Reconcile(context.Background(), ctrl.Request{
NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
})
require.NoError(t, err)
var remaining corev1.PodList
require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
assert.Len(t, remaining.Items, 2, "running pod should not trigger cascade")
}
func TestFailoverCascade_NotFoundPodIsNoop(t *testing.T) {
r, _ := newCascadeReconciler()
result, err := r.Reconcile(context.Background(), ctrl.Request{
NamespacedName: types.NamespacedName{Name: "gone", Namespace: cascadeTestNamespace},
})
require.NoError(t, err)
assert.Equal(t, ctrl.Result{}, result)
}
func TestFailoverCascade_MissingGroveLabelsIsNoop(t *testing.T) {
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "partial-labels",
Namespace: cascadeTestNamespace,
Labels: map[string]string{
commonconsts.KubeLabelDynamoFailoverEngineGroupMember: commonconsts.KubeLabelValueTrue,
groveLabelPCSG: "my-pcsg",
},
},
Status: corev1.PodStatus{Phase: corev1.PodFailed},
}
r, _ := newCascadeReconciler(pod)
result, err := r.Reconcile(context.Background(), ctrl.Request{
NamespacedName: types.NamespacedName{Name: "partial-labels", Namespace: cascadeTestNamespace},
})
require.NoError(t, err)
assert.Equal(t, ctrl.Result{}, result)
}
func TestFailoverCascade_DifferentPCSGReplicaUnaffected(t *testing.T) {
failedPod := newFailoverPod("ldr-0", corev1.PodFailed, "0", "0")
differentReplica := newFailoverPod("ldr-r1-0", corev1.PodRunning, "1", "0")
r, c := newCascadeReconciler(failedPod, differentReplica)
_, err := r.Reconcile(context.Background(), ctrl.Request{
NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
})
require.NoError(t, err)
var remaining corev1.PodList
require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
assert.Len(t, remaining.Items, 1, "only the different PCSG replica pod should remain")
assert.Equal(t, "ldr-r1-0", remaining.Items[0].Name)
}
func TestFailoverCascade_DeletingPodIsSkipped(t *testing.T) {
now := metav1.Now()
failedPod := newFailoverPod("ldr-0", corev1.PodFailed, "0", "0")
failedPod.DeletionTimestamp = &now
failedPod.DeletionGracePeriodSeconds = ptr.To(int64(0))
failedPod.Finalizers = []string{"test-finalizer"}
sibling := newFailoverPod("gms-0-0", corev1.PodRunning, "0", "0")
r, c := newCascadeReconciler(failedPod, sibling)
result, err := r.Reconcile(context.Background(), ctrl.Request{
NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
})
require.NoError(t, err)
assert.Equal(t, ctrl.Result{}, result)
var remaining corev1.PodList
require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
assert.Len(t, remaining.Items, 2, "already-deleting pod should not trigger a cascade")
}
func TestFailoverCascade_ConcurrentReconcileIsIdempotent(t *testing.T) {
pod1 := newFailoverPod("ldr-0", corev1.PodFailed, "0", "0")
pod2 := newFailoverPod("wkr-1-0", corev1.PodFailed, "0", "0")
r, c := newCascadeReconciler(pod1, pod2)
_, err := r.Reconcile(context.Background(), ctrl.Request{
NamespacedName: types.NamespacedName{Name: "ldr-0", Namespace: cascadeTestNamespace},
})
require.NoError(t, err)
// Second reconcile for the other pod — it's already gone (NotFound).
_, err = r.Reconcile(context.Background(), ctrl.Request{
NamespacedName: types.NamespacedName{Name: "wkr-1-0", Namespace: cascadeTestNamespace},
})
require.NoError(t, err)
var remaining corev1.PodList
require.NoError(t, c.List(context.Background(), &remaining, client.InNamespace(cascadeTestNamespace)))
assert.Empty(t, remaining.Items)
}
......@@ -25,7 +25,11 @@ const (
// ClaimName is the pod-level DRA ResourceClaim name for shared GPU access.
ClaimName = "intrapod-shared-gpu"
defaultDeviceClassName = "gpu.nvidia.com"
// DefaultDeviceClassName is the default DRA DeviceClass name used when a
// component does not specify an explicit gpuType. It matches the
// DeviceClass that ships with the NVIDIA DRA Driver and is the single
// source of truth for this string across the operator.
DefaultDeviceClassName = "gpu.nvidia.com"
)
// ApplyClaim replaces the first container's nvidia.com/gpu resources with a
......@@ -120,7 +124,7 @@ func GenerateResourceClaimTemplate(
}
if deviceClassName == "" {
deviceClassName = defaultDeviceClassName
deviceClassName = DefaultDeviceClassName
}
if cl != nil {
......
......@@ -100,7 +100,7 @@ func TestGenerateResourceClaimTemplate_Enabled(t *testing.T) {
assert.Equal(t, "myapp-worker-gpu", tmpl.Name)
require.Len(t, tmpl.Spec.Spec.Devices.Requests, 1)
req := tmpl.Spec.Spec.Devices.Requests[0]
assert.Equal(t, defaultDeviceClassName, req.Exactly.DeviceClassName)
assert.Equal(t, DefaultDeviceClassName, req.Exactly.DeviceClassName)
assert.Equal(t, int64(4), req.Exactly.Count)
}
......
......@@ -29,6 +29,25 @@ type VLLMBackend struct {
}
func (b *VLLMBackend) UpdateContainer(container *corev1.Container, numberOfNodes int32, role Role, component *v1alpha1.DynamoComponentDeploymentSharedSpec, serviceName string, multinodeDeployer MultinodeDeployer) {
// The inter-pod GMS layout (with or without failover) requires the engine
// to load weights from the dedicated GMS weight-server pod rather than
// from disk. --load-format gms and DYN_VLLM_GMS_SHADOW_MODE activate the
// vLLM-side GMS client path and apply to both standalone inter-pod GMS
// and inter-pod GMS + failover; the "shadow mode" name is a vLLM upstream
// naming convention, not a statement about whether shadow pods are
// present.
if component.IsInterPodGMSEnabled() {
if !containerHasArg(container, "--load-format", "gms") {
injectFlagsIntoContainerCommand(container, "--load-format gms", false, "vllm")
}
// DYN_VLLM_GMS_SHADOW_MODE is a vLLM-engine-specific switch (activates
// the vLLM-side GMS client path for shadow weight loading). It is
// injected here — in the vLLM backend — rather than in the backend-
// agnostic GMS helpers so non-vLLM backends do not inherit a stray,
// meaningless env var if/when inter-pod GMS is extended to them.
container.Env = append(container.Env, corev1.EnvVar{Name: "DYN_VLLM_GMS_SHADOW_MODE", Value: "true"})
}
isMultinode := numberOfNodes > 1
if isMultinode {
......
......@@ -980,3 +980,73 @@ func TestShouldUseMpBackend(t *testing.T) {
})
}
}
// TestVLLMBackend_UpdateContainer_InterPodGMS asserts that when the inter-pod
// GMS layout is enabled (gpuMemoryService.mode=interPod, with or without
// failover), the vLLM backend is the one responsible for injecting both the
// --load-format=gms flag and the DYN_VLLM_GMS_SHADOW_MODE env var. These are
// vLLM-runtime switches and must live in the backend adapter, not in the
// backend-agnostic GMS helpers (see gmsEngineEnvVars).
func TestVLLMBackend_UpdateContainer_InterPodGMS(t *testing.T) {
backend := &VLLMBackend{}
component := &v1alpha1.DynamoComponentDeploymentSharedSpec{
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: v1alpha1.GMSModeInterPod,
},
Failover: &v1alpha1.FailoverSpec{
Enabled: true,
Mode: v1alpha1.GMSModeInterPod,
},
}
container := &corev1.Container{
Command: []string{"python3"},
Args: []string{"-m", "dynamo.vllm"},
}
backend.UpdateContainer(container, 1, RoleMain, component, "svc", &GroveMultinodeDeployer{})
// --load-format gms flag must be injected into the container args.
joined := ""
for _, a := range container.Args {
joined += " " + a
}
if !reflect.DeepEqual(containerHasArg(container, "--load-format", "gms"), true) {
t.Errorf("expected --load-format gms to be injected; got args=%q", joined)
}
// DYN_VLLM_GMS_SHADOW_MODE must be set exactly once.
count := 0
for _, e := range container.Env {
if e.Name == "DYN_VLLM_GMS_SHADOW_MODE" {
count++
if e.Value != "true" {
t.Errorf("DYN_VLLM_GMS_SHADOW_MODE value = %q, want %q", e.Value, "true")
}
}
}
if count != 1 {
t.Errorf("DYN_VLLM_GMS_SHADOW_MODE env var count = %d, want 1", count)
}
}
// TestVLLMBackend_UpdateContainer_NoInterPodGMS asserts the complementary
// invariant: when inter-pod GMS failover is NOT enabled, the vLLM backend
// must not inject the GMS-specific env var (it is meaningless outside the
// inter-pod layout).
func TestVLLMBackend_UpdateContainer_NoInterPodGMS(t *testing.T) {
backend := &VLLMBackend{}
component := &v1alpha1.DynamoComponentDeploymentSharedSpec{}
container := &corev1.Container{
Command: []string{"python3"},
Args: []string{"-m", "dynamo.vllm"},
}
backend.UpdateContainer(container, 1, RoleMain, component, "svc", &GroveMultinodeDeployer{})
for _, e := range container.Env {
if e.Name == "DYN_VLLM_GMS_SHADOW_MODE" {
t.Errorf("DYN_VLLM_GMS_SHADOW_MODE must not be injected when inter-pod GMS is disabled")
}
}
}
......@@ -93,6 +93,10 @@ func (b *BaseComponentDefaults) getCommonContainer(context ComponentContext) cor
},
}
container.Env = []corev1.EnvVar{
{
Name: "CONTAINER_NAME",
Value: commonconsts.MainContainerName,
},
{
Name: commonconsts.DynamoNamespaceEnvVar,
Value: context.DynamoNamespace,
......@@ -144,10 +148,9 @@ func (b *BaseComponentDefaults) getCommonContainer(context ComponentContext) cor
}
if context.Discovery.Mode == configv1alpha1.KubeDiscoveryModeContainer {
container.Env = append(container.Env, corev1.EnvVar{
Name: "CONTAINER_NAME",
Value: container.Name,
})
// CONTAINER_NAME is already injected unconditionally above with
// MainContainerName (which equals container.Name here); do not append
// it again or we end up with two env entries of the same name.
container.Env = append(container.Env, corev1.EnvVar{
Name: "DYN_KUBE_DISCOVERY_MODE",
Value: string(configv1alpha1.KubeDiscoveryModeContainer),
......
......@@ -82,6 +82,7 @@ func TestPlannerDefaults_GetBaseContainer(t *testing.T) {
FailureThreshold: 720,
},
Env: []corev1.EnvVar{
{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
{Name: commonconsts.DynamoNamespaceEnvVar, Value: "dynamo-namespace"},
{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypePlanner},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "name"},
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dynamo
......@@ -9,22 +21,389 @@ import (
"fmt"
"path/filepath"
"strconv"
"strings"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/dra"
gmsruntime "github.com/ai-dynamo/dynamo/deploy/operator/internal/gms"
grovev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
corev1 "k8s.io/api/core/v1"
resourcev1 "k8s.io/api/resource/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)
var failoverLockFile = filepath.Join(gmsruntime.SharedMountPath, "failover.lock")
// ──────────────────────────────────────────────────────────────────────────────
// Inter-pod GMS failover (Mode: interPod)
//
// A dedicated GMS weight server pod is created per rank. Engine pods share GPU
// memory via DRA ResourceClaims and a hostPath volume for UDS sockets.
// ──────────────────────────────────────────────────────────────────────────────
const (
gmsSharedVolumeName = "gms-shared"
gmsHostPathBase = "/run/gms"
gmsSharedMountPath = "/run/gms/shared"
gmsFailoverLockFile = "failover.lock"
gmsPermFixInitName = "fix-gms-perms"
)
// gmsWrapperScript generates a bash script that launches the GMS server
// (gpu_memory_service.cli.server), which auto-discovers DRA-allocated GPUs
// and exposes both "weights" and "kv_cache" UDS sockets per device. The
// wrapper cleans up stale sockets from a previous run, forwards SIGTERM/SIGINT
// to the process group, and propagates the GMS server's exit code so the
// container's exitCode in the Pod status reflects the actual failure mode
// (rather than always being 1).
func gmsWrapperScript() string {
return fmt.Sprintf(
`rm -f %s/gms_*.sock
rc=1
cleanup() { kill -- -$$ 2>/dev/null; exit "$rc"; }
trap cleanup SIGTERM SIGINT
python3 -m %s &
echo "Started GMS server pid=$!"
wait -n
rc=$?
echo "GMS server exited (code=$rc), shutting down"
cleanup`, gmsSharedMountPath, gmsruntime.ServerModule)
}
// gmsStartupProbeCommand returns the exec probe command that verifies the GMS
// server has opened both the weights and kv_cache UDS sockets for every
// allocated GPU (2 sockets per device).
func gmsStartupProbeCommand(gpuCount int) []string {
return []string{
"sh", "-c",
fmt.Sprintf("test $(ls %s/gms_*.sock 2>/dev/null | wc -l) -ge %d", gmsSharedMountPath, 2*gpuCount),
}
}
// applyGMSSharedResources attaches the resources common to both GMS weight
// server pods and engine pods: strips GPU limits (DRA handles allocation),
// adds the GPU toleration, mounts the rank-isolated hostPath shared volume,
// and prepends the permission-fix init container.
func applyGMSSharedResources(podSpec *corev1.PodSpec, c *corev1.Container, rank int32) {
removeGPUFromLimits(c)
addGPUToleration(podSpec)
vol, mount := gmsSharedVolume(rank)
podSpec.Volumes = append(podSpec.Volumes, vol)
c.VolumeMounts = append(c.VolumeMounts, mount)
podSpec.InitContainers = append(podSpec.InitContainers, gmsPermFixInitContainer(rank, c.Image))
}
// gmsWeightServerPodSpec builds a GMS weight server pod spec by cloning and
// modifying a base engine pod spec. The GMS pod runs a different command,
// has no liveness/readiness probes, and uses a startup probe that checks
// for the expected number of GMS UDS sockets.
//
// RestartPolicy is intentionally left unset here (i.e. inherits the base /
// Grove default, which is Always). A GMS server process holds only local
// state — GPU allocations (via DRA, which survive the container), hostPath
// UDS sockets (recreated by gmsWrapperScript on startup), and in-memory
// weight buffers (re-sharded on reconnection by the engine clients). So an
// in-place kubelet restart is a fast, correct recovery path.
//
// The paired engine pod mirrors this policy in the standalone inter-pod GMS
// layout (a restarted engine re-imports IPC handles from the still-running
// GMS server). In the inter-pod GMS failover layout, augmentEngineForGMS
// overrides the engine's RestartPolicy to Never so the cohort can only be
// recovered via FailoverCascadeReconciler; see the comment there.
func gmsWeightServerPodSpec(basePodSpec *corev1.PodSpec, rank int32, gpuCount int) *corev1.PodSpec {
podSpec := basePodSpec.DeepCopy()
if len(podSpec.Containers) == 0 {
return podSpec
}
c := &podSpec.Containers[0]
c.Command = []string{"bash", "-c"}
c.Args = []string{gmsWrapperScript()}
c.StartupProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{Command: gmsStartupProbeCommand(gpuCount)},
},
PeriodSeconds: 2,
TimeoutSeconds: 2,
FailureThreshold: 150, // 2s * 150 = 5 min
}
c.LivenessProbe = nil
c.ReadinessProbe = nil
c.Env = append(c.Env, corev1.EnvVar{
Name: gmsruntime.EnvSocketDir,
Value: gmsSharedMountPath,
})
applyGMSSharedResources(podSpec, c, rank)
return podSpec
}
// gmsEngineEnvVars returns the backend-agnostic environment variables injected
// into engine pods when GMS failover is enabled. Backend-specific switches
// (e.g. the vLLM DYN_VLLM_GMS_SHADOW_MODE flag) are injected by the backend's
// UpdateContainer path so non-vLLM backends do not inherit stray env vars.
func gmsEngineEnvVars() []corev1.EnvVar {
return []corev1.EnvVar{
{
Name: "ENGINE_ID",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{
FieldPath: "metadata.labels['grove.io/podclique-pod-index']",
},
},
},
{Name: gmsruntime.EnvSocketDir, Value: gmsSharedMountPath},
{Name: "FAILOVER_LOCK_PATH", Value: gmsSharedMountPath + "/" + gmsFailoverLockFile},
{Name: "DYN_SYSTEM_STARTING_HEALTH_STATUS", Value: "notready"},
}
}
// augmentEngineForGMS modifies an engine pod spec in-place to work with the
// inter-pod GMS layout: injects env vars, shared volume, strips GPU limits,
// adds toleration, and prepends an init container to fix hostPath directory
// permissions.
//
// RestartPolicy behavior is layout-dependent and is the one asymmetry between
// standalone inter-pod GMS and inter-pod GMS failover:
//
// - Standalone inter-pod GMS (isInterPodFailover=false): RestartPolicy is
// left unset (inherits Always), matching the GMS weight-server pod. A
// crashed engine is restarted in place by kubelet; the GMS server keeps
// running and the new engine container reconnects to the existing UDS
// sockets and re-imports CUDA IPC handles during --load-format gms
// startup. There is no cohort state to protect because there is no
// cohort — just one engine paired with one GMS server per rank.
//
// - Inter-pod GMS failover (isInterPodFailover=true): RestartPolicy is
// forced to Never. Engine pods in a failover cohort hold distributed
// state that cannot survive an in-place container restart — active NCCL
// collectives, torch.distributed TCPStore membership, and primary/shadow
// coordination via the failover lock file and DYN_VLLM_GMS_SHADOW_MODE.
// An in-place restart leaves the cohort in a half-torn-down state and
// blocks recovery. The correct recovery path is for the pod to exit,
// FailoverCascadeReconciler (see failover_cascade_controller.go) to
// force-delete the full engine group based on the
// KubeLabelDynamoFailoverEngineGroupMember label, and Grove to recreate
// the cohort from scratch. That label is applied in graph.go only when
// isInterPodFailover is true, so forcing Never in the standalone case
// would strand engine pods in Failed state with nothing listening to
// force-delete them.
func augmentEngineForGMS(podSpec *corev1.PodSpec, rank int32, isInterPodFailover bool) {
if len(podSpec.Containers) == 0 {
return
}
c := &podSpec.Containers[0]
c.Env = append(c.Env, gmsEngineEnvVars()...)
removeEnvVar(c, "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS")
applyGMSSharedResources(podSpec, c, rank)
if isInterPodFailover {
podSpec.RestartPolicy = corev1.RestartPolicyNever
}
}
// gmsSharedVolume returns a hostPath volume and mount with a subPathExpr that
// isolates the shared directory per PCSG replica and per rank.
func gmsSharedVolume(rank int32) (corev1.Volume, corev1.VolumeMount) {
hostPathType := corev1.HostPathDirectoryOrCreate
vol := corev1.Volume{
Name: gmsSharedVolumeName,
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: gmsHostPathBase,
Type: &hostPathType,
},
},
}
mount := corev1.VolumeMount{
Name: gmsSharedVolumeName,
MountPath: gmsSharedMountPath,
SubPathExpr: fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)/rank-%d", rank),
}
return vol, mount
}
// gmsPermFixInitContainer returns an init container that runs as root and
// fixes the hostPath directory permissions so the non-root application user
// can write UDS sockets and lock files. It uses the same subPathExpr as the
// main container so kubelet creates the isolated subdirectory first.
func gmsPermFixInitContainer(rank int32, image string) corev1.Container {
_, mount := gmsSharedVolume(rank)
return corev1.Container{
Name: gmsPermFixInitName,
Image: image,
Command: []string{"sh", "-c", fmt.Sprintf("chmod 1777 %s", gmsSharedMountPath)},
SecurityContext: &corev1.SecurityContext{
// Must run as uid 0 to chmod the hostPath mount for the non-root
// engine/server processes. Explicitly set RunAsNonRoot=false so
// cluster-wide baseline/restricted PodSecurity policies and some
// pod-level SecurityContext defaults do not silently reject this
// init container on admission.
RunAsUser: ptr.To[int64](0),
RunAsNonRoot: ptr.To(false),
},
VolumeMounts: []corev1.VolumeMount{mount},
}
}
// removeGPUFromLimits strips nvidia.com/gpu from the container's resource
// limits and requests because DRA handles GPU allocation for GMS pods.
func removeGPUFromLimits(c *corev1.Container) {
delete(c.Resources.Limits, "nvidia.com/gpu")
delete(c.Resources.Requests, "nvidia.com/gpu")
}
// addGPUToleration ensures pods without explicit GPU limits still get
// scheduled on GPU nodes.
func addGPUToleration(podSpec *corev1.PodSpec) {
toleration := corev1.Toleration{
Key: "nvidia.com/gpu",
Operator: corev1.TolerationOpExists,
Effect: corev1.TaintEffectNoSchedule,
}
for _, t := range podSpec.Tolerations {
if t.Key == toleration.Key && t.Effect == toleration.Effect {
return
}
}
podSpec.Tolerations = append(podSpec.Tolerations, toleration)
}
// removeEnvVar removes all occurrences of the named env var from a container.
func removeEnvVar(c *corev1.Container, name string) {
filtered := c.Env[:0]
for _, e := range c.Env {
if e.Name != name {
filtered = append(filtered, e)
}
}
c.Env = filtered
}
// getGPUCount extracts the GPU count from the component's resource limits.
func getGPUCount(resources *v1alpha1.Resources) int32 {
if resources == nil || resources.Limits == nil || resources.Limits.GPU == "" {
return 0
}
if n, err := strconv.ParseInt(resources.Limits.GPU, 10, 32); err == nil {
return int32(n)
}
return 0
}
// getDeviceClassName returns the DRA device class name from gpuType,
// falling back to the default device class shipped with the NVIDIA DRA
// driver. The literal "gpu.nvidia.com" is intentionally not duplicated
// here — it is the single source of truth in the dra package.
func getDeviceClassName(resources *v1alpha1.Resources) string {
if resources != nil && resources.Limits != nil && resources.Limits.GPUType != "" {
return resources.Limits.GPUType
}
return dra.DefaultDeviceClassName
}
// gmsRCTName returns a deterministic ResourceClaimTemplate name for a given rank.
func gmsRCTName(serviceName string, rank int32) string {
return fmt.Sprintf("%s-gpu-rank-%d", serviceName, rank)
}
// gmsResourceClaimTemplateConfigs builds one PCS-level ResourceClaimTemplateConfig
// per rank. Each RCT has the same GPU spec but a distinct per-rank name so that
// each rank's GMS + engine pods get their own ResourceClaim.
func gmsResourceClaimTemplateConfigs(serviceName string, resources *v1alpha1.Resources, roles []ServiceRole) []grovev1alpha1.ResourceClaimTemplateConfig {
seen := map[int32]bool{}
configs := make([]grovev1alpha1.ResourceClaimTemplateConfig, 0, len(roles))
for _, r := range roles {
if seen[r.Rank] {
continue
}
seen[r.Rank] = true
configs = append(configs, grovev1alpha1.ResourceClaimTemplateConfig{
Name: gmsRCTName(serviceName, r.Rank),
TemplateSpec: resourcev1.ResourceClaimTemplateSpec{
Spec: resourcev1.ResourceClaimSpec{
Devices: resourcev1.DeviceClaim{
Requests: []resourcev1.DeviceRequest{
{
Name: "gpu",
Exactly: &resourcev1.ExactDeviceRequest{
DeviceClassName: getDeviceClassName(resources),
AllocationMode: resourcev1.DeviceAllocationModeExactCount,
Count: int64(getGPUCount(resources)),
},
},
},
},
},
},
})
}
return configs
}
// gmsResourceSharingEntries builds one PCSG-level ResourceSharingSpec per rank.
// Each entry uses PerReplica scope and a filter listing only the GMS clique
// and the engine clique for that rank, ensuring GPU isolation between ranks.
func gmsResourceSharingEntries(serviceName string, roles []ServiceRole) []grovev1alpha1.PCSGResourceSharingSpec {
type rankGroup struct {
cliqueNames []string
}
groups := map[int32]*rankGroup{}
var rankOrder []int32
for _, r := range roles {
g, ok := groups[r.Rank]
if !ok {
g = &rankGroup{}
groups[r.Rank] = g
rankOrder = append(rankOrder, r.Rank)
}
g.cliqueNames = append(g.cliqueNames, strings.ToLower(r.Name))
}
refs := make([]grovev1alpha1.PCSGResourceSharingSpec, 0, len(groups))
for _, rank := range rankOrder {
g := groups[rank]
refs = append(refs, grovev1alpha1.PCSGResourceSharingSpec{
ResourceSharingSpec: grovev1alpha1.ResourceSharingSpec{
Name: gmsRCTName(serviceName, rank),
Scope: grovev1alpha1.ResourceSharingScopePerReplica,
},
Filter: &grovev1alpha1.PCSGResourceSharingFilter{
ChildCliqueNames: g.cliqueNames,
},
})
}
return refs
}
// ──────────────────────────────────────────────────────────────────────────────
// Intra-pod GMS failover (Mode: intraPod)
//
// The main container is cloned into two engine containers (active + standby)
// within the same pod. GPU access is shared via DRA and a GMS sidecar
// injects weights via the shared emptyDir volume.
// ──────────────────────────────────────────────────────────────────────────────
// intraPodFailoverLockFile is the lock file path used by engine containers to
// coordinate active/standby election within the same pod.
var intraPodFailoverLockFile = filepath.Join(gmsruntime.SharedMountPath, "failover.lock")
const (
failoverEngineCount = 2
)
// isFailoverEnabled returns true only for intra-pod failover mode, where the
// main container is cloned into active + standby containers within the same pod.
// Inter-pod failover (Mode=interPod) is handled separately via expandRolesForService
// and generatePodSpecForRole — it does not use container cloning.
func isFailoverEnabled(component *v1alpha1.DynamoComponentDeploymentSharedSpec) bool {
return component.Failover != nil && component.Failover.Enabled
return component.Failover != nil && component.Failover.Enabled &&
component.Failover.Mode == v1alpha1.GMSModeIntraPod
}
// buildFailoverPod clones the main container into two engine containers (active + standby).
......@@ -95,11 +474,10 @@ func buildEngineContainer(base corev1.Container, engineID int, systemPort int) c
}
}
containerName := fmt.Sprintf("engine-%d", engineID)
failoverEnvs := []corev1.EnvVar{
{Name: "ENGINE_ID", Value: strconv.Itoa(engineID)},
{Name: "CONTAINER_NAME", Value: containerName},
{Name: "FAILOVER_LOCK_PATH", Value: failoverLockFile},
{Name: "CONTAINER_NAME", Value: engine.Name},
{Name: "FAILOVER_LOCK_PATH", Value: intraPodFailoverLockFile},
{Name: "DYN_SYSTEM_STARTING_HEALTH_STATUS", Value: "notready"},
{Name: "DYN_SYSTEM_PORT", Value: strconv.Itoa(systemPort)},
{Name: "DYN_SYSTEM_ENABLED", Value: "true"},
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dynamo
......@@ -14,16 +26,435 @@ import (
commonconsts "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/dra"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/gms"
grovev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
k8sresource "k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/intstr"
)
// failoverPodSpec returns a pod spec that has already been transformed by
// ──────────────────────────────────────────────────────────────────────────────
// Inter-pod GMS failover tests
// ──────────────────────────────────────────────────────────────────────────────
func TestGmsWeightServerPodSpec(t *testing.T) {
base := &corev1.PodSpec{
Containers: []corev1.Container{{
Name: "engine",
Command: []string{"python3", "-m", "vllm.entrypoints.openai.api_server"},
Args: []string{"--model", "meta-llama/Llama-3-8B"},
LivenessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{Path: "/health"},
},
},
ReadinessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{Path: "/ready"},
},
},
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": k8sresource.MustParse("8"),
corev1.ResourceMemory: k8sresource.MustParse("64Gi"),
},
},
}},
}
result := gmsWeightServerPodSpec(base, 0, 8)
require.Len(t, result.Containers, 1)
c := result.Containers[0]
assert.Equal(t, []string{"bash", "-c"}, c.Command, "should use bash")
require.Len(t, c.Args, 1)
assert.Contains(t, c.Args[0], gms.ServerModule, "should run gpu_memory_service.cli.server")
assert.Nil(t, c.LivenessProbe, "liveness probe should be nil")
assert.Nil(t, c.ReadinessProbe, "readiness probe should be nil")
assert.NotNil(t, c.StartupProbe, "startup probe should be set")
assert.Equal(t, gmsStartupProbeCommand(8), c.StartupProbe.Exec.Command)
assert.NotContains(t, c.Resources.Limits, corev1.ResourceName("nvidia.com/gpu"), "GPU should be stripped")
assert.Contains(t, c.Resources.Limits, corev1.ResourceMemory, "non-GPU limits should remain")
assert.True(t, hasToleration(result, "nvidia.com/gpu"), "should have GPU toleration")
assert.True(t, hasVolume(result, gmsSharedVolumeName), "should have shared volume")
assert.True(t, hasVolumeMount(c, gmsSharedMountPath), "should have shared volume mount")
assert.True(t, hasEnvVar(c, gms.EnvSocketDir, gmsSharedMountPath), "should set GMS_SOCKET_DIR")
require.Len(t, result.InitContainers, 1, "should have perm-fix init container")
initC := result.InitContainers[0]
assert.Equal(t, gmsPermFixInitName, initC.Name)
assert.Equal(t, c.Image, initC.Image, "init container should reuse the service image")
require.NotNil(t, initC.SecurityContext)
assert.Equal(t, int64(0), *initC.SecurityContext.RunAsUser)
// Verify original is not mutated
assert.Len(t, base.Containers[0].Command, 3, "original command should be unchanged")
}
func TestGmsWeightServerPodSpec_EmptyContainers(t *testing.T) {
base := &corev1.PodSpec{}
result := gmsWeightServerPodSpec(base, 0, 1)
assert.Empty(t, result.Containers)
}
func TestGmsWeightServerPodSpec_SubPathExpr(t *testing.T) {
base := &corev1.PodSpec{
Containers: []corev1.Container{{Name: "engine"}},
}
t.Run("rank 0", func(t *testing.T) {
result := gmsWeightServerPodSpec(base, 0, 4)
mount := findVolumeMount(result.Containers[0], gmsSharedMountPath)
require.NotNil(t, mount, "GMS container should mount shared volume")
assert.Equal(t, "$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)/rank-0", mount.SubPathExpr)
})
t.Run("rank 3", func(t *testing.T) {
result := gmsWeightServerPodSpec(base, 3, 4)
mount := findVolumeMount(result.Containers[0], gmsSharedMountPath)
require.NotNil(t, mount, "GMS container should mount shared volume")
assert.Equal(t, "$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)/rank-3", mount.SubPathExpr)
})
}
func TestAugmentEngineForGMS(t *testing.T) {
podSpec := &corev1.PodSpec{
Containers: []corev1.Container{{
Name: "engine",
Env: []corev1.EnvVar{
{Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", Value: "true"},
{Name: "KEEP_ME", Value: "yes"},
},
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": k8sresource.MustParse("4"),
},
},
}},
}
augmentEngineForGMS(podSpec, 1, true)
c := podSpec.Containers[0]
assert.True(t, hasEnvVar(c, "ENGINE_ID", ""), "ENGINE_ID should be set (via Downward API)")
assert.True(t, hasEnvVar(c, gms.EnvSocketDir, gmsSharedMountPath))
assert.True(t, hasEnvVar(c, "FAILOVER_LOCK_PATH", gmsSharedMountPath+"/"+gmsFailoverLockFile))
// DYN_VLLM_GMS_SHADOW_MODE is backend-specific and is injected by
// VLLMBackend.UpdateContainer, not by augmentEngineForGMS. See
// TestVLLMBackend_UpdateContainer_InterPodGMS in backend_vllm_test.go.
assert.False(t, hasEnvVar(c, "DYN_VLLM_GMS_SHADOW_MODE", "true"),
"vLLM-specific env var must not leak into backend-agnostic GMS helpers")
assert.True(t, hasEnvVar(c, "DYN_SYSTEM_STARTING_HEALTH_STATUS", "notready"))
assert.True(t, hasEnvVar(c, "KEEP_ME", "yes"), "unrelated env vars should be preserved")
for _, e := range c.Env {
assert.NotEqual(t, "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS", e.Name, "should be removed")
}
assert.NotContains(t, c.Resources.Limits, corev1.ResourceName("nvidia.com/gpu"))
assert.True(t, hasToleration(podSpec, "nvidia.com/gpu"))
assert.True(t, hasVolume(podSpec, gmsSharedVolumeName))
require.Len(t, podSpec.InitContainers, 1, "should have perm-fix init container")
initC := podSpec.InitContainers[0]
assert.Equal(t, gmsPermFixInitName, initC.Name)
assert.Equal(t, c.Image, initC.Image, "init container should reuse the service image")
require.NotNil(t, initC.SecurityContext)
assert.Equal(t, int64(0), *initC.SecurityContext.RunAsUser)
initMount := findVolumeMount(initC, gmsSharedMountPath)
require.NotNil(t, initMount, "init container should mount shared volume")
assert.Equal(t, "$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)/rank-1", initMount.SubPathExpr)
assert.Equal(t, corev1.RestartPolicyNever, podSpec.RestartPolicy,
"inter-pod failover engines must be RestartPolicyNever so the "+
"FailoverCascadeReconciler is the sole recovery path")
}
// TestAugmentEngineForGMS_StandaloneDoesNotForceRestartNever pins the
// standalone inter-pod GMS behavior: the engine pod must NOT be forced to
// RestartPolicy=Never. The cascade-group label is only applied when
// isInterPodFailover is true (see graph.go:GenerateGrovePodCliqueSet), so
// forcing Never in standalone mode would strand a crashed engine in Failed
// state with nothing listening to force-delete the PCSG replica. Instead the
// engine inherits the default (Always) and kubelet restarts it in place,
// matching the paired GMS weight-server pod — the restarted engine reconnects
// to the still-running GMS server over UDS during --load-format gms startup.
func TestAugmentEngineForGMS_StandaloneDoesNotForceRestartNever(t *testing.T) {
podSpec := &corev1.PodSpec{
Containers: []corev1.Container{{
Name: "engine",
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": k8sresource.MustParse("4"),
},
},
}},
}
augmentEngineForGMS(podSpec, 0, false)
assert.Equal(t, corev1.RestartPolicy(""), podSpec.RestartPolicy,
"standalone inter-pod GMS engine must not have RestartPolicy overridden; "+
"kubelet restart is the correct recovery path")
assert.True(t, hasVolume(podSpec, gmsSharedVolumeName),
"standalone engine still needs the shared hostPath for UDS sockets")
assert.True(t, hasEnvVar(podSpec.Containers[0], gms.EnvSocketDir, gmsSharedMountPath),
"standalone engine still needs the socket-dir env var to reach the GMS server")
}
func TestAugmentEngineForGMS_EmptyContainers(t *testing.T) {
podSpec := &corev1.PodSpec{}
augmentEngineForGMS(podSpec, 0, true)
assert.Empty(t, podSpec.Containers)
}
func TestRemoveGPUFromLimits(t *testing.T) {
c := &corev1.Container{
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": k8sresource.MustParse("8"),
corev1.ResourceMemory: k8sresource.MustParse("64Gi"),
},
Requests: corev1.ResourceList{
"nvidia.com/gpu": k8sresource.MustParse("8"),
},
},
}
removeGPUFromLimits(c)
assert.NotContains(t, c.Resources.Limits, corev1.ResourceName("nvidia.com/gpu"))
assert.Contains(t, c.Resources.Limits, corev1.ResourceMemory)
assert.NotContains(t, c.Resources.Requests, corev1.ResourceName("nvidia.com/gpu"))
}
func TestAddGPUToleration_Idempotent(t *testing.T) {
podSpec := &corev1.PodSpec{}
addGPUToleration(podSpec)
addGPUToleration(podSpec)
count := 0
for _, tol := range podSpec.Tolerations {
if tol.Key == "nvidia.com/gpu" {
count++
}
}
assert.Equal(t, 1, count, "toleration should be added only once")
}
func TestRemoveEnvVar(t *testing.T) {
c := &corev1.Container{
Env: []corev1.EnvVar{
{Name: "A", Value: "1"},
{Name: "REMOVE_ME", Value: "x"},
{Name: "B", Value: "2"},
{Name: "REMOVE_ME", Value: "y"},
},
}
removeEnvVar(c, "REMOVE_ME")
assert.Len(t, c.Env, 2)
assert.Equal(t, "A", c.Env[0].Name)
assert.Equal(t, "B", c.Env[1].Name)
}
func TestGetGPUCount(t *testing.T) {
tests := []struct {
name string
resources *v1alpha1.Resources
want int32
}{
{"nil resources", nil, 0},
{"nil limits", &v1alpha1.Resources{}, 0},
{"empty gpu string", &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPU: ""}}, 0},
{"valid gpu count", &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPU: "8"}}, 8},
{"invalid gpu string", &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPU: "abc"}}, 0},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, getGPUCount(tt.resources))
})
}
}
func TestGetDeviceClassName(t *testing.T) {
tests := []struct {
name string
resources *v1alpha1.Resources
want string
}{
{"nil resources", nil, "gpu.nvidia.com"},
{"nil limits", &v1alpha1.Resources{}, "gpu.nvidia.com"},
{"empty gpuType", &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{}}, "gpu.nvidia.com"},
{"custom gpuType", &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPUType: "gpu.nvidia.com/h100"}}, "gpu.nvidia.com/h100"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.want, getDeviceClassName(tt.resources))
})
}
}
func TestGmsEngineEnvVars(t *testing.T) {
envs := gmsEngineEnvVars()
names := make(map[string]bool)
for _, e := range envs {
names[e.Name] = true
}
assert.True(t, names["ENGINE_ID"])
assert.True(t, names[gms.EnvSocketDir])
assert.True(t, names["FAILOVER_LOCK_PATH"])
assert.True(t, names["DYN_SYSTEM_STARTING_HEALTH_STATUS"])
// DYN_VLLM_GMS_SHADOW_MODE is backend-specific and is injected by
// VLLMBackend.UpdateContainer, not by gmsEngineEnvVars. See
// TestVLLMBackend_UpdateContainer_InterPodGMS in backend_vllm_test.go.
assert.False(t, names["DYN_VLLM_GMS_SHADOW_MODE"],
"vLLM-specific env var must not leak into backend-agnostic GMS helpers")
for _, e := range envs {
if e.Name == "ENGINE_ID" {
assert.NotNil(t, e.ValueFrom, "ENGINE_ID should use Downward API")
assert.NotNil(t, e.ValueFrom.FieldRef)
assert.Contains(t, e.ValueFrom.FieldRef.FieldPath, "grove.io/podclique-pod-index")
}
}
}
func TestGroveMultinodeDeployer_GMS(t *testing.T) {
t.Run("GetNodeRank returns static rank for GMS", func(t *testing.T) {
d := &GroveMultinodeDeployer{IsInterPodGMS: true, Rank: 2}
rank, isShellExpr := d.GetNodeRank()
assert.Equal(t, "2", rank)
assert.False(t, isShellExpr, "GMS rank should be static, not a shell expression")
})
t.Run("GetNodeRank returns shell expr for non-GMS", func(t *testing.T) {
d := &GroveMultinodeDeployer{IsInterPodGMS: false}
rank, isShellExpr := d.GetNodeRank()
assert.Contains(t, rank, "GROVE_PCLQ_POD_INDEX")
assert.True(t, isShellExpr)
})
t.Run("GetHostNames for GMS multinode", func(t *testing.T) {
d := &GroveMultinodeDeployer{IsInterPodGMS: true, Rank: 0}
hostnames := d.GetHostNames("svc", 3)
assert.Len(t, hostnames, 3)
assert.Contains(t, hostnames[0], "ldr-$(GROVE_PCLQ_POD_INDEX)")
assert.Contains(t, hostnames[1], "wkr-1-$(GROVE_PCLQ_POD_INDEX)")
assert.Contains(t, hostnames[2], "wkr-2-$(GROVE_PCLQ_POD_INDEX)")
})
t.Run("GetHostNames for non-GMS multinode", func(t *testing.T) {
d := &GroveMultinodeDeployer{IsInterPodGMS: false}
hostnames := d.GetHostNames("svc", 3)
assert.Len(t, hostnames, 3)
assert.Contains(t, hostnames[0], "ldr")
assert.Contains(t, hostnames[1], "wkr-0")
assert.Contains(t, hostnames[2], "wkr-1")
})
}
func TestGmsRCTName(t *testing.T) {
assert.Equal(t, "my-svc-gpu-rank-0", gmsRCTName("my-svc", 0))
assert.Equal(t, "llama-gpu-rank-2", gmsRCTName("llama", 2))
}
func TestGmsResourceClaimTemplateConfigs_SingleNode(t *testing.T) {
resources := &v1alpha1.Resources{
Limits: &v1alpha1.ResourceItem{GPU: "8", GPUType: "gpu.nvidia.com/h100"},
}
roles := []ServiceRole{
{Name: "svc-gms-0", Role: RoleGMS, Rank: 0, Replicas: 1},
{Name: "svc", Role: RoleMain, Rank: 0, Replicas: 2},
}
configs := gmsResourceClaimTemplateConfigs("svc", resources, roles)
require.Len(t, configs, 1)
assert.Equal(t, "svc-gpu-rank-0", configs[0].Name)
req := configs[0].TemplateSpec.Spec.Devices.Requests[0]
require.NotNil(t, req.Exactly)
assert.Equal(t, "gpu.nvidia.com/h100", req.Exactly.DeviceClassName)
assert.Equal(t, int64(8), req.Exactly.Count)
}
func TestGmsResourceClaimTemplateConfigs_Multinode(t *testing.T) {
resources := &v1alpha1.Resources{
Limits: &v1alpha1.ResourceItem{GPU: "4"},
}
roles := []ServiceRole{
{Name: "svc-gms-0", Role: RoleGMS, Rank: 0, Replicas: 1},
{Name: "svc-ldr", Role: RoleLeader, Rank: 0, Replicas: 3},
{Name: "svc-gms-1", Role: RoleGMS, Rank: 1, Replicas: 1},
{Name: "svc-wkr-1", Role: RoleWorker, Rank: 1, Replicas: 3},
}
configs := gmsResourceClaimTemplateConfigs("svc", resources, roles)
require.Len(t, configs, 2)
assert.Equal(t, "svc-gpu-rank-0", configs[0].Name)
assert.Equal(t, "svc-gpu-rank-1", configs[1].Name)
req := configs[1].TemplateSpec.Spec.Devices.Requests[0]
require.NotNil(t, req.Exactly)
assert.Equal(t, "gpu.nvidia.com", req.Exactly.DeviceClassName)
assert.Equal(t, int64(4), req.Exactly.Count)
}
func TestGmsResourceSharingEntries_SingleNode(t *testing.T) {
roles := []ServiceRole{
{Name: "svc-gms-0", Role: RoleGMS, Rank: 0, Replicas: 1},
{Name: "svc", Role: RoleMain, Rank: 0, Replicas: 2},
}
refs := gmsResourceSharingEntries("svc", roles)
require.Len(t, refs, 1)
assert.Equal(t, "svc-gpu-rank-0", refs[0].Name)
assert.Equal(t, grovev1alpha1.ResourceSharingScopePerReplica, refs[0].Scope)
require.NotNil(t, refs[0].Filter)
assert.Equal(t, []string{"svc-gms-0", "svc"}, refs[0].Filter.ChildCliqueNames)
}
func TestGmsResourceSharingEntries_Multinode(t *testing.T) {
roles := []ServiceRole{
{Name: "svc-gms-0", Role: RoleGMS, Rank: 0, Replicas: 1},
{Name: "svc-ldr", Role: RoleLeader, Rank: 0, Replicas: 3},
{Name: "svc-gms-1", Role: RoleGMS, Rank: 1, Replicas: 1},
{Name: "svc-wkr-1", Role: RoleWorker, Rank: 1, Replicas: 3},
}
refs := gmsResourceSharingEntries("svc", roles)
require.Len(t, refs, 2)
assert.Equal(t, "svc-gpu-rank-0", refs[0].Name)
assert.Equal(t, grovev1alpha1.ResourceSharingScopePerReplica, refs[0].Scope)
require.NotNil(t, refs[0].Filter)
assert.Equal(t, []string{"svc-gms-0", "svc-ldr"}, refs[0].Filter.ChildCliqueNames)
assert.Equal(t, "svc-gpu-rank-1", refs[1].Name)
assert.Equal(t, grovev1alpha1.ResourceSharingScopePerReplica, refs[1].Scope)
require.NotNil(t, refs[1].Filter)
assert.Equal(t, []string{"svc-gms-1", "svc-wkr-1"}, refs[1].Filter.ChildCliqueNames)
}
// ──────────────────────────────────────────────────────────────────────────────
// Intra-pod failover tests
// ──────────────────────────────────────────────────────────────────────────────
// intraPodFailoverPodSpec returns a pod spec that has already been transformed by
// applyGPUMemoryService (DRA claims, shared volume, TMPDIR set), including
// a frontend sidecar to verify sidecar preservation.
func failoverPodSpec() corev1.PodSpec {
func intraPodFailoverPodSpec() corev1.PodSpec {
httpPort := intstr.FromString("system")
return corev1.PodSpec{
Containers: []corev1.Container{
......@@ -72,10 +503,8 @@ func failoverPodSpec() corev1.PodSpec {
}
}
// --- buildFailoverPod ---
func TestBuildFailoverPod_TwoEnginesPlusSidecar(t *testing.T) {
ps := failoverPodSpec()
ps := intraPodFailoverPodSpec()
err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
require.NoError(t, err)
......@@ -94,14 +523,14 @@ func TestBuildFailoverPod_EmptyContainers(t *testing.T) {
}
func TestBuildFailoverPod_RejectsNonVLLM(t *testing.T) {
ps := failoverPodSpec()
ps := intraPodFailoverPodSpec()
err := buildFailoverPod(&ps, 1, BackendFrameworkSGLang)
require.Error(t, err)
assert.Contains(t, err.Error(), "currently supported only for vLLM")
}
func TestBuildFailoverPod_EngineEnvVars(t *testing.T) {
ps := failoverPodSpec()
ps := intraPodFailoverPodSpec()
err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
require.NoError(t, err)
......@@ -110,8 +539,7 @@ func TestBuildFailoverPod_EngineEnvVars(t *testing.T) {
env := envToMap(engine.Env)
assert.Equal(t, strconv.Itoa(i), env["ENGINE_ID"], "engine-%d ENGINE_ID", i)
assert.Equal(t, fmt.Sprintf("engine-%d", i), env["CONTAINER_NAME"], "engine-%d CONTAINER_NAME", i)
assert.Equal(t, failoverLockFile, env["FAILOVER_LOCK_PATH"], "engine-%d FAILOVER_LOCK_PATH", i)
assert.Equal(t, "true", env["DYN_VLLM_GMS_SHADOW_MODE"], "engine-%d shadow mode", i)
assert.Equal(t, intraPodFailoverLockFile, env["FAILOVER_LOCK_PATH"], "engine-%d FAILOVER_LOCK_PATH", i)
assert.Equal(t, "notready", env["DYN_SYSTEM_STARTING_HEALTH_STATUS"], "engine-%d starting health", i)
assert.Equal(t, "true", env["DYN_SYSTEM_ENABLED"], "engine-%d system enabled", i)
......@@ -124,7 +552,7 @@ func TestBuildFailoverPod_EngineEnvVars(t *testing.T) {
}
func TestBuildFailoverPod_StaggeredPorts(t *testing.T) {
ps := failoverPodSpec()
ps := intraPodFailoverPodSpec()
err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
require.NoError(t, err)
......@@ -139,7 +567,7 @@ func TestBuildFailoverPod_StaggeredPorts(t *testing.T) {
}
func TestBuildFailoverPod_ProbesRetargetedToNamedPort(t *testing.T) {
ps := failoverPodSpec()
ps := intraPodFailoverPodSpec()
err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
require.NoError(t, err)
......@@ -159,7 +587,7 @@ func TestBuildFailoverPod_ProbesRetargetedToNamedPort(t *testing.T) {
}
func TestBuildFailoverPod_PreservesDRAClaim(t *testing.T) {
ps := failoverPodSpec()
ps := intraPodFailoverPodSpec()
err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
require.NoError(t, err)
......@@ -171,7 +599,7 @@ func TestBuildFailoverPod_PreservesDRAClaim(t *testing.T) {
}
func TestBuildFailoverPod_PreservesDiscoveryBackend(t *testing.T) {
ps := failoverPodSpec()
ps := intraPodFailoverPodSpec()
err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
require.NoError(t, err)
......@@ -182,7 +610,7 @@ func TestBuildFailoverPod_PreservesDiscoveryBackend(t *testing.T) {
}
func TestBuildFailoverPod_MultinodeNNODES(t *testing.T) {
ps := failoverPodSpec()
ps := intraPodFailoverPodSpec()
err := buildFailoverPod(&ps, 4, BackendFrameworkVLLM)
require.NoError(t, err)
......@@ -193,7 +621,7 @@ func TestBuildFailoverPod_MultinodeNNODES(t *testing.T) {
}
func TestBuildFailoverPod_SingleNodeNoNNODES(t *testing.T) {
ps := failoverPodSpec()
ps := intraPodFailoverPodSpec()
err := buildFailoverPod(&ps, 1, BackendFrameworkVLLM)
require.NoError(t, err)
......@@ -204,18 +632,70 @@ func TestBuildFailoverPod_SingleNodeNoNNODES(t *testing.T) {
}
}
// --- isFailoverEnabled ---
func TestIsFailoverEnabled(t *testing.T) {
assert.True(t, isFailoverEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{
Failover: &v1alpha1.FailoverSpec{Enabled: true},
Failover: &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeIntraPod},
}))
assert.False(t, isFailoverEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{
Failover: &v1alpha1.FailoverSpec{Enabled: false},
Failover: &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
}), "inter-pod mode must not trigger intra-pod container cloning")
assert.False(t, isFailoverEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{
Failover: &v1alpha1.FailoverSpec{Enabled: false, Mode: v1alpha1.GMSModeIntraPod},
}))
assert.False(t, isFailoverEnabled(&v1alpha1.DynamoComponentDeploymentSharedSpec{}))
}
// ──────────────────────────────────────────────────────────────────────────────
// Helpers
// ──────────────────────────────────────────────────────────────────────────────
func hasToleration(podSpec *corev1.PodSpec, key string) bool {
for _, t := range podSpec.Tolerations {
if t.Key == key {
return true
}
}
return false
}
func hasVolume(podSpec *corev1.PodSpec, name string) bool {
for _, v := range podSpec.Volumes {
if v.Name == name {
return true
}
}
return false
}
func hasVolumeMount(c corev1.Container, mountPath string) bool {
for _, m := range c.VolumeMounts {
if m.MountPath == mountPath {
return true
}
}
return false
}
func findVolumeMount(c corev1.Container, mountPath string) *corev1.VolumeMount {
for i := range c.VolumeMounts {
if c.VolumeMounts[i].MountPath == mountPath {
return &c.VolumeMounts[i]
}
}
return nil
}
func hasEnvVar(c corev1.Container, name, value string) bool {
for _, e := range c.Env {
if e.Name == name {
if value == "" || e.Value == value {
return true
}
}
}
return false
}
func envToMap(envs []corev1.EnvVar) map[string]string {
m := make(map[string]string, len(envs))
for _, e := range envs {
......
......@@ -522,30 +522,33 @@ func resolveImagePullSecrets(retriever SecretsRetriever, namespace, image string
}
// applyCliqueStartupDependencies configures StartsAfter dependencies for cliques in a PodCliqueSet
// based on the backend framework and multinode deployment patterns.
// based on the backend framework, multinode deployment patterns, and the
// inter-pod GMS layout.
//
// Rules:
// - For VLLM and SGLang: worker cliques start after leader clique
// - For TRTLLM: leader clique starts after worker cliques
// - Only applies to multinode deployments (numberOfNodes > 1)
// - For TRTLLM multinode: leader clique starts after worker cliques
// - For inter-pod GMS: engine PCLQs start after their corresponding GMS PCLQ
// (per rank). This applies both to the standalone inter-pod layout and to
// the inter-pod layout with failover; the ordering reflects that engines
// load weights from the weight-server pod regardless of whether shadows are
// present.
// - Sets the PodCliqueSet StartupType to Explicit if any dependencies are configured
func applyCliqueStartupDependencies(
gangSet *grovev1alpha1.PodCliqueSet,
roles []ServiceRole,
backendFramework BackendFramework,
numberOfNodes int32,
isInterPodGMS bool,
) {
// enabled for TRTLLM multinode deployments only
// TODO: reactivate for all backends when we have a better way to handle the readiness probe for the leader.
enabled := backendFramework == BackendFrameworkTRTLLM && numberOfNodes > 1
if !enabled {
return // No dependencies for single-node deployments
enabledMultinode := backendFramework == BackendFrameworkTRTLLM && numberOfNodes > 1
if !enabledMultinode && !isInterPodGMS {
return
}
// Build maps of leader and worker clique names
var leaderCliqueName string
var workerCliqueNames []string
// For GMS: map rank -> GMS clique name
gmsCliqueByRank := map[int32]string{}
for _, r := range roles {
cliqueName := strings.ToLower(r.Name)
......@@ -554,30 +557,49 @@ func applyCliqueStartupDependencies(
leaderCliqueName = cliqueName
case RoleWorker:
workerCliqueNames = append(workerCliqueNames, cliqueName)
case RoleGMS:
gmsCliqueByRank[r.Rank] = cliqueName
}
}
// Apply dependencies to cliques
hasDependencies := false
for _, clique := range gangSet.Spec.Template.Cliques {
// Find the corresponding role for this clique
var cliqueRole Role
var cliqueRank int32
found := false
for _, r := range roles {
if strings.ToLower(r.Name) == clique.Name {
cliqueRole = r.Role
cliqueRank = r.Rank
found = true
break
}
}
if !found {
continue
}
var startsAfter []string
// GMS dependencies: engine PCLQs start after their rank's GMS PCLQ
if isInterPodGMS && cliqueRole != RoleGMS {
if gmsName, ok := gmsCliqueByRank[cliqueRank]; ok {
startsAfter = append(startsAfter, gmsName)
}
}
// Existing multinode dependencies
if enabledMultinode {
multiDeps := getCliqueStartupDependencies(cliqueRole, backendFramework, leaderCliqueName, workerCliqueNames)
startsAfter = append(startsAfter, multiDeps...)
}
// Determine dependencies for this clique
startsAfter := getCliqueStartupDependencies(cliqueRole, backendFramework, leaderCliqueName, workerCliqueNames)
if len(startsAfter) > 0 {
clique.Spec.StartsAfter = startsAfter
hasDependencies = true
}
}
// Set explicit startup type if we have any dependencies
if hasDependencies {
explicitStartupType := grovev1alpha1.CliqueStartupTypeExplicit
gangSet.Spec.Template.StartupType = &explicitStartupType
......@@ -660,7 +682,7 @@ func GenerateComponentService(params ComponentServiceParams) (*corev1.Service, e
labels[k] = v
}
if params.IsK8sDiscovery {
labels[commonconsts.KubeLabelDynamoDiscoveryBackend] = "kubernetes"
labels[commonconsts.KubeLabelDynamoDiscoveryBackend] = commonconsts.DiscoveryBackendKubernetes
labels[commonconsts.KubeLabelDynamoDiscoveryEnabled] = commonconsts.KubeLabelValueTrue
}
......@@ -822,28 +844,116 @@ const (
RoleWorker Role = "worker"
RoleMain Role = "main"
RoleCheckpoint Role = "checkpoint"
RoleGMS Role = "gms"
)
// Update ServiceRole struct for expandRolesForService
// ServiceRole describes one PodClique (PCLQ) to be materialised for a
// service. A single DynamoComponentDeploymentSharedSpec can expand into
// multiple ServiceRoles depending on the deployment topology:
//
// - single-node, no GMS: 1 role (RoleMain)
// - multinode, no GMS: 2 roles (RoleLeader + RoleWorker)
// - single-node, inter-pod GMS: 1 engine PCLQ (replicated) + 1 RoleGMS
// weight-server PCLQ
// - multinode, inter-pod GMS: N engine PCLQs (one per rank, replicated)
// - 1 RoleGMS weight-server PCLQ
//
// The fields carry the information buildCliqueForRole needs to produce a
// concrete PodCliqueTemplateSpec:
//
// - Name: PCLQ name suffix used for Grove resource naming and hostname
// derivation.
// - Role: the pod's semantic role (main/leader/worker/gms). Drives
// backend-specific wiring (e.g. --load-format, --node-rank, discovery
// labels).
// - Replicas: the PCLQ replica count. For GMS this is the number of
// engine pods per rank (primary + NumShadows shadows); for non-GMS
// roles it is typically 1 (the PCSG-level serviceReplicas controls
// horizontal scaling).
// - Rank: static node rank (0 = leader/main, 1..N-1 = workers).
// Non-trivial for inter-pod GMS because each rank becomes its own
// PCLQ and shares a pod index across shadows; for non-GMS multinode
// pods the rank is derived dynamically from GROVE_PCLQ_POD_INDEX.
type ServiceRole struct {
Name string
Role Role
Replicas int32
Rank int32 // node rank: 0 = leader/main, 1..N-1 = workers
}
// Update expandRolesForService to use Role
func expandRolesForService(serviceName string, serviceReplicas *int32, numberOfNodes int32) []ServiceRole {
var roles []ServiceRole
if numberOfNodes > 1 {
roles = append(roles, ServiceRole{Name: serviceName + "-" + commonconsts.GroveRoleSuffixLeader, Role: RoleLeader, Replicas: 1})
roles = append(roles, ServiceRole{Name: serviceName + "-" + commonconsts.GroveRoleSuffixWorker, Role: RoleWorker, Replicas: numberOfNodes - 1})
} else {
// expandRolesForService turns a service's (numberOfNodes,
// gpuMemoryService.mode, failover.mode, replicas) tuple into the concrete
// list of ServiceRole entries the rest of the Grove rendering pipeline
// iterates over. It is the single place that decides how many PodCliques a
// service produces and what each PCLQ looks like (name, role, replicas,
// static rank).
//
// The inter-pod GMS branch is selected by IsInterPodGMSEnabled() (layout)
// rather than IsInterPodFailoverEnabled() (hot-spares): both the standalone
// inter-pod layout (1 engine pod + 1 weight-server pod per rank) and the
// inter-pod layout with failover (primary + N shadows + 1 weight-server pod
// per rank) use the same PCLQ topology, differing only in the per-rank engine
// clique's Replicas (derived from GetTotalEnginePods).
//
// Callers that iterate "engine roles" must still gate on
// IsInterPodGMSEnabled() — this function emits the GMS weight-server PCLQ
// as a regular ServiceRole, not as a separate concept.
func expandRolesForService(serviceName string, serviceReplicas *int32, numberOfNodes int32, component *v1alpha1.DynamoComponentDeploymentSharedSpec) []ServiceRole {
isInterPodGMS := component.IsInterPodGMSEnabled()
isMultinode := numberOfNodes > 1
switch {
case isMultinode && isInterPodGMS:
return expandMultinodeGMSRoles(serviceName, numberOfNodes, component.GetTotalEnginePods())
case isMultinode:
return expandMultinodeRoles(serviceName, numberOfNodes)
case isInterPodGMS:
return expandSingleNodeGMSRoles(serviceName, component.GetTotalEnginePods())
default:
return expandSingleNodeRoles(serviceName, serviceReplicas)
}
}
func expandSingleNodeRoles(serviceName string, serviceReplicas *int32) []ServiceRole {
replicas := int32(1)
if serviceReplicas != nil {
replicas = *serviceReplicas
}
roles = append(roles, ServiceRole{Name: serviceName, Role: RoleMain, Replicas: replicas})
return []ServiceRole{
{Name: serviceName, Role: RoleMain, Replicas: replicas},
}
}
func expandMultinodeRoles(serviceName string, numberOfNodes int32) []ServiceRole {
return []ServiceRole{
{Name: serviceName + "-" + commonconsts.GroveRoleSuffixLeader, Role: RoleLeader, Replicas: 1},
{Name: serviceName + "-" + commonconsts.GroveRoleSuffixWorker, Role: RoleWorker, Replicas: numberOfNodes - 1},
}
}
func expandSingleNodeGMSRoles(serviceName string, totalEnginePods int32) []ServiceRole {
return []ServiceRole{
{Name: fmt.Sprintf("%s-%s-0", serviceName, commonconsts.GroveRoleSuffixGMS), Role: RoleGMS, Replicas: 1, Rank: 0},
{Name: serviceName, Role: RoleMain, Replicas: totalEnginePods, Rank: 0},
}
}
func expandMultinodeGMSRoles(serviceName string, numberOfNodes int32, totalEnginePods int32) []ServiceRole {
roles := make([]ServiceRole, 0, numberOfNodes*2)
for rank := int32(0); rank < numberOfNodes; rank++ {
gmsName := fmt.Sprintf("%s-%s-%d", serviceName, commonconsts.GroveRoleSuffixGMS, rank)
roles = append(roles, ServiceRole{Name: gmsName, Role: RoleGMS, Replicas: 1, Rank: rank})
var engineName string
var engineRole Role
if rank == 0 {
engineName = serviceName + "-" + commonconsts.GroveRoleSuffixLeader
engineRole = RoleLeader
} else {
engineName = fmt.Sprintf("%s-%s-%d", serviceName, commonconsts.GroveRoleSuffixWorker, rank)
engineRole = RoleWorker
}
roles = append(roles, ServiceRole{Name: engineName, Role: engineRole, Replicas: totalEnginePods, Rank: rank})
}
return roles
}
......@@ -1002,6 +1112,7 @@ func GenerateBasePodSpec(
multinodeDeploymentType commonconsts.MultinodeDeploymentType,
serviceName string,
checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info (resolved by ResolveCheckpointForService)
deployerOverride MultinodeDeployer, // Optional: overrides factory-created deployer when non-nil
) (*corev1.PodSpec, error) {
// Start with base container generated per component type
componentContext := generateComponentContext(component, parentGraphDeploymentName, namespace, numberOfNodes, NewDiscoveryContext(operatorConfig.Discovery.Backend, component.Annotations))
......@@ -1119,10 +1230,13 @@ func GenerateBasePodSpec(
})
}
// Apply backend-specific container modifications
multinodeDeployer := MultinodeDeployerFactory(multinodeDeploymentType)
multinodeDeployer := deployerOverride
if multinodeDeployer == nil {
multinodeDeployer = MultinodeDeployerFactory(multinodeDeploymentType)
if multinodeDeployer == nil {
return nil, fmt.Errorf("unsupported multinode deployment type: %s", multinodeDeploymentType)
}
}
backend := BackendFactory(backendFramework, operatorConfig, parentGraphDeploymentName)
if backend == nil {
return nil, fmt.Errorf("unsupported backend framework: %s", backendFramework)
......@@ -1184,8 +1298,17 @@ func GenerateBasePodSpec(
}
}
// GMS: replace nvidia.com/gpu with a shared DRA claim and add the server sidecar.
if component.GPUMemoryService != nil && component.GPUMemoryService.Enabled {
// Intra-pod GMS: replace nvidia.com/gpu with a shared DRA claim and add the server
// sidecar directly into this pod.
//
// Inter-pod GMS (gpuMemoryService.mode=interPod, with or without failover)
// must be skipped here — that layout wires DRA claims and the GMS server
// on a dedicated weight-server pod at the PCSG level (see
// generateGrovePodCliqueSet → gmsWeightServerPodSpec); re-applying the
// claim and injecting a sidecar here would produce a double-wired engine
// pod (stray GMS sidecar, conflicting claim).
if component.GPUMemoryService != nil && component.GPUMemoryService.Enabled &&
!component.IsInterPodGMSEnabled() {
claimTemplateName := dra.ResourceClaimTemplateName(parentGraphDeploymentName, serviceName)
if err := dra.ApplyClaim(&podSpec, claimTemplateName); err != nil {
return nil, fmt.Errorf("failed to apply DRA claim for GMS: %w", err)
......@@ -1283,7 +1406,8 @@ func generateFrontendSidecar(
return container, nil
}
// GeneratePodSpecForComponent creates a PodSpec for Grove deployments (simplified wrapper)
// GeneratePodSpecForComponent creates a PodSpec for Grove deployments (simplified wrapper).
// deployerOverride, when non-nil, overrides the default MultinodeDeployer from the factory.
func GeneratePodSpecForComponent(
component *v1alpha1.DynamoComponentDeploymentSharedSpec,
backendFramework BackendFramework,
......@@ -1294,7 +1418,8 @@ func GeneratePodSpecForComponent(
operatorConfig *configv1alpha1.OperatorConfiguration,
multinodeDeploymentType commonconsts.MultinodeDeploymentType,
serviceName string,
checkpointInfo *checkpoint.CheckpointInfo, // Optional checkpoint info
checkpointInfo *checkpoint.CheckpointInfo,
deployerOverride MultinodeDeployer,
) (*corev1.PodSpec, error) {
if len(dynamoDeployment.Spec.Envs) > 0 {
component.Envs = MergeEnvs(dynamoDeployment.Spec.Envs, component.Envs)
......@@ -1303,7 +1428,7 @@ func GeneratePodSpecForComponent(
propagateDGDAnnotations(dynamoDeployment.GetAnnotations(), component)
propagateDGDSpecMetadata(dynamoDeployment.Spec.Annotations, dynamoDeployment.Spec.Labels, component)
podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, operatorConfig, multinodeDeploymentType, serviceName, checkpointInfo)
podSpec, err := GenerateBasePodSpec(component, backendFramework, secretsRetriever, dynamoDeployment.Name, dynamoDeployment.Namespace, role, numberOfNodes, operatorConfig, multinodeDeploymentType, serviceName, checkpointInfo, deployerOverride)
if err != nil {
return nil, err
}
......@@ -1359,6 +1484,150 @@ func propagateDGDSpecMetadata(annotations, labels map[string]string, component *
}
// GenerateGrovePodCliqueSet generates a Grove PodCliqueSet for the given deployment, supporting both single-node and multinode cases.
// cliqueParams groups the context needed to build a single PodClique template
// from a ServiceRole. All fields come from the enclosing GenerateGrovePodCliqueSet
// loop iteration and are read-only.
type cliqueParams struct {
r ServiceRole
component *v1alpha1.DynamoComponentDeploymentSharedSpec
backendFramework BackendFramework
secretsRetriever SecretsRetriever
dynamoDeployment *v1alpha1.DynamoGraphDeployment
numberOfNodes int32
operatorConfig *configv1alpha1.OperatorConfiguration
runtimeConfig *controller_common.RuntimeConfig
serviceName string
checkpointInfo *checkpoint.CheckpointInfo
isMultinode bool
usesPCSG bool
isInterPodGMS bool
isInterPodFailover bool
discoveryBackend configv1alpha1.DiscoveryBackend
discoveryContext DiscoveryContext
restartState *RestartState
existingRestartAnnotations map[string]string
validatedQueueName string
kubeClient ctrlclient.Reader
ctx context.Context
}
// buildCliqueForRole generates a single PodCliqueTemplateSpec for the given role,
// injecting labels, annotations, checkpoint config, and scheduler settings.
func buildCliqueForRole(p cliqueParams) (*grovev1alpha1.PodCliqueTemplateSpec, error) {
podSpec, err := generatePodSpecForRole(
p.r, p.component, p.backendFramework, p.secretsRetriever,
p.dynamoDeployment, p.numberOfNodes, p.operatorConfig, p.serviceName, p.checkpointInfo,
)
if err != nil {
return nil, fmt.Errorf("failed to generate podSpec for role %s: %w", p.r.Name, err)
}
if p.operatorConfig.Checkpoint.Enabled {
if err := checkpoint.InjectCheckpointIntoPodSpec(
p.ctx, p.kubeClient, p.dynamoDeployment.Namespace, podSpec, p.checkpointInfo,
); err != nil {
return nil, fmt.Errorf("failed to inject checkpoint config for role %s: %w", p.r.Name, err)
}
}
// minAvailable controls Grove gang-scheduling: the clique is only
// considered available when at least this many replicas are Ready.
//
// The invariant we want is "minAvailable = Replicas unless the clique
// has redundant replicas". Concretely:
//
// - Plain multinode (no inter-pod GMS failover): the worker clique
// collapses non-leader ranks into a single clique with
// Replicas = numberOfNodes - 1 and those pods are NCCL peers of each
// other — losing any one breaks the collective, so all replicas
// must be Ready. Standalone inter-pod GMS on multinode also lands
// here but has Replicas = 1 per PCLQ (primary only, no shadows), so
// the same rule evaluates to minAvailable = 1 without a special case.
//
// - Inter-pod GMS failover (single- or multinode): within each rank
// Replicas = primary + shadows and shadows ARE redundant hot spares
// — requiring every shadow to be Ready would defeat failover, so
// the clique stays at minAvailable = 1.
//
// - Single-node clique (no multinode, with or without intra-pod
// failover or standalone inter-pod GMS): Replicas is at most 1 or a
// small DP fanout under the outer PCSG where the replicas are
// independent of each other; minAvailable = 1 is correct.
//
// The two-line rule below captures all of the above: take the baseline
// of 1, then lift it to Replicas only on plain multinode without
// inter-pod failover (the only layout that combines >1 replicas per
// clique with no redundancy between them).
minAvailable := int32(1)
if p.isMultinode && !p.isInterPodFailover {
minAvailable = p.r.Replicas
}
clique := &grovev1alpha1.PodCliqueTemplateSpec{
Name: strings.ToLower(p.r.Name),
Spec: grovev1alpha1.PodCliqueSpec{
RoleName: strings.ToLower(p.r.Name),
Replicas: p.r.Replicas,
MinAvailable: ptr.To(minAvailable),
PodSpec: *podSpec,
},
}
if !p.usesPCSG {
clique.TopologyConstraint = toGroveTopologyConstraint(p.component.TopologyConstraint)
}
labels, err := generateLabels(p.component, p.dynamoDeployment, p.serviceName, p.discoveryContext)
if err != nil {
return nil, fmt.Errorf("failed to generate labels: %w", err)
}
clique.Labels = labels
if p.isInterPodFailover && p.r.Role != RoleGMS {
clique.Labels[commonconsts.KubeLabelDynamoFailoverEngineGroupMember] = commonconsts.KubeLabelValueTrue
}
// Strip discovery labels from RoleGMS pods. generateLabels applies them
// unconditionally to every role for container-mode Pod reflector filtering
// (see #8067), but GMS weight-server pods run gpu_memory_service.cli.server
// — not the dynamo runtime — and never register a DynamoWorkerMetadata CR.
// Leaving the labels on them would make the Rust discovery daemon include
// them in its reflector store for no purpose and wake its debounce loop on
// every GMS restart/fast-kill event.
if p.r.Role == RoleGMS {
delete(clique.Labels, commonconsts.KubeLabelDynamoDiscoveryBackend)
delete(clique.Labels, commonconsts.KubeLabelDynamoDiscoveryEnabled)
}
annotations, err := generateAnnotations(p.component)
if err != nil {
return nil, fmt.Errorf("failed to generate annotations: %w", err)
}
checkpoint.ApplyRestorePodMetadata(labels, annotations, p.checkpointInfo)
annotations = applyRestartAnnotation(annotations, p.serviceName, p.restartState, p.existingRestartAnnotations)
clique.Annotations = annotations
injectKaiSchedulerIfEnabled(clique, p.runtimeConfig, p.validatedQueueName)
return clique, nil
}
// applyRestartAnnotation adds the restart annotation to the map if needed,
// creating the map when it is nil.
func applyRestartAnnotation(annotations map[string]string, serviceName string, restartState *RestartState, existingRestartAnnotations map[string]string) map[string]string {
if restartState.ShouldAnnotateService(serviceName) {
if annotations == nil {
annotations = make(map[string]string)
}
annotations[commonconsts.RestartAnnotation] = restartState.Timestamp
} else if existingRestartAnnotations != nil {
if existingTimestamp, ok := existingRestartAnnotations[serviceName]; ok {
if annotations == nil {
annotations = make(map[string]string)
}
annotations[commonconsts.RestartAnnotation] = existingTimestamp
}
}
return annotations
}
func GenerateGrovePodCliqueSet(
ctx context.Context,
dynamoDeployment *v1alpha1.DynamoGraphDeployment,
......@@ -1402,7 +1671,16 @@ func GenerateGrovePodCliqueSet(
discoveryContext := NewDiscoveryContext(operatorConfig.Discovery.Backend, dynamoDeployment.Annotations)
var scalingGroups []grovev1alpha1.PodCliqueScalingGroupConfig
for serviceName, component := range dynamoDeployment.Spec.Services {
var resourceClaimTemplates []grovev1alpha1.ResourceClaimTemplateConfig
sortedServiceNames := make([]string, 0, len(dynamoDeployment.Spec.Services))
for name := range dynamoDeployment.Spec.Services {
sortedServiceNames = append(sortedServiceNames, name)
}
sort.Strings(sortedServiceNames)
for _, serviceName := range sortedServiceNames {
component := dynamoDeployment.Spec.Services[serviceName]
dynamoNamespace := GetDynamoNamespace(dynamoDeployment, component)
component.DynamoNamespace = &dynamoNamespace
// Determine backend framework using hybrid approach
......@@ -1426,113 +1704,121 @@ func GenerateGrovePodCliqueSet(
numberOfNodes := component.GetNumberOfNodes()
isMultinode := numberOfNodes > 1
roles := expandRolesForService(serviceName, component.Replicas, numberOfNodes)
isInterPodGMS := component.IsInterPodGMSEnabled()
isInterPodFailover := component.IsInterPodFailoverEnabled()
usesPCSG := isMultinode || isInterPodGMS
roles := expandRolesForService(serviceName, component.Replicas, numberOfNodes, component)
var cliqueNames []string
for _, r := range roles {
podSpec, err := GeneratePodSpecForComponent(
component,
backendFramework,
secretsRetriever,
dynamoDeployment,
r.Role,
numberOfNodes,
operatorConfig,
commonconsts.MultinodeDeploymentTypeGrove,
serviceName,
checkpointInfo,
)
clique, err := buildCliqueForRole(cliqueParams{
r: r,
component: component,
backendFramework: backendFramework,
secretsRetriever: secretsRetriever,
dynamoDeployment: dynamoDeployment,
numberOfNodes: numberOfNodes,
operatorConfig: operatorConfig,
runtimeConfig: runtimeConfig,
serviceName: serviceName,
checkpointInfo: checkpointInfo,
isMultinode: isMultinode,
usesPCSG: usesPCSG,
isInterPodGMS: isInterPodGMS,
isInterPodFailover: isInterPodFailover,
discoveryBackend: discoveryBackend,
discoveryContext: discoveryContext,
restartState: restartState,
existingRestartAnnotations: existingRestartAnnotations,
validatedQueueName: validatedQueueName,
kubeClient: kubeClient,
ctx: ctx,
})
if err != nil {
return nil, fmt.Errorf("failed to generate podSpec for role %s: %w", r.Name, err)
}
if operatorConfig.Checkpoint.Enabled {
if err := checkpoint.InjectCheckpointIntoPodSpec(
ctx,
kubeClient,
dynamoDeployment.Namespace,
podSpec,
checkpointInfo,
); err != nil {
return nil, fmt.Errorf("failed to inject checkpoint config for role %s: %w", r.Name, err)
return nil, err
}
gangSet.Spec.Template.Cliques = append(gangSet.Spec.Template.Cliques, clique)
cliqueNames = append(cliqueNames, strings.ToLower(r.Name))
}
minAvailable := int32(1)
if isMultinode {
minAvailable = r.Replicas
}
applyCliqueStartupDependencies(gangSet, roles, backendFramework, numberOfNodes, isInterPodGMS)
clique := &grovev1alpha1.PodCliqueTemplateSpec{
Name: strings.ToLower(r.Name),
Spec: grovev1alpha1.PodCliqueSpec{
RoleName: strings.ToLower(r.Name),
Replicas: r.Replicas,
MinAvailable: ptr.To(minAvailable),
PodSpec: *podSpec,
},
if isInterPodGMS {
resourceClaimTemplates = append(resourceClaimTemplates, gmsResourceClaimTemplateConfigs(serviceName, component.Resources, roles)...)
}
// For single-node services, set topology constraint directly on the clique.
// For multinode services, the constraint goes on the PCSG instead;
// child cliques inherit from PCSG and should NOT have explicit constraints.
if !isMultinode {
clique.TopologyConstraint = toGroveTopologyConstraint(component.TopologyConstraint)
if usesPCSG {
pcsg := grovev1alpha1.PodCliqueScalingGroupConfig{
Name: strings.ToLower(serviceName),
CliqueNames: cliqueNames,
Replicas: component.Replicas,
MinAvailable: ptr.To(int32(1)),
TopologyConstraint: toGroveTopologyConstraint(component.TopologyConstraint),
}
labels, err := generateLabels(component, dynamoDeployment, serviceName, discoveryContext)
if err != nil {
return nil, fmt.Errorf("failed to generate labels: %w", err)
if isInterPodGMS {
pcsg.ResourceSharing = gmsResourceSharingEntries(serviceName, roles)
}
clique.Labels = labels
annotations, err := generateAnnotations(component)
if err != nil {
return nil, fmt.Errorf("failed to generate annotations: %w", err)
scalingGroups = append(scalingGroups, pcsg)
}
checkpoint.ApplyRestorePodMetadata(labels, annotations, checkpointInfo)
// Apply restart annotation if this service should be restarted.
// For services not in the current restart order, preserve their existing annotation
// to avoid triggering unwanted rollouts when a new restart begins.
if restartState.ShouldAnnotateService(serviceName) {
if annotations == nil {
annotations = make(map[string]string)
}
annotations[commonconsts.RestartAnnotation] = restartState.Timestamp
} else if existingRestartAnnotations != nil {
if existingTimestamp, ok := existingRestartAnnotations[serviceName]; ok {
if annotations == nil {
annotations = make(map[string]string)
}
annotations[commonconsts.RestartAnnotation] = existingTimestamp
if len(scalingGroups) > 0 {
gangSet.Spec.Template.PodCliqueScalingGroupConfigs = scalingGroups
}
if len(resourceClaimTemplates) > 0 {
gangSet.Spec.Template.ResourceClaimTemplates = resourceClaimTemplates
}
clique.Annotations = annotations
// Inject kai-scheduler settings if enabled
injectKaiSchedulerIfEnabled(clique, runtimeConfig, validatedQueueName)
return gangSet, nil
}
gangSet.Spec.Template.Cliques = append(gangSet.Spec.Template.Cliques, clique)
cliqueNames = append(cliqueNames, strings.ToLower(r.Name))
// generatePodSpecForRole builds the pod spec for a single role, handling GMS
// weight server pods and GMS engine pods differently from regular pods.
func generatePodSpecForRole(
r ServiceRole,
component *v1alpha1.DynamoComponentDeploymentSharedSpec,
backendFramework BackendFramework,
secretsRetriever SecretsRetriever,
dynamoDeployment *v1alpha1.DynamoGraphDeployment,
numberOfNodes int32,
operatorConfig *configv1alpha1.OperatorConfiguration,
serviceName string,
checkpointInfo *checkpoint.CheckpointInfo,
) (*corev1.PodSpec, error) {
isInterPodGMS := component.IsInterPodGMSEnabled()
if r.Role == RoleGMS {
// GMS weight server: generate a base engine spec then transform it
basePodSpec, err := GeneratePodSpecForComponent(
component, backendFramework, secretsRetriever, dynamoDeployment,
RoleMain, 1, operatorConfig,
commonconsts.MultinodeDeploymentTypeGrove, serviceName, checkpointInfo, nil,
)
if err != nil {
return nil, fmt.Errorf("failed to generate base podSpec for GMS: %w", err)
}
return gmsWeightServerPodSpec(basePodSpec, r.Rank, int(getGPUCount(component.Resources))), nil
}
// Apply startup dependencies for this service
applyCliqueStartupDependencies(gangSet, roles, backendFramework, numberOfNodes)
if isMultinode {
scalingGroups = append(scalingGroups, grovev1alpha1.PodCliqueScalingGroupConfig{
Name: strings.ToLower(serviceName),
CliqueNames: cliqueNames,
Replicas: component.Replicas,
MinAvailable: ptr.To(int32(1)),
TopologyConstraint: toGroveTopologyConstraint(component.TopologyConstraint),
})
// Engine pod (or non-GMS pod): optionally use a rank-aware deployer for multinode inter-pod GMS
var deployer MultinodeDeployer
if isInterPodGMS && numberOfNodes > 1 {
deployer = &GroveMultinodeDeployer{IsInterPodGMS: true, Rank: r.Rank}
}
podSpec, err := GeneratePodSpecForComponent(
component, backendFramework, secretsRetriever, dynamoDeployment,
r.Role, numberOfNodes, operatorConfig,
commonconsts.MultinodeDeploymentTypeGrove, serviceName, checkpointInfo, deployer,
)
if err != nil {
return nil, err
}
if len(scalingGroups) > 0 {
gangSet.Spec.Template.PodCliqueScalingGroupConfigs = scalingGroups
if isInterPodGMS {
augmentEngineForGMS(podSpec, r.Rank, component.IsInterPodFailoverEnabled())
}
return gangSet, nil
return podSpec, nil
}
func generateLabels(
......@@ -1579,9 +1865,17 @@ func generateLabels(
if workerHash := component.Labels[commonconsts.KubeLabelDynamoWorkerHash]; workerHash != "" {
labels[commonconsts.KubeLabelDynamoWorkerHash] = workerHash
}
// Discovery labels on pod template — needed for Pod reflector filtering in container mode
// Discovery labels on pod template — needed for Pod reflector filtering in
// container mode (see lib/runtime/src/discovery/kube/daemon.rs). Applied to
// every role by default because any role may host the dynamo runtime — for
// example, multinode vLLM workers in data-parallel hybrid-lb mode run their
// own API server (see RoleWorker branch in injectDataParallelLaunchFlags).
// Callers that render non-dynamo pods (specifically the RoleGMS weight
// server, which runs gpu_memory_service.cli.server and never registers a
// DynamoWorkerMetadata CR) are responsible for stripping these labels after
// the fact — see buildCliqueForRole.
if discovery.Backend == configv1alpha1.DiscoveryBackendKubernetes {
labels[commonconsts.KubeLabelDynamoDiscoveryBackend] = "kubernetes"
labels[commonconsts.KubeLabelDynamoDiscoveryBackend] = commonconsts.DiscoveryBackendKubernetes
labels[commonconsts.KubeLabelDynamoDiscoveryEnabled] = commonconsts.KubeLabelValueTrue
}
return labels, nil
......@@ -1783,6 +2077,7 @@ func GenerateBasePodSpecForController(
multinodeDeploymentType,
serviceName,
checkpointInfo,
nil, // use default deployer
)
if err != nil {
return nil, err
......
......@@ -1491,6 +1491,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
Env: []corev1.EnvVar{
{
Name: "CONTAINER_NAME",
Value: commonconsts.MainContainerName,
},
{
Name: "DYN_HTTP_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
......@@ -1697,6 +1701,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
FailureThreshold: 720,
},
Env: []corev1.EnvVar{
{
Name: "CONTAINER_NAME",
Value: commonconsts.MainContainerName,
},
{
Name: "DYNAMO_POD_GANG_SET_REPLICAS",
Value: "1",
......@@ -2092,6 +2100,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
Env: []corev1.EnvVar{
{
Name: "CONTAINER_NAME",
Value: commonconsts.MainContainerName,
},
{
Name: "DYNAMO_POD_GANG_SET_REPLICAS",
Value: "1",
......@@ -2302,6 +2314,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
Env: []corev1.EnvVar{
{
Name: "CONTAINER_NAME",
Value: commonconsts.MainContainerName,
},
{
Name: "DYNAMO_POD_GANG_SET_REPLICAS",
Value: "1",
......@@ -2489,6 +2505,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
Env: []corev1.EnvVar{
{
Name: "CONTAINER_NAME",
Value: commonconsts.MainContainerName,
},
{
Name: "DYN_HTTP_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
......@@ -2686,6 +2706,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
FailureThreshold: 720,
},
Env: []corev1.EnvVar{
{
Name: "CONTAINER_NAME",
Value: commonconsts.MainContainerName,
},
{
Name: "DYNAMO_POD_GANG_SET_REPLICAS",
Value: "1",
......@@ -3103,6 +3127,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
Env: []corev1.EnvVar{
{
Name: "CONTAINER_NAME",
Value: commonconsts.MainContainerName,
},
{
Name: "DYNAMO_POD_GANG_SET_REPLICAS",
Value: "1",
......@@ -3300,6 +3328,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
Env: []corev1.EnvVar{
{
Name: "CONTAINER_NAME",
Value: commonconsts.MainContainerName,
},
{
Name: "DYNAMO_POD_GANG_SET_REPLICAS",
Value: "1",
......@@ -3487,6 +3519,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
},
},
Env: []corev1.EnvVar{
{
Name: "CONTAINER_NAME",
Value: commonconsts.MainContainerName,
},
{
Name: "DYN_HTTP_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoServicePort),
......@@ -3684,6 +3720,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
FailureThreshold: 720,
},
Env: []corev1.EnvVar{
{
Name: "CONTAINER_NAME",
Value: commonconsts.MainContainerName,
},
{
Name: "DYNAMO_POD_GANG_SET_REPLICAS",
Value: "1",
......@@ -4013,6 +4053,7 @@ func TestGeneratePodSpecForComponent_SGLang(t *testing.T) {
commonconsts.MultinodeDeploymentTypeGrove,
"worker",
nil, // No checkpoint info in tests
nil, // Use default deployer
)
if tt.expectError {
......@@ -4171,6 +4212,7 @@ func TestGeneratePodSpecForComponent_VLLM(t *testing.T) {
commonconsts.MultinodeDeploymentTypeGrove,
"worker",
nil, // No checkpoint info in tests
nil, // Use default deployer
)
if tt.expectError {
......@@ -4258,6 +4300,7 @@ func TestGeneratePodSpecForComponent_UnsupportedBackend(t *testing.T) {
commonconsts.MultinodeDeploymentTypeGrove,
"worker",
nil, // No checkpoint info in tests
nil, // Use default deployer
)
if tt.expectError {
......@@ -4283,6 +4326,7 @@ func TestExpandRolesForService(t *testing.T) {
serviceName string
numberOfNodes int32
serviceReplicas *int32
component *v1alpha1.DynamoComponentDeploymentSharedSpec
expected []ServiceRole
}{
{
......@@ -4338,11 +4382,99 @@ func TestExpandRolesForService(t *testing.T) {
{Name: "test-service", Role: RoleMain, Replicas: 0},
},
},
{
name: "single-node GMS with 1 shadow",
serviceName: "svc",
numberOfNodes: 1,
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
Failover: &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod, NumShadows: 1},
},
expected: []ServiceRole{
{Name: "svc-gms-0", Role: RoleGMS, Replicas: 1, Rank: 0},
{Name: "svc", Role: RoleMain, Replicas: 2, Rank: 0},
},
},
{
name: "single-node GMS with 3 shadows",
serviceName: "svc",
numberOfNodes: 1,
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
Failover: &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod, NumShadows: 3},
},
expected: []ServiceRole{
{Name: "svc-gms-0", Role: RoleGMS, Replicas: 1, Rank: 0},
{Name: "svc", Role: RoleMain, Replicas: 4, Rank: 0},
},
},
{
name: "single-node standalone inter-pod GMS (no failover)",
serviceName: "svc",
numberOfNodes: 1,
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
},
expected: []ServiceRole{
{Name: "svc-gms-0", Role: RoleGMS, Replicas: 1, Rank: 0},
{Name: "svc", Role: RoleMain, Replicas: 1, Rank: 0},
},
},
{
name: "multinode GMS 2 nodes 1 shadow",
serviceName: "svc",
numberOfNodes: 2,
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
Failover: &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod, NumShadows: 1},
},
expected: []ServiceRole{
{Name: "svc-gms-0", Role: RoleGMS, Replicas: 1, Rank: 0},
{Name: "svc-ldr", Role: RoleLeader, Replicas: 2, Rank: 0},
{Name: "svc-gms-1", Role: RoleGMS, Replicas: 1, Rank: 1},
{Name: "svc-wkr-1", Role: RoleWorker, Replicas: 2, Rank: 1},
},
},
{
name: "multinode GMS 3 nodes 2 shadows",
serviceName: "svc",
numberOfNodes: 3,
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
Failover: &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod, NumShadows: 2},
},
expected: []ServiceRole{
{Name: "svc-gms-0", Role: RoleGMS, Replicas: 1, Rank: 0},
{Name: "svc-ldr", Role: RoleLeader, Replicas: 3, Rank: 0},
{Name: "svc-gms-1", Role: RoleGMS, Replicas: 1, Rank: 1},
{Name: "svc-wkr-1", Role: RoleWorker, Replicas: 3, Rank: 1},
{Name: "svc-gms-2", Role: RoleGMS, Replicas: 1, Rank: 2},
{Name: "svc-wkr-2", Role: RoleWorker, Replicas: 3, Rank: 2},
},
},
{
name: "multinode standalone inter-pod GMS (no failover)",
serviceName: "svc",
numberOfNodes: 2,
component: &v1alpha1.DynamoComponentDeploymentSharedSpec{
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
},
expected: []ServiceRole{
{Name: "svc-gms-0", Role: RoleGMS, Replicas: 1, Rank: 0},
{Name: "svc-ldr", Role: RoleLeader, Replicas: 1, Rank: 0},
{Name: "svc-gms-1", Role: RoleGMS, Replicas: 1, Rank: 1},
{Name: "svc-wkr-1", Role: RoleWorker, Replicas: 1, Rank: 1},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := expandRolesForService(tt.serviceName, tt.serviceReplicas, tt.numberOfNodes)
component := tt.component
if component == nil {
component = &v1alpha1.DynamoComponentDeploymentSharedSpec{}
}
result := expandRolesForService(tt.serviceName, tt.serviceReplicas, tt.numberOfNodes, component)
if !reflect.DeepEqual(result, tt.expected) {
t.Errorf("expandRolesForService() = %v, want %v", result, tt.expected)
}
......@@ -4802,8 +4934,8 @@ func TestApplyCliqueStartupDependencies(t *testing.T) {
gangSet.Spec.Template.Cliques = append(gangSet.Spec.Template.Cliques, clique)
}
// Apply dependencies
applyCliqueStartupDependencies(gangSet, tt.roles, tt.backendFramework, tt.numberOfNodes)
// Apply dependencies (non-GMS)
applyCliqueStartupDependencies(gangSet, tt.roles, tt.backendFramework, tt.numberOfNodes, false)
// Verify StartupType
if tt.expectStartupType {
......@@ -4832,6 +4964,80 @@ func TestApplyCliqueStartupDependencies(t *testing.T) {
}
}
func TestApplyCliqueStartupDependencies_GMS(t *testing.T) {
t.Run("gms_single_node_engine_starts_after_gms", func(t *testing.T) {
gmsRoles := []ServiceRole{
{Name: "svc-gms-0", Role: RoleGMS, Rank: 0, Replicas: 1},
{Name: "svc", Role: RoleMain, Rank: 0, Replicas: 2},
}
gangSet := &grovev1alpha1.PodCliqueSet{
Spec: grovev1alpha1.PodCliqueSetSpec{
Template: grovev1alpha1.PodCliqueSetTemplateSpec{
Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{
{Name: "svc-gms-0", Spec: grovev1alpha1.PodCliqueSpec{RoleName: "svc-gms-0", Replicas: 1}},
{Name: "svc", Spec: grovev1alpha1.PodCliqueSpec{RoleName: "svc", Replicas: 2}},
},
},
},
}
applyCliqueStartupDependencies(gangSet, gmsRoles, BackendFrameworkVLLM, 1, true)
if gangSet.Spec.Template.StartupType == nil || *gangSet.Spec.Template.StartupType != grovev1alpha1.CliqueStartupTypeExplicit {
t.Fatal("expected CliqueStartupTypeExplicit")
}
for _, c := range gangSet.Spec.Template.Cliques {
switch c.Name {
case "svc-gms-0":
if c.Spec.StartsAfter != nil {
t.Errorf("GMS clique should have no startsAfter, got %v", c.Spec.StartsAfter)
}
case "svc":
if !reflect.DeepEqual(c.Spec.StartsAfter, []string{"svc-gms-0"}) {
t.Errorf("engine clique startsAfter = %v, want [svc-gms-0]", c.Spec.StartsAfter)
}
}
}
})
t.Run("gms_does_not_leak_startsAfter_to_unrelated_cliques", func(t *testing.T) {
gmsRoles := []ServiceRole{
{Name: "engine-gms-0", Role: RoleGMS, Rank: 0, Replicas: 1},
{Name: "engine", Role: RoleMain, Rank: 0, Replicas: 2},
}
gangSet := &grovev1alpha1.PodCliqueSet{
Spec: grovev1alpha1.PodCliqueSetSpec{
Template: grovev1alpha1.PodCliqueSetTemplateSpec{
Cliques: []*grovev1alpha1.PodCliqueTemplateSpec{
{Name: "frontend", Spec: grovev1alpha1.PodCliqueSpec{RoleName: "frontend", Replicas: 1}},
{Name: "engine-gms-0", Spec: grovev1alpha1.PodCliqueSpec{RoleName: "engine-gms-0", Replicas: 1}},
{Name: "engine", Spec: grovev1alpha1.PodCliqueSpec{RoleName: "engine", Replicas: 2}},
},
},
},
}
applyCliqueStartupDependencies(gangSet, gmsRoles, BackendFrameworkVLLM, 1, true)
for _, c := range gangSet.Spec.Template.Cliques {
switch c.Name {
case "frontend":
if c.Spec.StartsAfter != nil {
t.Errorf("frontend clique should have no startsAfter, got %v", c.Spec.StartsAfter)
}
case "engine-gms-0":
if c.Spec.StartsAfter != nil {
t.Errorf("GMS clique should have no startsAfter, got %v", c.Spec.StartsAfter)
}
case "engine":
if !reflect.DeepEqual(c.Spec.StartsAfter, []string{"engine-gms-0"}) {
t.Errorf("engine clique startsAfter = %v, want [engine-gms-0]", c.Spec.StartsAfter)
}
}
}
})
}
func TestGetCliqueStartupDependencies(t *testing.T) {
tests := []struct {
name string
......@@ -5064,6 +5270,7 @@ func TestGenerateBasePodSpec_Frontend(t *testing.T) {
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
nil, // Use default deployer
)
if (err != nil) != tt.wantErr {
......@@ -5140,6 +5347,7 @@ func TestGenerateBasePodSpec_PlannerServiceAccount(t *testing.T) {
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
nil, // Use default deployer
)
if err != nil {
......@@ -5263,6 +5471,7 @@ func TestGenerateBasePodSpec_DisableImagePullSecretDiscovery(t *testing.T) {
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
nil, // Use default deployer
)
if err != nil {
......@@ -5369,6 +5578,7 @@ func TestGenerateBasePodSpec_DiscoverBackend(t *testing.T) {
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
nil, // Use default deployer
)
if !assert.NoError(t, err) {
return
......@@ -5421,7 +5631,9 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
Args: []string{"-m", "dynamo.worker"},
Env: []corev1.EnvVar{
{Name: "ANOTHER_COMPONENTENV", Value: "true"},
{Name: "ANOTHER_CONTAINER_ENV", Value: "true"}, {Name: commonconsts.DynamoComponentEnvVar, Value: "worker"},
{Name: "ANOTHER_CONTAINER_ENV", Value: "true"},
{Name: "CONTAINER_NAME", Value: commonconsts.MainContainerName},
{Name: commonconsts.DynamoComponentEnvVar, Value: "worker"},
{Name: commonconsts.DynamoDiscoveryBackendEnvVar, Value: "kubernetes"},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "false"},
{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default-test-deployment"},
......@@ -5537,6 +5749,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
nil, // Use default deployer
)
if err != nil {
......@@ -5634,6 +5847,7 @@ func TestGenerateBasePodSpec_VolumeMounts(t *testing.T) {
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
nil, // Use default deployer
)
if tt.expectError {
......@@ -5870,6 +6084,7 @@ func TestGenerateBasePodSpec_ResourceClaims(t *testing.T) {
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
nil, // Use default deployer
)
if tt.expectError {
......@@ -6082,6 +6297,7 @@ func TestGenerateBasePodSpec_UseAsCompilationCache_BackendSupport(t *testing.T)
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
nil, // Use default deployer
)
if tt.expectError {
......@@ -6268,6 +6484,7 @@ func TestGenerateBasePodSpec_SecurityContext(t *testing.T) {
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil, // No checkpoint info in tests
nil, // Use default deployer
)
if err != nil {
......@@ -6954,6 +7171,134 @@ func TestGenerateLabels_ReassertsRestoreIdentityLabelsAfterMetadataMerge(t *test
assert.Equal(t, "workerhash", labels[commonconsts.KubeLabelDynamoWorkerHash])
}
// TestGenerateGrovePodCliqueSet_GMSPodsDoNotCarryDiscoveryLabels pins the
// contract that inter-pod GMS weight-server cliques (RoleGMS) do NOT carry
// the kubernetes discovery labels, while engine cliques (RoleMain / RoleLeader
// / RoleWorker) do — the latter matches the behavior introduced by
// #8067 "per-container kube discovery for multi-engine pods". The Rust
// discovery daemon (lib/runtime/src/discovery/kube/daemon.rs) uses these
// labels as a reflector filter; GMS pods run gpu_memory_service.cli.server,
// not the dynamo runtime, and never register a DynamoWorkerMetadata CR, so
// they must be excluded to avoid reflector-store bloat and spurious wake-ups.
func TestGenerateGrovePodCliqueSet_GMSPodsDoNotCarryDiscoveryLabels(t *testing.T) {
dgd := &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgd",
Namespace: "test-ns",
},
Spec: v1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"decode": {
ComponentType: commonconsts.ComponentTypeDecode,
Replicas: ptr.To(int32(1)),
Resources: &v1alpha1.Resources{
Limits: &v1alpha1.ResourceItem{GPU: "1"},
},
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: v1alpha1.GMSModeInterPod,
},
Failover: &v1alpha1.FailoverSpec{
Enabled: true,
Mode: v1alpha1.GMSModeInterPod,
NumShadows: 1,
},
},
},
},
}
controllerConfig := &configv1alpha1.OperatorConfiguration{
Discovery: configv1alpha1.DiscoveryConfiguration{Backend: "kubernetes"},
Infrastructure: configv1alpha1.InfrastructureConfiguration{
ETCDAddress: "etcd-address",
NATSAddress: "nats-address",
},
}
got, err := GenerateGrovePodCliqueSet(context.Background(), dgd, controllerConfig, &controller_common.RuntimeConfig{DRAEnabled: true}, nil, nil, nil, nil, nil)
require.NoError(t, err)
require.NotNil(t, got)
var sawGMS, sawEngine bool
for _, clique := range got.Spec.Template.Cliques {
_, hasBackend := clique.Labels[commonconsts.KubeLabelDynamoDiscoveryBackend]
_, hasEnabled := clique.Labels[commonconsts.KubeLabelDynamoDiscoveryEnabled]
if strings.Contains(clique.Name, "gms") {
sawGMS = true
assert.False(t, hasBackend, "GMS clique %q must not carry KubeLabelDynamoDiscoveryBackend", clique.Name)
assert.False(t, hasEnabled, "GMS clique %q must not carry KubeLabelDynamoDiscoveryEnabled", clique.Name)
} else {
sawEngine = true
assert.True(t, hasBackend, "engine clique %q must carry KubeLabelDynamoDiscoveryBackend (#8067 contract)", clique.Name)
assert.True(t, hasEnabled, "engine clique %q must carry KubeLabelDynamoDiscoveryEnabled (#8067 contract)", clique.Name)
}
}
assert.True(t, sawGMS, "test setup should produce at least one GMS clique")
assert.True(t, sawEngine, "test setup should produce at least one engine clique")
}
// TestGenerateGrovePodCliqueSet_MinAvailable_FailoverShadowsAreRedundant pins
// the contract that per-rank engine cliques in an inter-pod failover cohort
// use MinAvailable=1 even when multinode (numberOfNodes > 1). Replicas here
// represent (primary + shadows) AT THAT RANK — redundant hot spares of each
// other, NOT NCCL peers. Gang-scheduling them (MinAvailable = Replicas) would
// require every shadow at every rank to be Ready before Grove considered the
// clique available, which defeats failover. See the minAvailable comment in
// renderClique for the full rationale.
func TestGenerateGrovePodCliqueSet_MinAvailable_FailoverShadowsAreRedundant(t *testing.T) {
const numberOfNodes int32 = 2
const numShadows int32 = 1
const totalEnginePods = numShadows + 1 // primary + shadows per rank
dgd := &v1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgd", Namespace: "test-ns"},
Spec: v1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*v1alpha1.DynamoComponentDeploymentSharedSpec{
"decode": {
ComponentType: commonconsts.ComponentTypeDecode,
Replicas: ptr.To(int32(1)),
Multinode: &v1alpha1.MultinodeSpec{NodeCount: numberOfNodes},
Resources: &v1alpha1.Resources{Limits: &v1alpha1.ResourceItem{GPU: "1"}},
GPUMemoryService: &v1alpha1.GPUMemoryServiceSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod},
Failover: &v1alpha1.FailoverSpec{Enabled: true, Mode: v1alpha1.GMSModeInterPod, NumShadows: numShadows},
},
},
},
}
got, err := GenerateGrovePodCliqueSet(
context.Background(),
dgd,
&configv1alpha1.OperatorConfiguration{
Discovery: configv1alpha1.DiscoveryConfiguration{Backend: "kubernetes"},
Infrastructure: configv1alpha1.InfrastructureConfiguration{ETCDAddress: "etcd-address", NATSAddress: "nats-address"},
},
&controller_common.RuntimeConfig{DRAEnabled: true},
nil, nil, nil, nil, nil,
)
require.NoError(t, err)
require.NotNil(t, got)
var sawEngineClique bool
for _, clique := range got.Spec.Template.Cliques {
require.NotNil(t, clique.Spec.MinAvailable, "clique %q has nil MinAvailable", clique.Name)
if strings.Contains(clique.Name, "gms") {
assert.EqualValues(t, 1, *clique.Spec.MinAvailable, "GMS clique %q MinAvailable", clique.Name)
assert.EqualValues(t, 1, clique.Spec.Replicas, "GMS clique %q Replicas", clique.Name)
continue
}
sawEngineClique = true
assert.EqualValues(t, totalEnginePods, clique.Spec.Replicas,
"multinode failover engine clique %q Replicas should be primary+shadows=%d", clique.Name, totalEnginePods)
assert.EqualValues(t, 1, *clique.Spec.MinAvailable,
"multinode failover engine clique %q MinAvailable must be 1 (shadows are redundant hot spares, NOT NCCL peers)", clique.Name)
}
assert.True(t, sawEngineClique, "test setup should produce at least one engine (non-GMS) clique")
}
func TestIsWorkerComponent(t *testing.T) {
workers := []string{commonconsts.ComponentTypeWorker, commonconsts.ComponentTypePrefill, commonconsts.ComponentTypeDecode}
nonWorkers := []string{commonconsts.ComponentTypeFrontend, commonconsts.ComponentTypePlanner, commonconsts.ComponentTypeEPP, "custom", ""}
......@@ -7235,7 +7580,8 @@ func TestGenerateBasePodSpec_FrontendSidecar(t *testing.T) {
controllerConfig,
commonconsts.MultinodeDeploymentTypeGrove,
"test-service",
nil,
nil, // checkpointInfo
nil, // deployerOverride
)
if (err != nil) != tt.wantErr {
......
......@@ -23,31 +23,55 @@ import (
type GroveMultinodeDeployer struct {
MultinodeDeployer
// IsInterPodGMS is true when this deployer produces pod specs for an
// engine PCLQ that uses the inter-pod GMS *layout* (one engine pod per
// rank, per shadow, with a dedicated GMS weight server pod). It is a
// layout/topology flag — not a failover policy flag — and governs how
// hostnames, node ranks, and per-pod wiring are computed. Today this
// layout is only produced when inter-pod GMS failover is enabled, but
// the deployer itself should not encode that assumption.
IsInterPodGMS bool
Rank int32 // explicit node rank (used when IsInterPodGMS is true)
}
func (d *GroveMultinodeDeployer) GetLeaderHostname(serviceName string) string {
return fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-0.$(GROVE_HEADLESS_SERVICE)", strings.ToLower(serviceName), commonconsts.GroveRoleSuffixLeader)
if d.IsInterPodGMS {
// GMS: each PCLQ has multiple replicas; pods at the same index across
// ranks form a communication group, so use the dynamic pod index.
return fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-$(GROVE_PCLQ_POD_INDEX).$(GROVE_HEADLESS_SERVICE)",
strings.ToLower(serviceName), commonconsts.GroveRoleSuffixLeader)
}
return fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-0.$(GROVE_HEADLESS_SERVICE)",
strings.ToLower(serviceName), commonconsts.GroveRoleSuffixLeader)
}
func (d *GroveMultinodeDeployer) GetNodeRank() (string, bool) {
// This requires shell expansion for arithmetic expression
if d.IsInterPodGMS {
return fmt.Sprintf("%d", d.Rank), false
}
return "$((GROVE_PCLQ_POD_INDEX + 1))", true
}
func (d *GroveMultinodeDeployer) NeedsDNSWait() bool {
// Grove doesn't need DNS wait - it handles startup coordination differently
return false
}
func (d *GroveMultinodeDeployer) GetHostNames(serviceName string, numberOfNodes int32) []string {
hostnames := make([]string, 0, numberOfNodes)
leaderHostname := d.GetLeaderHostname(serviceName)
hostnames = append(hostnames, leaderHostname)
// Add worker hostnames
hostnames = append(hostnames, d.GetLeaderHostname(serviceName))
if d.IsInterPodGMS {
for rank := int32(1); rank < numberOfNodes; rank++ {
hostname := fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-%d-$(GROVE_PCLQ_POD_INDEX).$(GROVE_HEADLESS_SERVICE)",
strings.ToLower(serviceName), commonconsts.GroveRoleSuffixWorker, rank)
hostnames = append(hostnames, hostname)
}
} else {
for i := int32(0); i < numberOfNodes-1; i++ {
workerHostname := fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-%d.$(GROVE_HEADLESS_SERVICE)",
hostname := fmt.Sprintf("$(GROVE_PCSG_NAME)-$(GROVE_PCSG_INDEX)-%s-%s-%d.$(GROVE_HEADLESS_SERVICE)",
strings.ToLower(serviceName), commonconsts.GroveRoleSuffixWorker, i)
hostnames = append(hostnames, workerHostname)
hostnames = append(hostnames, hostname)
}
}
return hostnames
}
......@@ -63,18 +87,16 @@ func GetComponentReadinessAndServiceReplicaStatuses(ctx context.Context, client
serviceStatuses := make(map[string]v1alpha1.ServiceReplicaStatus, len(dgd.Spec.Services))
for serviceName, component := range dgd.Spec.Services {
isMultinode := component.GetNumberOfNodes() > 1
usesPCSG := component.GetNumberOfNodes() > 1 || component.IsInterPodGMSEnabled()
resourceName := fmt.Sprintf("%s-0-%s", dgd.Name, strings.ToLower(serviceName))
if isMultinode {
// Check PodCliqueScalingGroup: spec.replicas == status.availableReplicas
if usesPCSG {
ok, reason, serviceStatus := CheckPCSGReady(ctx, client, resourceName, dgd.Namespace, logger)
serviceStatuses[serviceName] = serviceStatus
if !ok {
notReadyComponents = append(notReadyComponents, fmt.Sprintf("pcsg/%s: %s", resourceName, reason))
}
} else {
// Check PodClique: spec.replicas == status.readyReplicas
ok, reason, serviceStatus := CheckPodCliqueReady(ctx, client, resourceName, dgd.Namespace, logger)
serviceStatuses[serviceName] = serviceStatus
if !ok {
......
......@@ -54,6 +54,27 @@ func shellQuoteForBashC(s string) string {
return s
}
// containerHasArg reports whether the container already carries the given
// flag/value pair in its Args (either as adjacent tokens "flag", "value" or
// as a single token "flag=value" or "flag value" embedded inside a shell
// string). It is used to make flag injection idempotent.
func containerHasArg(container *corev1.Container, flag, value string) bool {
if container == nil {
return false
}
joined := flag + " " + value
equals := flag + "=" + value
for i, arg := range container.Args {
if strings.Contains(arg, joined) || strings.Contains(arg, equals) {
return true
}
if arg == flag && i+1 < len(container.Args) && container.Args[i+1] == value {
return true
}
}
return false
}
func injectFlagsIntoContainerCommand(container *corev1.Container, flags string, needsShell bool, framework string) {
if len(container.Command) > 0 && isPythonCommand(container.Command[0]) {
// Direct python command case
......
......@@ -43,6 +43,11 @@ const (
// Pod names follow formats like: <pcs-name>-<pcs-index>-<pcsg-name>-<pcsg-index>-<pclq-name>-<random>
// The random string and hyphens consume additional characters, leaving 45 for the resource names.
maxCombinedResourceNameLength = 45
// backendFrameworkVLLM is the spec.backendFramework value that identifies
// a vLLM deployment. Duplicated here (instead of importing from
// internal/dynamo) to avoid a webhook -> dynamo import cycle.
backendFrameworkVLLM = "vllm"
)
// DynamoGraphDeploymentValidator validates DynamoGraphDeployment resources.
......@@ -50,21 +55,24 @@ const (
type DynamoGraphDeploymentValidator struct {
deployment *nvidiacomv1alpha1.DynamoGraphDeployment
mgr ctrl.Manager // Optional: for API group detection via discovery client
groveEnabled bool
}
// NewDynamoGraphDeploymentValidator creates a new validator for DynamoGraphDeployment.
func NewDynamoGraphDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoGraphDeployment) *DynamoGraphDeploymentValidator {
// groveEnabled should reflect the operator's runtime config (global.grove.enabled).
func NewDynamoGraphDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoGraphDeployment, groveEnabled bool) *DynamoGraphDeploymentValidator {
return &DynamoGraphDeploymentValidator{
deployment: deployment,
mgr: nil,
groveEnabled: groveEnabled,
}
}
// NewDynamoGraphDeploymentValidatorWithManager creates a validator with a manager for API group detection.
func NewDynamoGraphDeploymentValidatorWithManager(deployment *nvidiacomv1alpha1.DynamoGraphDeployment, mgr ctrl.Manager) *DynamoGraphDeploymentValidator {
func NewDynamoGraphDeploymentValidatorWithManager(deployment *nvidiacomv1alpha1.DynamoGraphDeployment, mgr ctrl.Manager, groveEnabled bool) *DynamoGraphDeploymentValidator {
return &DynamoGraphDeploymentValidator{
deployment: deployment,
mgr: mgr,
groveEnabled: groveEnabled,
}
}
......@@ -176,6 +184,44 @@ func (v *DynamoGraphDeploymentValidator) validateImmutableFields(old *nvidiacomv
}
}
// Validate inter-pod GMS layout and failover immutability.
//
// Flipping the inter-pod GMS layout or toggling failover within an
// inter-pod layout both change the PodClique topology (weight-server PCLQ,
// per-rank engine PCLQs, shadow PCLQs, DRA ResourceClaimTemplates), which
// Grove cannot transform in place. Force the user to delete and recreate.
for serviceName, newService := range v.deployment.Spec.Services {
oldService, exists := old.Spec.Services[serviceName]
if !exists {
continue
}
oldInterPodGMS := oldService.IsInterPodGMSEnabled()
newInterPodGMS := newService.IsInterPodGMSEnabled()
if oldInterPodGMS != newInterPodGMS {
errs = append(errs, fmt.Errorf(
"spec.services[%s].gpuMemoryService.mode: the inter-pod GMS layout cannot be toggled after creation; "+
"delete and recreate the DynamoGraphDeployment",
serviceName,
))
}
oldInterPodFailover := oldService.IsInterPodFailoverEnabled()
newInterPodFailover := newService.IsInterPodFailoverEnabled()
if oldInterPodFailover != newInterPodFailover {
errs = append(errs, fmt.Errorf(
"spec.services[%s].failover: inter-pod GMS failover cannot be toggled after creation; "+
"delete and recreate the DynamoGraphDeployment",
serviceName,
))
}
if oldInterPodFailover && newInterPodFailover && oldService.Failover.NumShadows != newService.Failover.NumShadows {
errs = append(errs, fmt.Errorf(
"spec.services[%s].failover.numShadows is immutable for inter-pod GMS failover; "+
"delete and recreate the DynamoGraphDeployment to change it",
serviceName,
))
}
}
// Validate topology constraint immutability
if err := v.validateTopologyConstraintImmutability(old); err != nil {
errs = append(errs, err)
......@@ -279,6 +325,41 @@ func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv
// validateService validates a single service configuration using SharedSpecValidator.
// Returns warnings and error.
func (v *DynamoGraphDeploymentValidator) validateService(ctx context.Context, serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) (admission.Warnings, error) {
// The inter-pod GMS layout (with or without failover) requires the Grove
// pathway: the weight-server pod, per-rank PCLQs, and DRA ResourceClaim
// templates are all wired at the PodCliqueScalingGroup level, which only
// the Grove renderer produces.
if service.IsInterPodGMSEnabled() && !v.isGrovePathway() {
if !v.groveEnabled {
return nil, fmt.Errorf(
"spec.services[%s]: gpuMemoryService.mode=%q requires the Grove pathway, but Grove is disabled at the operator level (global.grove.enabled=false)",
serviceName, nvidiacomv1alpha1.GMSModeInterPod)
}
return nil, fmt.Errorf(
"spec.services[%s]: gpuMemoryService.mode=%q requires the Grove pathway; remove or unset the %q annotation (currently %q)",
serviceName, nvidiacomv1alpha1.GMSModeInterPod,
consts.KubeAnnotationEnableGrove, v.deployment.Annotations[consts.KubeAnnotationEnableGrove])
}
// The inter-pod GMS layout is currently implemented only for vLLM (the
// engine relies on vLLM-specific runtime hooks like --load-format gms and
// DYN_VLLM_GMS_SHADOW_MODE that activate the GMS client path). Fail fast
// at admission rather than producing a broken deployment when another or
// no backend is configured — an empty BackendFramework means the operator
// cannot confirm the engine speaks vLLM, which is a hard prerequisite for
// inter-pod GMS (both standalone and with failover).
if service.IsInterPodGMSEnabled() &&
v.deployment.Spec.BackendFramework != backendFrameworkVLLM {
detected := v.deployment.Spec.BackendFramework
if detected == "" {
detected = "<unset>"
}
return nil, fmt.Errorf(
"spec.services[%s]: the inter-pod GMS layout (gpuMemoryService.mode=%q) is currently supported only for vLLM (detected: %s); "+
"set spec.backendFramework=%q",
serviceName, nvidiacomv1alpha1.GMSModeInterPod, detected, backendFrameworkVLLM)
}
// Validate service name length constraints for Grove PodCliqueSet naming
// Only validate when Grove pathway may be in use
if v.isGrovePathway() {
......@@ -318,44 +399,69 @@ func (v *DynamoGraphDeploymentValidator) validateServiceNameLength(serviceName s
dgdName := v.deployment.Name
lowerServiceName := strings.ToLower(serviceName)
// Check if this is a multinode service
isMultinode := service.GetNumberOfNodes() > 1
isInterPodGMS := service.IsInterPodGMSEnabled()
// Determine the longest PodClique name that will be generated.
// Grove validates: len(PCS name) + len(PCSG name) + len(PCLQ name) <= 45
var longestPCLQName string
var pcsgName string
switch {
case isInterPodGMS:
// GMS services always get a PCSG named after the service.
// Longest PCLQ name is "serviceName-gms-0" (len + 6) or "serviceName-wkr-N".
pcsgName = lowerServiceName
gmsName := fmt.Sprintf("%s-%s-0", lowerServiceName, consts.GroveRoleSuffixGMS)
longestPCLQName = gmsName
if isMultinode {
// For multinode: PodCliqueSet name + PodCliqueScalingGroup name + PodClique name (with leader suffix)
// The PodClique name is serviceName + "-ldr" (using GroveRoleSuffixLeader)
leaderPodCliqueName := lowerServiceName + "-" + consts.GroveRoleSuffixLeader
combinedLength := len(dgdName) + len(lowerServiceName) + len(leaderPodCliqueName)
// For high node counts, "svc-wkr-NN" can be longer than "svc-gms-0"
maxRank := service.GetNumberOfNodes() - 1
workerName := fmt.Sprintf("%s-%s-%d", lowerServiceName, consts.GroveRoleSuffixWorker, maxRank)
if len(workerName) > len(longestPCLQName) {
longestPCLQName = workerName
}
}
case isMultinode:
pcsgName = lowerServiceName
longestPCLQName = lowerServiceName + "-" + consts.GroveRoleSuffixLeader
default:
// Single-node non-GMS: no PCSG, only PCS + PCLQ
combinedLength := len(dgdName) + len(lowerServiceName)
if combinedLength > maxCombinedResourceNameLength {
return fmt.Errorf("spec.services[%s]: combined resource name length %d exceeds %d-character limit required for pod naming. "+
"Consider shortening the DynamoGraphDeployment name '%s' (length %d) or service name '%s' (length %d). "+
"For multinode services, the combined length of DGD name + service name + service name with role suffix (e.g., '%s-ldr') must not exceed %d characters",
"The combined length of DGD name + service name must not exceed %d characters",
serviceName, combinedLength, maxCombinedResourceNameLength,
dgdName, len(dgdName), serviceName, len(serviceName),
lowerServiceName, maxCombinedResourceNameLength)
maxCombinedResourceNameLength)
}
return nil
}
} else {
// For single-node: PodCliqueSet name + PodClique name
combinedLength := len(dgdName) + len(lowerServiceName)
// For services with PCSG: PCS name + PCSG name + longest PCLQ name
combinedLength := len(dgdName) + len(pcsgName) + len(longestPCLQName)
if combinedLength > maxCombinedResourceNameLength {
return fmt.Errorf("spec.services[%s]: combined resource name length %d exceeds %d-character limit required for pod naming. "+
"Consider shortening the DynamoGraphDeployment name '%s' (length %d) or service name '%s' (length %d). "+
"The combined length of DGD name + service name must not exceed %d characters",
"The combined length of DGD name + PCSG name + longest PodClique name ('%s') must not exceed %d characters",
serviceName, combinedLength, maxCombinedResourceNameLength,
dgdName, len(dgdName), serviceName, len(serviceName),
maxCombinedResourceNameLength)
}
longestPCLQName, maxCombinedResourceNameLength)
}
return nil
}
// isGrovePathway determines if Grove pathway may be used for this deployment.
// Grove is used when the nvidia.com/enable-grove annotation is NOT explicitly set to "false".
// This is a conservative check - if Grove might be used, we validate the name length constraints.
// Grove requires both operator-level enablement (global.grove.enabled) and the
// per-DGD annotation not being explicitly set to "false".
func (v *DynamoGraphDeploymentValidator) isGrovePathway() bool {
if !v.groveEnabled {
return false
}
return v.deployment.Annotations == nil ||
strings.ToLower(v.deployment.Annotations[consts.KubeAnnotationEnableGrove]) != consts.KubeLabelValueFalse
}
......@@ -797,18 +903,22 @@ func (v *DynamoGraphDeploymentValidator) validateNoRestartDuringRollingUpdate(ol
}
// validateFailoverRequiresDiscoveryMode checks that when any service has
// failover enabled, the DGD carries the nvidia.com/dynamo-kube-discovery-mode
// annotation set to "container". Failover pods produce multiple engine
// containers that each need their own discovery identity.
// intra-pod failover enabled, the DGD carries the nvidia.com/dynamo-kube-discovery-mode
// annotation set to "container". Intra-pod failover produces multiple engine
// containers within the same pod that each need their own discovery identity.
// Inter-pod failover uses separate pods, so the annotation is not required.
func (v *DynamoGraphDeploymentValidator) validateFailoverRequiresDiscoveryMode() error {
hasFailover := false
hasIntraPodFailover := false
for _, svc := range v.deployment.Spec.Services {
if svc != nil && svc.Failover != nil && svc.Failover.Enabled {
hasFailover = true
if svc == nil || svc.Failover == nil || !svc.Failover.Enabled {
continue
}
if svc.Failover.Mode == nvidiacomv1alpha1.GMSModeIntraPod {
hasIntraPodFailover = true
break
}
}
if !hasFailover {
if !hasIntraPodFailover {
return nil
}
......
......@@ -43,15 +43,18 @@ const (
type DynamoGraphDeploymentHandler struct {
mgr manager.Manager
operatorPrincipal string
groveEnabled bool
}
// NewDynamoGraphDeploymentHandler creates a new handler for DynamoGraphDeployment Webhook.
// operatorPrincipal is the full Kubernetes SA username of the operator, used to authorize
// replica changes on scaling-adapter-enabled services (#7656).
func NewDynamoGraphDeploymentHandler(mgr manager.Manager, operatorPrincipal string) *DynamoGraphDeploymentHandler {
// groveEnabled reflects the operator's runtime config (global.grove.enabled).
func NewDynamoGraphDeploymentHandler(mgr manager.Manager, operatorPrincipal string, groveEnabled bool) *DynamoGraphDeploymentHandler {
return &DynamoGraphDeploymentHandler{
mgr: mgr,
operatorPrincipal: operatorPrincipal,
groveEnabled: groveEnabled,
}
}
......@@ -67,7 +70,7 @@ func (h *DynamoGraphDeploymentHandler) ValidateCreate(ctx context.Context, obj r
logger.Info("validate create", "name", deployment.Name, "namespace", deployment.Namespace)
// Create validator with manager for API group detection and perform validation
validator := NewDynamoGraphDeploymentValidatorWithManager(deployment, h.mgr)
validator := NewDynamoGraphDeploymentValidatorWithManager(deployment, h.mgr, h.groveEnabled)
return validator.Validate(ctx)
}
......@@ -94,7 +97,7 @@ func (h *DynamoGraphDeploymentHandler) ValidateUpdate(ctx context.Context, oldOb
}
// Create validator with manager for API group detection and perform validation.
validator := NewDynamoGraphDeploymentValidatorWithManager(newDeployment, h.mgr)
validator := NewDynamoGraphDeploymentValidatorWithManager(newDeployment, h.mgr, h.groveEnabled)
warnings, err := validator.Validate(ctx)
if err != nil {
return warnings, err
......
......@@ -42,6 +42,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
tests := []struct {
name string
deployment *nvidiacomv1alpha1.DynamoGraphDeployment
groveEnabled bool
wantErr bool
errMsg string
errContains bool
......@@ -511,6 +512,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
// Service name length validation tests
{
name: "service name too long for single-node deployment",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "verylongdynamographdeploymentname",
......@@ -528,6 +530,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
},
{
name: "service name too long for multinode deployment",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "vllm-agg",
......@@ -549,6 +552,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
},
{
name: "valid service name length for single-node",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "dgd",
......@@ -564,6 +568,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
},
{
name: "valid service name length for multinode",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "dgd",
......@@ -583,6 +588,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
},
{
name: "boundary case - exactly at 45 char limit for single-node",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
// DGD name (3 chars) + service name (42 chars) = 45 chars (exactly at limit)
......@@ -600,6 +606,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
},
{
name: "boundary case - one char over limit for single-node",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
// DGD name (3 chars) + service name (43 chars) = 46 chars (over limit)
......@@ -620,6 +627,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
// Grove disabled tests - service name length validation should be skipped
{
name: "long service name allowed when Grove disabled via annotation",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "verylongdynamographdeploymentname",
......@@ -638,6 +646,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
},
{
name: "long multinode service name allowed when Grove disabled via annotation",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "vllm-agg",
......@@ -660,6 +669,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
},
{
name: "Grove annotation case insensitive - FALSE",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "verylongdynamographdeploymentname",
......@@ -676,6 +686,280 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
},
wantErr: false,
},
// GMS failover validation test cases
{
name: "valid GMS failover single-node with GPU",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-gms",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
NumShadows: 1,
},
Resources: &nvidiacomv1alpha1.Resources{
Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "8"},
},
},
},
},
},
wantErr: false,
},
{
name: "valid standalone inter-pod GMS (no failover) single-node with GPU",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-gms-standalone",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Resources: &nvidiacomv1alpha1.Resources{
Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "8"},
},
},
},
},
},
wantErr: false,
},
{
name: "GMS failover without GPU",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-gms",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
NumShadows: 1,
},
},
},
},
},
wantErr: true,
errContains: true,
// validateGPUMemoryService fires first when the inter-pod layout
// is declared without any GPU resources.
errMsg: "requires resources.limits.gpu",
},
{
name: "inter-pod GMS on frontend component rejected",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-gms",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"fe": {
ComponentType: "frontend",
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Resources: &nvidiacomv1alpha1.Resources{
Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "1"},
},
},
},
},
},
wantErr: true,
errContains: true,
errMsg: "GPU memory service is only supported for worker components",
},
{
name: "GMS failover requires Grove pathway - annotation disabled",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-gms",
Namespace: "default",
Annotations: map[string]string{
consts.KubeAnnotationEnableGrove: "false",
},
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
NumShadows: 1,
},
Resources: &nvidiacomv1alpha1.Resources{
Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "8"},
},
},
},
},
},
wantErr: true,
errContains: true,
errMsg: "requires the Grove pathway",
},
{
name: "GMS failover requires Grove pathway - operator grove disabled",
groveEnabled: false,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-gms",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
NumShadows: 1,
},
Resources: &nvidiacomv1alpha1.Resources{
Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "8"},
},
},
},
},
},
wantErr: true,
errContains: true,
errMsg: "requires the Grove pathway",
},
{
name: "inter-pod GMS rejected on non-vLLM backend",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-gms",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
NumShadows: 1,
},
Resources: &nvidiacomv1alpha1.Resources{
Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "8"},
},
},
},
},
},
wantErr: true,
errContains: true,
errMsg: "currently supported only for vLLM",
},
{
name: "inter-pod GMS rejected when backendFramework is unset",
groveEnabled: true,
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-gms",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
// BackendFramework intentionally left empty — the
// inter-pod gate must fail closed rather than silently
// accept a deployment whose engine may not speak vLLM.
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
NumShadows: 1,
},
Resources: &nvidiacomv1alpha1.Resources{
Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "8"},
},
},
},
},
},
wantErr: true,
errContains: true,
errMsg: "currently supported only for vLLM",
},
{
name: "GMS failover disabled is valid without GPU",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-gms",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: false,
},
},
},
},
},
wantErr: false,
},
// Annotation validation test cases
{
name: "valid annotation vllm-distributed-executor-backend=mp",
......@@ -1245,7 +1529,7 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentValidator(tt.deployment)
validator := NewDynamoGraphDeploymentValidator(tt.deployment, tt.groveEnabled)
_, err := validator.Validate(context.Background())
if (err != nil) != tt.wantErr {
......@@ -1928,11 +2212,119 @@ func TestDynamoGraphDeploymentValidator_ValidateUpdate(t *testing.T) {
},
wantErr: false,
},
{
name: "toggling GMS failover is immutable",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
NumShadows: 1,
},
},
},
},
},
wantErr: true,
errMsg: "failover cannot be toggled after creation",
},
{
name: "toggling inter-pod GMS layout is immutable",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
},
},
},
},
wantErr: true,
errMsg: "inter-pod GMS layout cannot be toggled after creation",
},
{
name: "changing numShadows is immutable",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
NumShadows: 1,
},
},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "vllm",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"worker": {
ComponentType: consts.ComponentTypeWorker,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
NumShadows: 3,
},
},
},
},
},
wantErr: true,
errMsg: "failover.numShadows is immutable",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentValidator(tt.newDeployment)
validator := NewDynamoGraphDeploymentValidator(tt.newDeployment, true)
// Pass nil userInfo and empty operatorPrincipal - these tests don't modify replicas, so it's safe
warnings, err := validator.ValidateUpdate(tt.oldDeployment, nil, "")
......
......@@ -19,6 +19,7 @@ package validation
import (
"context"
"errors"
"fmt"
"strconv"
"strings"
......@@ -129,12 +130,12 @@ func (v *SharedSpecValidator) Validate(ctx context.Context) (admission.Warnings,
return nil, err
}
// Validate GPU memory service configuration
// Validate GPU memory service configuration (intra-pod GMS)
if err := v.validateGPUMemoryService(); err != nil {
return nil, err
}
// Validate failover configuration
// Validate GMS failover constraints
if err := v.validateFailover(); err != nil {
return nil, err
}
......@@ -266,50 +267,128 @@ func (v *SharedSpecValidator) validateFrontendSidecar() error {
return nil
}
// validateFailover validates the failover configuration for a service.
// Structural checks only — DRA/DeviceClass availability is checked by the controller
// at reconcile time (same pattern as Grove orchestrator availability).
// parseGPUCount extracts the GPU count from a Resources block, preferring
// Limits then Requests. Returns (0, nil) when no GPU is requested, or an
// error if the value is non-numeric.
func parseGPUCount(r *nvidiacomv1alpha1.Resources) (int, error) {
gpuStr := ""
switch {
case r != nil && r.Limits != nil && r.Limits.GPU != "":
gpuStr = r.Limits.GPU
case r != nil && r.Requests != nil && r.Requests.GPU != "":
gpuStr = r.Requests.GPU
}
if gpuStr == "" {
return 0, nil
}
n, err := strconv.Atoi(gpuStr)
if err != nil {
return 0, fmt.Errorf("invalid value %q: %w", gpuStr, err)
}
return n, nil
}
// validateFailover validates GMS failover configuration constraints.
//
// The layout (intra-pod sidecar vs. inter-pod weight-server pod) is declared
// by gpuMemoryService.mode. failover is an independent toggle: when enabled,
// failover.mode MUST match gpuMemoryService.mode so the two knobs describe a
// consistent topology. It is also valid to configure gpuMemoryService without
// failover (no shadows; a single engine + GMS server pair) — see
// validateGPUMemoryService below.
func (v *SharedSpecValidator) validateFailover() error {
if v.spec.Failover == nil || !v.spec.Failover.Enabled {
// When failover.enabled is false the sub-fields (mode, numShadows)
// are dormant configuration and the render path ignores them
// (GetNumShadows returns 0). We deliberately do not validate them
// here so users can stage a failover config before flipping
// enabled=true — matching the K8s convention that fields on a
// disabled feature are not constrained.
return nil
}
// Failover requires GPU memory service
var errs []error
// For intra-pod mode: require gpuMemoryService.enabled and validate mode matching.
if v.spec.Failover.Mode == nvidiacomv1alpha1.GMSModeIntraPod {
if v.spec.GPUMemoryService == nil || !v.spec.GPUMemoryService.Enabled {
return fmt.Errorf(
"%s.failover: failover requires gpuMemoryService.enabled to be true",
v.fieldPath)
errs = append(errs, fmt.Errorf(
"%s.failover: intraPod failover requires gpuMemoryService.enabled to be true",
v.fieldPath))
} else if v.spec.GPUMemoryService.Mode != "" &&
v.spec.GPUMemoryService.Mode != nvidiacomv1alpha1.GMSModeIntraPod {
errs = append(errs, fmt.Errorf(
"%s.failover: failover.mode %q must match gpuMemoryService.mode %q",
v.fieldPath, v.spec.Failover.Mode, v.spec.GPUMemoryService.Mode))
}
// Failover mode must match GMS mode when both are set
if v.spec.Failover.Mode != "" && v.spec.GPUMemoryService.Mode != "" &&
v.spec.Failover.Mode != v.spec.GPUMemoryService.Mode {
return fmt.Errorf(
"%s.failover: failover.mode %q must match gpuMemoryService.mode %q",
v.fieldPath, v.spec.Failover.Mode, v.spec.GPUMemoryService.Mode)
// intraPod is a fixed 1 primary + 1 shadow sidecar layout; numShadows
// is meaningless here and any value other than the implicit 1 is
// almost certainly a configuration error (user probably wanted
// mode=interPod).
if v.spec.Failover.NumShadows != 0 && v.spec.Failover.NumShadows != 1 {
errs = append(errs, fmt.Errorf(
"%s.failover.numShadows=%d is invalid for mode=%q: intraPod uses a fixed 1 primary + 1 shadow sidecar; "+
"use failover.mode=%q to configure numShadows",
v.fieldPath, v.spec.Failover.NumShadows, nvidiacomv1alpha1.GMSModeIntraPod, nvidiacomv1alpha1.GMSModeInterPod))
}
}
// interPod failover is not yet supported
// For inter-pod mode: require the inter-pod GMS layout (gpuMemoryService
// with mode=interPod) so failover hot-spares are added on top of an
// already-declared weight-server pod layout.
if v.spec.Failover.Mode == nvidiacomv1alpha1.GMSModeInterPod {
return fmt.Errorf(
"%s.failover: mode \"interPod\" is not yet supported",
v.fieldPath)
if v.spec.GPUMemoryService == nil || !v.spec.GPUMemoryService.Enabled {
errs = append(errs, fmt.Errorf(
"%s.failover: interPod failover requires gpuMemoryService.enabled=true and gpuMemoryService.mode=%q",
v.fieldPath, nvidiacomv1alpha1.GMSModeInterPod))
} else if v.spec.GPUMemoryService.Mode != nvidiacomv1alpha1.GMSModeInterPod {
// An unset gpuMemoryService.mode defaults to the intra-pod sidecar
// layout, which is incompatible with inter-pod failover; the user
// must set gpuMemoryService.mode=interPod explicitly.
detected := string(v.spec.GPUMemoryService.Mode)
if detected == "" {
detected = "<unset>"
}
errs = append(errs, fmt.Errorf(
"%s.failover: interPod failover requires gpuMemoryService.mode=%q (got %q)",
v.fieldPath, nvidiacomv1alpha1.GMSModeInterPod, detected))
}
return nil
if v.spec.Failover.NumShadows < 1 {
errs = append(errs, fmt.Errorf("%s.failover.numShadows must be >= 1", v.fieldPath))
}
gpuCount, err := parseGPUCount(v.spec.Resources)
if err != nil {
errs = append(errs, fmt.Errorf("%s.resources.limits.gpu: %w", v.fieldPath, err))
} else if gpuCount < 1 {
errs = append(errs, fmt.Errorf("%s: GMS failover requires at least 1 GPU in resources.limits.gpu", v.fieldPath))
}
switch v.spec.ComponentType {
case consts.ComponentTypeEPP, consts.ComponentTypeFrontend, consts.ComponentTypePlanner:
errs = append(errs, fmt.Errorf("%s: GMS failover is not supported for componentType %q", v.fieldPath, v.spec.ComponentType))
}
}
return errors.Join(errs...)
}
// validateGPUMemoryService validates gpuMemoryService constraints.
//
// gpuMemoryService declares the GMS layout (intra-pod sidecar vs. inter-pod
// dedicated weight-server pod) and may be enabled independently of failover:
// the intra-pod layout gives the engine a GMS sidecar in the same pod, and
// the inter-pod layout gives it a dedicated weight-server pod paired with one
// engine pod. Failover adds shadow engine pods on top of the declared layout
// (see validateFailover); it is not the sole way to request the inter-pod
// layout.
func (v *SharedSpecValidator) validateGPUMemoryService() error {
if v.spec.GPUMemoryService == nil || !v.spec.GPUMemoryService.Enabled {
return nil
}
if v.spec.GPUMemoryService.Mode == nvidiacomv1alpha1.GMSModeInterPod {
return fmt.Errorf(
"%s.gpuMemoryService: mode \"interPod\" is not yet supported",
v.fieldPath)
}
isWorker := v.spec.ComponentType == consts.ComponentTypeWorker ||
v.spec.ComponentType == consts.ComponentTypePrefill ||
v.spec.ComponentType == consts.ComponentTypeDecode
......@@ -319,27 +398,7 @@ func (v *SharedSpecValidator) validateGPUMemoryService() error {
v.fieldPath)
}
if v.spec.Resources == nil {
return fmt.Errorf(
"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
v.fieldPath)
}
gpuStr := ""
switch {
case v.spec.Resources.Limits != nil && v.spec.Resources.Limits.GPU != "":
gpuStr = v.spec.Resources.Limits.GPU
case v.spec.Resources.Requests != nil && v.spec.Resources.Requests.GPU != "":
gpuStr = v.spec.Resources.Requests.GPU
}
if gpuStr == "" {
return fmt.Errorf(
"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
v.fieldPath)
}
gpuCount, err := strconv.Atoi(gpuStr)
gpuCount, err := parseGPUCount(v.spec.Resources)
if err != nil || gpuCount < 1 {
return fmt.Errorf(
"%s.gpuMemoryService: GPU memory service requires resources.limits.gpu >= 1",
......
......@@ -396,6 +396,195 @@ func TestSharedSpecValidator_Validate_Warnings(t *testing.T) {
}
}
// TestSharedSpecValidator_Failover_ModeConstraints covers the layout/failover
// symmetry invariants enforced by validateFailover / validateGPUMemoryService:
//
// 1. gpuMemoryService declares the layout (intra-pod sidecar vs. inter-pod
// weight-server pod). Both modes are valid on their own (standalone GMS
// with no failover), and both may be paired with failover of a matching
// mode.
// 2. failover.mode=intraPod requires gpuMemoryService.enabled=true and a
// matching (or unset) gpuMemoryService.mode.
// 3. failover.mode=interPod requires gpuMemoryService.enabled=true AND
// gpuMemoryService.mode=interPod — the symmetric counterpart of (2).
// 4. intraPod failover with numShadows != 1 is rejected (intraPod is a
// fixed 1 primary + 1 shadow layout).
// 5. When failover.enabled=false, sub-fields (mode, numShadows) are dormant
// configuration and are intentionally NOT validated — the render path
// ignores them and users may stage a config before enabling failover.
func TestSharedSpecValidator_Failover_ModeConstraints(t *testing.T) {
workerGPU := &nvidiacomv1alpha1.Resources{
Limits: &nvidiacomv1alpha1.ResourceItem{GPU: "1"},
}
tests := []struct {
name string
spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
wantErr bool
errSubstr string
}{
{
name: "standalone inter-pod GMS (no failover) is accepted",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
Resources: workerGPU,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
},
wantErr: false,
},
{
name: "sidecar gpuMemoryService mode=intraPod is accepted",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
Resources: workerGPU,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeIntraPod,
},
},
wantErr: false,
},
{
name: "sidecar gpuMemoryService mode unset is accepted",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
Resources: workerGPU,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
},
},
wantErr: false,
},
{
name: "inter-pod failover requires gpuMemoryService.enabled",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
Resources: workerGPU,
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
NumShadows: 1,
},
},
wantErr: true,
errSubstr: "gpuMemoryService.enabled=true",
},
{
name: "inter-pod failover requires gpuMemoryService.mode=interPod",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
Resources: workerGPU,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeIntraPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
NumShadows: 1,
},
},
wantErr: true,
errSubstr: "requires gpuMemoryService.mode",
},
{
name: "inter-pod failover with matching gpuMemoryService.mode=interPod is accepted",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
Resources: workerGPU,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
NumShadows: 1,
},
},
wantErr: false,
},
{
// numShadows is dormant configuration when failover.enabled=false
// and GetNumShadows returns 0; validateFailover deliberately does
// not constrain sub-fields on a disabled feature so users can
// stage a config before flipping enabled=true.
name: "numShadows with failover.enabled=false is accepted (dormant config)",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
Resources: workerGPU,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeInterPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: false,
NumShadows: 2,
},
},
wantErr: false,
},
{
name: "intraPod failover with numShadows=2 is rejected",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
Resources: workerGPU,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeIntraPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeIntraPod,
NumShadows: 2,
},
},
wantErr: true,
errSubstr: "numShadows",
},
{
name: "intraPod failover with numShadows=1 is accepted",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
ComponentType: consts.ComponentTypeWorker,
Resources: workerGPU,
GPUMemoryService: &nvidiacomv1alpha1.GPUMemoryServiceSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeIntraPod,
},
Failover: &nvidiacomv1alpha1.FailoverSpec{
Enabled: true,
Mode: nvidiacomv1alpha1.GMSModeIntraPod,
NumShadows: 1,
},
},
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
v := NewSharedSpecValidator(tt.spec, "spec", "default-my-dgd")
_, err := v.Validate(context.Background())
if tt.wantErr {
if err == nil {
t.Fatalf("expected error, got nil")
}
if tt.errSubstr != "" && !contains(err.Error(), tt.errSubstr) {
t.Errorf("error %q does not contain %q", err.Error(), tt.errSubstr)
}
return
}
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
})
}
}
// contains checks if s contains substr
func contains(s, substr string) bool {
return len(s) >= len(substr) && (s == substr || len(substr) == 0 ||
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment