"lib/bindings/python/vscode:/vscode.git/clone" did not exist on "aaf283bbb898bb7fae2909fe11a2762b6350d1cc"
Unverified Commit a31cbd0d authored by Raghav Potluri's avatar Raghav Potluri Committed by GitHub
Browse files

fix(operator): prevent nil pointer panic when DGD service omits replicas (#6739)


Signed-off-by: default avatarRaghav Potluri <raghav.potluri21@gmail.com>
parent 0f974b47
...@@ -835,7 +835,11 @@ func expandRolesForService(serviceName string, serviceReplicas *int32, numberOfN ...@@ -835,7 +835,11 @@ func expandRolesForService(serviceName string, serviceReplicas *int32, numberOfN
roles = append(roles, ServiceRole{Name: serviceName + "-" + commonconsts.GroveRoleSuffixLeader, Role: RoleLeader, Replicas: 1}) roles = append(roles, ServiceRole{Name: serviceName + "-" + commonconsts.GroveRoleSuffixLeader, Role: RoleLeader, Replicas: 1})
roles = append(roles, ServiceRole{Name: serviceName + "-" + commonconsts.GroveRoleSuffixWorker, Role: RoleWorker, Replicas: numberOfNodes - 1}) roles = append(roles, ServiceRole{Name: serviceName + "-" + commonconsts.GroveRoleSuffixWorker, Role: RoleWorker, Replicas: numberOfNodes - 1})
} else { } else {
roles = append(roles, ServiceRole{Name: serviceName, Role: RoleMain, Replicas: *serviceReplicas}) replicas := int32(1)
if serviceReplicas != nil {
replicas = *serviceReplicas
}
roles = append(roles, ServiceRole{Name: serviceName, Role: RoleMain, Replicas: replicas})
} }
return roles return roles
} }
......
...@@ -4243,14 +4243,14 @@ func TestExpandRolesForService(t *testing.T) { ...@@ -4243,14 +4243,14 @@ func TestExpandRolesForService(t *testing.T) {
name string name string
serviceName string serviceName string
numberOfNodes int32 numberOfNodes int32
serviceReplicas int32 serviceReplicas *int32
expected []ServiceRole expected []ServiceRole
}{ }{
{ {
name: "single node", name: "single node",
serviceName: "test-service", serviceName: "test-service",
numberOfNodes: 1, numberOfNodes: 1,
serviceReplicas: 2, serviceReplicas: ptr.To(int32(2)),
expected: []ServiceRole{ expected: []ServiceRole{
{Name: "test-service", Role: RoleMain, Replicas: 2}, {Name: "test-service", Role: RoleMain, Replicas: 2},
}, },
...@@ -4277,16 +4277,33 @@ func TestExpandRolesForService(t *testing.T) { ...@@ -4277,16 +4277,33 @@ func TestExpandRolesForService(t *testing.T) {
name: "zero nodes should return main", name: "zero nodes should return main",
serviceName: "test-service", serviceName: "test-service",
numberOfNodes: 0, numberOfNodes: 0,
serviceReplicas: 1, serviceReplicas: ptr.To(int32(1)),
expected: []ServiceRole{ expected: []ServiceRole{
{Name: "test-service", Role: RoleMain, Replicas: 1}, {Name: "test-service", Role: RoleMain, Replicas: 1},
}, },
}, },
{
name: "nil replicas defaults to 1",
serviceName: "test-service",
numberOfNodes: 1,
expected: []ServiceRole{
{Name: "test-service", Role: RoleMain, Replicas: 1},
},
},
{
name: "zero replicas preserved",
serviceName: "test-service",
numberOfNodes: 1,
serviceReplicas: ptr.To(int32(0)),
expected: []ServiceRole{
{Name: "test-service", Role: RoleMain, Replicas: 0},
},
},
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
result := expandRolesForService(tt.serviceName, &tt.serviceReplicas, tt.numberOfNodes) result := expandRolesForService(tt.serviceName, tt.serviceReplicas, tt.numberOfNodes)
if !reflect.DeepEqual(result, tt.expected) { if !reflect.DeepEqual(result, tt.expected) {
t.Errorf("expandRolesForService() = %v, want %v", result, tt.expected) t.Errorf("expandRolesForService() = %v, want %v", result, tt.expected)
} }
......
...@@ -25,6 +25,7 @@ import ( ...@@ -25,6 +25,7 @@ import (
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
admissionv1 "k8s.io/api/admission/v1" admissionv1 "k8s.io/api/admission/v1"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission" "sigs.k8s.io/controller-runtime/pkg/webhook/admission"
...@@ -50,8 +51,9 @@ func NewDGDDefaulter(operatorVersion string) *DGDDefaulter { ...@@ -50,8 +51,9 @@ func NewDGDDefaulter(operatorVersion string) *DGDDefaulter {
} }
// Default implements admission.CustomDefaulter. // Default implements admission.CustomDefaulter.
// On every operation: defaults nil Replicas to 1 for all services.
// On CREATE: stamps nvidia.com/dynamo-operator-origin-version with the operator version. // On CREATE: stamps nvidia.com/dynamo-operator-origin-version with the operator version.
// On UPDATE: does nothing -- the origin version is immutable once set. // On UPDATE/DELETE: the origin version annotation is immutable once set.
func (d *DGDDefaulter) Default(ctx context.Context, obj runtime.Object) error { func (d *DGDDefaulter) Default(ctx context.Context, obj runtime.Object) error {
logger := log.FromContext(ctx).WithName(dgdDefaultingWebhookName) logger := log.FromContext(ctx).WithName(dgdDefaultingWebhookName)
...@@ -66,6 +68,18 @@ func (d *DGDDefaulter) Default(ctx context.Context, obj runtime.Object) error { ...@@ -66,6 +68,18 @@ func (d *DGDDefaulter) Default(ctx context.Context, obj runtime.Object) error {
return nil return nil
} }
// Default nil replicas to 1 for all services. The Replicas field is
// *int32 with omitempty, so users can legally omit it. Without this
// default the controller panics on a nil pointer dereference in
// expandRolesForService(). Apply on every operation so that services
// added via UPDATE also get the default.
for name, svc := range dgd.Spec.Services {
if svc != nil && svc.Replicas == nil {
svc.Replicas = ptr.To(int32(1))
logger.V(1).Info("defaulted nil replicas to 1", "service", name)
}
}
if req.Operation == admissionv1.Create { if req.Operation == admissionv1.Create {
if dgd.Annotations == nil { if dgd.Annotations == nil {
dgd.Annotations = make(map[string]string) dgd.Annotations = make(map[string]string)
......
...@@ -25,6 +25,7 @@ import ( ...@@ -25,6 +25,7 @@ import (
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
admissionv1 "k8s.io/api/admission/v1" admissionv1 "k8s.io/api/admission/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission" "sigs.k8s.io/controller-runtime/pkg/webhook/admission"
) )
...@@ -165,3 +166,92 @@ func TestDGDDefaulter_Default(t *testing.T) { ...@@ -165,3 +166,92 @@ func TestDGDDefaulter_Default(t *testing.T) {
}) })
} }
} }
func TestDGDDefaulter_DefaultsNilReplicas(t *testing.T) {
tests := []struct {
name string
op admissionv1.Operation
services map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
wantReplicas map[string]int32
}{
{
name: "CREATE defaults nil replicas to 1",
op: admissionv1.Create,
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"Frontend": {Replicas: nil},
"VllmWorker": {Replicas: ptr.To(int32(3))},
"NilService": {Replicas: nil},
},
wantReplicas: map[string]int32{
"Frontend": 1,
"VllmWorker": 3,
"NilService": 1,
},
},
{
name: "UPDATE defaults nil replicas to 1",
op: admissionv1.Update,
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"NewService": {Replicas: nil},
},
wantReplicas: map[string]int32{
"NewService": 1,
},
},
{
name: "does not overwrite explicit replicas",
op: admissionv1.Create,
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"Worker": {Replicas: ptr.To(int32(5))},
},
wantReplicas: map[string]int32{
"Worker": 5,
},
},
{
name: "preserves explicit zero replicas",
op: admissionv1.Create,
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"Idle": {Replicas: ptr.To(int32(0))},
},
wantReplicas: map[string]int32{
"Idle": 0,
},
},
{
name: "nil service pointer in map is safe",
op: admissionv1.Create,
services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"Ghost": nil,
},
wantReplicas: map[string]int32{},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
defaulter := NewDGDDefaulter("0.9.0")
dgd := &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "default"},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: tt.services,
},
}
if err := defaulter.Default(admissionCtx(tt.op), dgd); err != nil {
t.Fatalf("Default() unexpected error: %v", err)
}
for name, want := range tt.wantReplicas {
svc := dgd.Spec.Services[name]
if svc.Replicas == nil {
t.Errorf("service %q: replicas is nil, want %d", name, want)
continue
}
if *svc.Replicas != want {
t.Errorf("service %q: replicas = %d, want %d", name, *svc.Replicas, want)
}
}
})
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment