Unverified Commit 538d3035 authored by Jonathan Tong's avatar Jonathan Tong Committed by GitHub
Browse files

fix: make node topology immutable in DynamoGraphDeployment (#4969)


Signed-off-by: default avatarJont828 <jt572@cornell.edu>
parent 8b5c8039
...@@ -280,6 +280,10 @@ func (s *DynamoComponentDeployment) GetNumberOfNodes() int32 { ...@@ -280,6 +280,10 @@ func (s *DynamoComponentDeployment) GetNumberOfNodes() int32 {
return s.Spec.GetNumberOfNodes() return s.Spec.GetNumberOfNodes()
} }
func (s *DynamoComponentDeploymentSharedSpec) IsMultinode() bool {
return s.GetNumberOfNodes() > 1
}
func (s *DynamoComponentDeploymentSharedSpec) GetNumberOfNodes() int32 { func (s *DynamoComponentDeploymentSharedSpec) GetNumberOfNodes() int32 {
if s.Multinode != nil { if s.Multinode != nil {
return s.Multinode.NodeCount return s.Multinode.NodeCount
......
...@@ -91,11 +91,32 @@ func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.D ...@@ -91,11 +91,32 @@ func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.D
// validateImmutableFields checks that immutable fields have not been changed. // validateImmutableFields checks that immutable fields have not been changed.
// Appends warnings to the provided slice. // Appends warnings to the provided slice.
func (v *DynamoGraphDeploymentValidator) validateImmutableFields(old *nvidiacomv1alpha1.DynamoGraphDeployment, warnings *admission.Warnings) error { func (v *DynamoGraphDeploymentValidator) validateImmutableFields(old *nvidiacomv1alpha1.DynamoGraphDeployment, warnings *admission.Warnings) error {
var errs []error
if v.deployment.Spec.BackendFramework != old.Spec.BackendFramework { if v.deployment.Spec.BackendFramework != old.Spec.BackendFramework {
*warnings = append(*warnings, "Changing spec.backendFramework may cause unexpected behavior") *warnings = append(*warnings, "Changing spec.backendFramework may cause unexpected behavior")
return fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation") errs = append(errs, fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation"))
} }
return nil
// Validate that node topology (single-node vs multi-node) is not changed for each service.
for serviceName, newService := range v.deployment.Spec.Services {
// Get old service (if exists)
oldService, exists := old.Spec.Services[serviceName]
if !exists {
// New service, no comparison needed
continue
}
if oldService.IsMultinode() != newService.IsMultinode() {
errs = append(errs, fmt.Errorf(
"spec.services[%s] cannot change node topology (between single-node and multi-node) after creation",
serviceName,
))
}
}
return errors.Join(errs...)
} }
// validateReplicasChanges checks if replicas were changed for services with scaling adapter enabled. // validateReplicasChanges checks if replicas were changed for services with scaling adapter enabled.
......
...@@ -414,6 +414,303 @@ func TestDynamoGraphDeploymentValidator_ValidateUpdate(t *testing.T) { ...@@ -414,6 +414,303 @@ func TestDynamoGraphDeploymentValidator_ValidateUpdate(t *testing.T) {
}, },
wantErr: false, wantErr: false,
}, },
{
name: "changing service from single-node to multi-node",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
// Single-node (nil Multinode)
Multinode: nil,
},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
// Multi-node (NodeCount > 1)
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[main] cannot change node topology (between single-node and multi-node) after creation",
},
{
name: "changing service from multi-node to single-node",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
// Multi-node
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 3,
},
},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
// Single-node (nil Multinode)
Multinode: nil,
},
},
},
},
wantErr: true,
errMsg: "spec.services[main] cannot change node topology (between single-node and multi-node) after creation",
},
{
name: "changing multinode NodeCount within multi-node range is allowed",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 4,
},
},
},
},
},
wantErr: false,
},
{
name: "keeping service as single-node is allowed",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: nil,
},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: nil,
},
},
},
},
wantErr: false,
},
{
name: "keeping service as multi-node is allowed",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 3,
},
},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 3,
},
},
},
},
},
wantErr: false,
},
{
name: "changing from single-node (NodeCount=1) to multi-node (NodeCount=2)",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 1,
},
},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[main] cannot change node topology (between single-node and multi-node) after creation",
},
{
name: "changing from multi-node (NodeCount=2) to single-node (NodeCount=1)",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 1,
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[main] cannot change node topology (between single-node and multi-node) after creation",
},
{
name: "multiple services with one changing topology",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: nil,
},
"prefill": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
// Changing from single-node to multi-node
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 3,
},
},
"prefill": {
// Keeping as multi-node (OK)
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 4,
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[main] cannot change node topology (between single-node and multi-node) after creation",
},
{
name: "adding new service with multinode is allowed",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: nil,
},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: nil,
},
"decode": {
// New service with multinode - should be allowed
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 4,
},
},
},
},
},
wantErr: false,
},
{
name: "adding new service without multinode is allowed",
oldDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
},
},
},
newDeployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
BackendFramework: "sglang",
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"main": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
"gateway": {
// New service without multinode - should be allowed
Multinode: nil,
},
},
},
},
wantErr: false,
},
} }
for _, tt := range tests { for _, tt := range tests {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment