Unverified Commit a3f1e7ec authored by Ashna Mehrotra's avatar Ashna Mehrotra Committed by GitHub
Browse files

fix: add DGD service name length validation (#5449)


Signed-off-by: default avatarashnamehrotra <ashnamehrotra@gmail.com>
parent 2132fd81
...@@ -22,6 +22,7 @@ import ( ...@@ -22,6 +22,7 @@ import (
"errors" "errors"
"fmt" "fmt"
"sort" "sort"
"strings"
semver "github.com/Masterminds/semver/v3" semver "github.com/Masterminds/semver/v3"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
...@@ -32,6 +33,15 @@ import ( ...@@ -32,6 +33,15 @@ import (
"sigs.k8s.io/controller-runtime/pkg/webhook/admission" "sigs.k8s.io/controller-runtime/pkg/webhook/admission"
) )
const (
// maxCombinedResourceNameLength is the maximum allowed combined length for Grove resource names.
// This constraint comes from Grove's PodCliqueSet webhook validation which enforces a 45-character
// limit on the combined length of PodCliqueSet name + PodCliqueScalingGroup name + PodClique name.
// Pod names follow formats like: <pcs-name>-<pcs-index>-<pcsg-name>-<pcsg-index>-<pclq-name>-<random>
// The random string and hyphens consume additional characters, leaving 45 for the resource names.
maxCombinedResourceNameLength = 45
)
// DynamoGraphDeploymentValidator validates DynamoGraphDeployment resources. // DynamoGraphDeploymentValidator validates DynamoGraphDeployment resources.
// This validator can be used by both webhooks and controllers for consistent validation. // This validator can be used by both webhooks and controllers for consistent validation.
type DynamoGraphDeploymentValidator struct { type DynamoGraphDeploymentValidator struct {
...@@ -244,6 +254,14 @@ func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv ...@@ -244,6 +254,14 @@ func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv
// validateService validates a single service configuration using SharedSpecValidator. // validateService validates a single service configuration using SharedSpecValidator.
// Returns warnings and error. // Returns warnings and error.
func (v *DynamoGraphDeploymentValidator) validateService(ctx context.Context, serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) (admission.Warnings, error) { func (v *DynamoGraphDeploymentValidator) validateService(ctx context.Context, serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) (admission.Warnings, error) {
// Validate service name length constraints for Grove PodCliqueSet naming
// Only validate when Grove pathway may be in use
if v.isGrovePathway() {
if err := v.validateServiceNameLength(serviceName, service); err != nil {
return nil, err
}
}
// Use SharedSpecValidator to validate service spec (which is a DynamoComponentDeploymentSharedSpec) // Use SharedSpecValidator to validate service spec (which is a DynamoComponentDeploymentSharedSpec)
fieldPath := fmt.Sprintf("spec.services[%s]", serviceName) fieldPath := fmt.Sprintf("spec.services[%s]", serviceName)
calculatedNamespace := v.deployment.GetDynamoNamespaceForService(service) calculatedNamespace := v.deployment.GetDynamoNamespaceForService(service)
...@@ -258,6 +276,65 @@ func (v *DynamoGraphDeploymentValidator) validateService(ctx context.Context, se ...@@ -258,6 +276,65 @@ func (v *DynamoGraphDeploymentValidator) validateService(ctx context.Context, se
return sharedValidator.Validate(ctx) return sharedValidator.Validate(ctx)
} }
// validateServiceNameLength validates that the service name combined with the DGD name
// won't exceed Grove's 45-character limit for resource naming.
//
// Grove generates PodCliqueSet resources with the following naming patterns:
// - PodCliqueSet name: DGD name (e.g., "vllm-agg")
// - For multinode services:
// - PodCliqueScalingGroup name: lowercase(serviceName) (e.g., "vllmprefillworker")
// - PodClique names: lowercase(serviceName + "-ldr") and lowercase(serviceName + "-wkr")
//
// - For single-node services:
// - PodClique name: lowercase(serviceName)
//
// The combined length of these names must not exceed 45 characters.
func (v *DynamoGraphDeploymentValidator) validateServiceNameLength(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) error {
dgdName := v.deployment.Name
lowerServiceName := strings.ToLower(serviceName)
// Check if this is a multinode service
isMultinode := service.GetNumberOfNodes() > 1
if isMultinode {
// For multinode: PodCliqueSet name + PodCliqueScalingGroup name + PodClique name (with leader suffix)
// The PodClique name is serviceName + "-ldr" (using GroveRoleSuffixLeader)
leaderPodCliqueName := lowerServiceName + "-" + consts.GroveRoleSuffixLeader
combinedLength := len(dgdName) + len(lowerServiceName) + len(leaderPodCliqueName)
if combinedLength > maxCombinedResourceNameLength {
return fmt.Errorf("spec.services[%s]: combined resource name length %d exceeds %d-character limit required for pod naming. "+
"Consider shortening the DynamoGraphDeployment name '%s' (length %d) or service name '%s' (length %d). "+
"For multinode services, the combined length of DGD name + service name + service name with role suffix (e.g., '%s-ldr') must not exceed %d characters",
serviceName, combinedLength, maxCombinedResourceNameLength,
dgdName, len(dgdName), serviceName, len(serviceName),
lowerServiceName, maxCombinedResourceNameLength)
}
} else {
// For single-node: PodCliqueSet name + PodClique name
combinedLength := len(dgdName) + len(lowerServiceName)
if combinedLength > maxCombinedResourceNameLength {
return fmt.Errorf("spec.services[%s]: combined resource name length %d exceeds %d-character limit required for pod naming. "+
"Consider shortening the DynamoGraphDeployment name '%s' (length %d) or service name '%s' (length %d). "+
"The combined length of DGD name + service name must not exceed %d characters",
serviceName, combinedLength, maxCombinedResourceNameLength,
dgdName, len(dgdName), serviceName, len(serviceName),
maxCombinedResourceNameLength)
}
}
return nil
}
// isGrovePathway determines if Grove pathway may be used for this deployment.
// Grove is used when the nvidia.com/enable-grove annotation is NOT explicitly set to "false".
// This is a conservative check - if Grove might be used, we validate the name length constraints.
func (v *DynamoGraphDeploymentValidator) isGrovePathway() bool {
return v.deployment.Annotations == nil ||
strings.ToLower(v.deployment.Annotations[consts.KubeAnnotationEnableGrove]) != consts.KubeLabelValueFalse
}
// validatePVCs validates the PVC configurations. // validatePVCs validates the PVC configurations.
func (v *DynamoGraphDeploymentValidator) validatePVCs() error { func (v *DynamoGraphDeploymentValidator) validatePVCs() error {
for i, pvc := range v.deployment.Spec.PVCs { for i, pvc := range v.deployment.Spec.PVCs {
......
...@@ -508,6 +508,174 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) { ...@@ -508,6 +508,174 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
}, },
wantErr: false, wantErr: false,
}, },
// Service name length validation tests
{
name: "service name too long for single-node deployment",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "verylongdynamographdeploymentname",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"VeryLongServiceNameThatExceedsLimit": {},
},
},
},
wantErr: true,
errContains: true,
errMsg: "combined resource name length",
},
{
name: "service name too long for multinode deployment",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "vllm-agg",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"VllmPrefillWorker": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
},
},
},
wantErr: true,
errContains: true,
errMsg: "combined resource name length",
},
{
name: "valid service name length for single-node",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "dgd",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"Frontend": {},
},
},
},
wantErr: false,
},
{
name: "valid service name length for multinode",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "dgd",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"Worker": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
},
},
},
wantErr: false,
},
{
name: "boundary case - exactly at 45 char limit for single-node",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
// DGD name (3 chars) + service name (42 chars) = 45 chars (exactly at limit)
Name: "dgd",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
// 42 character service name
"abcdefghijklmnopqrstuvwxyz0123456789ABCDEF": {},
},
},
},
wantErr: false,
},
{
name: "boundary case - one char over limit for single-node",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
// DGD name (3 chars) + service name (43 chars) = 46 chars (over limit)
Name: "dgd",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
// 43 character service name
"abcdefghijklmnopqrstuvwxyz0123456789ABCDEFG": {},
},
},
},
wantErr: true,
errContains: true,
errMsg: "combined resource name length 46 exceeds 45-character limit",
},
// Grove disabled tests - service name length validation should be skipped
{
name: "long service name allowed when Grove disabled via annotation",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "verylongdynamographdeploymentname",
Namespace: "default",
Annotations: map[string]string{
consts.KubeAnnotationEnableGrove: "false",
},
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"VeryLongServiceNameThatExceedsLimit": {},
},
},
},
wantErr: false,
},
{
name: "long multinode service name allowed when Grove disabled via annotation",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "vllm-agg",
Namespace: "default",
Annotations: map[string]string{
consts.KubeAnnotationEnableGrove: "false",
},
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"VllmPrefillWorker": {
Multinode: &nvidiacomv1alpha1.MultinodeSpec{
NodeCount: 2,
},
},
},
},
},
wantErr: false,
},
{
name: "Grove annotation case insensitive - FALSE",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "verylongdynamographdeploymentname",
Namespace: "default",
Annotations: map[string]string{
consts.KubeAnnotationEnableGrove: "FALSE",
},
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"VeryLongServiceNameThatExceedsLimit": {},
},
},
},
wantErr: false,
},
// Annotation validation test cases // Annotation validation test cases
{ {
name: "no annotations is valid", name: "no annotations is valid",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment