Unverified Commit 09f2314d authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

feat: add scaling adapter (#4699)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 1f9b69b0
......@@ -1034,7 +1034,7 @@ func GenerateGrovePodCliqueSet(
PodSpec: *podSpec,
},
}
labels, err := generateLabels(component, dynamoDeployment, r.Name)
labels, err := generateLabels(component, dynamoDeployment, serviceName)
if err != nil {
return nil, fmt.Errorf("failed to generate labels: %w", err)
}
......@@ -1075,6 +1075,7 @@ func generateLabels(component *v1alpha1.DynamoComponentDeploymentSharedSpec, dyn
labels := make(map[string]string)
labels[commonconsts.KubeLabelDynamoSelector] = GetDynamoComponentName(dynamoDeployment, componentName)
labels[commonconsts.KubeLabelDynamoGraphDeploymentName] = dynamoDeployment.Name
labels[commonconsts.KubeLabelDynamoComponent] = componentName
if component.DynamoNamespace != nil {
labels[commonconsts.KubeLabelDynamoNamespace] = *component.DynamoNamespace
}
......
......@@ -121,7 +121,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment",
},
Autoscaling: nil,
},
},
},
......@@ -153,7 +152,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
Custom: map[string]string{},
},
},
Autoscaling: nil,
},
},
},
......@@ -229,7 +227,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment",
},
Autoscaling: nil,
},
},
},
......@@ -261,7 +258,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
Custom: map[string]string{},
},
},
Autoscaling: nil,
},
},
},
......@@ -341,7 +337,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment",
},
Autoscaling: nil,
Ingress: &v1alpha1.IngressSpec{
Enabled: true,
Host: "test-dynamographdeployment",
......@@ -377,7 +372,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
Custom: map[string]string{},
},
},
Autoscaling: nil,
},
},
},
......@@ -465,7 +459,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment",
},
Autoscaling: nil,
Envs: []corev1.EnvVar{
{
Name: "DYN_DEPLOYMENT_CONFIG",
......@@ -503,7 +496,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
Custom: map[string]string{},
},
},
Autoscaling: nil,
Envs: []corev1.EnvVar{
{
Name: "DYN_DEPLOYMENT_CONFIG",
......@@ -599,7 +591,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
commonconsts.KubeLabelDynamoNamespace: "default-test-dynamographdeployment",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment",
},
Autoscaling: nil,
ExtraPodSpec: &v1alpha1.ExtraPodSpec{
MainContainer: &corev1.Container{
Command: []string{"sh", "-c"},
......@@ -644,7 +635,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
Custom: map[string]string{},
},
},
Autoscaling: nil,
Envs: []corev1.EnvVar{
{
Name: "TEST_ENV",
......@@ -1307,6 +1297,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: "frontend",
Labels: map[string]string{
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend",
commonconsts.KubeLabelDynamoComponent: "Frontend",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeFrontend,
commonconsts.KubeLabelDynamoSubComponentType: "test-sub-component",
......@@ -1483,6 +1474,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Labels: map[string]string{
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner",
commonconsts.KubeLabelDynamoComponent: "Planner",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypePlanner,
commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment",
......@@ -1884,8 +1876,9 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker,
commonconsts.KubeLabelDynamoSubComponentType: "test-sub-component",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker-ldr",
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
commonconsts.KubeLabelDynamoComponent: "worker",
commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment",
"nvidia.com/label1": "label1",
"nvidia.com/label2": "label2",
......@@ -2059,8 +2052,9 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker,
commonconsts.KubeLabelDynamoSubComponentType: "test-sub-component",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker-wkr",
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
commonconsts.KubeLabelDynamoComponent: "worker",
commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment",
"nvidia.com/label1": "label1",
"nvidia.com/label2": "label2",
......@@ -2200,6 +2194,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend",
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeFrontend,
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
commonconsts.KubeLabelDynamoComponent: "Frontend",
commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment",
},
Annotations: map[string]string{},
......@@ -2358,6 +2353,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: "planner",
Labels: map[string]string{
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner",
commonconsts.KubeLabelDynamoComponent: "Planner",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypePlanner,
......@@ -2779,7 +2775,8 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
{
Name: "worker-ldr",
Labels: map[string]string{
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker-ldr",
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker",
commonconsts.KubeLabelDynamoComponent: "worker",
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker,
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
......@@ -2943,7 +2940,8 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Labels: map[string]string{
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypeWorker,
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker-wkr",
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-worker",
commonconsts.KubeLabelDynamoComponent: "worker",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment",
"nvidia.com/label1": "label1",
......@@ -3084,6 +3082,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-frontend",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
commonconsts.KubeLabelDynamoComponent: "Frontend",
commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment",
},
Annotations: map[string]string{},
......@@ -3243,6 +3242,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Labels: map[string]string{
commonconsts.KubeLabelMetricsEnabled: commonconsts.KubeLabelValueTrue,
commonconsts.KubeLabelDynamoSelector: "test-dynamo-graph-deployment-planner",
commonconsts.KubeLabelDynamoComponent: "Planner",
commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
commonconsts.KubeLabelDynamoComponentType: commonconsts.ComponentTypePlanner,
commonconsts.KubeLabelDynamoNamespace: "test-namespace-test-dynamo-graph-deployment",
......
......@@ -19,7 +19,9 @@ package webhook
import (
"context"
"strings"
authenticationv1 "k8s.io/api/authentication/v1"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
logf "sigs.k8s.io/controller-runtime/pkg/log"
......@@ -118,3 +120,54 @@ func (v *LeaseAwareValidator) shouldSkipValidation(obj runtime.Object) bool {
return false
}
// DGDReplicasModifierSuffixes defines suffixes for service accounts that are authorized
// to modify DGD replicas when scaling adapter is enabled.
// Service accounts matching any of these suffixes are allowed regardless of namespace.
var DGDReplicasModifierSuffixes = []string{
// Dynamo operator controller manager (handles DGDSA reconciliation)
// Example: "dynamo-platform-dynamo-operator-controller-manager"
"-dynamo-operator-controller-manager",
// Planner service account (manages DGD replicas for autoscaling)
// Example: "planner-serviceaccount"
"planner-serviceaccount",
}
// CanModifyDGDReplicas checks if the request comes from a service account authorized
// to modify DGD replicas when scaling adapter is enabled.
// Service accounts are identified by username format: system:serviceaccount:<namespace>:<name>
//
// Authorized service accounts (by suffix):
// - *-dynamo-operator-controller-manager (for DGDSA reconciliation)
// - *planner-serviceaccount (for Planner autoscaling)
func CanModifyDGDReplicas(userInfo authenticationv1.UserInfo) bool {
username := userInfo.Username
// Service accounts have username format: system:serviceaccount:<namespace>:<name>
if !strings.HasPrefix(username, "system:serviceaccount:") {
return false
}
// Parse: system:serviceaccount:<namespace>:<name>
parts := strings.Split(username, ":")
if len(parts) != 4 {
return false
}
namespace := parts[2]
saName := parts[3]
// Check against authorized suffixes
for _, suffix := range DGDReplicasModifierSuffixes {
if strings.HasSuffix(saName, suffix) {
webhookCommonLog.V(1).Info("allowing DGD replicas modification",
"serviceAccount", saName,
"namespace", namespace,
"matchedSuffix", suffix)
return true
}
}
return false
}
......@@ -42,13 +42,10 @@ func NewDynamoComponentDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoC
func (v *DynamoComponentDeploymentValidator) Validate() (admission.Warnings, error) {
// Validate shared spec fields using SharedSpecValidator
sharedValidator := NewSharedSpecValidator(&v.deployment.Spec.DynamoComponentDeploymentSharedSpec, "spec")
if err := sharedValidator.Validate(); err != nil {
return nil, err
}
// DCD-specific validation would go here (currently none)
return nil, nil
return sharedValidator.Validate()
}
// ValidateUpdate performs stateful validation comparing old and new DynamoComponentDeployment.
......
......@@ -47,11 +47,6 @@ func TestDynamoComponentDeploymentValidator_Validate(t *testing.T) {
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &validReplicas,
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 1,
MaxReplicas: 10,
},
},
BackendFramework: "sglang",
},
......@@ -74,26 +69,6 @@ func TestDynamoComponentDeploymentValidator_Validate(t *testing.T) {
wantErr: true,
errMsg: "spec.replicas must be non-negative",
},
{
name: "invalid autoscaling",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-deployment",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 5,
MaxReplicas: 3,
},
},
},
},
wantErr: true,
errMsg: "spec.autoscaling.maxReplicas must be > minReplicas",
},
{
name: "invalid ingress",
deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
......
......@@ -22,6 +22,8 @@ import (
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
authenticationv1 "k8s.io/api/authentication/v1"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
......@@ -51,30 +53,106 @@ func (v *DynamoGraphDeploymentValidator) Validate() (admission.Warnings, error)
return nil, err
}
var allWarnings admission.Warnings
// Validate each service
for serviceName, service := range v.deployment.Spec.Services {
if err := v.validateService(serviceName, service); err != nil {
warnings, err := v.validateService(serviceName, service)
if err != nil {
return nil, err
}
allWarnings = append(allWarnings, warnings...)
}
return nil, nil
return allWarnings, nil
}
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeployment.
// userInfo is used for identity-based validation (replica protection).
// If userInfo is nil, replica changes for DGDSA-enabled services are rejected (fail closed).
// Returns warnings and error.
func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment) (admission.Warnings, error) {
// Validate that BackendFramework is not changed (immutable)
func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) (admission.Warnings, error) {
var warnings admission.Warnings
// Validate immutable fields
if err := v.validateImmutableFields(old, &warnings); err != nil {
return warnings, err
}
// Validate replicas changes for services with scaling adapter enabled
// Pass userInfo (may be nil - will fail closed for DGDSA-enabled services)
if err := v.validateReplicasChanges(old, userInfo); err != nil {
return warnings, err
}
return warnings, nil
}
// validateImmutableFields checks that immutable fields have not been changed.
// Appends warnings to the provided slice.
func (v *DynamoGraphDeploymentValidator) validateImmutableFields(old *nvidiacomv1alpha1.DynamoGraphDeployment, warnings *admission.Warnings) error {
if v.deployment.Spec.BackendFramework != old.Spec.BackendFramework {
warning := "Changing spec.backendFramework may cause unexpected behavior"
return admission.Warnings{warning}, fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation")
*warnings = append(*warnings, "Changing spec.backendFramework may cause unexpected behavior")
return fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation")
}
return nil
}
// validateReplicasChanges checks if replicas were changed for services with scaling adapter enabled.
// Only authorized service accounts (operator controller, planner) can modify these fields.
// If userInfo is nil, all replica changes for DGDSA-enabled services are rejected (fail closed).
func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) error {
// If the request comes from an authorized service account, allow the change
if userInfo != nil && internalwebhook.CanModifyDGDReplicas(*userInfo) {
return nil
}
var errs []error
for serviceName, newService := range v.deployment.Spec.Services {
// Check if scaling adapter is enabled for this service (enabled by default)
scalingAdapterEnabled := true
if newService.ScalingAdapter != nil && newService.ScalingAdapter.Disable {
scalingAdapterEnabled = false
}
if !scalingAdapterEnabled {
// Scaling adapter is disabled, users can modify replicas directly
continue
}
// Get old service (if exists)
oldService, exists := old.Spec.Services[serviceName]
if !exists {
// New service, no comparison needed
continue
}
// Check if replicas changed
oldReplicas := int32(1) // default
if oldService.Replicas != nil {
oldReplicas = *oldService.Replicas
}
newReplicas := int32(1) // default
if newService.Replicas != nil {
newReplicas = *newService.Replicas
}
return nil, nil
if oldReplicas != newReplicas {
errs = append(errs, fmt.Errorf(
"spec.services[%s].replicas cannot be modified directly when scaling adapter is enabled; "+
"scale or update the related DynamoGraphDeploymentScalingAdapter instead",
serviceName))
}
}
return errors.Join(errs...)
}
// validateService validates a single service configuration using SharedSpecValidator.
func (v *DynamoGraphDeploymentValidator) validateService(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) error {
// Returns warnings and error.
func (v *DynamoGraphDeploymentValidator) validateService(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) (admission.Warnings, error) {
// Use SharedSpecValidator to validate service spec (which is a DynamoComponentDeploymentSharedSpec)
fieldPath := fmt.Sprintf("spec.services[%s]", serviceName)
sharedValidator := NewSharedSpecValidator(service, fieldPath)
......
......@@ -23,6 +23,7 @@ import (
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
authenticationv1 "k8s.io/api/authentication/v1"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
......@@ -91,9 +92,24 @@ func (h *DynamoGraphDeploymentHandler) ValidateUpdate(ctx context.Context, oldOb
return warnings, err
}
// Validate stateful rules (immutability)
updateWarnings, err := validator.ValidateUpdate(oldDeployment)
// Get user info from admission request context for identity-based validation
var userInfo *authenticationv1.UserInfo
req, err := admission.RequestFromContext(ctx)
if err != nil {
logger.Error(err, "failed to get admission request from context, replica changes for DGDSA-enabled services will be rejected")
// userInfo remains nil - validateReplicasChanges will fail closed
} else {
userInfo = &req.UserInfo
}
// Validate stateful rules (immutability + replicas protection)
updateWarnings, err := validator.ValidateUpdate(oldDeployment, userInfo)
if err != nil {
username := "<unknown>"
if userInfo != nil {
username = userInfo.Username
}
logger.Info("validation failed", "error", err.Error(), "user", username)
return updateWarnings, err
}
......
......@@ -93,28 +93,6 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
wantErr: true,
errMsg: "spec.services[main].replicas must be non-negative",
},
{
name: "service with invalid autoscaling",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
ObjectMeta: metav1.ObjectMeta{
Name: "test-graph",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
"prefill": {
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 10,
MaxReplicas: 5,
},
},
},
},
},
wantErr: true,
errMsg: "spec.services[prefill].autoscaling.maxReplicas must be > minReplicas",
},
{
name: "service with invalid ingress",
deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
......@@ -441,7 +419,8 @@ func TestDynamoGraphDeploymentValidator_ValidateUpdate(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentValidator(tt.newDeployment)
warnings, err := validator.ValidateUpdate(tt.oldDeployment)
// Pass nil userInfo - these tests don't modify replicas, so it's safe
warnings, err := validator.ValidateUpdate(tt.oldDeployment, nil)
if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
......
......@@ -21,6 +21,7 @@ import (
"fmt"
nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
// SharedSpecValidator validates DynamoComponentDeploymentSharedSpec fields.
......@@ -41,61 +42,45 @@ func NewSharedSpecValidator(spec *nvidiacomv1alpha1.DynamoComponentDeploymentSha
}
// Validate performs validation on the shared spec fields.
// Returns an error if validation fails.
func (v *SharedSpecValidator) Validate() error {
// Returns warnings (e.g., deprecation notices) and error if validation fails.
func (v *SharedSpecValidator) Validate() (admission.Warnings, error) {
// Validate replicas if specified
if v.spec.Replicas != nil && *v.spec.Replicas < 0 {
return fmt.Errorf("%s.replicas must be non-negative", v.fieldPath)
}
// Validate autoscaling configuration if specified
if v.spec.Autoscaling != nil {
if err := v.validateAutoscaling(); err != nil {
return err
}
return nil, fmt.Errorf("%s.replicas must be non-negative", v.fieldPath)
}
// Validate ingress configuration if enabled
if v.spec.Ingress != nil && v.spec.Ingress.Enabled {
if err := v.validateIngress(); err != nil {
return err
return nil, err
}
}
// Validate volume mounts
if err := v.validateVolumeMounts(); err != nil {
return err
return nil, err
}
// Validate shared memory
if v.spec.SharedMemory != nil {
if err := v.validateSharedMemory(); err != nil {
return err
}
return nil, err
}
return nil
}
// validateAutoscaling validates the autoscaling configuration.
func (v *SharedSpecValidator) validateAutoscaling() error {
autoscaling := v.spec.Autoscaling
if !autoscaling.Enabled {
return nil
}
// Validate minReplicas
if autoscaling.MinReplicas < 1 {
return fmt.Errorf("%s.autoscaling.minReplicas must be >= 1", v.fieldPath)
}
// Collect warnings (e.g., deprecation notices)
var warnings admission.Warnings
// Validate maxReplicas
if autoscaling.MaxReplicas <= autoscaling.MinReplicas {
return fmt.Errorf("%s.autoscaling.maxReplicas must be > minReplicas", v.fieldPath)
// Check for deprecated autoscaling field
//nolint:staticcheck // SA1019: Intentionally checking deprecated field to warn users
if v.spec.Autoscaling != nil {
warnings = append(warnings, fmt.Sprintf(
"%s.autoscaling is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter "+
"with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md",
v.fieldPath))
}
return nil
return warnings, nil
}
// validateIngress validates the ingress configuration.
......
......@@ -41,11 +41,6 @@ func TestSharedSpecValidator_Validate(t *testing.T) {
name: "valid spec with all fields",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &validReplicas,
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 1,
MaxReplicas: 10,
},
Ingress: &nvidiacomv1alpha1.IngressSpec{
Enabled: true,
Host: "example.com",
......@@ -77,44 +72,6 @@ func TestSharedSpecValidator_Validate(t *testing.T) {
wantErr: true,
errMsg: "spec.replicas must be non-negative",
},
{
name: "autoscaling minReplicas too low",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 0,
MaxReplicas: 10,
},
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.autoscaling.minReplicas must be >= 1",
},
{
name: "autoscaling maxReplicas less than minReplicas",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 5,
MaxReplicas: 3,
},
},
fieldPath: "spec",
wantErr: true,
errMsg: "spec.autoscaling.maxReplicas must be > minReplicas",
},
{
name: "autoscaling disabled - no validation",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: false,
MinReplicas: 0,
MaxReplicas: 0,
},
},
fieldPath: "spec",
wantErr: false,
},
{
name: "ingress enabled without host",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
......@@ -227,7 +184,7 @@ func TestSharedSpecValidator_Validate(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewSharedSpecValidator(tt.spec, tt.fieldPath)
err := validator.Validate()
_, err := validator.Validate()
if (err != nil) != tt.wantErr {
t.Errorf("SharedSpecValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
......@@ -240,3 +197,53 @@ func TestSharedSpecValidator_Validate(t *testing.T) {
})
}
}
func TestSharedSpecValidator_Validate_Warnings(t *testing.T) {
validReplicas := int32(3)
tests := []struct {
name string
spec *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
fieldPath string
wantWarnings int
}{
{
name: "no warnings for spec without autoscaling",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &validReplicas,
},
fieldPath: "spec",
wantWarnings: 0,
},
{
name: "warning for deprecated autoscaling field enabled",
spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
Replicas: &validReplicas,
//nolint:staticcheck // SA1019: Intentionally testing deprecated field
Autoscaling: &nvidiacomv1alpha1.Autoscaling{
Enabled: true,
MinReplicas: 1,
MaxReplicas: 10,
},
},
fieldPath: "spec",
wantWarnings: 1,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewSharedSpecValidator(tt.spec, tt.fieldPath)
warnings, err := validator.Validate()
if err != nil {
t.Errorf("SharedSpecValidator.Validate() unexpected error = %v", err)
return
}
if len(warnings) != tt.wantWarnings {
t.Errorf("SharedSpecValidator.Validate() warnings count = %d, want %d", len(warnings), tt.wantWarnings)
}
})
}
}
......@@ -10,3 +10,4 @@ Deployment Guide
Webhooks <../kubernetes/webhooks>
Minikube Setup <../kubernetes/deployment/minikube>
Managing Models with DynamoModel <../kubernetes/deployment/dynamomodel-guide>
Autoscaling <../kubernetes/autoscaling>
......@@ -37,6 +37,7 @@ Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API
- [DynamoComponentDeployment](#dynamocomponentdeployment)
- [DynamoGraphDeployment](#dynamographdeployment)
- [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
- [DynamoModel](#dynamomodel)
......@@ -45,7 +46,9 @@ Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API
Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter
with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md
for migration guidance. This field will be removed in a future API version.
......@@ -55,11 +58,11 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `enabled` _boolean_ | | | |
| `minReplicas` _integer_ | | | |
| `maxReplicas` _integer_ | | | |
| `behavior` _[HorizontalPodAutoscalerBehavior](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling)_ | | | |
| `metrics` _[MetricSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling) array_ | | | |
| `enabled` _boolean_ | Deprecated: This field is ignored. | | |
| `minReplicas` _integer_ | Deprecated: This field is ignored. | | |
| `maxReplicas` _integer_ | Deprecated: This field is ignored. | | |
| `behavior` _[HorizontalPodAutoscalerBehavior](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling)_ | Deprecated: This field is ignored. | | |
| `metrics` _[MetricSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling) array_ | Deprecated: This field is ignored. | | |
......@@ -165,7 +168,7 @@ _Appears in:_
| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.<br />The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\} <br /> |
| `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | |
| `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,<br />GPUs/devices, and any runtime-specific resources. | | |
| `autoscaling` _[Autoscaling](#autoscaling)_ | Autoscaling config for this component (replica range, target utilization, etc.). | | |
| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter<br />with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md<br />for migration guidance. This field will be removed in a future API version. | | |
| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | |
| `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as<br />environment variables in the component containers. | | |
| `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. | | |
......@@ -176,8 +179,9 @@ _Appears in:_
| `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.<br />It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field<br />that allows overriding the main container configuration. | | |
| `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. | | |
| `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. | | |
| `replicas` _integer_ | Replicas is the desired number of Pods for this component when autoscaling is not used. | | |
| `replicas` _integer_ | Replicas is the desired number of Pods for this component.<br />When scalingAdapter is enabled (default), this field is managed by the<br />DynamoGraphDeploymentScalingAdapter and should not be modified directly. | | Minimum: 0 <br /> |
| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | |
| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled (default), replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. | | |
#### DynamoComponentDeploymentSpec
......@@ -202,7 +206,7 @@ _Appears in:_
| `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.<br />The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component | | Optional: \{\} <br /> |
| `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace | | |
| `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,<br />GPUs/devices, and any runtime-specific resources. | | |
| `autoscaling` _[Autoscaling](#autoscaling)_ | Autoscaling config for this component (replica range, target utilization, etc.). | | |
| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter<br />with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md<br />for migration guidance. This field will be removed in a future API version. | | |
| `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. | | |
| `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as<br />environment variables in the component containers. | | |
| `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. | | |
......@@ -213,8 +217,9 @@ _Appears in:_
| `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.<br />It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field<br />that allows overriding the main container configuration. | | |
| `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. | | |
| `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. | | |
| `replicas` _integer_ | Replicas is the desired number of Pods for this component when autoscaling is not used. | | |
| `replicas` _integer_ | Replicas is the desired number of Pods for this component.<br />When scalingAdapter is enabled (default), this field is managed by the<br />DynamoGraphDeploymentScalingAdapter and should not be modified directly. | | Minimum: 0 <br /> |
| `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. | | |
| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled (default), replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. | | |
#### DynamoGraphDeployment
......@@ -314,6 +319,83 @@ _Appears in:_
| `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.<br />Contains name, namespace, state, and creation status of the managed DGD. | | Optional: \{\} <br /> |
#### DynamoGraphDeploymentScalingAdapter
DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services
within a DynamoGraphDeployment. It implements the Kubernetes scale
subresource, enabling integration with HPA, KEDA, and custom autoscalers.
The adapter acts as an intermediary between autoscalers and the DGD,
ensuring that only the adapter controller modifies the DGD's service replicas.
This prevents conflicts when multiple autoscaling mechanisms are in play.
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
| `kind` _string_ | `DynamoGraphDeploymentScalingAdapter` | | |
| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
| `spec` _[DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)_ | | | |
| `status` _[DynamoGraphDeploymentScalingAdapterStatus](#dynamographdeploymentscalingadapterstatus)_ | | | |
#### DynamoGraphDeploymentScalingAdapterSpec
DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter
_Appears in:_
- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `replicas` _integer_ | Replicas is the desired number of replicas for the target service.<br />This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. | | Minimum: 0 <br />Required: \{\} <br /> |
| `dgdRef` _[DynamoGraphDeploymentServiceRef](#dynamographdeploymentserviceref)_ | DGDRef references the DynamoGraphDeployment and the specific service to scale. | | Required: \{\} <br /> |
#### DynamoGraphDeploymentScalingAdapterStatus
DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter
_Appears in:_
- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `replicas` _integer_ | Replicas is the current number of replicas for the target service.<br />This is synced from the DGD's service replicas and is required for the scale subresource. | | |
| `selector` _string_ | Selector is a label selector string for the pods managed by this adapter.<br />Required for HPA compatibility via the scale subresource. | | |
| `lastScaleTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | LastScaleTime is the last time the adapter scaled the target service. | | |
#### DynamoGraphDeploymentServiceRef
DynamoGraphDeploymentServiceRef identifies a specific service within a DynamoGraphDeployment
_Appears in:_
- [DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `name` _string_ | Name of the DynamoGraphDeployment | | MinLength: 1 <br />Required: \{\} <br /> |
| `serviceName` _string_ | ServiceName is the key name of the service within the DGD's spec.services map to scale | | MinLength: 1 <br />Required: \{\} <br /> |
#### DynamoGraphDeploymentSpec
......@@ -638,6 +720,25 @@ _Appears in:_
| `claims` _[ResourceClaim](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourceclaim-v1-core) array_ | Claims specifies resource claims for dynamic resource allocation | | |
#### ScalingAdapter
ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter
for replica management. When enabled (default), the DGDSA owns the replicas field and
external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource.
_Appears in:_
- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `disable` _boolean_ | Disable indicates whether the ScalingAdapter should be disabled for this service.<br />When false (default), a DGDSA is created and owns the replicas field.<br />When true, no DGDSA is created and replicas can be modified directly in the DGD. | false | |
#### SharedMemorySpec
......
# Autoscaling
This guide explains how to configure autoscaling for DynamoGraphDeployment (DGD) services using the `sglang-agg` example from `examples/backends/sglang/deploy/agg.yaml`.
## Example DGD
All examples in this guide use the following DGD:
```yaml
# examples/backends/sglang/deploy/agg.yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: sglang-agg
namespace: default
spec:
services:
Frontend:
dynamoNamespace: sglang-agg
componentType: frontend
replicas: 1
decode:
dynamoNamespace: sglang-agg
componentType: worker
replicas: 1
resources:
limits:
gpu: "1"
```
**Key identifiers:**
- **DGD name**: `sglang-agg`
- **Namespace**: `default`
- **Services**: `Frontend`, `decode`
- **dynamo_namespace label**: `default-sglang-agg` (used for metric filtering)
## Overview
Dynamo provides flexible autoscaling through the `DynamoGraphDeploymentScalingAdapter` (DGDSA) resource. When you deploy a DGD, the operator automatically creates one adapter per service (unless explicitly disabled). These adapters implement the Kubernetes [Scale subresource](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#scale-subresource), enabling integration with:
| Autoscaler | Description | Best For |
|------------|-------------|----------|
| **KEDA** | Event-driven autoscaling (recommended) | Most use cases |
| **Kubernetes HPA** | Native horizontal pod autoscaling | Simple CPU/memory-based scaling |
| **Dynamo Planner** | LLM-aware autoscaling with SLA optimization | Production LLM workloads |
| **Custom Controllers** | Any scale-subresource-compatible controller | Custom requirements |
> **⚠️ Deprecation Notice**: The `spec.services[X].autoscaling` field in DGD is **deprecated and ignored**. Use DGDSA with HPA, KEDA, or Planner instead. If you have existing DGDs with `autoscaling` configured, you'll see a warning. Remove the field to silence the warning.
## Architecture
```
┌──────────────────────────────────┐ ┌─────────────────────────────────────┐
│ DynamoGraphDeployment │ │ Scaling Adapters (auto-created) │
│ "sglang-agg" │ │ (one per service) │
├──────────────────────────────────┤ ├─────────────────────────────────────┤
│ │ │ │
│ spec.services: │ │ ┌─────────────────────────────┐ │ ┌──────────────────┐
│ │ │ │ sglang-agg-frontend │◄───┼──────│ Autoscalers │
│ ┌────────────────────────┐◄───┼──────────┼──│ spec.replicas: 1 │ │ │ │
│ │ Frontend: 1 replica │ │ │ └─────────────────────────────┘ │ │ • KEDA │
│ └────────────────────────┘ │ │ │ │ • HPA │
│ │ │ ┌─────────────────────────────┐ │ │ • Planner │
│ ┌────────────────────────┐◄───┼──────────┼──│ sglang-agg-decode │◄───┼──────│ • Custom │
│ │ decode: 1 replica │ │ │ │ spec.replicas: 1 │ │ │ │
│ └────────────────────────┘ │ │ └─────────────────────────────┘ │ └──────────────────┘
│ │ │ │
└──────────────────────────────────┘ └─────────────────────────────────────┘
```
**How it works:**
1. You deploy a DGD with services (Frontend, decode)
2. The operator auto-creates one DGDSA per service
3. Autoscalers (KEDA, HPA, Planner) target the adapters via `/scale` subresource
4. Adapter controller syncs replica changes to the DGD
5. DGD controller reconciles the underlying pods
## Viewing Scaling Adapters
After deploying the `sglang-agg` DGD, verify the auto-created adapters:
```bash
kubectl get dgdsa -n default
# Example output:
# NAME DGD SERVICE REPLICAS AGE
# sglang-agg-frontend sglang-agg Frontend 1 5m
# sglang-agg-decode sglang-agg decode 1 5m
```
## Replica Ownership Model
When DGDSA is enabled (the default), it becomes the **source of truth** for replica counts. This follows the same pattern as Kubernetes Deployments owning ReplicaSets.
### How It Works
1. **DGDSA owns replicas**: Autoscalers (HPA, KEDA, Planner) update the DGDSA's `spec.replicas`
2. **DGDSA syncs to DGD**: The DGDSA controller writes the replica count to the DGD's service
3. **Direct DGD edits blocked**: A validating webhook prevents users from directly editing `spec.services[X].replicas` in the DGD
4. **Controllers allowed**: Only authorized controllers (operator, Planner) can modify DGD replicas
### Manual Scaling with DGDSA Enabled
When DGDSA is enabled, use `kubectl scale` on the adapter (not the DGD):
```bash
# ✅ Correct - scale via DGDSA
kubectl scale dgdsa sglang-agg-decode --replicas=3
# ❌ Blocked - direct DGD edit rejected by webhook
kubectl patch dgd sglang-agg --type=merge -p '{"spec":{"services":{"decode":{"replicas":3}}}}'
# Error: spec.services[decode].replicas cannot be modified directly when scaling adapter is enabled;
# use 'kubectl scale dgdsa/sglang-agg-decode --replicas=3' or update the DynamoGraphDeploymentScalingAdapter instead
```
## Disabling DGDSA for a Service
If you want to manage replicas directly in the DGD (without autoscaling), you can disable the scaling adapter per service:
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: sglang-agg
spec:
services:
Frontend:
replicas: 2
scalingAdapter:
disable: true # ← No DGDSA created, direct edits allowed
decode:
replicas: 1 # ← DGDSA created by default, managed via adapter
```
**When to disable DGDSA:**
- You want simple, manual replica management
- You don't need autoscaling for that service
- You prefer direct DGD edits over adapter-based scaling
**When to keep DGDSA enabled (default):**
- You want to use HPA, KEDA, or Planner for autoscaling
- You want a clear separation between "desired scale" (adapter) and "deployment config" (DGD)
- You want protection against accidental direct replica edits
## Autoscaling with Dynamo Planner
The Dynamo Planner is an LLM-aware autoscaler that optimizes scaling decisions based on inference-specific metrics like Time To First Token (TTFT), Inter-Token Latency (ITL), and KV cache utilization.
**When to use Planner:**
- You want LLM-optimized autoscaling out of the box
- You need coordinated scaling across prefill/decode services
- You want SLA-driven scaling (e.g., target TTFT < 500ms)
**How Planner works:**
Planner is deployed as a service component within your DGD. It:
1. Queries Prometheus for frontend metrics (request rate, latency, etc.)
2. Uses profiling data to predict optimal replica counts
3. Scales prefill/decode workers to meet SLA targets
**Deployment:**
The recommended way to deploy Planner is via `DynamoGraphDeploymentRequest` (DGDR). See the [SLA Planner Quick Start](../planner/sla_planner_quickstart.md) for complete instructions.
Example configurations with Planner:
- `examples/backends/vllm/deploy/disagg_planner.yaml`
- `examples/backends/sglang/deploy/disagg_planner.yaml`
- `examples/backends/trtllm/deploy/disagg_planner.yaml`
For more details, see the [SLA Planner documentation](../planner/sla_planner.md).
## Autoscaling with Kubernetes HPA
The Horizontal Pod Autoscaler (HPA) is Kubernetes' native autoscaling solution.
**When to use HPA:**
- You have simple, predictable scaling requirements
- You want to use standard Kubernetes tooling
- You need CPU or memory-based scaling
> **Note**: For custom metrics (like TTFT or queue depth), consider using [KEDA](#autoscaling-with-keda-recommended) instead - it's simpler to configure.
### Basic HPA (CPU-based)
```yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: sglang-agg-frontend-hpa
namespace: default
spec:
scaleTargetRef:
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentScalingAdapter
name: sglang-agg-frontend
minReplicas: 1
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
behavior:
scaleDown:
stabilizationWindowSeconds: 300
scaleUp:
stabilizationWindowSeconds: 0
```
### HPA with Dynamo Metrics
Dynamo exports several metrics useful for autoscaling. These are available at the `/metrics` endpoint on each frontend pod.
> **See also**: For a complete list of all Dynamo metrics, see the [Metrics Reference](../observability/metrics.md). For Prometheus and Grafana setup, see the [Prometheus and Grafana Setup Guide](../observability/prometheus-grafana.md).
#### Available Dynamo Metrics
| Metric | Type | Description | Good for scaling |
|--------|------|-------------|------------------|
| `dynamo_frontend_queued_requests` | Gauge | Requests waiting in HTTP queue | ✅ Workers |
| `dynamo_frontend_inflight_requests` | Gauge | Concurrent requests to engine | ✅ All services |
| `dynamo_frontend_time_to_first_token_seconds` | Histogram | TTFT latency | ✅ Workers |
| `dynamo_frontend_inter_token_latency_seconds` | Histogram | ITL latency | ✅ Decode |
| `dynamo_frontend_request_duration_seconds` | Histogram | Total request duration | ⚠️ General |
| `kvstats_gpu_cache_usage_percent` | Gauge | GPU KV cache usage (0-1) | ✅ Decode |
#### Metric Labels
Dynamo metrics include these labels for filtering:
| Label | Description | Example |
|-------|-------------|---------|
| `dynamo_namespace` | Unique DGD identifier (`{k8s-namespace}-{dynamoNamespace}`) | `default-sglang-agg` |
| `model` | Model being served | `Qwen/Qwen3-0.6B` |
> **Note**: When you have multiple DGDs in the same namespace, use `dynamo_namespace` to filter metrics for a specific DGD.
#### Example: Scale Decode Service Based on TTFT
Using HPA with Prometheus Adapter requires configuring external metrics.
**Step 1: Configure Prometheus Adapter**
Add this to your Helm values file (e.g., `prometheus-adapter-values.yaml`):
```yaml
# prometheus-adapter-values.yaml
prometheus:
url: http://prometheus-kube-prometheus-prometheus.monitoring.svc
port: 9090
rules:
external:
# TTFT p95 from frontend - used to scale decode
- seriesQuery: 'dynamo_frontend_time_to_first_token_seconds_bucket{namespace!=""}'
resources:
overrides:
namespace: {resource: "namespace"}
name:
as: "dynamo_ttft_p95_seconds"
metricsQuery: |
histogram_quantile(0.95,
sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{<<.LabelMatchers>>}[5m]))
by (le, namespace, dynamo_namespace)
)
```
**Step 2: Install Prometheus Adapter**
```bash
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
helm upgrade --install prometheus-adapter prometheus-community/prometheus-adapter \
-n monitoring --create-namespace \
-f prometheus-adapter-values.yaml
```
**Step 3: Verify the metric is available**
```bash
kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1/namespaces/<your-namespace>/dynamo_ttft_p95_seconds" | jq
```
**Step 4: Create the HPA**
```yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: sglang-agg-decode-hpa
spec:
scaleTargetRef:
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentScalingAdapter
name: sglang-agg-decode # ← DGD name + service name (lowercase)
minReplicas: 1
maxReplicas: 10
metrics:
- type: External
external:
metric:
name: dynamo_ttft_p95_seconds
selector:
matchLabels:
dynamo_namespace: "default-sglang-agg" # ← {namespace}-{dynamoNamespace}
target:
type: Value
value: "500m" # Scale up when TTFT p95 > 500ms
behavior:
scaleDown:
stabilizationWindowSeconds: 60 # Wait 1 min before scaling down
policies:
- type: Pods
value: 1
periodSeconds: 30
scaleUp:
stabilizationWindowSeconds: 0 # Scale up immediately
policies:
- type: Pods
value: 2
periodSeconds: 30
```
**How it works:**
1. Frontend pods export `dynamo_frontend_time_to_first_token_seconds` histogram
2. Prometheus Adapter calculates p95 TTFT per `dynamo_namespace`
3. HPA monitors this metric filtered by `dynamo_namespace: "default-sglang-agg"`
4. When TTFT p95 > 500ms, HPA scales up the `sglang-agg-decode` adapter
5. Adapter controller syncs the replica count to the DGD's `decode` service
6. More decode workers are created, reducing TTFT
#### Example: Scale Based on Queue Depth
Add this rule to your `prometheus-adapter-values.yaml` (alongside the TTFT rule):
```yaml
# Add to rules.external in prometheus-adapter-values.yaml
- seriesQuery: 'dynamo_frontend_queued_requests{namespace!=""}'
resources:
overrides:
namespace: {resource: "namespace"}
name:
as: "dynamo_queued_requests"
metricsQuery: |
sum(<<.Series>>{<<.LabelMatchers>>}) by (namespace, dynamo_namespace)
```
Then create the HPA:
```yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: sglang-agg-decode-queue-hpa
namespace: default
spec:
scaleTargetRef:
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentScalingAdapter
name: sglang-agg-decode
minReplicas: 1
maxReplicas: 10
metrics:
- type: External
external:
metric:
name: dynamo_queued_requests
selector:
matchLabels:
dynamo_namespace: "default-sglang-agg"
target:
type: Value
value: "10" # Scale up when queue > 10 requests
```
## Autoscaling with KEDA (Recommended)
KEDA (Kubernetes Event-driven Autoscaling) extends Kubernetes with event-driven autoscaling, supporting 50+ scalers including Prometheus.
**Advantages over HPA + Prometheus Adapter:**
- No Prometheus Adapter configuration needed
- PromQL queries are defined in the ScaledObject itself (declarative, per-deployment)
- Easy to update - just `kubectl apply` the ScaledObject
- Can scale to zero when idle
- Supports multiple triggers per object
**When to use KEDA:**
- You want simpler configuration (no Prometheus Adapter to manage)
- You need event-driven scaling (e.g., queue depth, Kafka, etc.)
- You want to scale to zero when idle
### Installing KEDA
```bash
# Add KEDA Helm repo
helm repo add kedacore https://kedacore.github.io/charts
helm repo update
# Install KEDA
helm install keda kedacore/keda \
--namespace keda \
--create-namespace
# Verify installation
kubectl get pods -n keda
```
> **Note**: If you have Prometheus Adapter installed, either uninstall it first (`helm uninstall prometheus-adapter -n monitoring`) or install KEDA with `--set metricsServer.enabled=false` to avoid API conflicts.
### Example: Scale Decode Based on TTFT
Using the `sglang-agg` DGD from `examples/backends/sglang/deploy/agg.yaml`:
```yaml
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
name: sglang-agg-decode-scaler
namespace: default
spec:
scaleTargetRef:
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentScalingAdapter
name: sglang-agg-decode
minReplicaCount: 1
maxReplicaCount: 10
pollingInterval: 15 # Check metrics every 15 seconds
cooldownPeriod: 60 # Wait 60s before scaling down
triggers:
- type: prometheus
metadata:
# Update this URL to match your Prometheus service
serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090
metricName: dynamo_ttft_p95
query: |
histogram_quantile(0.95,
sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m]))
by (le)
)
threshold: "0.5" # Scale up when TTFT p95 > 500ms (0.5 seconds)
activationThreshold: "0.1" # Start scaling when TTFT > 100ms
```
Apply it:
```bash
kubectl apply -f sglang-agg-decode-scaler.yaml
```
### Verify KEDA Scaling
```bash
# Check ScaledObject status
kubectl get scaledobject -n default
# KEDA creates an HPA under the hood - you can see it
kubectl get hpa -n default
# Example output:
# NAME REFERENCE TARGETS MINPODS MAXPODS REPLICAS
# keda-hpa-sglang-agg-decode-scaler DynamoGraphDeploymentScalingAdapter/sglang-agg-decode 45m/500m 1 10 1
# Get detailed status
kubectl describe scaledobject sglang-agg-decode-scaler -n default
```
### Example: Scale Based on Queue Depth
```yaml
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
name: sglang-agg-decode-queue-scaler
namespace: default
spec:
scaleTargetRef:
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentScalingAdapter
name: sglang-agg-decode
minReplicaCount: 1
maxReplicaCount: 10
pollingInterval: 15
cooldownPeriod: 60
triggers:
- type: prometheus
metadata:
serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090
metricName: dynamo_queued_requests
query: |
sum(dynamo_frontend_queued_requests{dynamo_namespace="default-sglang-agg"})
threshold: "10" # Scale up when queue > 10 requests
```
### How KEDA Works
KEDA creates and manages an HPA under the hood:
```
┌──────────────────────────────────────────────────────────────────────┐
│ You create: ScaledObject │
│ - scaleTargetRef: sglang-agg-decode │
│ - triggers: prometheus query │
└──────────────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────────────┐
│ KEDA Operator automatically creates: HPA │
│ - name: keda-hpa-sglang-agg-decode-scaler │
│ - scaleTargetRef: sglang-agg-decode │
│ - metrics: External (from KEDA metrics server) │
└──────────────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────────────┐
│ DynamoGraphDeploymentScalingAdapter: sglang-agg-decode │
│ - spec.replicas: updated by HPA │
└──────────────────────────────────────────────────────────────────────┘
┌──────────────────────────────────────────────────────────────────────┐
│ DynamoGraphDeployment: sglang-agg │
│ - spec.services.decode.replicas: synced from adapter │
└──────────────────────────────────────────────────────────────────────┘
```
## Mixed Autoscaling
For disaggregated deployments (prefill + decode), you can use different autoscaling strategies for different services:
```yaml
---
# HPA for Frontend (CPU-based)
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: sglang-agg-frontend-hpa
namespace: default
spec:
scaleTargetRef:
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentScalingAdapter
name: sglang-agg-frontend
minReplicas: 1
maxReplicas: 5
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
---
# KEDA for Decode (TTFT-based)
apiVersion: keda.sh/v1alpha1
kind: ScaledObject
metadata:
name: sglang-agg-decode-scaler
namespace: default
spec:
scaleTargetRef:
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeploymentScalingAdapter
name: sglang-agg-decode
minReplicaCount: 1
maxReplicaCount: 10
triggers:
- type: prometheus
metadata:
serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090
query: |
histogram_quantile(0.95,
sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m]))
by (le)
)
threshold: "0.5"
```
## Manual Scaling
### With DGDSA Enabled (Default)
When DGDSA is enabled (the default), scale via the adapter:
```bash
kubectl scale dgdsa sglang-agg-decode -n default --replicas=3
```
Verify the scaling:
```bash
kubectl get dgdsa sglang-agg-decode -n default
# Output:
# NAME DGD SERVICE REPLICAS AGE
# sglang-agg-decode sglang-agg decode 3 10m
```
> **Note**: If an autoscaler (KEDA, HPA, Planner) is managing the adapter, your change will be overwritten on the next evaluation cycle.
### With DGDSA Disabled
If you've disabled the scaling adapter for a service, edit the DGD directly:
```bash
kubectl patch dgd sglang-agg --type=merge -p '{"spec":{"services":{"decode":{"replicas":3}}}}'
```
Or edit the YAML:
```yaml
spec:
services:
decode:
replicas: 3
scalingAdapter:
disable: true
```
## Best Practices
### 1. Choose One Autoscaler Per Service
Avoid configuring multiple autoscalers for the same service:
| Configuration | Status |
|---------------|--------|
| HPA for frontend, Planner for prefill/decode | ✅ Good |
| KEDA for all services | ✅ Good |
| Planner only (default) | ✅ Good |
| HPA + Planner both targeting decode | ❌ Bad - they will fight |
### 2. Use Appropriate Metrics
| Service Type | Recommended Metrics | Dynamo Metric |
|--------------|---------------------|---------------|
| Frontend | CPU utilization, request rate | `dynamo_frontend_requests_total` |
| Prefill | Queue depth, TTFT | `dynamo_frontend_queued_requests`, `dynamo_frontend_time_to_first_token_seconds` |
| Decode | KV cache utilization, ITL | `kvstats_gpu_cache_usage_percent`, `dynamo_frontend_inter_token_latency_seconds` |
### 3. Configure Stabilization Windows
Prevent thrashing with appropriate stabilization:
```yaml
# HPA
behavior:
scaleDown:
stabilizationWindowSeconds: 300 # Wait 5 min before scaling down
scaleUp:
stabilizationWindowSeconds: 0 # Scale up immediately
# KEDA
spec:
cooldownPeriod: 300
```
### 4. Set Sensible Min/Max Replicas
Always configure minimum and maximum replicas in your HPA/KEDA to prevent:
- Scaling to zero (unless intentional)
- Unbounded scaling that exhausts cluster resources
## Troubleshooting
### Adapters Not Created
```bash
# Check DGD status
kubectl describe dgd sglang-agg -n default
# Check operator logs
kubectl logs -n dynamo-system deployment/dynamo-operator
```
### Scaling Not Working
```bash
# Check adapter status
kubectl describe dgdsa sglang-agg-decode -n default
# Check HPA/KEDA status
kubectl describe hpa sglang-agg-decode-hpa -n default
kubectl describe scaledobject sglang-agg-decode-scaler -n default
# Verify metrics are available in Kubernetes metrics API
kubectl get --raw /apis/external.metrics.k8s.io/v1beta1
```
### Metrics Not Available
If HPA/KEDA shows `<unknown>` for metrics:
```bash
# Check if Dynamo metrics are being scraped
kubectl port-forward -n default svc/sglang-agg-frontend 8000:8000
curl http://localhost:8000/metrics | grep dynamo_frontend
# Example output:
# dynamo_frontend_queued_requests{model="Qwen/Qwen3-0.6B"} 2
# dynamo_frontend_inflight_requests{model="Qwen/Qwen3-0.6B"} 5
# Verify Prometheus is scraping the metrics
kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
# Then query: dynamo_frontend_time_to_first_token_seconds_bucket
# Check KEDA operator logs
kubectl logs -n keda deployment/keda-operator
```
### Rapid Scaling Up and Down
If you see unstable scaling:
1. Check if multiple autoscalers are targeting the same adapter
2. Increase `cooldownPeriod` in KEDA ScaledObject
3. Increase `stabilizationWindowSeconds` in HPA behavior
## References
- [Kubernetes HPA Documentation](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/)
- [KEDA Documentation](https://keda.sh/)
- [Prometheus Adapter](https://github.com/kubernetes-sigs/prometheus-adapter)
- [Planner Documentation](../planner/sla_planner.md)
- [Dynamo Metrics Reference](../observability/metrics.md)
- [Prometheus and Grafana Setup](../observability/prometheus-grafana.md)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment