feat: add scaling adapter (#4699)

Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>

feat: add scaling adapter (#4699)
Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>
09f2314d · Julien Mancuso · GitHub · 1f9b69b0 · 09f2314d · 09f2314d
Unverified Commit 09f2314d authored Dec 09, 2025 by Julien Mancuso Committed by GitHub Dec 09, 2025
13 changed files
--- a/deploy/cloud/operator/internal/dynamo/graph.go
+++ b/deploy/cloud/operator/internal/dynamo/graph.go
@@ -1034,7 +1034,7 @@ func GenerateGrovePodCliqueSet(
 					PodSpec:      *podSpec,
 				},
 			}
-			labels, err := generateLabels(component, dynamoDeployment, r.Name)
+			labels, err := generateLabels(component, dynamoDeployment, serviceName)
 			if err != nil {
 				return nil, fmt.Errorf("failed to generate labels: %w", err)
 			}
@@ -1075,6 +1075,7 @@ func generateLabels(component *v1alpha1.DynamoComponentDeploymentSharedSpec, dyn
 	labels := make(map[string]string)
 	labels[commonconsts.KubeLabelDynamoSelector] = GetDynamoComponentName(dynamoDeployment, componentName)
 	labels[commonconsts.KubeLabelDynamoGraphDeploymentName] = dynamoDeployment.Name
+	labels[commonconsts.KubeLabelDynamoComponent] = componentName
 	if component.DynamoNamespace != nil {
 		labels[commonconsts.KubeLabelDynamoNamespace] = *component.DynamoNamespace
 	}

--- a/deploy/cloud/operator/internal/dynamo/graph_test.go
+++ b/deploy/cloud/operator/internal/dynamo/graph_test.go
@@ -121,7 +121,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
 								commonconsts.KubeLabelDynamoNamespace:           "default-test-dynamographdeployment",
 								commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment",
 							},
-							Autoscaling: nil,
 						},
 					},
 				},
@@ -153,7 +152,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
 									Custom: map[string]string{},
 								},
 							},
-							Autoscaling: nil,
 						},
 					},
 				},
@@ -229,7 +227,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
 								commonconsts.KubeLabelDynamoNamespace:           "default-test-dynamographdeployment",
 								commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment",
 							},
-							Autoscaling: nil,
 						},
 					},
 				},
@@ -261,7 +258,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
 									Custom: map[string]string{},
 								},
 							},
-							Autoscaling: nil,
 						},
 					},
 				},
@@ -341,7 +337,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
 								commonconsts.KubeLabelDynamoNamespace:           "default-test-dynamographdeployment",
 								commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment",
 							},
-							Autoscaling: nil,
 							Ingress: &v1alpha1.IngressSpec{
 								Enabled: true,
 								Host:    "test-dynamographdeployment",
@@ -377,7 +372,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
 									Custom: map[string]string{},
 								},
 							},
-							Autoscaling: nil,
 						},
 					},
 				},
@@ -465,7 +459,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
 								commonconsts.KubeLabelDynamoNamespace:           "default-test-dynamographdeployment",
 								commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment",
 							},
-							Autoscaling: nil,
 							Envs: []corev1.EnvVar{
 								{
 									Name:  "DYN_DEPLOYMENT_CONFIG",
@@ -503,7 +496,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
 									Custom: map[string]string{},
 								},
 							},
-							Autoscaling: nil,
 							Envs: []corev1.EnvVar{
 								{
 									Name:  "DYN_DEPLOYMENT_CONFIG",
@@ -599,7 +591,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
 								commonconsts.KubeLabelDynamoNamespace:           "default-test-dynamographdeployment",
 								commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamographdeployment",
 							},
-							Autoscaling: nil,
 							ExtraPodSpec: &v1alpha1.ExtraPodSpec{
 								MainContainer: &corev1.Container{
 									Command: []string{"sh", "-c"},
@@ -644,7 +635,6 @@ func TestGenerateDynamoComponentsDeployments(t *testing.T) {
 									Custom: map[string]string{},
 								},
 							},
-							Autoscaling: nil,
 							Envs: []corev1.EnvVar{
 								{
 									Name:  "TEST_ENV",
@@ -1307,6 +1297,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 								Name: "frontend",
 								Labels: map[string]string{
 									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-frontend",
+									commonconsts.KubeLabelDynamoComponent:           "Frontend",
 									commonconsts.KubeLabelMetricsEnabled:            commonconsts.KubeLabelValueTrue,
 									commonconsts.KubeLabelDynamoComponentType:       commonconsts.ComponentTypeFrontend,
 									commonconsts.KubeLabelDynamoSubComponentType:    "test-sub-component",
@@ -1483,6 +1474,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 								Labels: map[string]string{
 									commonconsts.KubeLabelMetricsEnabled:            commonconsts.KubeLabelValueTrue,
 									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-planner",
+									commonconsts.KubeLabelDynamoComponent:           "Planner",
 									commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
 									commonconsts.KubeLabelDynamoComponentType:       commonconsts.ComponentTypePlanner,
 									commonconsts.KubeLabelDynamoNamespace:           "test-namespace-test-dynamo-graph-deployment",
@@ -1884,8 +1876,9 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 									commonconsts.KubeLabelDynamoComponentType:       commonconsts.ComponentTypeWorker,
 									commonconsts.KubeLabelDynamoSubComponentType:    "test-sub-component",
 									commonconsts.KubeLabelMetricsEnabled:            commonconsts.KubeLabelValueTrue,
-									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-worker-ldr",
+									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-worker",
 									commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
+									commonconsts.KubeLabelDynamoComponent:           "worker",
 									commonconsts.KubeLabelDynamoNamespace:           "test-namespace-test-dynamo-graph-deployment",
 									"nvidia.com/label1":                             "label1",
 									"nvidia.com/label2":                             "label2",
@@ -2059,8 +2052,9 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 									commonconsts.KubeLabelDynamoComponentType:       commonconsts.ComponentTypeWorker,
 									commonconsts.KubeLabelDynamoSubComponentType:    "test-sub-component",
 									commonconsts.KubeLabelMetricsEnabled:            commonconsts.KubeLabelValueTrue,
-									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-worker-wkr",
+									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-worker",
 									commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
+									commonconsts.KubeLabelDynamoComponent:           "worker",
 									commonconsts.KubeLabelDynamoNamespace:           "test-namespace-test-dynamo-graph-deployment",
 									"nvidia.com/label1":                             "label1",
 									"nvidia.com/label2":                             "label2",
@@ -2200,6 +2194,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-frontend",
 									commonconsts.KubeLabelDynamoComponentType:       commonconsts.ComponentTypeFrontend,
 									commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
+									commonconsts.KubeLabelDynamoComponent:           "Frontend",
 									commonconsts.KubeLabelDynamoNamespace:           "test-namespace-test-dynamo-graph-deployment",
 								},
 								Annotations: map[string]string{},
@@ -2358,6 +2353,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 								Name: "planner",
 								Labels: map[string]string{
 									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-planner",
+									commonconsts.KubeLabelDynamoComponent:           "Planner",
 									commonconsts.KubeLabelMetricsEnabled:            commonconsts.KubeLabelValueTrue,
 									commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
 									commonconsts.KubeLabelDynamoComponentType:       commonconsts.ComponentTypePlanner,
@@ -2779,7 +2775,8 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 							{
 								Name: "worker-ldr",
 								Labels: map[string]string{
-									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-worker-ldr",
+									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-worker",
+									commonconsts.KubeLabelDynamoComponent:           "worker",
 									commonconsts.KubeLabelMetricsEnabled:            commonconsts.KubeLabelValueTrue,
 									commonconsts.KubeLabelDynamoComponentType:       commonconsts.ComponentTypeWorker,
 									commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
@@ -2943,7 +2940,8 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 								Labels: map[string]string{
 									commonconsts.KubeLabelDynamoComponentType:       commonconsts.ComponentTypeWorker,
 									commonconsts.KubeLabelMetricsEnabled:            commonconsts.KubeLabelValueTrue,
-									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-worker-wkr",
+									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-worker",
+									commonconsts.KubeLabelDynamoComponent:           "worker",
 									commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
 									commonconsts.KubeLabelDynamoNamespace:           "test-namespace-test-dynamo-graph-deployment",
 									"nvidia.com/label1":                             "label1",
@@ -3084,6 +3082,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 									commonconsts.KubeLabelMetricsEnabled:            commonconsts.KubeLabelValueTrue,
 									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-frontend",
 									commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
+									commonconsts.KubeLabelDynamoComponent:           "Frontend",
 									commonconsts.KubeLabelDynamoNamespace:           "test-namespace-test-dynamo-graph-deployment",
 								},
 								Annotations: map[string]string{},
@@ -3243,6 +3242,7 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
 								Labels: map[string]string{
 									commonconsts.KubeLabelMetricsEnabled:            commonconsts.KubeLabelValueTrue,
 									commonconsts.KubeLabelDynamoSelector:            "test-dynamo-graph-deployment-planner",
+									commonconsts.KubeLabelDynamoComponent:           "Planner",
 									commonconsts.KubeLabelDynamoGraphDeploymentName: "test-dynamo-graph-deployment",
 									commonconsts.KubeLabelDynamoComponentType:       commonconsts.ComponentTypePlanner,
 									commonconsts.KubeLabelDynamoNamespace:           "test-namespace-test-dynamo-graph-deployment",

--- a/deploy/cloud/operator/internal/webhook/common.go
+++ b/deploy/cloud/operator/internal/webhook/common.go
@@ -19,7 +19,9 @@ package webhook

 import (
 	"context"
+	"strings"

+	authenticationv1 "k8s.io/api/authentication/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	logf "sigs.k8s.io/controller-runtime/pkg/log"
@@ -118,3 +120,54 @@ func (v *LeaseAwareValidator) shouldSkipValidation(obj runtime.Object) bool {

 	return false
 }
+
+// DGDReplicasModifierSuffixes defines suffixes for service accounts that are authorized
+// to modify DGD replicas when scaling adapter is enabled.
+// Service accounts matching any of these suffixes are allowed regardless of namespace.
+var DGDReplicasModifierSuffixes = []string{
+	// Dynamo operator controller manager (handles DGDSA reconciliation)
+	// Example: "dynamo-platform-dynamo-operator-controller-manager"
+	"-dynamo-operator-controller-manager",
+
+	// Planner service account (manages DGD replicas for autoscaling)
+	// Example: "planner-serviceaccount"
+	"planner-serviceaccount",
+}
+
+// CanModifyDGDReplicas checks if the request comes from a service account authorized
+// to modify DGD replicas when scaling adapter is enabled.
+// Service accounts are identified by username format: system:serviceaccount:<namespace>:<name>
+//
+// Authorized service accounts (by suffix):
+// - *-dynamo-operator-controller-manager (for DGDSA reconciliation)
+// - *planner-serviceaccount (for Planner autoscaling)
+func CanModifyDGDReplicas(userInfo authenticationv1.UserInfo) bool {
+	username := userInfo.Username
+
+	// Service accounts have username format: system:serviceaccount:<namespace>:<name>
+	if !strings.HasPrefix(username, "system:serviceaccount:") {
+		return false
+	}
+
+	// Parse: system:serviceaccount:<namespace>:<name>
+	parts := strings.Split(username, ":")
+	if len(parts) != 4 {
+		return false
+	}
+
+	namespace := parts[2]
+	saName := parts[3]
+
+	// Check against authorized suffixes
+	for _, suffix := range DGDReplicasModifierSuffixes {
+		if strings.HasSuffix(saName, suffix) {
+			webhookCommonLog.V(1).Info("allowing DGD replicas modification",
+				"serviceAccount", saName,
+				"namespace", namespace,
+				"matchedSuffix", suffix)
+			return true
+		}
+	}
+
+	return false
+}
--- a/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go
+++ b/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment.go
@@ -42,13 +42,10 @@ func NewDynamoComponentDeploymentValidator(deployment *nvidiacomv1alpha1.DynamoC
 func (v *DynamoComponentDeploymentValidator) Validate() (admission.Warnings, error) {
 	// Validate shared spec fields using SharedSpecValidator
 	sharedValidator := NewSharedSpecValidator(&v.deployment.Spec.DynamoComponentDeploymentSharedSpec, "spec")
-	if err := sharedValidator.Validate(); err != nil {
-		return nil, err
-	}

 	// DCD-specific validation would go here (currently none)

-	return nil, nil
+	return sharedValidator.Validate()
 }

 // ValidateUpdate performs stateful validation comparing old and new DynamoComponentDeployment.

--- a/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment_test.go
+++ b/deploy/cloud/operator/internal/webhook/validation/dynamocomponentdeployment_test.go
@@ -47,11 +47,6 @@ func TestDynamoComponentDeploymentValidator_Validate(t *testing.T) {
 				Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
 					DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
 						Replicas: &validReplicas,
-						Autoscaling: &nvidiacomv1alpha1.Autoscaling{
-							Enabled:     true,
-							MinReplicas: 1,
-							MaxReplicas: 10,
-						},
 					},
 					BackendFramework: "sglang",
 				},
@@ -74,26 +69,6 @@ func TestDynamoComponentDeploymentValidator_Validate(t *testing.T) {
 			wantErr: true,
 			errMsg:  "spec.replicas must be non-negative",
 		},
-		{
-			name: "invalid autoscaling",
-			deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{
-				ObjectMeta: metav1.ObjectMeta{
-					Name:      "test-deployment",
-					Namespace: "default",
-				},
-				Spec: nvidiacomv1alpha1.DynamoComponentDeploymentSpec{
-					DynamoComponentDeploymentSharedSpec: nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
-						Autoscaling: &nvidiacomv1alpha1.Autoscaling{
-							Enabled:     true,
-							MinReplicas: 5,
-							MaxReplicas: 3,
-						},
-					},
-				},
-			},
-			wantErr: true,
-			errMsg:  "spec.autoscaling.maxReplicas must be > minReplicas",
-		},
 		{
 			name: "invalid ingress",
 			deployment: &nvidiacomv1alpha1.DynamoComponentDeployment{

--- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go
+++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment.go
@@ -22,6 +22,8 @@ import (
 	"fmt"

 	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
+	internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
+	authenticationv1 "k8s.io/api/authentication/v1"
 	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
 )

@@ -51,30 +53,106 @@ func (v *DynamoGraphDeploymentValidator) Validate() (admission.Warnings, error)
 		return nil, err
 	}

+	var allWarnings admission.Warnings
+
 	// Validate each service
 	for serviceName, service := range v.deployment.Spec.Services {
-		if err := v.validateService(serviceName, service); err != nil {
+		warnings, err := v.validateService(serviceName, service)
+		if err != nil {
 			return nil, err
 		}
+		allWarnings = append(allWarnings, warnings...)
 	}

-	return nil, nil
+	return allWarnings, nil
 }

 // ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeployment.
+// userInfo is used for identity-based validation (replica protection).
+// If userInfo is nil, replica changes for DGDSA-enabled services are rejected (fail closed).
 // Returns warnings and error.
-func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment) (admission.Warnings, error) {
-	// Validate that BackendFramework is not changed (immutable)
+func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) (admission.Warnings, error) {
+	var warnings admission.Warnings
+
+	// Validate immutable fields
+	if err := v.validateImmutableFields(old, &warnings); err != nil {
+		return warnings, err
+	}
+
+	// Validate replicas changes for services with scaling adapter enabled
+	// Pass userInfo (may be nil - will fail closed for DGDSA-enabled services)
+	if err := v.validateReplicasChanges(old, userInfo); err != nil {
+		return warnings, err
+	}
+
+	return warnings, nil
+}
+
+// validateImmutableFields checks that immutable fields have not been changed.
+// Appends warnings to the provided slice.
+func (v *DynamoGraphDeploymentValidator) validateImmutableFields(old *nvidiacomv1alpha1.DynamoGraphDeployment, warnings *admission.Warnings) error {
 	if v.deployment.Spec.BackendFramework != old.Spec.BackendFramework {
-		warning := "Changing spec.backendFramework may cause unexpected behavior"
-		return admission.Warnings{warning}, fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation")
+		*warnings = append(*warnings, "Changing spec.backendFramework may cause unexpected behavior")
+		return fmt.Errorf("spec.backendFramework is immutable and cannot be changed after creation")
+	}
+	return nil
+}
+
+// validateReplicasChanges checks if replicas were changed for services with scaling adapter enabled.
+// Only authorized service accounts (operator controller, planner) can modify these fields.
+// If userInfo is nil, all replica changes for DGDSA-enabled services are rejected (fail closed).
+func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) error {
+	// If the request comes from an authorized service account, allow the change
+	if userInfo != nil && internalwebhook.CanModifyDGDReplicas(*userInfo) {
+		return nil
+	}
+
+	var errs []error
+
+	for serviceName, newService := range v.deployment.Spec.Services {
+		// Check if scaling adapter is enabled for this service (enabled by default)
+		scalingAdapterEnabled := true
+		if newService.ScalingAdapter != nil && newService.ScalingAdapter.Disable {
+			scalingAdapterEnabled = false
+		}
+
+		if !scalingAdapterEnabled {
+			// Scaling adapter is disabled, users can modify replicas directly
+			continue
+		}
+
+		// Get old service (if exists)
+		oldService, exists := old.Spec.Services[serviceName]
+		if !exists {
+			// New service, no comparison needed
+			continue
+		}
+
+		// Check if replicas changed
+		oldReplicas := int32(1) // default
+		if oldService.Replicas != nil {
+			oldReplicas = *oldService.Replicas
+		}
+
+		newReplicas := int32(1) // default
+		if newService.Replicas != nil {
+			newReplicas = *newService.Replicas
 		}

-	return nil, nil
+		if oldReplicas != newReplicas {
+			errs = append(errs, fmt.Errorf(
+				"spec.services[%s].replicas cannot be modified directly when scaling adapter is enabled; "+
+					"scale or update the related DynamoGraphDeploymentScalingAdapter instead",
+				serviceName))
+		}
+	}
+
+	return errors.Join(errs...)
 }

 // validateService validates a single service configuration using SharedSpecValidator.
-func (v *DynamoGraphDeploymentValidator) validateService(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) error {
+// Returns warnings and error.
+func (v *DynamoGraphDeploymentValidator) validateService(serviceName string, service *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec) (admission.Warnings, error) {
 	// Use SharedSpecValidator to validate service spec (which is a DynamoComponentDeploymentSharedSpec)
 	fieldPath := fmt.Sprintf("spec.services[%s]", serviceName)
 	sharedValidator := NewSharedSpecValidator(service, fieldPath)

--- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go
+++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_handler.go
@@ -23,6 +23,7 @@ import (

 	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
 	internalwebhook "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/webhook"
+	authenticationv1 "k8s.io/api/authentication/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/controller-runtime/pkg/manager"
@@ -91,9 +92,24 @@ func (h *DynamoGraphDeploymentHandler) ValidateUpdate(ctx context.Context, oldOb
 		return warnings, err
 	}

-	// Validate stateful rules (immutability)
-	updateWarnings, err := validator.ValidateUpdate(oldDeployment)
+	// Get user info from admission request context for identity-based validation
+	var userInfo *authenticationv1.UserInfo
+	req, err := admission.RequestFromContext(ctx)
 	if err != nil {
+		logger.Error(err, "failed to get admission request from context, replica changes for DGDSA-enabled services will be rejected")
+		// userInfo remains nil - validateReplicasChanges will fail closed
+	} else {
+		userInfo = &req.UserInfo
+	}
+
+	// Validate stateful rules (immutability + replicas protection)
+	updateWarnings, err := validator.ValidateUpdate(oldDeployment, userInfo)
+	if err != nil {
+		username := "<unknown>"
+		if userInfo != nil {
+			username = userInfo.Username
+		}
+		logger.Info("validation failed", "error", err.Error(), "user", username)
 		return updateWarnings, err
 	}


--- a/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go
+++ b/deploy/cloud/operator/internal/webhook/validation/dynamographdeployment_test.go
@@ -93,28 +93,6 @@ func TestDynamoGraphDeploymentValidator_Validate(t *testing.T) {
 			wantErr: true,
 			errMsg:  "spec.services[main].replicas must be non-negative",
 		},
-		{
-			name: "service with invalid autoscaling",
-			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
-				ObjectMeta: metav1.ObjectMeta{
-					Name:      "test-graph",
-					Namespace: "default",
-				},
-				Spec: nvidiacomv1alpha1.DynamoGraphDeploymentSpec{
-					Services: map[string]*nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
-						"prefill": {
-							Autoscaling: &nvidiacomv1alpha1.Autoscaling{
-								Enabled:     true,
-								MinReplicas: 10,
-								MaxReplicas: 5,
-							},
-						},
-					},
-				},
-			},
-			wantErr: true,
-			errMsg:  "spec.services[prefill].autoscaling.maxReplicas must be > minReplicas",
-		},
 		{
 			name: "service with invalid ingress",
 			deployment: &nvidiacomv1alpha1.DynamoGraphDeployment{
@@ -441,7 +419,8 @@ func TestDynamoGraphDeploymentValidator_ValidateUpdate(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			validator := NewDynamoGraphDeploymentValidator(tt.newDeployment)
-			warnings, err := validator.ValidateUpdate(tt.oldDeployment)
+			// Pass nil userInfo - these tests don't modify replicas, so it's safe
+			warnings, err := validator.ValidateUpdate(tt.oldDeployment, nil)

 			if (err != nil) != tt.wantErr {
 				t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)

--- a/deploy/cloud/operator/internal/webhook/validation/shared.go
+++ b/deploy/cloud/operator/internal/webhook/validation/shared.go
@@ -21,6 +21,7 @@ import (
 	"fmt"

 	nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
+	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
 )

 // SharedSpecValidator validates DynamoComponentDeploymentSharedSpec fields.
@@ -41,61 +42,45 @@ func NewSharedSpecValidator(spec *nvidiacomv1alpha1.DynamoComponentDeploymentSha
 }

 // Validate performs validation on the shared spec fields.
-// Returns an error if validation fails.
-func (v *SharedSpecValidator) Validate() error {
+// Returns warnings (e.g., deprecation notices) and error if validation fails.
+func (v *SharedSpecValidator) Validate() (admission.Warnings, error) {
 	// Validate replicas if specified
 	if v.spec.Replicas != nil && *v.spec.Replicas < 0 {
-		return fmt.Errorf("%s.replicas must be non-negative", v.fieldPath)
-	}
-
-	// Validate autoscaling configuration if specified
-	if v.spec.Autoscaling != nil {
-		if err := v.validateAutoscaling(); err != nil {
-			return err
-		}
+		return nil, fmt.Errorf("%s.replicas must be non-negative", v.fieldPath)
 	}

 	// Validate ingress configuration if enabled
 	if v.spec.Ingress != nil && v.spec.Ingress.Enabled {
 		if err := v.validateIngress(); err != nil {
-			return err
+			return nil, err
 		}
 	}

 	// Validate volume mounts
 	if err := v.validateVolumeMounts(); err != nil {
-		return err
+		return nil, err
 	}

 	// Validate shared memory
 	if v.spec.SharedMemory != nil {
 		if err := v.validateSharedMemory(); err != nil {
-			return err
-		}
+			return nil, err
 		}
-
-	return nil
-}
-
-// validateAutoscaling validates the autoscaling configuration.
-func (v *SharedSpecValidator) validateAutoscaling() error {
-	autoscaling := v.spec.Autoscaling
-
-	if !autoscaling.Enabled {
-		return nil
 	}

-	// Validate minReplicas
-	if autoscaling.MinReplicas < 1 {
-		return fmt.Errorf("%s.autoscaling.minReplicas must be >= 1", v.fieldPath)
-	}
+	// Collect warnings (e.g., deprecation notices)
+	var warnings admission.Warnings

-	// Validate maxReplicas
-	if autoscaling.MaxReplicas <= autoscaling.MinReplicas {
-		return fmt.Errorf("%s.autoscaling.maxReplicas must be > minReplicas", v.fieldPath)
+	// Check for deprecated autoscaling field
+	//nolint:staticcheck // SA1019: Intentionally checking deprecated field to warn users
+	if v.spec.Autoscaling != nil {
+		warnings = append(warnings, fmt.Sprintf(
+			"%s.autoscaling is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter "+
+				"with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md",
+			v.fieldPath))
 	}

-	return nil
+	return warnings, nil
 }

 // validateIngress validates the ingress configuration.

--- a/deploy/cloud/operator/internal/webhook/validation/shared_test.go
+++ b/deploy/cloud/operator/internal/webhook/validation/shared_test.go
@@ -41,11 +41,6 @@ func TestSharedSpecValidator_Validate(t *testing.T) {
 			name: "valid spec with all fields",
 			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
 				Replicas: &validReplicas,
-				Autoscaling: &nvidiacomv1alpha1.Autoscaling{
-					Enabled:     true,
-					MinReplicas: 1,
-					MaxReplicas: 10,
-				},
 				Ingress: &nvidiacomv1alpha1.IngressSpec{
 					Enabled: true,
 					Host:    "example.com",
@@ -77,44 +72,6 @@ func TestSharedSpecValidator_Validate(t *testing.T) {
 			wantErr:   true,
 			errMsg:    "spec.replicas must be non-negative",
 		},
-		{
-			name: "autoscaling minReplicas too low",
-			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
-				Autoscaling: &nvidiacomv1alpha1.Autoscaling{
-					Enabled:     true,
-					MinReplicas: 0,
-					MaxReplicas: 10,
-				},
-			},
-			fieldPath: "spec",
-			wantErr:   true,
-			errMsg:    "spec.autoscaling.minReplicas must be >= 1",
-		},
-		{
-			name: "autoscaling maxReplicas less than minReplicas",
-			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
-				Autoscaling: &nvidiacomv1alpha1.Autoscaling{
-					Enabled:     true,
-					MinReplicas: 5,
-					MaxReplicas: 3,
-				},
-			},
-			fieldPath: "spec",
-			wantErr:   true,
-			errMsg:    "spec.autoscaling.maxReplicas must be > minReplicas",
-		},
-		{
-			name: "autoscaling disabled - no validation",
-			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
-				Autoscaling: &nvidiacomv1alpha1.Autoscaling{
-					Enabled:     false,
-					MinReplicas: 0,
-					MaxReplicas: 0,
-				},
-			},
-			fieldPath: "spec",
-			wantErr:   false,
-		},
 		{
 			name: "ingress enabled without host",
 			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
@@ -227,7 +184,7 @@ func TestSharedSpecValidator_Validate(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			validator := NewSharedSpecValidator(tt.spec, tt.fieldPath)
-			err := validator.Validate()
+			_, err := validator.Validate()

 			if (err != nil) != tt.wantErr {
 				t.Errorf("SharedSpecValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
@@ -240,3 +197,53 @@ func TestSharedSpecValidator_Validate(t *testing.T) {
 		})
 	}
 }
+
+func TestSharedSpecValidator_Validate_Warnings(t *testing.T) {
+	validReplicas := int32(3)
+
+	tests := []struct {
+		name         string
+		spec         *nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec
+		fieldPath    string
+		wantWarnings int
+	}{
+		{
+			name: "no warnings for spec without autoscaling",
+			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+				Replicas: &validReplicas,
+			},
+			fieldPath:    "spec",
+			wantWarnings: 0,
+		},
+		{
+			name: "warning for deprecated autoscaling field enabled",
+			spec: &nvidiacomv1alpha1.DynamoComponentDeploymentSharedSpec{
+				Replicas: &validReplicas,
+				//nolint:staticcheck // SA1019: Intentionally testing deprecated field
+				Autoscaling: &nvidiacomv1alpha1.Autoscaling{
+					Enabled:     true,
+					MinReplicas: 1,
+					MaxReplicas: 10,
+				},
+			},
+			fieldPath:    "spec",
+			wantWarnings: 1,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			validator := NewSharedSpecValidator(tt.spec, tt.fieldPath)
+			warnings, err := validator.Validate()
+
+			if err != nil {
+				t.Errorf("SharedSpecValidator.Validate() unexpected error = %v", err)
+				return
+			}
+
+			if len(warnings) != tt.wantWarnings {
+				t.Errorf("SharedSpecValidator.Validate() warnings count = %d, want %d", len(warnings), tt.wantWarnings)
+			}
+		})
+	}
+}
--- a/docs/_sections/k8s_deployment.rst
+++ b/docs/_sections/k8s_deployment.rst
@@ -10,3 +10,4 @@ Deployment Guide
   Webhooks <../kubernetes/webhooks>
   Minikube Setup <../kubernetes/deployment/minikube>
   Managing Models with DynamoModel <../kubernetes/deployment/dynamomodel-guide>
+   Autoscaling <../kubernetes/autoscaling>
--- a/docs/kubernetes/api_reference.md
+++ b/docs/kubernetes/api_reference.md
@@ -37,6 +37,7 @@ Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API
 - [DynamoComponentDeployment](#dynamocomponentdeployment)
 - [DynamoGraphDeployment](#dynamographdeployment)
 - [DynamoGraphDeploymentRequest](#dynamographdeploymentrequest)
+- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
 - [DynamoModel](#dynamomodel)


@@ -45,7 +46,9 @@ Package v1alpha1 contains API Schema definitions for the nvidia.com v1alpha1 API



-
+Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter
+with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md
+for migration guidance. This field will be removed in a future API version.



@@ -55,11 +58,11 @@ _Appears in:_

 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `enabled` _boolean_ |  |  |  |
-| `minReplicas` _integer_ |  |  |  |
-| `maxReplicas` _integer_ |  |  |  |
-| `behavior` _[HorizontalPodAutoscalerBehavior](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling)_ |  |  |  |
-| `metrics` _[MetricSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling) array_ |  |  |  |
+| `enabled` _boolean_ | Deprecated: This field is ignored. |  |  |
+| `minReplicas` _integer_ | Deprecated: This field is ignored. |  |  |
+| `maxReplicas` _integer_ | Deprecated: This field is ignored. |  |  |
+| `behavior` _[HorizontalPodAutoscalerBehavior](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#horizontalpodautoscalerbehavior-v2-autoscaling)_ | Deprecated: This field is ignored. |  |  |
+| `metrics` _[MetricSpec](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#metricspec-v2-autoscaling) array_ | Deprecated: This field is ignored. |  |  |



@@ -165,7 +168,7 @@ _Appears in:_
 | `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.<br />The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component |  | Optional: \{\} <br /> |
 | `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace |  |  |
 | `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,<br />GPUs/devices, and any runtime-specific resources. |  |  |
-| `autoscaling` _[Autoscaling](#autoscaling)_ | Autoscaling config for this component (replica range, target utilization, etc.). |  |  |
+| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter<br />with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md<br />for migration guidance. This field will be removed in a future API version. |  |  |
 | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. |  |  |
 | `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as<br />environment variables in the component containers. |  |  |
 | `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. |  |  |
@@ -176,8 +179,9 @@ _Appears in:_
 | `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.<br />It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field<br />that allows overriding the main container configuration. |  |  |
 | `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. |  |  |
 | `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. |  |  |
-| `replicas` _integer_ | Replicas is the desired number of Pods for this component when autoscaling is not used. |  |  |
+| `replicas` _integer_ | Replicas is the desired number of Pods for this component.<br />When scalingAdapter is enabled (default), this field is managed by the<br />DynamoGraphDeploymentScalingAdapter and should not be modified directly. |  | Minimum: 0 <br /> |
 | `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. |  |  |
+| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled (default), replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. |  |  |


 #### DynamoComponentDeploymentSpec
@@ -202,7 +206,7 @@ _Appears in:_
 | `dynamoNamespace` _string_ | DynamoNamespace is deprecated and will be removed in a future version.<br />The DGD Kubernetes namespace and DynamoGraphDeployment name are used to construct the Dynamo namespace for each component |  | Optional: \{\} <br /> |
 | `globalDynamoNamespace` _boolean_ | GlobalDynamoNamespace indicates that the Component will be placed in the global Dynamo namespace |  |  |
 | `resources` _[Resources](#resources)_ | Resources requested and limits for this component, including CPU, memory,<br />GPUs/devices, and any runtime-specific resources. |  |  |
-| `autoscaling` _[Autoscaling](#autoscaling)_ | Autoscaling config for this component (replica range, target utilization, etc.). |  |  |
+| `autoscaling` _[Autoscaling](#autoscaling)_ | Deprecated: This field is deprecated and ignored. Use DynamoGraphDeploymentScalingAdapter<br />with HPA, KEDA, or Planner for autoscaling instead. See docs/kubernetes/autoscaling.md<br />for migration guidance. This field will be removed in a future API version. |  |  |
 | `envs` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#envvar-v1-core) array_ | Envs defines additional environment variables to inject into the component containers. |  |  |
 | `envFromSecret` _string_ | EnvFromSecret references a Secret whose key/value pairs will be exposed as<br />environment variables in the component containers. |  |  |
 | `volumeMounts` _[VolumeMount](#volumemount) array_ | VolumeMounts references PVCs defined at the top level for volumes to be mounted by the component. |  |  |
@@ -213,8 +217,9 @@ _Appears in:_
 | `extraPodSpec` _[ExtraPodSpec](#extrapodspec)_ | ExtraPodSpec allows to override the main pod spec configuration.<br />It is a k8s standard PodSpec. It also contains a MainContainer (standard k8s Container) field<br />that allows overriding the main container configuration. |  |  |
 | `livenessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | LivenessProbe to detect and restart unhealthy containers. |  |  |
 | `readinessProbe` _[Probe](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#probe-v1-core)_ | ReadinessProbe to signal when the container is ready to receive traffic. |  |  |
-| `replicas` _integer_ | Replicas is the desired number of Pods for this component when autoscaling is not used. |  |  |
+| `replicas` _integer_ | Replicas is the desired number of Pods for this component.<br />When scalingAdapter is enabled (default), this field is managed by the<br />DynamoGraphDeploymentScalingAdapter and should not be modified directly. |  | Minimum: 0 <br /> |
 | `multinode` _[MultinodeSpec](#multinodespec)_ | Multinode is the configuration for multinode components. |  |  |
+| `scalingAdapter` _[ScalingAdapter](#scalingadapter)_ | ScalingAdapter configures whether this service uses the DynamoGraphDeploymentScalingAdapter.<br />When enabled (default), replicas are managed via DGDSA and external autoscalers can scale<br />the service using the Scale subresource. When disabled, replicas can be modified directly. |  |  |


 #### DynamoGraphDeployment
@@ -314,6 +319,83 @@ _Appears in:_
 | `deployment` _[DeploymentStatus](#deploymentstatus)_ | Deployment tracks the auto-created DGD when AutoApply is true.<br />Contains name, namespace, state, and creation status of the managed DGD. |  | Optional: \{\} <br /> |


+#### DynamoGraphDeploymentScalingAdapter
+
+
+
+DynamoGraphDeploymentScalingAdapter provides a scaling interface for individual services
+within a DynamoGraphDeployment. It implements the Kubernetes scale
+subresource, enabling integration with HPA, KEDA, and custom autoscalers.
+
+The adapter acts as an intermediary between autoscalers and the DGD,
+ensuring that only the adapter controller modifies the DGD's service replicas.
+This prevents conflicts when multiple autoscaling mechanisms are in play.
+
+
+
+
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `apiVersion` _string_ | `nvidia.com/v1alpha1` | | |
+| `kind` _string_ | `DynamoGraphDeploymentScalingAdapter` | | |
+| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
+| `spec` _[DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)_ |  |  |  |
+| `status` _[DynamoGraphDeploymentScalingAdapterStatus](#dynamographdeploymentscalingadapterstatus)_ |  |  |  |
+
+
+#### DynamoGraphDeploymentScalingAdapterSpec
+
+
+
+DynamoGraphDeploymentScalingAdapterSpec defines the desired state of DynamoGraphDeploymentScalingAdapter
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `replicas` _integer_ | Replicas is the desired number of replicas for the target service.<br />This field is modified by external autoscalers (HPA/KEDA/Planner) or manually by users. |  | Minimum: 0 <br />Required: \{\} <br /> |
+| `dgdRef` _[DynamoGraphDeploymentServiceRef](#dynamographdeploymentserviceref)_ | DGDRef references the DynamoGraphDeployment and the specific service to scale. |  | Required: \{\} <br /> |
+
+
+#### DynamoGraphDeploymentScalingAdapterStatus
+
+
+
+DynamoGraphDeploymentScalingAdapterStatus defines the observed state of DynamoGraphDeploymentScalingAdapter
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentScalingAdapter](#dynamographdeploymentscalingadapter)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `replicas` _integer_ | Replicas is the current number of replicas for the target service.<br />This is synced from the DGD's service replicas and is required for the scale subresource. |  |  |
+| `selector` _string_ | Selector is a label selector string for the pods managed by this adapter.<br />Required for HPA compatibility via the scale subresource. |  |  |
+| `lastScaleTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#time-v1-meta)_ | LastScaleTime is the last time the adapter scaled the target service. |  |  |
+
+
+#### DynamoGraphDeploymentServiceRef
+
+
+
+DynamoGraphDeploymentServiceRef identifies a specific service within a DynamoGraphDeployment
+
+
+
+_Appears in:_
+- [DynamoGraphDeploymentScalingAdapterSpec](#dynamographdeploymentscalingadapterspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `name` _string_ | Name of the DynamoGraphDeployment |  | MinLength: 1 <br />Required: \{\} <br /> |
+| `serviceName` _string_ | ServiceName is the key name of the service within the DGD's spec.services map to scale |  | MinLength: 1 <br />Required: \{\} <br /> |
+
+
 #### DynamoGraphDeploymentSpec


@@ -638,6 +720,25 @@ _Appears in:_
 | `claims` _[ResourceClaim](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#resourceclaim-v1-core) array_ | Claims specifies resource claims for dynamic resource allocation |  |  |


+#### ScalingAdapter
+
+
+
+ScalingAdapter configures whether a service uses the DynamoGraphDeploymentScalingAdapter
+for replica management. When enabled (default), the DGDSA owns the replicas field and
+external autoscalers (HPA, KEDA, Planner) can control scaling via the Scale subresource.
+
+
+
+_Appears in:_
+- [DynamoComponentDeploymentSharedSpec](#dynamocomponentdeploymentsharedspec)
+- [DynamoComponentDeploymentSpec](#dynamocomponentdeploymentspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `disable` _boolean_ | Disable indicates whether the ScalingAdapter should be disabled for this service.<br />When false (default), a DGDSA is created and owns the replicas field.<br />When true, no DGDSA is created and replicas can be modified directly in the DGD. | false |  |
+
+
 #### SharedMemorySpec



--- a/docs/kubernetes/autoscaling.md
+++ b/docs/kubernetes/autoscaling.md
+# Autoscaling
+
+This guide explains how to configure autoscaling for DynamoGraphDeployment (DGD) services using the `sglang-agg` example from `examples/backends/sglang/deploy/agg.yaml`.
+
+## Example DGD
+
+All examples in this guide use the following DGD:
+
+```yaml
+# examples/backends/sglang/deploy/agg.yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: sglang-agg
+  namespace: default
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: sglang-agg
+      componentType: frontend
+      replicas: 1
+
+    decode:
+      dynamoNamespace: sglang-agg
+      componentType: worker
+      replicas: 1
+      resources:
+        limits:
+          gpu: "1"
+```
+
+**Key identifiers:**
+- **DGD name**: `sglang-agg`
+- **Namespace**: `default`
+- **Services**: `Frontend`, `decode`
+- **dynamo_namespace label**: `default-sglang-agg` (used for metric filtering)
+
+## Overview
+
+Dynamo provides flexible autoscaling through the `DynamoGraphDeploymentScalingAdapter` (DGDSA) resource. When you deploy a DGD, the operator automatically creates one adapter per service (unless explicitly disabled). These adapters implement the Kubernetes [Scale subresource](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#scale-subresource), enabling integration with:
+
+| Autoscaler | Description | Best For |
+|------------|-------------|----------|
+| **KEDA** | Event-driven autoscaling (recommended) | Most use cases |
+| **Kubernetes HPA** | Native horizontal pod autoscaling | Simple CPU/memory-based scaling |
+| **Dynamo Planner** | LLM-aware autoscaling with SLA optimization | Production LLM workloads |
+| **Custom Controllers** | Any scale-subresource-compatible controller | Custom requirements |
+
+> **⚠️ Deprecation Notice**: The `spec.services[X].autoscaling` field in DGD is **deprecated and ignored**. Use DGDSA with HPA, KEDA, or Planner instead. If you have existing DGDs with `autoscaling` configured, you'll see a warning. Remove the field to silence the warning.
+
+## Architecture
+
+```
+┌──────────────────────────────────┐          ┌─────────────────────────────────────┐
+│   DynamoGraphDeployment          │          │   Scaling Adapters (auto-created)   │
+│   "sglang-agg"                   │          │   (one per service)                 │
+├──────────────────────────────────┤          ├─────────────────────────────────────┤
+│                                  │          │                                     │
+│  spec.services:                  │          │  ┌─────────────────────────────┐    │      ┌──────────────────┐
+│                                  │          │  │ sglang-agg-frontend         │◄───┼──────│   Autoscalers    │
+│    ┌────────────────────────┐◄───┼──────────┼──│ spec.replicas: 1            │    │      │                  │
+│    │ Frontend: 1 replica    │    │          │  └─────────────────────────────┘    │      │  • KEDA          │
+│    └────────────────────────┘    │          │                                     │      │  • HPA           │
+│                                  │          │  ┌─────────────────────────────┐    │      │  • Planner       │
+│    ┌────────────────────────┐◄───┼──────────┼──│ sglang-agg-decode           │◄───┼──────│  • Custom        │
+│    │ decode:   1 replica    │    │          │  │ spec.replicas: 1            │    │      │                  │
+│    └────────────────────────┘    │          │  └─────────────────────────────┘    │      └──────────────────┘
+│                                  │          │                                     │
+└──────────────────────────────────┘          └─────────────────────────────────────┘
+```
+
+**How it works:**
+
+1. You deploy a DGD with services (Frontend, decode)
+2. The operator auto-creates one DGDSA per service
+3. Autoscalers (KEDA, HPA, Planner) target the adapters via `/scale` subresource
+4. Adapter controller syncs replica changes to the DGD
+5. DGD controller reconciles the underlying pods
+
+## Viewing Scaling Adapters
+
+After deploying the `sglang-agg` DGD, verify the auto-created adapters:
+
+```bash
+kubectl get dgdsa -n default
+
+# Example output:
+# NAME                  DGD         SERVICE    REPLICAS   AGE
+# sglang-agg-frontend   sglang-agg  Frontend   1          5m
+# sglang-agg-decode     sglang-agg  decode     1          5m
+```
+
+## Replica Ownership Model
+
+When DGDSA is enabled (the default), it becomes the **source of truth** for replica counts. This follows the same pattern as Kubernetes Deployments owning ReplicaSets.
+
+### How It Works
+
+1. **DGDSA owns replicas**: Autoscalers (HPA, KEDA, Planner) update the DGDSA's `spec.replicas`
+2. **DGDSA syncs to DGD**: The DGDSA controller writes the replica count to the DGD's service
+3. **Direct DGD edits blocked**: A validating webhook prevents users from directly editing `spec.services[X].replicas` in the DGD
+4. **Controllers allowed**: Only authorized controllers (operator, Planner) can modify DGD replicas
+
+### Manual Scaling with DGDSA Enabled
+
+When DGDSA is enabled, use `kubectl scale` on the adapter (not the DGD):
+
+```bash
+# ✅ Correct - scale via DGDSA
+kubectl scale dgdsa sglang-agg-decode --replicas=3
+
+# ❌ Blocked - direct DGD edit rejected by webhook
+kubectl patch dgd sglang-agg --type=merge -p '{"spec":{"services":{"decode":{"replicas":3}}}}'
+# Error: spec.services[decode].replicas cannot be modified directly when scaling adapter is enabled;
+#        use 'kubectl scale dgdsa/sglang-agg-decode --replicas=3' or update the DynamoGraphDeploymentScalingAdapter instead
+```
+
+## Disabling DGDSA for a Service
+
+If you want to manage replicas directly in the DGD (without autoscaling), you can disable the scaling adapter per service:
+
+```yaml
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: sglang-agg
+spec:
+  services:
+    Frontend:
+      replicas: 2
+      scalingAdapter:
+        disable: true    # ← No DGDSA created, direct edits allowed
+
+    decode:
+      replicas: 1        # ← DGDSA created by default, managed via adapter
+```
+
+**When to disable DGDSA:**
+- You want simple, manual replica management
+- You don't need autoscaling for that service
+- You prefer direct DGD edits over adapter-based scaling
+
+**When to keep DGDSA enabled (default):**
+- You want to use HPA, KEDA, or Planner for autoscaling
+- You want a clear separation between "desired scale" (adapter) and "deployment config" (DGD)
+- You want protection against accidental direct replica edits
+
+## Autoscaling with Dynamo Planner
+
+The Dynamo Planner is an LLM-aware autoscaler that optimizes scaling decisions based on inference-specific metrics like Time To First Token (TTFT), Inter-Token Latency (ITL), and KV cache utilization.
+
+**When to use Planner:**
+- You want LLM-optimized autoscaling out of the box
+- You need coordinated scaling across prefill/decode services
+- You want SLA-driven scaling (e.g., target TTFT < 500ms)
+
+**How Planner works:**
+
+Planner is deployed as a service component within your DGD. It:
+1. Queries Prometheus for frontend metrics (request rate, latency, etc.)
+2. Uses profiling data to predict optimal replica counts
+3. Scales prefill/decode workers to meet SLA targets
+
+**Deployment:**
+
+The recommended way to deploy Planner is via `DynamoGraphDeploymentRequest` (DGDR). See the [SLA Planner Quick Start](../planner/sla_planner_quickstart.md) for complete instructions.
+
+Example configurations with Planner:
+- `examples/backends/vllm/deploy/disagg_planner.yaml`
+- `examples/backends/sglang/deploy/disagg_planner.yaml`
+- `examples/backends/trtllm/deploy/disagg_planner.yaml`
+
+For more details, see the [SLA Planner documentation](../planner/sla_planner.md).
+
+## Autoscaling with Kubernetes HPA
+
+The Horizontal Pod Autoscaler (HPA) is Kubernetes' native autoscaling solution.
+
+**When to use HPA:**
+- You have simple, predictable scaling requirements
+- You want to use standard Kubernetes tooling
+- You need CPU or memory-based scaling
+
+> **Note**: For custom metrics (like TTFT or queue depth), consider using [KEDA](#autoscaling-with-keda-recommended) instead - it's simpler to configure.
+
+### Basic HPA (CPU-based)
+
+```yaml
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: sglang-agg-frontend-hpa
+  namespace: default
+spec:
+  scaleTargetRef:
+    apiVersion: nvidia.com/v1alpha1
+    kind: DynamoGraphDeploymentScalingAdapter
+    name: sglang-agg-frontend
+  minReplicas: 1
+  maxReplicas: 10
+  metrics:
+  - type: Resource
+    resource:
+      name: cpu
+      target:
+        type: Utilization
+        averageUtilization: 70
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 300
+    scaleUp:
+      stabilizationWindowSeconds: 0
+```
+
+### HPA with Dynamo Metrics
+
+Dynamo exports several metrics useful for autoscaling. These are available at the `/metrics` endpoint on each frontend pod.
+
+> **See also**: For a complete list of all Dynamo metrics, see the [Metrics Reference](../observability/metrics.md). For Prometheus and Grafana setup, see the [Prometheus and Grafana Setup Guide](../observability/prometheus-grafana.md).
+
+#### Available Dynamo Metrics
+
+| Metric | Type | Description | Good for scaling |
+|--------|------|-------------|------------------|
+| `dynamo_frontend_queued_requests` | Gauge | Requests waiting in HTTP queue | ✅ Workers |
+| `dynamo_frontend_inflight_requests` | Gauge | Concurrent requests to engine | ✅ All services |
+| `dynamo_frontend_time_to_first_token_seconds` | Histogram | TTFT latency | ✅ Workers |
+| `dynamo_frontend_inter_token_latency_seconds` | Histogram | ITL latency | ✅ Decode |
+| `dynamo_frontend_request_duration_seconds` | Histogram | Total request duration | ⚠️ General |
+| `kvstats_gpu_cache_usage_percent` | Gauge | GPU KV cache usage (0-1) | ✅ Decode |
+
+#### Metric Labels
+
+Dynamo metrics include these labels for filtering:
+
+| Label | Description | Example |
+|-------|-------------|---------|
+| `dynamo_namespace` | Unique DGD identifier (`{k8s-namespace}-{dynamoNamespace}`) | `default-sglang-agg` |
+| `model` | Model being served | `Qwen/Qwen3-0.6B` |
+
+> **Note**: When you have multiple DGDs in the same namespace, use `dynamo_namespace` to filter metrics for a specific DGD.
+
+#### Example: Scale Decode Service Based on TTFT
+
+Using HPA with Prometheus Adapter requires configuring external metrics.
+
+**Step 1: Configure Prometheus Adapter**
+
+Add this to your Helm values file (e.g., `prometheus-adapter-values.yaml`):
+
+```yaml
+# prometheus-adapter-values.yaml
+prometheus:
+  url: http://prometheus-kube-prometheus-prometheus.monitoring.svc
+  port: 9090
+
+rules:
+  external:
+  # TTFT p95 from frontend - used to scale decode
+  - seriesQuery: 'dynamo_frontend_time_to_first_token_seconds_bucket{namespace!=""}'
+    resources:
+      overrides:
+        namespace: {resource: "namespace"}
+    name:
+      as: "dynamo_ttft_p95_seconds"
+    metricsQuery: |
+      histogram_quantile(0.95,
+        sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{<<.LabelMatchers>>}[5m]))
+        by (le, namespace, dynamo_namespace)
+      )
+```
+
+**Step 2: Install Prometheus Adapter**
+
+```bash
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+helm repo update
+
+helm upgrade --install prometheus-adapter prometheus-community/prometheus-adapter \
+  -n monitoring --create-namespace \
+  -f prometheus-adapter-values.yaml
+```
+
+**Step 3: Verify the metric is available**
+
+```bash
+kubectl get --raw "/apis/external.metrics.k8s.io/v1beta1/namespaces/<your-namespace>/dynamo_ttft_p95_seconds" | jq
+```
+
+**Step 4: Create the HPA**
+
+```yaml
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: sglang-agg-decode-hpa
+spec:
+  scaleTargetRef:
+    apiVersion: nvidia.com/v1alpha1
+    kind: DynamoGraphDeploymentScalingAdapter
+    name: sglang-agg-decode              # ← DGD name + service name (lowercase)
+  minReplicas: 1
+  maxReplicas: 10
+  metrics:
+  - type: External
+    external:
+      metric:
+        name: dynamo_ttft_p95_seconds
+        selector:
+          matchLabels:
+            dynamo_namespace: "default-sglang-agg"  # ← {namespace}-{dynamoNamespace}
+      target:
+        type: Value
+        value: "500m"  # Scale up when TTFT p95 > 500ms
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 60    # Wait 1 min before scaling down
+      policies:
+      - type: Pods
+        value: 1
+        periodSeconds: 30
+    scaleUp:
+      stabilizationWindowSeconds: 0      # Scale up immediately
+      policies:
+      - type: Pods
+        value: 2
+        periodSeconds: 30
+```
+
+**How it works:**
+1. Frontend pods export `dynamo_frontend_time_to_first_token_seconds` histogram
+2. Prometheus Adapter calculates p95 TTFT per `dynamo_namespace`
+3. HPA monitors this metric filtered by `dynamo_namespace: "default-sglang-agg"`
+4. When TTFT p95 > 500ms, HPA scales up the `sglang-agg-decode` adapter
+5. Adapter controller syncs the replica count to the DGD's `decode` service
+6. More decode workers are created, reducing TTFT
+
+#### Example: Scale Based on Queue Depth
+
+Add this rule to your `prometheus-adapter-values.yaml` (alongside the TTFT rule):
+
+```yaml
+# Add to rules.external in prometheus-adapter-values.yaml
+- seriesQuery: 'dynamo_frontend_queued_requests{namespace!=""}'
+  resources:
+    overrides:
+      namespace: {resource: "namespace"}
+  name:
+    as: "dynamo_queued_requests"
+  metricsQuery: |
+    sum(<<.Series>>{<<.LabelMatchers>>}) by (namespace, dynamo_namespace)
+```
+
+Then create the HPA:
+
+```yaml
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: sglang-agg-decode-queue-hpa
+  namespace: default
+spec:
+  scaleTargetRef:
+    apiVersion: nvidia.com/v1alpha1
+    kind: DynamoGraphDeploymentScalingAdapter
+    name: sglang-agg-decode
+  minReplicas: 1
+  maxReplicas: 10
+  metrics:
+  - type: External
+    external:
+      metric:
+        name: dynamo_queued_requests
+        selector:
+          matchLabels:
+            dynamo_namespace: "default-sglang-agg"
+      target:
+        type: Value
+        value: "10"  # Scale up when queue > 10 requests
+```
+
+## Autoscaling with KEDA (Recommended)
+
+KEDA (Kubernetes Event-driven Autoscaling) extends Kubernetes with event-driven autoscaling, supporting 50+ scalers including Prometheus.
+
+**Advantages over HPA + Prometheus Adapter:**
+- No Prometheus Adapter configuration needed
+- PromQL queries are defined in the ScaledObject itself (declarative, per-deployment)
+- Easy to update - just `kubectl apply` the ScaledObject
+- Can scale to zero when idle
+- Supports multiple triggers per object
+
+**When to use KEDA:**
+- You want simpler configuration (no Prometheus Adapter to manage)
+- You need event-driven scaling (e.g., queue depth, Kafka, etc.)
+- You want to scale to zero when idle
+
+### Installing KEDA
+
+```bash
+# Add KEDA Helm repo
+helm repo add kedacore https://kedacore.github.io/charts
+helm repo update
+
+# Install KEDA
+helm install keda kedacore/keda \
+  --namespace keda \
+  --create-namespace
+
+# Verify installation
+kubectl get pods -n keda
+```
+
+> **Note**: If you have Prometheus Adapter installed, either uninstall it first (`helm uninstall prometheus-adapter -n monitoring`) or install KEDA with `--set metricsServer.enabled=false` to avoid API conflicts.
+
+### Example: Scale Decode Based on TTFT
+
+Using the `sglang-agg` DGD from `examples/backends/sglang/deploy/agg.yaml`:
+
+```yaml
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: sglang-agg-decode-scaler
+  namespace: default
+spec:
+  scaleTargetRef:
+    apiVersion: nvidia.com/v1alpha1
+    kind: DynamoGraphDeploymentScalingAdapter
+    name: sglang-agg-decode
+  minReplicaCount: 1
+  maxReplicaCount: 10
+  pollingInterval: 15      # Check metrics every 15 seconds
+  cooldownPeriod: 60       # Wait 60s before scaling down
+  triggers:
+  - type: prometheus
+    metadata:
+      # Update this URL to match your Prometheus service
+      serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090
+      metricName: dynamo_ttft_p95
+      query: |
+        histogram_quantile(0.95,
+          sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m]))
+          by (le)
+        )
+      threshold: "0.5"              # Scale up when TTFT p95 > 500ms (0.5 seconds)
+      activationThreshold: "0.1"    # Start scaling when TTFT > 100ms
+```
+
+Apply it:
+
+```bash
+kubectl apply -f sglang-agg-decode-scaler.yaml
+```
+
+### Verify KEDA Scaling
+
+```bash
+# Check ScaledObject status
+kubectl get scaledobject -n default
+
+# KEDA creates an HPA under the hood - you can see it
+kubectl get hpa -n default
+
+# Example output:
+# NAME                                REFERENCE                                              TARGETS      MINPODS   MAXPODS   REPLICAS
+# keda-hpa-sglang-agg-decode-scaler   DynamoGraphDeploymentScalingAdapter/sglang-agg-decode  45m/500m     1         10        1
+
+# Get detailed status
+kubectl describe scaledobject sglang-agg-decode-scaler -n default
+```
+
+### Example: Scale Based on Queue Depth
+
+```yaml
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: sglang-agg-decode-queue-scaler
+  namespace: default
+spec:
+  scaleTargetRef:
+    apiVersion: nvidia.com/v1alpha1
+    kind: DynamoGraphDeploymentScalingAdapter
+    name: sglang-agg-decode
+  minReplicaCount: 1
+  maxReplicaCount: 10
+  pollingInterval: 15
+  cooldownPeriod: 60
+  triggers:
+  - type: prometheus
+    metadata:
+      serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090
+      metricName: dynamo_queued_requests
+      query: |
+        sum(dynamo_frontend_queued_requests{dynamo_namespace="default-sglang-agg"})
+      threshold: "10"    # Scale up when queue > 10 requests
+```
+
+### How KEDA Works
+
+KEDA creates and manages an HPA under the hood:
+
+```
+┌──────────────────────────────────────────────────────────────────────┐
+│  You create: ScaledObject                                            │
+│    - scaleTargetRef: sglang-agg-decode                               │
+│    - triggers: prometheus query                                      │
+└──────────────────────────────────────────────────────────────────────┘
+                                │
+                                ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│  KEDA Operator automatically creates: HPA                            │
+│    - name: keda-hpa-sglang-agg-decode-scaler                         │
+│    - scaleTargetRef: sglang-agg-decode                               │
+│    - metrics: External (from KEDA metrics server)                    │
+└──────────────────────────────────────────────────────────────────────┘
+                                │
+                                ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│  DynamoGraphDeploymentScalingAdapter: sglang-agg-decode              │
+│    - spec.replicas: updated by HPA                                   │
+└──────────────────────────────────────────────────────────────────────┘
+                                │
+                                ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│  DynamoGraphDeployment: sglang-agg                                   │
+│    - spec.services.decode.replicas: synced from adapter              │
+└──────────────────────────────────────────────────────────────────────┘
+```
+
+## Mixed Autoscaling
+
+For disaggregated deployments (prefill + decode), you can use different autoscaling strategies for different services:
+
+```yaml
+---
+# HPA for Frontend (CPU-based)
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: sglang-agg-frontend-hpa
+  namespace: default
+spec:
+  scaleTargetRef:
+    apiVersion: nvidia.com/v1alpha1
+    kind: DynamoGraphDeploymentScalingAdapter
+    name: sglang-agg-frontend
+  minReplicas: 1
+  maxReplicas: 5
+  metrics:
+  - type: Resource
+    resource:
+      name: cpu
+      target:
+        type: Utilization
+        averageUtilization: 70
+
+---
+# KEDA for Decode (TTFT-based)
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: sglang-agg-decode-scaler
+  namespace: default
+spec:
+  scaleTargetRef:
+    apiVersion: nvidia.com/v1alpha1
+    kind: DynamoGraphDeploymentScalingAdapter
+    name: sglang-agg-decode
+  minReplicaCount: 1
+  maxReplicaCount: 10
+  triggers:
+  - type: prometheus
+    metadata:
+      serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc:9090
+      query: |
+        histogram_quantile(0.95,
+          sum(rate(dynamo_frontend_time_to_first_token_seconds_bucket{dynamo_namespace="default-sglang-agg"}[5m]))
+          by (le)
+        )
+      threshold: "0.5"
+```
+
+## Manual Scaling
+
+### With DGDSA Enabled (Default)
+
+When DGDSA is enabled (the default), scale via the adapter:
+
+```bash
+kubectl scale dgdsa sglang-agg-decode -n default --replicas=3
+```
+
+Verify the scaling:
+
+```bash
+kubectl get dgdsa sglang-agg-decode -n default
+
+# Output:
+# NAME                DGD         SERVICE   REPLICAS   AGE
+# sglang-agg-decode   sglang-agg  decode    3          10m
+```
+
+> **Note**: If an autoscaler (KEDA, HPA, Planner) is managing the adapter, your change will be overwritten on the next evaluation cycle.
+
+### With DGDSA Disabled
+
+If you've disabled the scaling adapter for a service, edit the DGD directly:
+
+```bash
+kubectl patch dgd sglang-agg --type=merge -p '{"spec":{"services":{"decode":{"replicas":3}}}}'
+```
+
+Or edit the YAML:
+
+```yaml
+spec:
+  services:
+    decode:
+      replicas: 3
+      scalingAdapter:
+        disable: true
+```
+
+## Best Practices
+
+### 1. Choose One Autoscaler Per Service
+
+Avoid configuring multiple autoscalers for the same service:
+
+| Configuration | Status |
+|---------------|--------|
+| HPA for frontend, Planner for prefill/decode | ✅ Good |
+| KEDA for all services | ✅ Good |
+| Planner only (default) | ✅ Good |
+| HPA + Planner both targeting decode | ❌ Bad - they will fight |
+
+### 2. Use Appropriate Metrics
+
+| Service Type | Recommended Metrics | Dynamo Metric |
+|--------------|---------------------|---------------|
+| Frontend | CPU utilization, request rate | `dynamo_frontend_requests_total` |
+| Prefill | Queue depth, TTFT | `dynamo_frontend_queued_requests`, `dynamo_frontend_time_to_first_token_seconds` |
+| Decode | KV cache utilization, ITL | `kvstats_gpu_cache_usage_percent`, `dynamo_frontend_inter_token_latency_seconds` |
+
+### 3. Configure Stabilization Windows
+
+Prevent thrashing with appropriate stabilization:
+
+```yaml
+# HPA
+behavior:
+  scaleDown:
+    stabilizationWindowSeconds: 300  # Wait 5 min before scaling down
+  scaleUp:
+    stabilizationWindowSeconds: 0    # Scale up immediately
+
+# KEDA
+spec:
+  cooldownPeriod: 300
+```
+
+### 4. Set Sensible Min/Max Replicas
+
+Always configure minimum and maximum replicas in your HPA/KEDA to prevent:
+- Scaling to zero (unless intentional)
+- Unbounded scaling that exhausts cluster resources
+
+## Troubleshooting
+
+### Adapters Not Created
+
+```bash
+# Check DGD status
+kubectl describe dgd sglang-agg -n default
+
+# Check operator logs
+kubectl logs -n dynamo-system deployment/dynamo-operator
+```
+
+### Scaling Not Working
+
+```bash
+# Check adapter status
+kubectl describe dgdsa sglang-agg-decode -n default
+
+# Check HPA/KEDA status
+kubectl describe hpa sglang-agg-decode-hpa -n default
+kubectl describe scaledobject sglang-agg-decode-scaler -n default
+
+# Verify metrics are available in Kubernetes metrics API
+kubectl get --raw /apis/external.metrics.k8s.io/v1beta1
+```
+
+### Metrics Not Available
+
+If HPA/KEDA shows `<unknown>` for metrics:
+
+```bash
+# Check if Dynamo metrics are being scraped
+kubectl port-forward -n default svc/sglang-agg-frontend 8000:8000
+curl http://localhost:8000/metrics | grep dynamo_frontend
+
+# Example output:
+# dynamo_frontend_queued_requests{model="Qwen/Qwen3-0.6B"} 2
+# dynamo_frontend_inflight_requests{model="Qwen/Qwen3-0.6B"} 5
+
+# Verify Prometheus is scraping the metrics
+kubectl port-forward -n monitoring svc/prometheus-kube-prometheus-prometheus 9090:9090
+# Then query: dynamo_frontend_time_to_first_token_seconds_bucket
+
+# Check KEDA operator logs
+kubectl logs -n keda deployment/keda-operator
+```
+
+### Rapid Scaling Up and Down
+
+If you see unstable scaling:
+
+1. Check if multiple autoscalers are targeting the same adapter
+2. Increase `cooldownPeriod` in KEDA ScaledObject
+3. Increase `stabilizationWindowSeconds` in HPA behavior
+
+## References
+
+- [Kubernetes HPA Documentation](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/)
+- [KEDA Documentation](https://keda.sh/)
+- [Prometheus Adapter](https://github.com/kubernetes-sigs/prometheus-adapter)
+- [Planner Documentation](../planner/sla_planner.md)
+- [Dynamo Metrics Reference](../observability/metrics.md)
+- [Prometheus and Grafana Setup](../observability/prometheus-grafana.md)
+