Unverified Commit a96d9c51 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

fix: Replace hardcoded SA suffix matching with config-driven allow-list for...


fix: Replace hardcoded SA suffix matching with config-driven allow-list for DGD replicas webhook (#7656) (#7682)
Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 6dc85fbc
...@@ -81,6 +81,14 @@ spec: ...@@ -81,6 +81,14 @@ spec:
{{- end }} {{- end }}
- name: KUBERNETES_CLUSTER_DOMAIN - name: KUBERNETES_CLUSTER_DOMAIN
value: {{ quote .Values.kubernetesClusterDomain }} value: {{ quote .Values.kubernetesClusterDomain }}
- name: POD_SERVICE_ACCOUNT
valueFrom:
fieldRef:
fieldPath: spec.serviceAccountName
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
imagePullPolicy: {{ .Values.controllerManager.manager.image.pullPolicy | quote }} imagePullPolicy: {{ .Values.controllerManager.manager.image.pullPolicy | quote }}
image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag | default .Chart.AppVersion }} image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag | default .Chart.AppVersion }}
livenessProbe: livenessProbe:
......
...@@ -686,6 +686,14 @@ func registerWebhooks( ...@@ -686,6 +686,14 @@ func registerWebhooks(
internalwebhook.SetExcludedNamespaces(nil) internalwebhook.SetExcludedNamespaces(nil)
} }
var operatorPrincipal string
if sa, ns := os.Getenv("POD_SERVICE_ACCOUNT"), os.Getenv("POD_NAMESPACE"); sa != "" && ns != "" {
operatorPrincipal = fmt.Sprintf("system:serviceaccount:%s:%s", ns, sa)
setupLog.Info("Detected operator principal from downward API", "principal", operatorPrincipal)
} else {
setupLog.Info("POD_SERVICE_ACCOUNT/POD_NAMESPACE not set; operator SA self-identification disabled")
}
setupLog.Info("Registering validation webhooks") setupLog.Info("Registering validation webhooks")
dcdHandler := webhookvalidation.NewDynamoComponentDeploymentHandler() dcdHandler := webhookvalidation.NewDynamoComponentDeploymentHandler()
...@@ -693,7 +701,7 @@ func registerWebhooks( ...@@ -693,7 +701,7 @@ func registerWebhooks(
return fmt.Errorf("unable to register DynamoComponentDeployment webhook: %w", err) return fmt.Errorf("unable to register DynamoComponentDeployment webhook: %w", err)
} }
dgdHandler := webhookvalidation.NewDynamoGraphDeploymentHandler(mgr) dgdHandler := webhookvalidation.NewDynamoGraphDeploymentHandler(mgr, operatorPrincipal)
if err := dgdHandler.RegisterWithManager(mgr); err != nil { if err := dgdHandler.RegisterWithManager(mgr); err != nil {
return fmt.Errorf("unable to register DynamoGraphDeployment webhook: %w", err) return fmt.Errorf("unable to register DynamoGraphDeployment webhook: %w", err)
} }
......
...@@ -21,6 +21,7 @@ import ( ...@@ -21,6 +21,7 @@ import (
"context" "context"
"strings" "strings"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
authenticationv1 "k8s.io/api/authentication/v1" authenticationv1 "k8s.io/api/authentication/v1"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client"
...@@ -121,52 +122,39 @@ func (v *LeaseAwareValidator) shouldSkipValidation(obj runtime.Object) bool { ...@@ -121,52 +122,39 @@ func (v *LeaseAwareValidator) shouldSkipValidation(obj runtime.Object) bool {
return false return false
} }
// DGDReplicasModifierSuffixes defines suffixes for service accounts that are authorized
// to modify DGD replicas when scaling adapter is enabled.
// Service accounts matching any of these suffixes are allowed regardless of namespace.
var DGDReplicasModifierSuffixes = []string{
// Dynamo operator controller manager (handles DGDSA reconciliation)
// Example: "dynamo-platform-dynamo-operator-controller-manager"
"-dynamo-operator-controller-manager",
// Planner service account (manages DGD replicas for autoscaling)
// Example: "planner-serviceaccount"
"planner-serviceaccount",
}
// CanModifyDGDReplicas checks if the request comes from a service account authorized // CanModifyDGDReplicas checks if the request comes from a service account authorized
// to modify DGD replicas when scaling adapter is enabled. // to modify DGD replicas when scaling adapter is enabled.
// Service accounts are identified by username format: system:serviceaccount:<namespace>:<name>
// //
// Authorized service accounts (by suffix): // operatorPrincipal is the full Kubernetes username
// - *-dynamo-operator-controller-manager (for DGDSA reconciliation) // (system:serviceaccount:<namespace>:<name>) of the operator's own service account,
// - *planner-serviceaccount (for Planner autoscaling) // auto-detected at startup via the Kubernetes Downward API. It may be empty if
func CanModifyDGDReplicas(userInfo authenticationv1.UserInfo) bool { // the Downward API env vars were not set.
//
// Authorization is checked in two ways:
// 1. Exact match against operatorPrincipal.
// 2. Name-only match for the planner SA, which the operator creates in every DGD
// namespace with a well-known constant name. Because the namespace is only known
// at runtime, it cannot be enumerated statically.
func CanModifyDGDReplicas(operatorPrincipal string, userInfo authenticationv1.UserInfo) bool {
username := userInfo.Username username := userInfo.Username
// Service accounts have username format: system:serviceaccount:<namespace>:<name>
if !strings.HasPrefix(username, "system:serviceaccount:") { if !strings.HasPrefix(username, "system:serviceaccount:") {
return false return false
} }
// Parse: system:serviceaccount:<namespace>:<name> if operatorPrincipal != "" && username == operatorPrincipal {
parts := strings.Split(username, ":") webhookCommonLog.V(1).Info("allowing DGD replicas modification",
if len(parts) != 4 { "username", username,
return false "matchType", "operatorPrincipal")
return true
} }
namespace := parts[2] parts := strings.Split(username, ":")
saName := parts[3] if len(parts) == 4 && parts[3] == consts.PlannerServiceAccountName {
webhookCommonLog.V(1).Info("allowing DGD replicas modification",
// Check against authorized suffixes "username", username,
for _, suffix := range DGDReplicasModifierSuffixes { "matchType", "plannerSA")
if strings.HasSuffix(saName, suffix) { return true
webhookCommonLog.V(1).Info("allowing DGD replicas modification",
"serviceAccount", saName,
"namespace", namespace,
"matchedSuffix", suffix)
return true
}
} }
return false return false
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package webhook
import (
"testing"
authenticationv1 "k8s.io/api/authentication/v1"
)
func TestCanModifyDGDReplicas(t *testing.T) {
tests := []struct {
name string
principal string
username string
expectAllowed bool
}{
{
name: "operator SA with standard Helm release (dynamo-platform)",
principal: "system:serviceaccount:dynamo-system:dynamo-platform-dynamo-operator-controller-manager",
username: "system:serviceaccount:dynamo-system:dynamo-platform-dynamo-operator-controller-manager",
expectAllowed: true,
},
{
name: "operator SA with collapsed Helm release (dynamo-operator) — the bug scenario",
principal: "system:serviceaccount:dynamo-system:dynamo-operator-controller-manager",
username: "system:serviceaccount:dynamo-system:dynamo-operator-controller-manager",
expectAllowed: true,
},
{
name: "operator SA auto-detected from downward API",
principal: "system:serviceaccount:custom-ns:my-release-controller-manager",
username: "system:serviceaccount:custom-ns:my-release-controller-manager",
expectAllowed: true,
},
{
name: "operator SA wrong namespace is rejected",
principal: "system:serviceaccount:dynamo-system:dynamo-operator-controller-manager",
username: "system:serviceaccount:other-ns:dynamo-operator-controller-manager",
expectAllowed: false,
},
{
name: "planner SA allowed in any namespace (well-known name)",
principal: "system:serviceaccount:dynamo-system:dynamo-operator-controller-manager",
username: "system:serviceaccount:user-ns:planner-serviceaccount",
expectAllowed: true,
},
{
name: "planner SA allowed with no operator principal set",
principal: "",
username: "system:serviceaccount:other-ns:planner-serviceaccount",
expectAllowed: true,
},
{
name: "unauthorized SA rejected",
principal: "system:serviceaccount:dynamo-system:dynamo-operator-controller-manager",
username: "system:serviceaccount:user-ns:some-random-sa",
expectAllowed: false,
},
{
name: "non-SA user rejected",
principal: "system:serviceaccount:dynamo-system:dynamo-operator-controller-manager",
username: "admin@example.com",
expectAllowed: false,
},
{
name: "malformed SA username rejected",
principal: "system:serviceaccount:dynamo-system:dynamo-operator-controller-manager",
username: "system:serviceaccount:only-three-parts",
expectAllowed: false,
},
{
name: "empty operator principal still permits planner",
principal: "",
username: "system:serviceaccount:ns:planner-serviceaccount",
expectAllowed: true,
},
{
name: "empty operator principal rejects other SA",
principal: "",
username: "system:serviceaccount:ns:dynamo-operator-controller-manager",
expectAllowed: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
userInfo := authenticationv1.UserInfo{Username: tt.username}
got := CanModifyDGDReplicas(tt.principal, userInfo)
if got != tt.expectAllowed {
t.Errorf("CanModifyDGDReplicas() = %v, want %v", got, tt.expectAllowed)
}
})
}
}
...@@ -115,8 +115,9 @@ func (v *DynamoGraphDeploymentValidator) Validate(ctx context.Context) (admissio ...@@ -115,8 +115,9 @@ func (v *DynamoGraphDeploymentValidator) Validate(ctx context.Context) (admissio
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeployment. // ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeployment.
// userInfo is used for identity-based validation (replica protection). // userInfo is used for identity-based validation (replica protection).
// If userInfo is nil, replica changes for DGDSA-enabled services are rejected (fail closed). // If userInfo is nil, replica changes for DGDSA-enabled services are rejected (fail closed).
// operatorPrincipal is the full Kubernetes SA username of the operator for authorization.
// Returns warnings and error. // Returns warnings and error.
func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) (admission.Warnings, error) { func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo, operatorPrincipal string) (admission.Warnings, error) {
var warnings admission.Warnings var warnings admission.Warnings
// Validate immutable fields // Validate immutable fields
...@@ -131,7 +132,7 @@ func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.D ...@@ -131,7 +132,7 @@ func (v *DynamoGraphDeploymentValidator) ValidateUpdate(old *nvidiacomv1alpha1.D
// Validate replicas changes for services with scaling adapter enabled // Validate replicas changes for services with scaling adapter enabled
// Pass userInfo (may be nil - will fail closed for DGDSA-enabled services) // Pass userInfo (may be nil - will fail closed for DGDSA-enabled services)
if err := v.validateReplicasChanges(old, userInfo); err != nil { if err := v.validateReplicasChanges(old, userInfo, operatorPrincipal); err != nil {
return warnings, err return warnings, err
} }
...@@ -224,9 +225,9 @@ func (v *DynamoGraphDeploymentValidator) validateServiceTopology(old *nvidiacomv ...@@ -224,9 +225,9 @@ func (v *DynamoGraphDeploymentValidator) validateServiceTopology(old *nvidiacomv
// validateReplicasChanges checks if replicas were changed for services with scaling adapter enabled. // validateReplicasChanges checks if replicas were changed for services with scaling adapter enabled.
// Only authorized service accounts (operator controller, planner) can modify these fields. // Only authorized service accounts (operator controller, planner) can modify these fields.
// If userInfo is nil, all replica changes for DGDSA-enabled services are rejected (fail closed). // If userInfo is nil, all replica changes for DGDSA-enabled services are rejected (fail closed).
func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo) error { func (v *DynamoGraphDeploymentValidator) validateReplicasChanges(old *nvidiacomv1alpha1.DynamoGraphDeployment, userInfo *authenticationv1.UserInfo, operatorPrincipal string) error {
// If the request comes from an authorized service account, allow the change // If the request comes from an authorized service account, allow the change
if userInfo != nil && internalwebhook.CanModifyDGDReplicas(*userInfo) { if userInfo != nil && internalwebhook.CanModifyDGDReplicas(operatorPrincipal, *userInfo) {
return nil return nil
} }
......
...@@ -41,13 +41,17 @@ const ( ...@@ -41,13 +41,17 @@ const (
// DynamoGraphDeploymentHandler is a handler for validating DynamoGraphDeployment resources. // DynamoGraphDeploymentHandler is a handler for validating DynamoGraphDeployment resources.
// It is a thin wrapper around DynamoGraphDeploymentValidator. // It is a thin wrapper around DynamoGraphDeploymentValidator.
type DynamoGraphDeploymentHandler struct { type DynamoGraphDeploymentHandler struct {
mgr manager.Manager mgr manager.Manager
operatorPrincipal string
} }
// NewDynamoGraphDeploymentHandler creates a new handler for DynamoGraphDeployment Webhook. // NewDynamoGraphDeploymentHandler creates a new handler for DynamoGraphDeployment Webhook.
func NewDynamoGraphDeploymentHandler(mgr manager.Manager) *DynamoGraphDeploymentHandler { // operatorPrincipal is the full Kubernetes SA username of the operator, used to authorize
// replica changes on scaling-adapter-enabled services (#7656).
func NewDynamoGraphDeploymentHandler(mgr manager.Manager, operatorPrincipal string) *DynamoGraphDeploymentHandler {
return &DynamoGraphDeploymentHandler{ return &DynamoGraphDeploymentHandler{
mgr: mgr, mgr: mgr,
operatorPrincipal: operatorPrincipal,
} }
} }
...@@ -107,7 +111,7 @@ func (h *DynamoGraphDeploymentHandler) ValidateUpdate(ctx context.Context, oldOb ...@@ -107,7 +111,7 @@ func (h *DynamoGraphDeploymentHandler) ValidateUpdate(ctx context.Context, oldOb
} }
// Validate stateful rules (immutability + replicas protection) // Validate stateful rules (immutability + replicas protection)
updateWarnings, err := validator.ValidateUpdate(oldDeployment, userInfo) updateWarnings, err := validator.ValidateUpdate(oldDeployment, userInfo, h.operatorPrincipal)
if err != nil { if err != nil {
username := "<unknown>" username := "<unknown>"
if userInfo != nil { if userInfo != nil {
......
...@@ -1933,8 +1933,8 @@ func TestDynamoGraphDeploymentValidator_ValidateUpdate(t *testing.T) { ...@@ -1933,8 +1933,8 @@ func TestDynamoGraphDeploymentValidator_ValidateUpdate(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentValidator(tt.newDeployment) validator := NewDynamoGraphDeploymentValidator(tt.newDeployment)
// Pass nil userInfo - these tests don't modify replicas, so it's safe // Pass nil userInfo and empty operatorPrincipal - these tests don't modify replicas, so it's safe
warnings, err := validator.ValidateUpdate(tt.oldDeployment, nil) warnings, err := validator.ValidateUpdate(tt.oldDeployment, nil, "")
if (err != nil) != tt.wantErr { if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr) t.Errorf("DynamoGraphDeploymentValidator.ValidateUpdate() error = %v, wantErr %v", err, tt.wantErr)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment