Unverified Commit 06fc5d5e authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: Make the image field optional in DGDRs (#6557)


Signed-off-by: default avatarJont828 <jt572@cornell.edu>
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Co-authored-by: default avatarJont828 <jt572@cornell.edu>
Co-authored-by: default avatarCopilot <223556219+Copilot@users.noreply.github.com>
Co-authored-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
parent 03360b84
...@@ -247,13 +247,20 @@ spec: ...@@ -247,13 +247,20 @@ spec:
]" ]"
echo "📝 Patching MutatingWebhookConfiguration..." echo "📝 Patching MutatingWebhookConfiguration..."
# Patch mutating webhook (DynamoGraphDeployment defaulting) # Patch all mutating webhooks:
# 0: mdynamographdeployment.kb.io (DGD defaulting)
# 1: mdynamographdeploymentrequestv1beta1.kb.io (DGDR defaulting)
kubectl patch mutatingwebhookconfiguration ${MUTATING_WEBHOOK_NAME} \ kubectl patch mutatingwebhookconfiguration ${MUTATING_WEBHOOK_NAME} \
--type='json' -p="[ --type='json' -p="[
{ {
\"op\": \"add\", \"op\": \"add\",
\"path\": \"/webhooks/0/clientConfig/caBundle\", \"path\": \"/webhooks/0/clientConfig/caBundle\",
\"value\": \"${CA_BUNDLE}\" \"value\": \"${CA_BUNDLE}\"
},
{
\"op\": \"add\",
\"path\": \"/webhooks/1/clientConfig/caBundle\",
\"value\": \"${CA_BUNDLE}\"
} }
]" ]"
......
...@@ -221,4 +221,37 @@ webhooks: ...@@ -221,4 +221,37 @@ webhooks:
- dynamographdeployments - dynamographdeployments
sideEffects: None sideEffects: None
timeoutSeconds: {{ .Values.webhook.timeoutSeconds }} timeoutSeconds: {{ .Values.webhook.timeoutSeconds }}
- admissionReviewVersions:
- v1
clientConfig:
{{- if and (not .Values.webhook.certManager.enabled) .Values.webhook.certificateSecret.external }}
{{- if .Values.webhook.caBundle }}
caBundle: {{ .Values.webhook.caBundle }}
{{- end }}
{{- end }}
service:
name: {{ include "dynamo-operator.fullname" . }}-webhook-service
namespace: {{ .Release.Namespace }}
path: /mutate-nvidia-com-v1beta1-dynamographdeploymentrequest
failurePolicy: {{ .Values.webhook.failurePolicy }}
name: mdynamographdeploymentrequestv1beta1.kb.io
{{- if .Values.webhook.namespaceSelector }}
namespaceSelector:
{{- toYaml .Values.webhook.namespaceSelector | nindent 4 }}
{{- else if .Values.namespaceRestriction.enabled }}
namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: {{ .Release.Namespace }}
{{- end }}
rules:
- apiGroups:
- nvidia.com
apiVersions:
- v1beta1
operations:
- CREATE
resources:
- dynamographdeploymentrequests
sideEffects: None
timeoutSeconds: {{ .Values.webhook.timeoutSeconds }}
...@@ -538,6 +538,13 @@ func (d *DynamoGraphDeploymentRequest) GetPhase() DGDRPhase { ...@@ -538,6 +538,13 @@ func (d *DynamoGraphDeploymentRequest) GetPhase() DGDRPhase {
return d.Status.Phase return d.Status.Phase
} }
// GetState implements the observability.StateProvider interface, returning the
// phase as a string so v1beta1 DGDRs can be counted by the resource counter
// without registering a v1alpha1 cache informer.
func (d *DynamoGraphDeploymentRequest) GetState() string {
return string(d.Status.Phase)
}
// SetProfilingPhase updates the profiling sub-phase. // SetProfilingPhase updates the profiling sub-phase.
func (d *DynamoGraphDeploymentRequest) SetProfilingPhase(phase ProfilingPhase) { func (d *DynamoGraphDeploymentRequest) SetProfilingPhase(phase ProfilingPhase) {
d.Status.ProfilingPhase = phase d.Status.ProfilingPhase = phase
......
...@@ -552,6 +552,7 @@ func main() { ...@@ -552,6 +552,7 @@ func main() {
if err = (&controller.DynamoGraphDeploymentRequestReconciler{ if err = (&controller.DynamoGraphDeploymentRequestReconciler{
Client: mgr.GetClient(), Client: mgr.GetClient(),
APIReader: mgr.GetAPIReader(),
Recorder: mgr.GetEventRecorderFor("dynamographdeploymentrequest"), Recorder: mgr.GetEventRecorderFor("dynamographdeploymentrequest"),
Config: operatorCfg, Config: operatorCfg,
RuntimeConfig: runtimeConfig, RuntimeConfig: runtimeConfig,
...@@ -622,8 +623,9 @@ func main() { ...@@ -622,8 +623,9 @@ func main() {
os.Exit(1) os.Exit(1)
} }
// Register the DGDR conversion webhook using the hub version (v1beta1).
if err = ctrl.NewWebhookManagedBy(mgr). if err = ctrl.NewWebhookManagedBy(mgr).
For(&nvidiacomv1alpha1.DynamoGraphDeploymentRequest{}). For(&nvidiacomv1beta1.DynamoGraphDeploymentRequest{}).
Complete(); err != nil { Complete(); err != nil {
setupLog.Error(err, "unable to register conversion webhook", "webhook", "DynamoGraphDeploymentRequest-conversion") setupLog.Error(err, "unable to register conversion webhook", "webhook", "DynamoGraphDeploymentRequest-conversion")
os.Exit(1) os.Exit(1)
...@@ -640,6 +642,12 @@ func main() { ...@@ -640,6 +642,12 @@ func main() {
os.Exit(1) os.Exit(1)
} }
dgdrDefaulter := webhookdefaulting.NewDGDRDefaulter(operatorVersion)
if err = dgdrDefaulter.RegisterWithManager(mgr); err != nil {
setupLog.Error(err, "unable to register webhook", "webhook", "DynamoGraphDeploymentRequest-defaulting")
os.Exit(1)
}
setupLog.Info("Defaulting webhooks registered successfully") setupLog.Info("Defaulting webhooks registered successfully")
//+kubebuilder:scaffold:builder //+kubebuilder:scaffold:builder
......
...@@ -92,8 +92,7 @@ const ( ...@@ -92,8 +92,7 @@ const (
// Volume paths // Volume paths
ProfilingOutputPath = "/data" ProfilingOutputPath = "/data"
ProfilingOutputFile = "config_with_planner.yaml" ProfilingOutputFile = "final_config.yaml"
ProfilingOutputFileMocker = "mocker_config_with_planner.yaml"
ProfilingConfigMountPath = "/config" ProfilingConfigMountPath = "/config"
ProfilingConfigDefaultKey = "disagg.yaml" ProfilingConfigDefaultKey = "disagg.yaml"
DefaultModelCacheMountPath = "/opt/model-cache" DefaultModelCacheMountPath = "/opt/model-cache"
...@@ -111,7 +110,7 @@ const ( ...@@ -111,7 +110,7 @@ const (
MessageAICProfilingJobCreated = "AIC profiling job created" MessageAICProfilingJobCreated = "AIC profiling job created"
MessageProfilingInProgress = "Profiling is in progress" MessageProfilingInProgress = "Profiling is in progress"
MessageSpecGenerated = "DynamoGraphDeployment spec generated successfully" MessageSpecGenerated = "DynamoGraphDeployment spec generated successfully"
MessageSpecAvailable = "Generated spec is available in status.generatedDeployment" MessageSpecAvailable = "Generated spec is available in annotation nvidia.com/generated-dgd-spec"
MessageDeploymentCreated = "DynamoGraphDeployment %s created successfully" MessageDeploymentCreated = "DynamoGraphDeployment %s created successfully"
MessageDeploymentReady = "DynamoGraphDeployment %s is ready" MessageDeploymentReady = "DynamoGraphDeployment %s is ready"
MessageDeploymentDegraded = "DynamoGraphDeployment %s degraded from Ready to %s" MessageDeploymentDegraded = "DynamoGraphDeployment %s degraded from Ready to %s"
...@@ -222,13 +221,6 @@ data: ...@@ -222,13 +221,6 @@ data:
EOF EOF
sed 's/^/ /' {{.OutputPath}}/{{.OutputFile}} >> /tmp/cm.yaml sed 's/^/ /' {{.OutputPath}}/{{.OutputFile}} >> /tmp/cm.yaml
# Add mocker config (profiler always generates both real and mocker configs)
if [ -f {{.OutputPath}}/{{.MockerOutputFile}} ]; then
echo " {{.MockerOutputFile}}: |" >> /tmp/cm.yaml
sed 's/^/ /' {{.OutputPath}}/{{.MockerOutputFile}} >> /tmp/cm.yaml
echo "Added mocker config to ConfigMap"
fi
# Add profiler status file for debugging # Add profiler status file for debugging
if [ -f {{.OutputPath}}/profiler_status.yaml ]; then if [ -f {{.OutputPath}}/profiler_status.yaml ]; then
echo " profiler_status.yaml: |" >> /tmp/cm.yaml echo " profiler_status.yaml: |" >> /tmp/cm.yaml
...@@ -245,6 +237,7 @@ echo "Saved profiling output to ConfigMap {{.ConfigMapName}}" ...@@ -245,6 +237,7 @@ echo "Saved profiling output to ConfigMap {{.ConfigMapName}}"
// DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object // DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object
type DynamoGraphDeploymentRequestReconciler struct { type DynamoGraphDeploymentRequestReconciler struct {
client.Client client.Client
APIReader client.Reader
Recorder record.EventRecorder Recorder record.EventRecorder
Config *configv1alpha1.OperatorConfiguration Config *configv1alpha1.OperatorConfiguration
RuntimeConfig *commonController.RuntimeConfig RuntimeConfig *commonController.RuntimeConfig
...@@ -794,14 +787,7 @@ func isOnlineProfiling(_ *nvidiacomv1beta1.DynamoGraphDeploymentRequest) bool { ...@@ -794,14 +787,7 @@ func isOnlineProfiling(_ *nvidiacomv1beta1.DynamoGraphDeploymentRequest) bool {
func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error { func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Context, dgdr *nvidiacomv1beta1.DynamoGraphDeploymentRequest) error {
var errs []error var errs []error
// Validate image is specified (required for the profiling job container).
// Mirrors the webhook admission check so controller-side writes cannot bypass it.
if dgdr.Spec.Image == "" {
errs = append(errs, fmt.Errorf("spec.image is required"))
}
// Disallow searchStrategy: thorough with backend: auto. // Disallow searchStrategy: thorough with backend: auto.
// Mirrors the webhook admission check so controller-side writes cannot bypass it.
if dgdr.Spec.SearchStrategy == nvidiacomv1beta1.SearchStrategyThorough && if dgdr.Spec.SearchStrategy == nvidiacomv1beta1.SearchStrategyThorough &&
dgdr.Spec.Backend == nvidiacomv1beta1.BackendTypeAuto { dgdr.Spec.Backend == nvidiacomv1beta1.BackendTypeAuto {
errs = append(errs, fmt.Errorf( errs = append(errs, fmt.Errorf(
...@@ -850,7 +836,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con ...@@ -850,7 +836,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
return nil return nil
} }
_, err := gpu.DiscoverGPUs(ctx, r.Client) _, err := gpu.DiscoverGPUs(ctx, r.APIReader)
if err == nil { if err == nil {
// GPU discovery is available, validation passes // GPU discovery is available, validation passes
return nil return nil
...@@ -996,10 +982,15 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -996,10 +982,15 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
} }
// Profiler args: pass the DGDR spec as JSON via --config // Profiler args: pass the DGDR spec as JSON via --config
profilerArgs := []string{"--config", specJSON} // --output-dir must match ProfilingOutputPath so the sidecar can find profiler_status.yaml
profilerArgs := []string{"--config", specJSON, "--output-dir", ProfilingOutputPath}
// Use image from spec // Use image from spec; the defaulting webhook fills this in for production builds.
// Guard against empty image in case the webhook didn't run (e.g. local dev builds).
imageName := dgdr.Spec.Image imageName := dgdr.Spec.Image
if imageName == "" {
return nil, false, fmt.Errorf("spec.image is required but not set; ensure the defaulting webhook ran or set spec.image explicitly")
}
logger.Info("Using profiler image", "image", imageName) logger.Info("Using profiler image", "image", imageName)
profilerContainer := corev1.Container{ profilerContainer := corev1.Container{
...@@ -1009,6 +1000,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1009,6 +1000,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
Args: profilerArgs, Args: profilerArgs,
Env: profilerEnv, Env: profilerEnv,
VolumeMounts: volumeMounts, VolumeMounts: volumeMounts,
WorkingDir: "/workspace",
} }
// Generate sidecar script from template // Generate sidecar script from template
...@@ -1019,12 +1011,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context. ...@@ -1019,12 +1011,11 @@ func (r *DynamoGraphDeploymentRequestReconciler) createProfilingJob(ctx context.
var scriptBuf bytes.Buffer var scriptBuf bytes.Buffer
err = tmpl.Execute(&scriptBuf, map[string]string{ err = tmpl.Execute(&scriptBuf, map[string]string{
"OutputPath": ProfilingOutputPath, "OutputPath": ProfilingOutputPath,
"OutputFile": ProfilingOutputFile, "OutputFile": ProfilingOutputFile,
"MockerOutputFile": ProfilingOutputFileMocker, "ConfigMapName": outputConfigMapName,
"ConfigMapName": outputConfigMapName, "Namespace": dgdr.Namespace,
"Namespace": dgdr.Namespace, "DGDRName": dgdr.Name,
"DGDRName": dgdr.Name,
}) })
if err != nil { if err != nil {
return nil, false, fmt.Errorf("failed to execute sidecar script template: %w", err) return nil, false, fmt.Errorf("failed to execute sidecar script template: %w", err)
...@@ -1222,7 +1213,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx ...@@ -1222,7 +1213,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
return nil // all fields already set by user return nil // all fields already set by user
} }
gpuInfo, err := gpu.DiscoverGPUs(ctx, r.Client) gpuInfo, err := gpu.DiscoverGPUs(ctx, r.APIReader)
if err != nil { if err != nil {
return err return err
} }
...@@ -1392,14 +1383,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con ...@@ -1392,14 +1383,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) generateDGDSpec(ctx context.Con
} }
// Select the right config file based on mocker feature flag // Select the right config file based on mocker feature flag
// Profiler always generates both real and mocker configs // Profiler writes the selected config (real or mocker) to a single output file
var outputFile string outputFile := ProfilingOutputFile
if dgdr.Spec.Features != nil && dgdr.Spec.Features.Mocker != nil && dgdr.Spec.Features.Mocker.Enabled {
outputFile = ProfilingOutputFileMocker
logger.Info("Using mocker deployment config")
} else {
outputFile = ProfilingOutputFile
}
// Get YAML content from ConfigMap // Get YAML content from ConfigMap
yamlContent, exists := cm.Data[outputFile] yamlContent, exists := cm.Data[outputFile]
......
...@@ -94,9 +94,10 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -94,9 +94,10 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
Image: "test-profiler:latest", Image: "test-profiler:latest",
AutoApply: true,
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
...@@ -1060,8 +1061,9 @@ var _ = Describe("DGDR Error Handling", func() { ...@@ -1060,8 +1061,9 @@ var _ = Describe("DGDR Error Handling", func() {
BeforeEach(func() { BeforeEach(func() {
recorder = record.NewFakeRecorder(100) recorder = record.NewFakeRecorder(100)
reconciler = &DynamoGraphDeploymentRequestReconciler{ reconciler = &DynamoGraphDeploymentRequestReconciler{
Client: k8sClient, Client: k8sClient,
Recorder: recorder, APIReader: k8sClient,
Recorder: recorder,
Config: &configv1alpha1.OperatorConfiguration{ Config: &configv1alpha1.OperatorConfiguration{
Namespace: configv1alpha1.NamespaceConfiguration{ Namespace: configv1alpha1.NamespaceConfiguration{
Restricted: "", Restricted: "",
...@@ -1859,9 +1861,10 @@ spec: ...@@ -1859,9 +1861,10 @@ spec:
Namespace: namespace, Namespace: namespace,
}, },
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "test-model", Model: "test-model",
Backend: "vllm", Backend: "vllm",
Image: "test-profiler:latest", Image: "test-profiler:latest",
AutoApply: true,
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
...@@ -1922,7 +1925,7 @@ spec: ...@@ -1922,7 +1925,7 @@ spec:
Namespace: namespace, Namespace: namespace,
}, },
Data: map[string]string{ Data: map[string]string{
ProfilingOutputFileMocker: dgdYAML, ProfilingOutputFile: dgdYAML,
}, },
} }
Expect(k8sClient.Create(ctx, cm)).Should(Succeed()) Expect(k8sClient.Create(ctx, cm)).Should(Succeed())
......
...@@ -50,7 +50,7 @@ type GPUInfo struct { ...@@ -50,7 +50,7 @@ type GPUInfo struct {
// //
// This function requires cluster-wide node read permissions and expects nodes // This function requires cluster-wide node read permissions and expects nodes
// to have GFD labels. If no nodes with GPU labels are found, it returns an error. // to have GFD labels. If no nodes with GPU labels are found, it returns an error.
func DiscoverGPUs(ctx context.Context, k8sClient client.Client) (*GPUInfo, error) { func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("Starting GPU discovery from cluster nodes") logger.Info("Starting GPU discovery from cluster nodes")
......
...@@ -26,6 +26,7 @@ import ( ...@@ -26,6 +26,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1" "github.com/ai-dynamo/dynamo/deploy/operator/api/v1alpha1"
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"github.com/ai-dynamo/dynamo/deploy/operator/internal/consts" "github.com/ai-dynamo/dynamo/deploy/operator/internal/consts"
) )
...@@ -159,7 +160,7 @@ func updateDynamoModelCounts(ctx context.Context, c client.Client, excludedNames ...@@ -159,7 +160,7 @@ func updateDynamoModelCounts(ctx context.Context, c client.Client, excludedNames
} }
func updateDynamoGraphDeploymentRequestCounts(ctx context.Context, c client.Client, excludedNamespaces ExcludedNamespaces, logger logr.Logger) { func updateDynamoGraphDeploymentRequestCounts(ctx context.Context, c client.Client, excludedNamespaces ExcludedNamespaces, logger logr.Logger) {
dgdrList := &v1alpha1.DynamoGraphDeploymentRequestList{} dgdrList := &v1beta1.DynamoGraphDeploymentRequestList{}
if err := c.List(ctx, dgdrList); err != nil { if err := c.List(ctx, dgdrList); err != nil {
logger.Error(err, "failed to list DynamoGraphDeploymentRequests") logger.Error(err, "failed to list DynamoGraphDeploymentRequests")
return return
...@@ -168,7 +169,7 @@ func updateDynamoGraphDeploymentRequestCounts(ctx context.Context, c client.Clie ...@@ -168,7 +169,7 @@ func updateDynamoGraphDeploymentRequestCounts(ctx context.Context, c client.Clie
dgdrList.Items, dgdrList.Items,
excludedNamespaces, excludedNamespaces,
consts.ResourceTypeDynamoGraphDeploymentRequest, consts.ResourceTypeDynamoGraphDeploymentRequest,
func(d *v1alpha1.DynamoGraphDeploymentRequest) *v1alpha1.DynamoGraphDeploymentRequest { return d }, func(d *v1beta1.DynamoGraphDeploymentRequest) *v1beta1.DynamoGraphDeploymentRequest { return d },
) )
} }
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package defaulting
import (
"context"
"fmt"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
admissionv1 "k8s.io/api/admission/v1"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
const (
dgdrDefaultingWebhookName = "dynamographdeploymentrequest-defaulting-webhook"
dgdrDefaultingWebhookPath = "/mutate-nvidia-com-v1beta1-dynamographdeploymentrequest"
// defaultImage is the default profiler image used when spec.image is not set.
// Default image derivation is only supported for public release versions (1.0.0+).
defaultImage = "nvcr.io/nvidia/ai-dynamo/dynamo-frontend"
)
// DGDRDefaulter is a mutating webhook handler that fills in default values for
// DynamoGraphDeploymentRequest resources on CREATE.
//
// If spec.image is not set, it is derived as:
//
// nvcr.io/nvidia/ai-dynamo/dynamo-frontend:<operatorVersion>
//
// Defaulting requires a known operator version and is only supported for
// operator versions 1.0.0 and later.
type DGDRDefaulter struct {
OperatorVersion string
}
// NewDGDRDefaulter creates a new DGDRDefaulter with the given operator version.
func NewDGDRDefaulter(operatorVersion string) *DGDRDefaulter {
return &DGDRDefaulter{OperatorVersion: operatorVersion}
}
// Default implements admission.CustomDefaulter.
// Only called on CREATE (the webhook is not registered for UPDATE).
// If spec.image is not set, derives a default image from the backend and operator version.
func (d *DGDRDefaulter) Default(ctx context.Context, obj runtime.Object) error {
logger := log.FromContext(ctx).WithName(dgdrDefaultingWebhookName)
dgdr, ok := obj.(*nvidiacomv1beta1.DynamoGraphDeploymentRequest)
if !ok {
return fmt.Errorf("expected DynamoGraphDeploymentRequest but got %T", obj)
}
req, err := admission.RequestFromContext(ctx)
if err != nil {
logger.Error(err, "failed to get admission request from context, skipping defaulting")
return nil
}
if req.Operation == admissionv1.Create && dgdr.Spec.Image == "" {
if img := d.defaultImageFor(); img != "" {
dgdr.Spec.Image = img
logger.Info("defaulted spec.image from operator version",
"name", dgdr.Name,
"namespace", dgdr.Namespace,
"image", img,
)
}
}
return nil
}
// defaultImageFor returns the default image, or empty string when the operator version
// is unknown (e.g. local dev builds), in which case the user must provide spec.image explicitly.
func (d *DGDRDefaulter) defaultImageFor() string {
if d.OperatorVersion == "" || d.OperatorVersion == "unknown" {
return ""
}
return fmt.Sprintf("%s:%s", defaultImage, d.OperatorVersion)
}
// RegisterWithManager registers the DGDR defaulting webhook with the manager.
func (d *DGDRDefaulter) RegisterWithManager(mgr manager.Manager) error {
webhook := admission.
WithCustomDefaulter(mgr.GetScheme(), &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}, d).
WithRecoverPanic(true)
mgr.GetWebhookServer().Register(dgdrDefaultingWebhookPath, webhook)
return nil
}
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package defaulting
import (
"context"
"testing"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
admissionv1 "k8s.io/api/admission/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)
func TestDGDRDefaulter_defaultImageFor(t *testing.T) {
tests := []struct {
name string
operatorVersion string
expectedImage string
}{
{
name: "known version produces default image",
operatorVersion: "1.0.0",
expectedImage: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0",
},
{
name: "pre-release version is valid",
operatorVersion: "1.0.0-rc1",
expectedImage: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0-rc1",
},
{
name: "unknown operator version cannot be defaulted",
operatorVersion: "unknown",
expectedImage: "",
},
{
name: "empty operator version cannot be defaulted",
operatorVersion: "",
expectedImage: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := NewDGDRDefaulter(tt.operatorVersion)
got := d.defaultImageFor()
if got != tt.expectedImage {
t.Errorf("defaultImageFor() = %q, want %q", got, tt.expectedImage)
}
})
}
}
func makeAdmissionCtx(op admissionv1.Operation) context.Context {
req := admission.Request{
AdmissionRequest: admissionv1.AdmissionRequest{
Operation: op,
},
}
return admission.NewContextWithRequest(context.Background(), req)
}
func TestDGDRDefaulter_Default(t *testing.T) {
tests := []struct {
name string
version string
operation admissionv1.Operation
initialImage string
expectedImage string
}{
{
name: "CREATE with empty image defaults to operator version",
version: "1.0.0",
operation: admissionv1.Create,
initialImage: "",
expectedImage: "nvcr.io/nvidia/ai-dynamo/dynamo-frontend:1.0.0",
},
{
name: "CREATE with preset image is not overwritten",
version: "1.0.0",
operation: admissionv1.Create,
initialImage: "my-registry/my-image:custom",
expectedImage: "my-registry/my-image:custom",
},
{
name: "CREATE with unknown operator version leaves image empty",
version: "unknown",
operation: admissionv1.Create,
initialImage: "",
expectedImage: "",
},
{
name: "UPDATE does not default image",
version: "1.0.0",
operation: admissionv1.Update,
initialImage: "",
expectedImage: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
d := NewDGDRDefaulter(tt.version)
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{Image: tt.initialImage},
}
ctx := makeAdmissionCtx(tt.operation)
if err := d.Default(ctx, dgdr); err != nil {
t.Fatalf("Default() unexpected error: %v", err)
}
if dgdr.Spec.Image != tt.expectedImage {
t.Errorf("after Default(): spec.image = %q, want %q", dgdr.Spec.Image, tt.expectedImage)
}
})
}
}
...@@ -50,11 +50,6 @@ func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1beta1.DynamoGr ...@@ -50,11 +50,6 @@ func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1beta1.DynamoGr
func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) { func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, error) {
var err error var err error
// Validate image is specified (required for the profiling job container).
if v.request.Spec.Image == "" {
err = errors.Join(err, errors.New("spec.image is required"))
}
// Disallow searchStrategy: thorough with backend: auto. // Disallow searchStrategy: thorough with backend: auto.
// "thorough" sweeps more configurations and requires a concrete backend to be selected; // "thorough" sweeps more configurations and requires a concrete backend to be selected;
// "auto" defers backend selection and is only compatible with the "rapid" search strategy. // "auto" defers backend selection and is only compatible with the "rapid" search strategy.
...@@ -77,6 +72,7 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, ...@@ -77,6 +72,7 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling. // validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
// Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided. // Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided.
// Also validates consistency of GPU range fields.
func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error { func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
// Check if manual hardware config is provided via typed spec.hardware fields. // Check if manual hardware config is provided via typed spec.hardware fields.
var hasManualHardwareConfig bool var hasManualHardwareConfig bool
......
...@@ -94,22 +94,7 @@ func (h *DynamoGraphDeploymentRequestHandler) ValidateUpdate(ctx context.Context ...@@ -94,22 +94,7 @@ func (h *DynamoGraphDeploymentRequestHandler) ValidateUpdate(ctx context.Context
// Create validator and perform validation // Create validator and perform validation
validator := NewDynamoGraphDeploymentRequestValidator(newRequest, h.isClusterWideOperator, h.gpuDiscoveryEnabled) validator := NewDynamoGraphDeploymentRequestValidator(newRequest, h.isClusterWideOperator, h.gpuDiscoveryEnabled)
return validator.ValidateUpdate(oldRequest)
// Validate stateless rules
warnings, err := validator.Validate()
if err != nil {
return warnings, err
}
// Validate stateful rules (immutability)
updateWarnings, err := validator.ValidateUpdate(oldRequest)
if err != nil {
return updateWarnings, err
}
// Combine warnings
warnings = append(warnings, updateWarnings...)
return warnings, nil
} }
// ValidateDelete validates a DynamoGraphDeploymentRequest delete request. // ValidateDelete validates a DynamoGraphDeploymentRequest delete request.
......
...@@ -49,19 +49,7 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -49,19 +49,7 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
isClusterWide: true, isClusterWide: true,
}, },
{
name: "missing image",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: nvidiacomv1beta1.BackendTypeVllm,
Image: "",
},
},
isClusterWide: true,
errMsg: "spec.image is required",
},
{ {
name: "thorough + auto is invalid", name: "thorough + auto is invalid",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
...@@ -174,7 +162,7 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -174,7 +162,7 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
errMsg: "GPU hardware configuration required: GPU discovery is disabled", errMsg: "GPU hardware configuration required: GPU discovery is disabled",
}, },
{ {
name: "multiple errors (missing image and thorough+auto)", name: "thorough+auto is invalid regardless of image",
request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"}, ObjectMeta: metav1.ObjectMeta{Name: "test-dgdr", Namespace: "default"},
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{ Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
...@@ -185,7 +173,7 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -185,7 +173,7 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
}, },
isClusterWide: true, isClusterWide: true,
errMsg: "spec.image is required\nspec.searchStrategy", errMsg: "spec.searchStrategy",
}, },
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment