Unverified Commit 6a3f2002 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: Optional GPU Discovery for Namespace-Scoped Operators (#6343)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 44a76f96
aiconfigurator @ 7c08d2f2
Subproject commit 7c08d2f2c4e289afe49f48e56d392a7d7221155d
...@@ -142,6 +142,7 @@ spec: ...@@ -142,6 +142,7 @@ spec:
- --namespace-scope-lease-duration={{ .Values.namespaceRestriction.lease.duration }} - --namespace-scope-lease-duration={{ .Values.namespaceRestriction.lease.duration }}
- --namespace-scope-lease-renew-interval={{ .Values.namespaceRestriction.lease.renewInterval }} - --namespace-scope-lease-renew-interval={{ .Values.namespaceRestriction.lease.renewInterval }}
{{- end }} {{- end }}
- --gpu-discovery-enabled={{ .Values.gpuDiscovery.enabled }}
{{- end }} {{- end }}
- --operator-version={{ .Chart.AppVersion }} - --operator-version={{ .Chart.AppVersion }}
{{- if .Values.webhook.enabled }} {{- if .Values.webhook.enabled }}
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Pre-install check: verifies the installer has permission to create the GPU discovery ClusterRole.
# Runs only when namespaceRestriction.enabled=true and gpuDiscovery.enabled=true (the default).
# If the check fails, installation is aborted with a clear error message explaining the options.
{{- if and .Values.namespaceRestriction.enabled .Values.gpuDiscovery.enabled }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-10"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-10"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
rules:
- apiGroups: ["authorization.k8s.io"]
resources: ["selfsubjectaccessreviews"]
verbs: ["create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-10"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
subjects:
- kind: ServiceAccount
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
namespace: {{ .Release.Namespace }}
---
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
backoffLimit: 0
activeDeadlineSeconds: 60
template:
spec:
restartPolicy: Never
serviceAccountName: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
{{- if .Values.controllerManager.tolerations }}
tolerations:
{{- toYaml .Values.controllerManager.tolerations | nindent 8 }}
{{- end }}
containers:
- name: check
image: alpine/k8s:1.34.1
command:
- /bin/sh
- -c
- |
if kubectl auth can-i create clusterroles --all-namespaces > /dev/null 2>&1; then
echo "GPU discovery preflight check passed: installer has ClusterRole creation permissions."
exit 0
else
echo ""
echo "ERROR: GPU discovery requires ClusterRole creation permissions, but the installer"
echo "does not have them. This is needed to grant the namespace-scoped operator"
echo "read-only node access for automatic GPU hardware discovery."
echo ""
echo "Options:"
echo " 1. Ask your cluster admin to grant ClusterRole creation permissions and re-run."
echo ""
echo " 2. Disable GPU discovery and provide hardware config manually in each DGDR:"
echo " helm install ... --set dynamo-operator.gpuDiscovery.enabled=false"
echo ""
echo " Then in your DynamoGraphDeploymentRequest:"
echo " spec:"
echo " profilingConfig:"
echo " config:"
echo " hardware:"
echo " numGpusPerNode: 8"
echo " gpuModel: \"H100-SXM5-80GB\""
echo " gpuVramMib: 81920"
echo ""
exit 1
fi
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- if and .Values.namespaceRestriction.enabled .Values.gpuDiscovery.enabled }}
---
# Grants the namespace-scoped operator read-only node access for GPU hardware discovery.
# Namespace-scoped operators do not have this ability by default; cluster-wide operators do.
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery
subjects:
- kind: ServiceAccount
name: {{ include "dynamo-operator.fullname" . }}-controller-manager
namespace: {{ .Release.Namespace }}
{{- end }}
...@@ -31,6 +31,13 @@ namespaceRestriction: ...@@ -31,6 +31,13 @@ namespaceRestriction:
duration: 30s duration: 30s
# Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running. # Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
renewInterval: 10s renewInterval: 10s
# -- GPU discovery configuration (only applies when namespaceRestriction.enabled=true)
gpuDiscovery:
# -- Whether to provision a ClusterRole for the namespace-scoped operator to read GPU node labels.
# When true (default), Helm creates a ClusterRole/ClusterRoleBinding granting node read access.
# Set to false if your installer lacks ClusterRole creation permissions; you must then provide
# hardware config manually in each DynamoGraphDeploymentRequest.
enabled: true
controllerManager: controllerManager:
tolerations: [] tolerations: []
affinity: {} affinity: {}
......
...@@ -13,6 +13,14 @@ ...@@ -13,6 +13,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
{{- $operatorValues := index .Values "dynamo-operator" }}
{{- if and $operatorValues.namespaceRestriction.enabled $operatorValues.gpuDiscovery.enabled }}
GPU Discovery: ENABLED — the operator will automatically detect GPU hardware from cluster nodes.
{{- else if $operatorValues.namespaceRestriction.enabled }}
GPU Discovery: DISABLED — DynamoGraphDeploymentRequests must include hardware configuration in
spec.profilingConfig.config, or set dynamo-operator.gpuDiscovery.enabled=true and reinstall.
{{- end }}
{{- if (index .Values "dynamo-api-store").enabled }} {{- if (index .Values "dynamo-api-store").enabled }}
{{- if (index .Values "dynamo-api-store").ingress.enabled }} {{- if (index .Values "dynamo-api-store").ingress.enabled }}
Your service is available at: Your service is available at:
......
...@@ -46,6 +46,13 @@ dynamo-operator: ...@@ -46,6 +46,13 @@ dynamo-operator:
# Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running. # Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
renewInterval: 10s renewInterval: 10s
# -- GPU discovery configuration (only applies when namespaceRestriction.enabled=true)
gpuDiscovery:
# -- Whether to provision a ClusterRole for the namespace-scoped operator to read GPU node labels.
# When true (default), Helm creates a ClusterRole/ClusterRoleBinding granting node read access.
# Set to false if your installer lacks ClusterRole creation permissions.
enabled: true
# -- The Dynamo discovery backend to use. Default is "kubernetes" for Kubernetes API service discovery. Set to "etcd" to use ETCD for discovery. -- # -- The Dynamo discovery backend to use. Default is "kubernetes" for Kubernetes API service discovery. Set to "etcd" to use ETCD for discovery. --
discoveryBackend: "kubernetes" discoveryBackend: "kubernetes"
......
...@@ -156,6 +156,7 @@ func main() { ...@@ -156,6 +156,7 @@ func main() {
var operatorVersion string var operatorVersion string
var discoveryBackend string var discoveryBackend string
var enableWebhooks bool var enableWebhooks bool
var gpuDiscoveryEnabled bool
// Checkpoint configuration // Checkpoint configuration
var checkpointEnabled bool var checkpointEnabled bool
var checkpointStorageType string var checkpointStorageType string
...@@ -181,6 +182,9 @@ func main() { ...@@ -181,6 +182,9 @@ func main() {
flag.BoolVar(&enableWebhooks, "enable-webhooks", false, flag.BoolVar(&enableWebhooks, "enable-webhooks", false,
"Enable admission webhooks for validation. When enabled, controllers skip validation "+ "Enable admission webhooks for validation. When enabled, controllers skip validation "+
"(webhooks handle it). When disabled, controllers perform validation.") "(webhooks handle it). When disabled, controllers perform validation.")
flag.BoolVar(&gpuDiscoveryEnabled, "gpu-discovery-enabled", true,
"Whether GPU discovery is enabled for namespace-scoped operators. When true (default), "+
"the Helm chart has provisioned a ClusterRole granting node read access for GPU hardware discovery.")
flag.StringVar(&restrictedNamespace, "restrictedNamespace", "", flag.StringVar(&restrictedNamespace, "restrictedNamespace", "",
"Enable resources filtering, only the resources belonging to the given namespace will be handled.") "Enable resources filtering, only the resources belonging to the given namespace will be handled.")
flag.StringVar(&leaderElectionID, "leader-election-id", "", "Leader election id"+ flag.StringVar(&leaderElectionID, "leader-election-id", "", "Leader election id"+
...@@ -688,6 +692,7 @@ func main() { ...@@ -688,6 +692,7 @@ func main() {
// Set webhooks enabled flag in config // Set webhooks enabled flag in config
ctrlConfig.WebhooksEnabled = enableWebhooks ctrlConfig.WebhooksEnabled = enableWebhooks
ctrlConfig.GPUDiscoveryEnabled = gpuDiscoveryEnabled
if enableWebhooks { if enableWebhooks {
setupLog.Info("Webhooks are enabled - webhooks will validate, controllers will skip validation") setupLog.Info("Webhooks are enabled - webhooks will validate, controllers will skip validation")
...@@ -733,7 +738,7 @@ func main() { ...@@ -733,7 +738,7 @@ func main() {
} }
isClusterWide := ctrlConfig.RestrictedNamespace == "" isClusterWide := ctrlConfig.RestrictedNamespace == ""
dgdrHandler := webhookvalidation.NewDynamoGraphDeploymentRequestHandler(isClusterWide) dgdrHandler := webhookvalidation.NewDynamoGraphDeploymentRequestHandler(isClusterWide, gpuDiscoveryEnabled)
if err = dgdrHandler.RegisterWithManager(mgr); err != nil { if err = dgdrHandler.RegisterWithManager(mgr); err != nil {
setupLog.Error(err, "unable to register webhook", "webhook", "DynamoGraphDeploymentRequest") setupLog.Error(err, "unable to register webhook", "webhook", "DynamoGraphDeploymentRequest")
os.Exit(1) os.Exit(1)
......
...@@ -893,7 +893,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex ...@@ -893,7 +893,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
// Use the validator for simple validation (defense in depth - only when webhooks are disabled) // Use the validator for simple validation (defense in depth - only when webhooks are disabled)
if !r.Config.WebhooksEnabled { if !r.Config.WebhooksEnabled {
isClusterWide := r.Config.RestrictedNamespace == "" isClusterWide := r.Config.RestrictedNamespace == ""
validator := webhookvalidation.NewDynamoGraphDeploymentRequestValidator(dgdr, isClusterWide) validator := webhookvalidation.NewDynamoGraphDeploymentRequestValidator(dgdr, isClusterWide, r.Config.GPUDiscoveryEnabled)
warnings, err := validator.Validate() warnings, err := validator.Validate()
if err != nil { if err != nil {
return err return err
...@@ -1036,18 +1036,31 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con ...@@ -1036,18 +1036,31 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
isNamespaceScoped := r.Config.RestrictedNamespace != "" isNamespaceScoped := r.Config.RestrictedNamespace != ""
if isNamespaceScoped { if isNamespaceScoped {
return fmt.Errorf(`GPU hardware info required but cannot be auto-discovered (namespace-scoped operator lacks node read permissions). tmpl := template.Must(template.New("nsGPUErr").Parse(
`GPU hardware info required but cannot be auto-discovered.` +
Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s. "\n\nOptions to resolve:" +
"\n\n1. Re-enable GPU discovery (if it was disabled during Helm install):" +
See: https://github.com/ai-dynamo/dynamo/issues/6257`, "\n helm upgrade ... --set dynamo-operator.gpuDiscovery.enabled=true" +
ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib, "\n\n2. Add hardware config to profilingConfig.config.{{.Hardware}}:" +
ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng) "\n {{.NumGPUs}}: 8" +
"\n {{.GPUModel}}: \"H100-SXM5-80GB\"" +
"\n {{.GPUVram}}: 81920" +
"\n\n3. Or specify {{.Engine}}.{{.MinGPUs}} and {{.Engine}}.{{.MaxGPUs}} for explicit GPU search ranges.",
))
var buf bytes.Buffer
_ = tmpl.Execute(&buf, map[string]string{
"Hardware": ConfigKeyHardware,
"NumGPUs": ConfigKeyNumGpusPerNode,
"GPUModel": ConfigKeyGPUModel,
"GPUVram": ConfigKeyGPUVramMib,
"Engine": ConfigKeyEngine,
"MinGPUs": ConfigKeyMinNumGpusPerEng,
"MaxGPUs": ConfigKeyMaxNumGpusPerEng,
})
return fmt.Errorf("%s", buf.String())
} }
return fmt.Errorf(`GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s. return fmt.Errorf("GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s",
See profiling documentation for configuration details.`,
ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib, ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib,
ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng) ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng)
} }
......
...@@ -85,6 +85,10 @@ type Config struct { ...@@ -85,6 +85,10 @@ type Config struct {
// When false, controllers perform validation (defense in depth) // When false, controllers perform validation (defense in depth)
WebhooksEnabled bool WebhooksEnabled bool
// GPUDiscoveryEnabled indicates whether Helm provisioned node read access for the namespace-scoped operator.
// Only relevant for namespace-scoped operators (RestrictedNamespace != "").
GPUDiscoveryEnabled bool
// Checkpoint configuration for checkpoint/restore functionality // Checkpoint configuration for checkpoint/restore functionality
Checkpoint CheckpointConfig Checkpoint CheckpointConfig
} }
......
...@@ -28,7 +28,7 @@ import ( ...@@ -28,7 +28,7 @@ import (
// toFloat64 converts a numeric value (int or float64) to float64. // toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64. // Returns 0 if the value is neither int nor float64.
func toFloat64(val interface{}) float64 { func toFloat64(val any) float64 {
switch v := val.(type) { switch v := val.(type) {
case float64: case float64:
return v return v
...@@ -44,14 +44,17 @@ func toFloat64(val interface{}) float64 { ...@@ -44,14 +44,17 @@ func toFloat64(val interface{}) float64 {
type DynamoGraphDeploymentRequestValidator struct { type DynamoGraphDeploymentRequestValidator struct {
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
isClusterWideOperator bool isClusterWideOperator bool
gpuDiscoveryEnabled bool
} }
// NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest. // NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest.
// The isClusterWide parameter indicates whether the operator is running in cluster-wide or namespace-restricted mode. // isClusterWide indicates whether the operator has cluster-wide permissions.
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, isClusterWide bool) *DynamoGraphDeploymentRequestValidator { // gpuDiscoveryEnabled indicates whether Helm provisioned node read access for the operator.
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator {
return &DynamoGraphDeploymentRequestValidator{ return &DynamoGraphDeploymentRequestValidator{
request: request, request: request,
isClusterWideOperator: isClusterWide, isClusterWideOperator: isClusterWide,
gpuDiscoveryEnabled: gpuDiscoveryEnabled,
} }
} }
...@@ -81,17 +84,17 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, ...@@ -81,17 +84,17 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
// Parse config to validate structure (only if config is present) // Parse config to validate structure (only if config is present)
if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 { if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 {
var config map[string]interface{} var config map[string]any
if parseErr := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); parseErr != nil { if parseErr := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); parseErr != nil {
err = errors.Join(err, fmt.Errorf("failed to parse spec.profilingConfig.config: %w", parseErr)) err = errors.Join(err, fmt.Errorf("failed to parse spec.profilingConfig.config: %w", parseErr))
} else { } else {
// Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields) // Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields)
if engineConfig, ok := config["engine"].(map[string]interface{}); ok { if engineConfig, ok := config["engine"].(map[string]any); ok {
if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != v.request.Spec.Backend { if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != v.request.Spec.Backend {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)", backend, v.request.Spec.Backend)) warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)", backend, v.request.Spec.Backend))
} }
} }
if deployment, ok := config["deployment"].(map[string]interface{}); ok { if deployment, ok := config["deployment"].(map[string]any); ok {
if model, ok := deployment["model"].(string); ok && model != "" && model != v.request.Spec.Model { if model, ok := deployment["model"].(string); ok && model != "" && model != v.request.Spec.Model {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)", model, v.request.Spec.Model)) warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)", model, v.request.Spec.Model))
} }
...@@ -108,24 +111,24 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings, ...@@ -108,24 +111,24 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
} }
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling. // validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
// This validation happens at admission time to fail fast before the DGDR is persisted to etcd. // Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided.
func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error { func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
// Parse profiling config // Parse profiling config
var config map[string]interface{} var config map[string]any
if v.request.Spec.ProfilingConfig.Config != nil { if v.request.Spec.ProfilingConfig.Config != nil {
if err := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); err != nil { if err := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
// Config parse errors will be caught by other validators // Config parse errors will be caught by other validators
return nil return nil
} }
} else { } else {
config = make(map[string]interface{}) config = make(map[string]any)
} }
// Check if manual hardware config is provided // Check if manual hardware config is provided
hardwareVal, hasHardware := config["hardware"] hardwareVal, hasHardware := config["hardware"]
var hasManualHardwareConfig bool var hasManualHardwareConfig bool
if hasHardware && hardwareVal != nil { if hasHardware && hardwareVal != nil {
if hardwareConfig, ok := hardwareVal.(map[string]interface{}); ok { if hardwareConfig, ok := hardwareVal.(map[string]any); ok {
// Check if essential hardware fields are provided // Check if essential hardware fields are provided
_, hasGPUModel := hardwareConfig["gpuModel"] _, hasGPUModel := hardwareConfig["gpuModel"]
_, hasGPUVram := hardwareConfig["gpuVramMib"] _, hasGPUVram := hardwareConfig["gpuVramMib"]
...@@ -137,7 +140,7 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error ...@@ -137,7 +140,7 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error
// Check if explicit GPU ranges are provided // Check if explicit GPU ranges are provided
var hasExplicitGPURanges bool var hasExplicitGPURanges bool
if engineVal, hasEngine := config["engine"]; hasEngine && engineVal != nil { if engineVal, hasEngine := config["engine"]; hasEngine && engineVal != nil {
if engineConfig, ok := engineVal.(map[string]interface{}); ok { if engineConfig, ok := engineVal.(map[string]any); ok {
minGPUs, hasMin := engineConfig["minNumGpusPerEngine"] minGPUs, hasMin := engineConfig["minNumGpusPerEngine"]
maxGPUs, hasMax := engineConfig["maxNumGpusPerEngine"] maxGPUs, hasMax := engineConfig["maxNumGpusPerEngine"]
// Validate explicit GPU ranges // Validate explicit GPU ranges
...@@ -156,42 +159,17 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error ...@@ -156,42 +159,17 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error
} }
} }
// If manual config or explicit ranges provided, validation passes
if hasManualHardwareConfig || hasExplicitGPURanges { if hasManualHardwareConfig || hasExplicitGPURanges {
return nil return nil
} }
// Neither manual config nor explicit ranges provided // No manual hardware config provided. Cluster-wide operators always have GPU discovery via node
// GPU discovery will be attempted at reconcile time, but if it's unavailable // permissions. Namespace-scoped operators rely on Helm-provisioned GPU discovery (gpuDiscovery.enabled).
// (e.g., namespace-scoped operator), the DGDR will fail if v.isClusterWideOperator || v.gpuDiscoveryEnabled {
//
// Fail at admission time to give users immediate feedback
if v.isClusterWideOperator {
// Cluster-wide operator should have GPU discovery available
// Allow DGDR to be created - GPU discovery will provide hardware info
return nil return nil
} }
// Namespace-scoped operator likely doesn't have node read permissions return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.profilingConfig.config)")
// Require manual hardware config or explicit GPU ranges
return errors.New(`GPU hardware configuration required for namespace-scoped operators.
Namespace-scoped operators typically lack node read permissions for GPU auto-discovery.
Provide hardware configuration in one of these ways:
1. Add hardware config in spec.profilingConfig.config:
hardware:
numGpusPerNode: 8
gpuModel: "H100-SXM5-80GB"
gpuVramMib: 81920
2. Or specify explicit GPU search ranges:
engine:
minNumGpusPerEngine: 2
maxNumGpusPerEngine: 8
See: https://github.com/ai-dynamo/dynamo/issues/6257`)
} }
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest. // ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
......
...@@ -41,13 +41,16 @@ const ( ...@@ -41,13 +41,16 @@ const (
// It is a thin wrapper around DynamoGraphDeploymentRequestValidator. // It is a thin wrapper around DynamoGraphDeploymentRequestValidator.
type DynamoGraphDeploymentRequestHandler struct { type DynamoGraphDeploymentRequestHandler struct {
isClusterWideOperator bool isClusterWideOperator bool
gpuDiscoveryEnabled bool
} }
// NewDynamoGraphDeploymentRequestHandler creates a new handler for DynamoGraphDeploymentRequest Webhook. // NewDynamoGraphDeploymentRequestHandler creates a new handler for DynamoGraphDeploymentRequest Webhook.
// The isClusterWide parameter indicates whether the operator is running in cluster-wide or namespace-restricted mode. // isClusterWide indicates whether the operator has cluster-wide permissions.
func NewDynamoGraphDeploymentRequestHandler(isClusterWide bool) *DynamoGraphDeploymentRequestHandler { // gpuDiscoveryEnabled indicates whether a ClusterRole for node read access was provisioned by Helm.
func NewDynamoGraphDeploymentRequestHandler(isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestHandler {
return &DynamoGraphDeploymentRequestHandler{ return &DynamoGraphDeploymentRequestHandler{
isClusterWideOperator: isClusterWide, isClusterWideOperator: isClusterWide,
gpuDiscoveryEnabled: gpuDiscoveryEnabled,
} }
} }
...@@ -63,7 +66,7 @@ func (h *DynamoGraphDeploymentRequestHandler) ValidateCreate(ctx context.Context ...@@ -63,7 +66,7 @@ func (h *DynamoGraphDeploymentRequestHandler) ValidateCreate(ctx context.Context
logger.Info("validate create", "name", request.Name, "namespace", request.Namespace) logger.Info("validate create", "name", request.Name, "namespace", request.Namespace)
// Create validator and perform validation // Create validator and perform validation
validator := NewDynamoGraphDeploymentRequestValidator(request, h.isClusterWideOperator) validator := NewDynamoGraphDeploymentRequestValidator(request, h.isClusterWideOperator, h.gpuDiscoveryEnabled)
return validator.Validate() return validator.Validate()
} }
...@@ -90,7 +93,7 @@ func (h *DynamoGraphDeploymentRequestHandler) ValidateUpdate(ctx context.Context ...@@ -90,7 +93,7 @@ func (h *DynamoGraphDeploymentRequestHandler) ValidateUpdate(ctx context.Context
} }
// Create validator and perform validation // Create validator and perform validation
validator := NewDynamoGraphDeploymentRequestValidator(newRequest, h.isClusterWideOperator) validator := NewDynamoGraphDeploymentRequestValidator(newRequest, h.isClusterWideOperator, h.gpuDiscoveryEnabled)
// Validate stateless rules // Validate stateless rules
warnings, err := validator.Validate() warnings, err := validator.Validate()
......
...@@ -29,19 +29,20 @@ import ( ...@@ -29,19 +29,20 @@ import (
func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
validConfig := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}}` validConfig := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}}`
validConfigWithHardware := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}, "hardware": {"numGpusPerNode": 8, "gpuModel": "H100-SXM5-80GB", "gpuVramMib": 81920}}` validConfigWithHardware := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}, "hardware": {"numGpusPerNode": 8, "gpuModel": "H100-SXM5-80GB", "gpuVramMib": 81920}}`
minimalConfig := `{"sla": {"ttft": 200.0}}`
configWithDifferentBackend := `{"engine": {"backend": "sglang"}}` configWithDifferentBackend := `{"engine": {"backend": "sglang"}}`
configWithDifferentModel := `{"deployment": {"model": "different-model"}}` configWithDifferentModel := `{"deployment": {"model": "different-model"}}`
invalidYAML := `{invalid yaml` invalidYAML := `{invalid yaml`
// errMsg: if non-empty, an error is expected and each newline-separated substring must appear in it.
// expectedWarning: if non-empty, at least one warning must contain this substring.
tests := []struct { tests := []struct {
name string name string
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
isClusterWide bool isClusterWide bool
wantErr bool gpuDiscoveryEnabled bool
errMsg string errMsg string
wantWarnings bool expectedWarning string
expectedWarning string
errContains bool
}{ }{
{ {
name: "valid request", name: "valid request",
...@@ -62,7 +63,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -62,7 +63,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
}, },
isClusterWide: true, isClusterWide: true,
wantErr: false,
}, },
{ {
name: "missing profiler image", name: "missing profiler image",
...@@ -83,7 +83,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -83,7 +83,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
}, },
isClusterWide: true, isClusterWide: true,
wantErr: true,
errMsg: "spec.profilingConfig.profilerImage is required", errMsg: "spec.profilingConfig.profilerImage is required",
}, },
{ {
...@@ -103,7 +102,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -103,7 +102,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
}, },
isClusterWide: true, isClusterWide: true,
wantErr: true,
errMsg: "spec.profilingConfig.config is required and must not be empty", errMsg: "spec.profilingConfig.config is required and must not be empty",
}, },
{ {
...@@ -125,11 +123,10 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -125,11 +123,10 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
}, },
isClusterWide: true, isClusterWide: true,
wantErr: true,
errMsg: "spec.profilingConfig.config is required and must not be empty", errMsg: "spec.profilingConfig.config is required and must not be empty",
}, },
{ {
name: "namespace-restricted operator (GPU discovery will fail gracefully)", name: "namespace-scoped operator with manual hardware config (should pass)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{ request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr", Name: "test-dgdr",
...@@ -146,8 +143,51 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -146,8 +143,51 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
}, },
}, },
isClusterWide: false, isClusterWide: false,
wantErr: false, gpuDiscoveryEnabled: false,
},
{
name: "namespace-scoped operator with GPU discovery enabled (should pass without manual config)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(minimalConfig),
},
},
},
},
isClusterWide: false,
gpuDiscoveryEnabled: true,
},
{
name: "namespace-scoped operator with GPU discovery disabled and no hardware config (should error)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(minimalConfig),
},
},
},
},
isClusterWide: false,
gpuDiscoveryEnabled: false,
errMsg: "GPU hardware configuration required: GPU discovery is disabled",
}, },
{ {
name: "invalid config YAML", name: "invalid config YAML",
...@@ -168,8 +208,7 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -168,8 +208,7 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
}, },
isClusterWide: true, isClusterWide: true,
wantErr: true, errMsg: "failed to parse spec.profilingConfig.config",
errMsg: "failed to parse spec.profilingConfig.config: error converting YAML to JSON: yaml: line 1: did not find expected ',' or '}'",
}, },
{ {
name: "warning for different backend in config", name: "warning for different backend in config",
...@@ -190,8 +229,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -190,8 +229,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
}, },
isClusterWide: true, isClusterWide: true,
wantErr: false,
wantWarnings: true,
expectedWarning: "spec.profilingConfig.config.engine.backend (sglang) will be overwritten by spec.backend (vllm)", expectedWarning: "spec.profilingConfig.config.engine.backend (sglang) will be overwritten by spec.backend (vllm)",
}, },
{ {
...@@ -213,8 +250,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -213,8 +250,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
}, },
isClusterWide: true, isClusterWide: true,
wantErr: false,
wantWarnings: true,
expectedWarning: "spec.profilingConfig.config.deployment.model (different-model) will be overwritten by spec.model (llama-3-8b)", expectedWarning: "spec.profilingConfig.config.deployment.model (different-model) will be overwritten by spec.model (llama-3-8b)",
}, },
{ {
...@@ -234,47 +269,34 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) { ...@@ -234,47 +269,34 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
}, },
}, },
isClusterWide: false, isClusterWide: false,
wantErr: true,
errMsg: "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty", errMsg: "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty",
errContains: true,
}, },
// TODO: Add test for invalid GPU range (min > max) validation
// The validation logic is in place (lines 148-152 of dynamographdeploymentrequest.go)
// but needs proper test coverage
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentRequestValidator(tt.request, tt.isClusterWide) validator := NewDynamoGraphDeploymentRequestValidator(tt.request, tt.isClusterWide, tt.gpuDiscoveryEnabled)
warnings, err := validator.Validate() warnings, err := validator.Validate()
if (err != nil) != tt.wantErr { wantErr := tt.errMsg != ""
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() error = %v, wantErr %v", err, tt.wantErr) if (err != nil) != wantErr {
t.Errorf("Validate() error = %v, wantErr %v", err, wantErr)
return return
} }
if wantErr {
if tt.wantErr { for _, msg := range strings.Split(tt.errMsg, "\n") {
if tt.errContains { if !strings.Contains(err.Error(), msg) {
// For multiple errors, check that all expected error messages are present t.Errorf("Validate() error %q does not contain %q", err.Error(), msg)
errStr := err.Error()
for _, expectedMsg := range strings.Split(tt.errMsg, "\n") {
if !strings.Contains(errStr, expectedMsg) {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() error message = %v, want to contain %v", errStr, expectedMsg)
}
}
} else {
if err.Error() != tt.errMsg {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() error message = %v, want %v", err.Error(), tt.errMsg)
} }
} }
} }
if tt.wantWarnings && len(warnings) == 0 { wantWarning := tt.expectedWarning != ""
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() expected warnings but got none") if wantWarning && len(warnings) == 0 {
t.Errorf("Validate() expected warning %q but got none", tt.expectedWarning)
} }
if wantWarning && len(warnings) > 0 && !strings.Contains(warnings[0], tt.expectedWarning) {
if tt.wantWarnings && len(warnings) > 0 && warnings[0] != tt.expectedWarning { t.Errorf("Validate() warning %q does not contain %q", warnings[0], tt.expectedWarning)
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() warning = %v, want %v", warnings[0], tt.expectedWarning)
} }
}) })
} }
...@@ -350,7 +372,7 @@ func TestDynamoGraphDeploymentRequestValidator_ValidateUpdate(t *testing.T) { ...@@ -350,7 +372,7 @@ func TestDynamoGraphDeploymentRequestValidator_ValidateUpdate(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentRequestValidator(tt.newRequest, true) validator := NewDynamoGraphDeploymentRequestValidator(tt.newRequest, true, true)
warnings, err := validator.ValidateUpdate(tt.oldRequest) warnings, err := validator.ValidateUpdate(tt.oldRequest)
if (err != nil) != tt.wantErr { if (err != nil) != tt.wantErr {
......
...@@ -227,15 +227,35 @@ See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator# ...@@ -227,15 +227,35 @@ See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#
### Automatic GPU Discovery ### Automatic GPU Discovery
The operator automatically discovers GPU resources from your Kubernetes cluster nodes when available. GPU discovery provides: The operator automatically discovers GPU resources from cluster nodes, providing hardware info (GPU model, VRAM, GPUs per node) and automatic profiling search space calculation.
- Hardware information (GPU model, VRAM, GPUs per node) **Requirements:**
- Automatic calculation of profiling search space based on model size - **Cluster-scoped operators**: Have node read permissions by default
- Hardware system identifier for AI Configurator integration - **Namespace-scoped operators**: GPU discovery is enabled by default when installing via Helm — the chart provisions the required ClusterRole/ClusterRoleBinding automatically
**Permissions**: GPU discovery requires cluster-wide node read permissions. Cluster-scoped operators automatically have these permissions. Namespace-restricted operators can also use GPU discovery if granted node read permissions via RBAC. **For namespace-scoped operators**, GPU discovery is controlled by a Helm value:
If GPU discovery is unavailable (no permissions or no GPU labels), the profiler will use manually specified hardware configuration or defaults. ```bash
# GPU discovery enabled (default) — Helm provisions read-only node access automatically
helm install dynamo-platform ... --set dynamo-operator.gpuDiscovery.enabled=true
# GPU discovery disabled — you must provide hardware config manually in each DGDR
helm install dynamo-platform ... --set dynamo-operator.gpuDiscovery.enabled=false
```
If GPU discovery is disabled, provide hardware config manually in the DGDR:
```yaml
spec:
profilingConfig:
config:
hardware:
numGpusPerNode: 8
gpuModel: "H100-SXM5-80GB"
gpuVramMib: 81920
```
If GPU discovery is disabled and no manual hardware config is provided, the DGDR will be rejected at admission time.
## Configuration ## Configuration
......
...@@ -192,6 +192,30 @@ Found existing namespace-restricted Dynamo operators in namespaces: ... ...@@ -192,6 +192,30 @@ Found existing namespace-restricted Dynamo operators in namespaces: ...
--set "dynamo-operator.namespaceRestriction.targetNamespace=dynamo-namespace" # optional --set "dynamo-operator.namespaceRestriction.targetNamespace=dynamo-namespace" # optional
``` ```
### GPU Discovery for DynamoGraphDeploymentRequests with Namespace-Scoped Operators
GPU discovery is **enabled by default** for namespace-scoped operators. The Helm chart automatically provisions a ClusterRole/ClusterRoleBinding granting the operator read-only access to node GPU labels.
**To disable GPU discovery** (if your installer lacks ClusterRole creation permissions):
```bash
helm install dynamo-platform ... --set dynamo-operator.gpuDiscovery.enabled=false
```
When GPU discovery is disabled, you must provide hardware configuration manually in each DynamoGraphDeploymentRequest:
```yaml
spec:
profilingConfig:
config:
hardware:
numGpusPerNode: 8
gpuModel: "H100-SXM5-80GB"
gpuVramMib: 81920
```
> **Note**: If GPU discovery is disabled and no hardware config is provided, the DGDR will be rejected at admission time with a clear error message.
[Verify Installation](#verify-installation) [Verify Installation](#verify-installation)
## Path B: Custom Build from Source ## Path B: Custom Build from Source
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment