Unverified Commit 6a3f2002 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: Optional GPU Discovery for Namespace-Scoped Operators (#6343)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 44a76f96
aiconfigurator @ 7c08d2f2
Subproject commit 7c08d2f2c4e289afe49f48e56d392a7d7221155d
......@@ -142,6 +142,7 @@ spec:
- --namespace-scope-lease-duration={{ .Values.namespaceRestriction.lease.duration }}
- --namespace-scope-lease-renew-interval={{ .Values.namespaceRestriction.lease.renewInterval }}
{{- end }}
- --gpu-discovery-enabled={{ .Values.gpuDiscovery.enabled }}
{{- end }}
- --operator-version={{ .Chart.AppVersion }}
{{- if .Values.webhook.enabled }}
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Pre-install check: verifies the installer has permission to create the GPU discovery ClusterRole.
# Runs only when namespaceRestriction.enabled=true and gpuDiscovery.enabled=true (the default).
# If the check fails, installation is aborted with a clear error message explaining the options.
{{- if and .Values.namespaceRestriction.enabled .Values.gpuDiscovery.enabled }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-10"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-10"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
rules:
- apiGroups: ["authorization.k8s.io"]
resources: ["selfsubjectaccessreviews"]
verbs: ["create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-10"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
subjects:
- kind: ServiceAccount
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
namespace: {{ .Release.Namespace }}
---
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
backoffLimit: 0
activeDeadlineSeconds: 60
template:
spec:
restartPolicy: Never
serviceAccountName: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
{{- if .Values.controllerManager.tolerations }}
tolerations:
{{- toYaml .Values.controllerManager.tolerations | nindent 8 }}
{{- end }}
containers:
- name: check
image: alpine/k8s:1.34.1
command:
- /bin/sh
- -c
- |
if kubectl auth can-i create clusterroles --all-namespaces > /dev/null 2>&1; then
echo "GPU discovery preflight check passed: installer has ClusterRole creation permissions."
exit 0
else
echo ""
echo "ERROR: GPU discovery requires ClusterRole creation permissions, but the installer"
echo "does not have them. This is needed to grant the namespace-scoped operator"
echo "read-only node access for automatic GPU hardware discovery."
echo ""
echo "Options:"
echo " 1. Ask your cluster admin to grant ClusterRole creation permissions and re-run."
echo ""
echo " 2. Disable GPU discovery and provide hardware config manually in each DGDR:"
echo " helm install ... --set dynamo-operator.gpuDiscovery.enabled=false"
echo ""
echo " Then in your DynamoGraphDeploymentRequest:"
echo " spec:"
echo " profilingConfig:"
echo " config:"
echo " hardware:"
echo " numGpusPerNode: 8"
echo " gpuModel: \"H100-SXM5-80GB\""
echo " gpuVramMib: 81920"
echo ""
exit 1
fi
{{- end }}
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
{{- if and .Values.namespaceRestriction.enabled .Values.gpuDiscovery.enabled }}
---
# Grants the namespace-scoped operator read-only node access for GPU hardware discovery.
# Namespace-scoped operators do not have this ability by default; cluster-wide operators do.
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery
subjects:
- kind: ServiceAccount
name: {{ include "dynamo-operator.fullname" . }}-controller-manager
namespace: {{ .Release.Namespace }}
{{- end }}
......@@ -31,6 +31,13 @@ namespaceRestriction:
duration: 30s
# Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
renewInterval: 10s
# -- GPU discovery configuration (only applies when namespaceRestriction.enabled=true)
gpuDiscovery:
# -- Whether to provision a ClusterRole for the namespace-scoped operator to read GPU node labels.
# When true (default), Helm creates a ClusterRole/ClusterRoleBinding granting node read access.
# Set to false if your installer lacks ClusterRole creation permissions; you must then provide
# hardware config manually in each DynamoGraphDeploymentRequest.
enabled: true
controllerManager:
tolerations: []
affinity: {}
......
......@@ -13,6 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
{{- $operatorValues := index .Values "dynamo-operator" }}
{{- if and $operatorValues.namespaceRestriction.enabled $operatorValues.gpuDiscovery.enabled }}
GPU Discovery: ENABLED — the operator will automatically detect GPU hardware from cluster nodes.
{{- else if $operatorValues.namespaceRestriction.enabled }}
GPU Discovery: DISABLED — DynamoGraphDeploymentRequests must include hardware configuration in
spec.profilingConfig.config, or set dynamo-operator.gpuDiscovery.enabled=true and reinstall.
{{- end }}
{{- if (index .Values "dynamo-api-store").enabled }}
{{- if (index .Values "dynamo-api-store").ingress.enabled }}
Your service is available at:
......
......@@ -46,6 +46,13 @@ dynamo-operator:
# Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
renewInterval: 10s
# -- GPU discovery configuration (only applies when namespaceRestriction.enabled=true)
gpuDiscovery:
# -- Whether to provision a ClusterRole for the namespace-scoped operator to read GPU node labels.
# When true (default), Helm creates a ClusterRole/ClusterRoleBinding granting node read access.
# Set to false if your installer lacks ClusterRole creation permissions.
enabled: true
# -- The Dynamo discovery backend to use. Default is "kubernetes" for Kubernetes API service discovery. Set to "etcd" to use ETCD for discovery. --
discoveryBackend: "kubernetes"
......
......@@ -156,6 +156,7 @@ func main() {
var operatorVersion string
var discoveryBackend string
var enableWebhooks bool
var gpuDiscoveryEnabled bool
// Checkpoint configuration
var checkpointEnabled bool
var checkpointStorageType string
......@@ -181,6 +182,9 @@ func main() {
flag.BoolVar(&enableWebhooks, "enable-webhooks", false,
"Enable admission webhooks for validation. When enabled, controllers skip validation "+
"(webhooks handle it). When disabled, controllers perform validation.")
flag.BoolVar(&gpuDiscoveryEnabled, "gpu-discovery-enabled", true,
"Whether GPU discovery is enabled for namespace-scoped operators. When true (default), "+
"the Helm chart has provisioned a ClusterRole granting node read access for GPU hardware discovery.")
flag.StringVar(&restrictedNamespace, "restrictedNamespace", "",
"Enable resources filtering, only the resources belonging to the given namespace will be handled.")
flag.StringVar(&leaderElectionID, "leader-election-id", "", "Leader election id"+
......@@ -688,6 +692,7 @@ func main() {
// Set webhooks enabled flag in config
ctrlConfig.WebhooksEnabled = enableWebhooks
ctrlConfig.GPUDiscoveryEnabled = gpuDiscoveryEnabled
if enableWebhooks {
setupLog.Info("Webhooks are enabled - webhooks will validate, controllers will skip validation")
......@@ -733,7 +738,7 @@ func main() {
}
isClusterWide := ctrlConfig.RestrictedNamespace == ""
dgdrHandler := webhookvalidation.NewDynamoGraphDeploymentRequestHandler(isClusterWide)
dgdrHandler := webhookvalidation.NewDynamoGraphDeploymentRequestHandler(isClusterWide, gpuDiscoveryEnabled)
if err = dgdrHandler.RegisterWithManager(mgr); err != nil {
setupLog.Error(err, "unable to register webhook", "webhook", "DynamoGraphDeploymentRequest")
os.Exit(1)
......
......@@ -893,7 +893,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateSpec(ctx context.Contex
// Use the validator for simple validation (defense in depth - only when webhooks are disabled)
if !r.Config.WebhooksEnabled {
isClusterWide := r.Config.RestrictedNamespace == ""
validator := webhookvalidation.NewDynamoGraphDeploymentRequestValidator(dgdr, isClusterWide)
validator := webhookvalidation.NewDynamoGraphDeploymentRequestValidator(dgdr, isClusterWide, r.Config.GPUDiscoveryEnabled)
warnings, err := validator.Validate()
if err != nil {
return err
......@@ -1036,18 +1036,31 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
isNamespaceScoped := r.Config.RestrictedNamespace != ""
if isNamespaceScoped {
return fmt.Errorf(`GPU hardware info required but cannot be auto-discovered (namespace-scoped operator lacks node read permissions).
Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s.
See: https://github.com/ai-dynamo/dynamo/issues/6257`,
ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib,
ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng)
tmpl := template.Must(template.New("nsGPUErr").Parse(
`GPU hardware info required but cannot be auto-discovered.` +
"\n\nOptions to resolve:" +
"\n\n1. Re-enable GPU discovery (if it was disabled during Helm install):" +
"\n helm upgrade ... --set dynamo-operator.gpuDiscovery.enabled=true" +
"\n\n2. Add hardware config to profilingConfig.config.{{.Hardware}}:" +
"\n {{.NumGPUs}}: 8" +
"\n {{.GPUModel}}: \"H100-SXM5-80GB\"" +
"\n {{.GPUVram}}: 81920" +
"\n\n3. Or specify {{.Engine}}.{{.MinGPUs}} and {{.Engine}}.{{.MaxGPUs}} for explicit GPU search ranges.",
))
var buf bytes.Buffer
_ = tmpl.Execute(&buf, map[string]string{
"Hardware": ConfigKeyHardware,
"NumGPUs": ConfigKeyNumGpusPerNode,
"GPUModel": ConfigKeyGPUModel,
"GPUVram": ConfigKeyGPUVramMib,
"Engine": ConfigKeyEngine,
"MinGPUs": ConfigKeyMinNumGpusPerEng,
"MaxGPUs": ConfigKeyMaxNumGpusPerEng,
})
return fmt.Errorf("%s", buf.String())
}
return fmt.Errorf(`GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s.
See profiling documentation for configuration details.`,
return fmt.Errorf("GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s",
ConfigKeyHardware, ConfigKeyNumGpusPerNode, ConfigKeyGPUModel, ConfigKeyGPUVramMib,
ConfigKeyEngine, ConfigKeyMinNumGpusPerEng, ConfigKeyEngine, ConfigKeyMaxNumGpusPerEng)
}
......
......@@ -85,6 +85,10 @@ type Config struct {
// When false, controllers perform validation (defense in depth)
WebhooksEnabled bool
// GPUDiscoveryEnabled indicates whether Helm provisioned node read access for the namespace-scoped operator.
// Only relevant for namespace-scoped operators (RestrictedNamespace != "").
GPUDiscoveryEnabled bool
// Checkpoint configuration for checkpoint/restore functionality
Checkpoint CheckpointConfig
}
......
......@@ -28,7 +28,7 @@ import (
// toFloat64 converts a numeric value (int or float64) to float64.
// Returns 0 if the value is neither int nor float64.
func toFloat64(val interface{}) float64 {
func toFloat64(val any) float64 {
switch v := val.(type) {
case float64:
return v
......@@ -44,14 +44,17 @@ func toFloat64(val interface{}) float64 {
type DynamoGraphDeploymentRequestValidator struct {
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
isClusterWideOperator bool
gpuDiscoveryEnabled bool
}
// NewDynamoGraphDeploymentRequestValidator creates a new validator for DynamoGraphDeploymentRequest.
// The isClusterWide parameter indicates whether the operator is running in cluster-wide or namespace-restricted mode.
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, isClusterWide bool) *DynamoGraphDeploymentRequestValidator {
// isClusterWide indicates whether the operator has cluster-wide permissions.
// gpuDiscoveryEnabled indicates whether Helm provisioned node read access for the operator.
func NewDynamoGraphDeploymentRequestValidator(request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest, isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestValidator {
return &DynamoGraphDeploymentRequestValidator{
request: request,
isClusterWideOperator: isClusterWide,
gpuDiscoveryEnabled: gpuDiscoveryEnabled,
}
}
......@@ -81,17 +84,17 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
// Parse config to validate structure (only if config is present)
if v.request.Spec.ProfilingConfig.Config != nil && len(v.request.Spec.ProfilingConfig.Config.Raw) > 0 {
var config map[string]interface{}
var config map[string]any
if parseErr := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); parseErr != nil {
err = errors.Join(err, fmt.Errorf("failed to parse spec.profilingConfig.config: %w", parseErr))
} else {
// Warn if deployment.model or engine.backend are specified in config (they will be overwritten by spec fields)
if engineConfig, ok := config["engine"].(map[string]interface{}); ok {
if engineConfig, ok := config["engine"].(map[string]any); ok {
if backend, ok := engineConfig["backend"].(string); ok && backend != "" && backend != v.request.Spec.Backend {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)", backend, v.request.Spec.Backend))
}
}
if deployment, ok := config["deployment"].(map[string]interface{}); ok {
if deployment, ok := config["deployment"].(map[string]any); ok {
if model, ok := deployment["model"].(string); ok && model != "" && model != v.request.Spec.Model {
warnings = append(warnings, fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)", model, v.request.Spec.Model))
}
......@@ -108,24 +111,24 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
}
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
// This validation happens at admission time to fail fast before the DGDR is persisted to etcd.
// Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided.
func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error {
// Parse profiling config
var config map[string]interface{}
var config map[string]any
if v.request.Spec.ProfilingConfig.Config != nil {
if err := yaml.Unmarshal(v.request.Spec.ProfilingConfig.Config.Raw, &config); err != nil {
// Config parse errors will be caught by other validators
return nil
}
} else {
config = make(map[string]interface{})
config = make(map[string]any)
}
// Check if manual hardware config is provided
hardwareVal, hasHardware := config["hardware"]
var hasManualHardwareConfig bool
if hasHardware && hardwareVal != nil {
if hardwareConfig, ok := hardwareVal.(map[string]interface{}); ok {
if hardwareConfig, ok := hardwareVal.(map[string]any); ok {
// Check if essential hardware fields are provided
_, hasGPUModel := hardwareConfig["gpuModel"]
_, hasGPUVram := hardwareConfig["gpuVramMib"]
......@@ -137,7 +140,7 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error
// Check if explicit GPU ranges are provided
var hasExplicitGPURanges bool
if engineVal, hasEngine := config["engine"]; hasEngine && engineVal != nil {
if engineConfig, ok := engineVal.(map[string]interface{}); ok {
if engineConfig, ok := engineVal.(map[string]any); ok {
minGPUs, hasMin := engineConfig["minNumGpusPerEngine"]
maxGPUs, hasMax := engineConfig["maxNumGpusPerEngine"]
// Validate explicit GPU ranges
......@@ -156,42 +159,17 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error
}
}
// If manual config or explicit ranges provided, validation passes
if hasManualHardwareConfig || hasExplicitGPURanges {
return nil
}
// Neither manual config nor explicit ranges provided
// GPU discovery will be attempted at reconcile time, but if it's unavailable
// (e.g., namespace-scoped operator), the DGDR will fail
//
// Fail at admission time to give users immediate feedback
if v.isClusterWideOperator {
// Cluster-wide operator should have GPU discovery available
// Allow DGDR to be created - GPU discovery will provide hardware info
// No manual hardware config provided. Cluster-wide operators always have GPU discovery via node
// permissions. Namespace-scoped operators rely on Helm-provisioned GPU discovery (gpuDiscovery.enabled).
if v.isClusterWideOperator || v.gpuDiscoveryEnabled {
return nil
}
// Namespace-scoped operator likely doesn't have node read permissions
// Require manual hardware config or explicit GPU ranges
return errors.New(`GPU hardware configuration required for namespace-scoped operators.
Namespace-scoped operators typically lack node read permissions for GPU auto-discovery.
Provide hardware configuration in one of these ways:
1. Add hardware config in spec.profilingConfig.config:
hardware:
numGpusPerNode: 8
gpuModel: "H100-SXM5-80GB"
gpuVramMib: 81920
2. Or specify explicit GPU search ranges:
engine:
minNumGpusPerEngine: 2
maxNumGpusPerEngine: 8
See: https://github.com/ai-dynamo/dynamo/issues/6257`)
return errors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.profilingConfig.config)")
}
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
......
......@@ -41,13 +41,16 @@ const (
// It is a thin wrapper around DynamoGraphDeploymentRequestValidator.
type DynamoGraphDeploymentRequestHandler struct {
isClusterWideOperator bool
gpuDiscoveryEnabled bool
}
// NewDynamoGraphDeploymentRequestHandler creates a new handler for DynamoGraphDeploymentRequest Webhook.
// The isClusterWide parameter indicates whether the operator is running in cluster-wide or namespace-restricted mode.
func NewDynamoGraphDeploymentRequestHandler(isClusterWide bool) *DynamoGraphDeploymentRequestHandler {
// isClusterWide indicates whether the operator has cluster-wide permissions.
// gpuDiscoveryEnabled indicates whether a ClusterRole for node read access was provisioned by Helm.
func NewDynamoGraphDeploymentRequestHandler(isClusterWide bool, gpuDiscoveryEnabled bool) *DynamoGraphDeploymentRequestHandler {
return &DynamoGraphDeploymentRequestHandler{
isClusterWideOperator: isClusterWide,
gpuDiscoveryEnabled: gpuDiscoveryEnabled,
}
}
......@@ -63,7 +66,7 @@ func (h *DynamoGraphDeploymentRequestHandler) ValidateCreate(ctx context.Context
logger.Info("validate create", "name", request.Name, "namespace", request.Namespace)
// Create validator and perform validation
validator := NewDynamoGraphDeploymentRequestValidator(request, h.isClusterWideOperator)
validator := NewDynamoGraphDeploymentRequestValidator(request, h.isClusterWideOperator, h.gpuDiscoveryEnabled)
return validator.Validate()
}
......@@ -90,7 +93,7 @@ func (h *DynamoGraphDeploymentRequestHandler) ValidateUpdate(ctx context.Context
}
// Create validator and perform validation
validator := NewDynamoGraphDeploymentRequestValidator(newRequest, h.isClusterWideOperator)
validator := NewDynamoGraphDeploymentRequestValidator(newRequest, h.isClusterWideOperator, h.gpuDiscoveryEnabled)
// Validate stateless rules
warnings, err := validator.Validate()
......
......@@ -29,19 +29,20 @@ import (
func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
validConfig := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}}`
validConfigWithHardware := `{"engine": {"backend": "vllm"}, "deployment": {"model": "test-model"}, "hardware": {"numGpusPerNode": 8, "gpuModel": "H100-SXM5-80GB", "gpuVramMib": 81920}}`
minimalConfig := `{"sla": {"ttft": 200.0}}`
configWithDifferentBackend := `{"engine": {"backend": "sglang"}}`
configWithDifferentModel := `{"deployment": {"model": "different-model"}}`
invalidYAML := `{invalid yaml`
// errMsg: if non-empty, an error is expected and each newline-separated substring must appear in it.
// expectedWarning: if non-empty, at least one warning must contain this substring.
tests := []struct {
name string
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
isClusterWide bool
wantErr bool
errMsg string
wantWarnings bool
expectedWarning string
errContains bool
name string
request *nvidiacomv1alpha1.DynamoGraphDeploymentRequest
isClusterWide bool
gpuDiscoveryEnabled bool
errMsg string
expectedWarning string
}{
{
name: "valid request",
......@@ -62,7 +63,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
},
},
isClusterWide: true,
wantErr: false,
},
{
name: "missing profiler image",
......@@ -83,7 +83,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
},
},
isClusterWide: true,
wantErr: true,
errMsg: "spec.profilingConfig.profilerImage is required",
},
{
......@@ -103,7 +102,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
},
},
isClusterWide: true,
wantErr: true,
errMsg: "spec.profilingConfig.config is required and must not be empty",
},
{
......@@ -125,11 +123,10 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
},
},
isClusterWide: true,
wantErr: true,
errMsg: "spec.profilingConfig.config is required and must not be empty",
},
{
name: "namespace-restricted operator (GPU discovery will fail gracefully)",
name: "namespace-scoped operator with manual hardware config (should pass)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
......@@ -146,8 +143,51 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
},
},
},
isClusterWide: false,
wantErr: false,
isClusterWide: false,
gpuDiscoveryEnabled: false,
},
{
name: "namespace-scoped operator with GPU discovery enabled (should pass without manual config)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(minimalConfig),
},
},
},
},
isClusterWide: false,
gpuDiscoveryEnabled: true,
},
{
name: "namespace-scoped operator with GPU discovery disabled and no hardware config (should error)",
request: &nvidiacomv1alpha1.DynamoGraphDeploymentRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "test-dgdr",
Namespace: "default",
},
Spec: nvidiacomv1alpha1.DynamoGraphDeploymentRequestSpec{
Model: "llama-3-8b",
Backend: "vllm",
ProfilingConfig: nvidiacomv1alpha1.ProfilingConfigSpec{
ProfilerImage: "profiler:latest",
Config: &apiextensionsv1.JSON{
Raw: []byte(minimalConfig),
},
},
},
},
isClusterWide: false,
gpuDiscoveryEnabled: false,
errMsg: "GPU hardware configuration required: GPU discovery is disabled",
},
{
name: "invalid config YAML",
......@@ -168,8 +208,7 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
},
},
isClusterWide: true,
wantErr: true,
errMsg: "failed to parse spec.profilingConfig.config: error converting YAML to JSON: yaml: line 1: did not find expected ',' or '}'",
errMsg: "failed to parse spec.profilingConfig.config",
},
{
name: "warning for different backend in config",
......@@ -190,8 +229,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
},
},
isClusterWide: true,
wantErr: false,
wantWarnings: true,
expectedWarning: "spec.profilingConfig.config.engine.backend (sglang) will be overwritten by spec.backend (vllm)",
},
{
......@@ -213,8 +250,6 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
},
},
isClusterWide: true,
wantErr: false,
wantWarnings: true,
expectedWarning: "spec.profilingConfig.config.deployment.model (different-model) will be overwritten by spec.model (llama-3-8b)",
},
{
......@@ -234,47 +269,34 @@ func TestDynamoGraphDeploymentRequestValidator_Validate(t *testing.T) {
},
},
isClusterWide: false,
wantErr: true,
errMsg: "spec.profilingConfig.profilerImage is required\nspec.profilingConfig.config is required and must not be empty",
errContains: true,
},
// TODO: Add test for invalid GPU range (min > max) validation
// The validation logic is in place (lines 148-152 of dynamographdeploymentrequest.go)
// but needs proper test coverage
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentRequestValidator(tt.request, tt.isClusterWide)
validator := NewDynamoGraphDeploymentRequestValidator(tt.request, tt.isClusterWide, tt.gpuDiscoveryEnabled)
warnings, err := validator.Validate()
if (err != nil) != tt.wantErr {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() error = %v, wantErr %v", err, tt.wantErr)
wantErr := tt.errMsg != ""
if (err != nil) != wantErr {
t.Errorf("Validate() error = %v, wantErr %v", err, wantErr)
return
}
if tt.wantErr {
if tt.errContains {
// For multiple errors, check that all expected error messages are present
errStr := err.Error()
for _, expectedMsg := range strings.Split(tt.errMsg, "\n") {
if !strings.Contains(errStr, expectedMsg) {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() error message = %v, want to contain %v", errStr, expectedMsg)
}
}
} else {
if err.Error() != tt.errMsg {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() error message = %v, want %v", err.Error(), tt.errMsg)
if wantErr {
for _, msg := range strings.Split(tt.errMsg, "\n") {
if !strings.Contains(err.Error(), msg) {
t.Errorf("Validate() error %q does not contain %q", err.Error(), msg)
}
}
}
if tt.wantWarnings && len(warnings) == 0 {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() expected warnings but got none")
wantWarning := tt.expectedWarning != ""
if wantWarning && len(warnings) == 0 {
t.Errorf("Validate() expected warning %q but got none", tt.expectedWarning)
}
if tt.wantWarnings && len(warnings) > 0 && warnings[0] != tt.expectedWarning {
t.Errorf("DynamoGraphDeploymentRequestValidator.Validate() warning = %v, want %v", warnings[0], tt.expectedWarning)
if wantWarning && len(warnings) > 0 && !strings.Contains(warnings[0], tt.expectedWarning) {
t.Errorf("Validate() warning %q does not contain %q", warnings[0], tt.expectedWarning)
}
})
}
......@@ -350,7 +372,7 @@ func TestDynamoGraphDeploymentRequestValidator_ValidateUpdate(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
validator := NewDynamoGraphDeploymentRequestValidator(tt.newRequest, true)
validator := NewDynamoGraphDeploymentRequestValidator(tt.newRequest, true, true)
warnings, err := validator.ValidateUpdate(tt.oldRequest)
if (err != nil) != tt.wantErr {
......
......@@ -227,15 +227,35 @@ See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#
### Automatic GPU Discovery
The operator automatically discovers GPU resources from your Kubernetes cluster nodes when available. GPU discovery provides:
The operator automatically discovers GPU resources from cluster nodes, providing hardware info (GPU model, VRAM, GPUs per node) and automatic profiling search space calculation.
- Hardware information (GPU model, VRAM, GPUs per node)
- Automatic calculation of profiling search space based on model size
- Hardware system identifier for AI Configurator integration
**Requirements:**
- **Cluster-scoped operators**: Have node read permissions by default
- **Namespace-scoped operators**: GPU discovery is enabled by default when installing via Helm — the chart provisions the required ClusterRole/ClusterRoleBinding automatically
**Permissions**: GPU discovery requires cluster-wide node read permissions. Cluster-scoped operators automatically have these permissions. Namespace-restricted operators can also use GPU discovery if granted node read permissions via RBAC.
**For namespace-scoped operators**, GPU discovery is controlled by a Helm value:
If GPU discovery is unavailable (no permissions or no GPU labels), the profiler will use manually specified hardware configuration or defaults.
```bash
# GPU discovery enabled (default) — Helm provisions read-only node access automatically
helm install dynamo-platform ... --set dynamo-operator.gpuDiscovery.enabled=true
# GPU discovery disabled — you must provide hardware config manually in each DGDR
helm install dynamo-platform ... --set dynamo-operator.gpuDiscovery.enabled=false
```
If GPU discovery is disabled, provide hardware config manually in the DGDR:
```yaml
spec:
profilingConfig:
config:
hardware:
numGpusPerNode: 8
gpuModel: "H100-SXM5-80GB"
gpuVramMib: 81920
```
If GPU discovery is disabled and no manual hardware config is provided, the DGDR will be rejected at admission time.
## Configuration
......
......@@ -192,6 +192,30 @@ Found existing namespace-restricted Dynamo operators in namespaces: ...
--set "dynamo-operator.namespaceRestriction.targetNamespace=dynamo-namespace" # optional
```
### GPU Discovery for DynamoGraphDeploymentRequests with Namespace-Scoped Operators
GPU discovery is **enabled by default** for namespace-scoped operators. The Helm chart automatically provisions a ClusterRole/ClusterRoleBinding granting the operator read-only access to node GPU labels.
**To disable GPU discovery** (if your installer lacks ClusterRole creation permissions):
```bash
helm install dynamo-platform ... --set dynamo-operator.gpuDiscovery.enabled=false
```
When GPU discovery is disabled, you must provide hardware configuration manually in each DynamoGraphDeploymentRequest:
```yaml
spec:
profilingConfig:
config:
hardware:
numGpusPerNode: 8
gpuModel: "H100-SXM5-80GB"
gpuVramMib: 81920
```
> **Note**: If GPU discovery is disabled and no hardware config is provided, the DGDR will be rejected at admission time with a clear error message.
[Verify Installation](#verify-installation)
## Path B: Custom Build from Source
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment