# Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
renewInterval:10s
# -- GPU discovery configuration (only applies when namespaceRestriction.enabled=true)
gpuDiscovery:
# -- Whether to provision a ClusterRole for the namespace-scoped operator to read GPU node labels.
# When true (default), Helm creates a ClusterRole/ClusterRoleBinding granting node read access.
# Set to false if your installer lacks ClusterRole creation permissions; you must then provide
# hardware config manually in each DynamoGraphDeploymentRequest.
# Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
renewInterval:10s
# -- GPU discovery configuration (only applies when namespaceRestriction.enabled=true)
gpuDiscovery:
# -- Whether to provision a ClusterRole for the namespace-scoped operator to read GPU node labels.
# When true (default), Helm creates a ClusterRole/ClusterRoleBinding granting node read access.
# Set to false if your installer lacks ClusterRole creation permissions.
enabled:true
# -- The Dynamo discovery backend to use. Default is "kubernetes" for Kubernetes API service discovery. Set to "etcd" to use ETCD for discovery. --
"\n\n2. Add hardware config to profilingConfig.config.{{.Hardware}}:"+
"\n {{.NumGPUs}}: 8"+
"\n {{.GPUModel}}: \"H100-SXM5-80GB\""+
"\n {{.GPUVram}}: 81920"+
"\n\n3. Or specify {{.Engine}}.{{.MinGPUs}} and {{.Engine}}.{{.MaxGPUs}} for explicit GPU search ranges.",
))
varbufbytes.Buffer
_=tmpl.Execute(&buf,map[string]string{
"Hardware":ConfigKeyHardware,
"NumGPUs":ConfigKeyNumGpusPerNode,
"GPUModel":ConfigKeyGPUModel,
"GPUVram":ConfigKeyGPUVramMib,
"Engine":ConfigKeyEngine,
"MinGPUs":ConfigKeyMinNumGpusPerEng,
"MaxGPUs":ConfigKeyMaxNumGpusPerEng,
})
returnfmt.Errorf("%s",buf.String())
}
returnfmt.Errorf(`GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s.
See profiling documentation for configuration details.`,
returnfmt.Errorf("GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s",
warnings=append(warnings,fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)",backend,v.request.Spec.Backend))
warnings=append(warnings,fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)",model,v.request.Spec.Model))
}
...
...
@@ -108,24 +111,24 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
}
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
// This validation happens at admission time to fail fast before the DGDR is persisted to etcd.
// Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided.
returnerrors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.profilingConfig.config)")
}
// ValidateUpdate performs stateful validation comparing old and new DynamoGraphDeploymentRequest.
@@ -227,15 +227,35 @@ See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#
### Automatic GPU Discovery
The operator automatically discovers GPU resources from your Kubernetes cluster nodes when available. GPU discovery provides:
The operator automatically discovers GPU resources from cluster nodes, providing hardware info (GPU model, VRAM, GPUs per node) and automatic profiling search space calculation.
- Hardware information (GPU model, VRAM, GPUs per node)
-Automatic calculation of profiling search space based on model size
-Hardware system identifier for AI Configurator integration
**Requirements:**
-**Cluster-scoped operators**: Have node read permissions by default
-**Namespace-scoped operators**: GPU discovery is enabled by default when installing via Helm — the chart provisions the required ClusterRole/ClusterRoleBinding automatically
**Permissions**: GPU discovery requires cluster-wide node read permissions. Cluster-scoped operators automatically have these permissions. Namespace-restricted operators can also use GPU discovery if granted node read permissions via RBAC.
**For namespace-scoped operators**, GPU discovery is controlled by a Helm value:
If GPU discovery is unavailable (no permissions or no GPU labels), the profiler will use manually specified hardware configuration or defaults.
### GPU Discovery for DynamoGraphDeploymentRequests with Namespace-Scoped Operators
GPU discovery is **enabled by default** for namespace-scoped operators. The Helm chart automatically provisions a ClusterRole/ClusterRoleBinding granting the operator read-only access to node GPU labels.