# Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
# Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
renewInterval:10s
renewInterval:10s
# -- GPU discovery configuration (only applies when namespaceRestriction.enabled=true)
gpuDiscovery:
# -- Whether to provision a ClusterRole for the namespace-scoped operator to read GPU node labels.
# When true (default), Helm creates a ClusterRole/ClusterRoleBinding granting node read access.
# Set to false if your installer lacks ClusterRole creation permissions; you must then provide
# hardware config manually in each DynamoGraphDeploymentRequest.
# Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
# Interval for renewing the namespace scope marker lease (namespace-restricted mode only). The namespace-restricted operator renews its lease at this interval to signal it's still running.
renewInterval:10s
renewInterval:10s
# -- GPU discovery configuration (only applies when namespaceRestriction.enabled=true)
gpuDiscovery:
# -- Whether to provision a ClusterRole for the namespace-scoped operator to read GPU node labels.
# When true (default), Helm creates a ClusterRole/ClusterRoleBinding granting node read access.
# Set to false if your installer lacks ClusterRole creation permissions.
enabled:true
# -- The Dynamo discovery backend to use. Default is "kubernetes" for Kubernetes API service discovery. Set to "etcd" to use ETCD for discovery. --
# -- The Dynamo discovery backend to use. Default is "kubernetes" for Kubernetes API service discovery. Set to "etcd" to use ETCD for discovery. --
"\n\n3. Or specify {{.Engine}}.{{.MinGPUs}} and {{.Engine}}.{{.MaxGPUs}} for explicit GPU search ranges.",
))
varbufbytes.Buffer
_=tmpl.Execute(&buf,map[string]string{
"Hardware":ConfigKeyHardware,
"NumGPUs":ConfigKeyNumGpusPerNode,
"GPUModel":ConfigKeyGPUModel,
"GPUVram":ConfigKeyGPUVramMib,
"Engine":ConfigKeyEngine,
"MinGPUs":ConfigKeyMinNumGpusPerEng,
"MaxGPUs":ConfigKeyMaxNumGpusPerEng,
})
returnfmt.Errorf("%s",buf.String())
}
}
returnfmt.Errorf(`GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s.
returnfmt.Errorf("GPU hardware info required but auto-discovery failed. Add hardware config to profilingConfig.config.%s (%s, %s, %s) or specify %s.%s and %s.%s",
See profiling documentation for configuration details.`,
warnings=append(warnings,fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)",backend,v.request.Spec.Backend))
warnings=append(warnings,fmt.Sprintf("spec.profilingConfig.config.engine.backend (%s) will be overwritten by spec.backend (%s)",backend,v.request.Spec.Backend))
warnings=append(warnings,fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)",model,v.request.Spec.Model))
warnings=append(warnings,fmt.Sprintf("spec.profilingConfig.config.deployment.model (%s) will be overwritten by spec.model (%s)",model,v.request.Spec.Model))
}
}
...
@@ -108,24 +111,24 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
...
@@ -108,24 +111,24 @@ func (v *DynamoGraphDeploymentRequestValidator) Validate() (admission.Warnings,
}
}
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
// validateGPUHardwareInfo ensures GPU hardware information will be available for profiling.
// This validation happens at admission time to fail fast before the DGDR is persisted to etcd.
// Returns an error at admission time if GPU discovery is disabled and no manual hardware config is provided.
@@ -156,42 +159,17 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error
...
@@ -156,42 +159,17 @@ func (v *DynamoGraphDeploymentRequestValidator) validateGPUHardwareInfo() error
}
}
}
}
// If manual config or explicit ranges provided, validation passes
ifhasManualHardwareConfig||hasExplicitGPURanges{
ifhasManualHardwareConfig||hasExplicitGPURanges{
returnnil
returnnil
}
}
// Neither manual config nor explicit ranges provided
// No manual hardware config provided. Cluster-wide operators always have GPU discovery via node
// GPU discovery will be attempted at reconcile time, but if it's unavailable
// permissions. Namespace-scoped operators rely on Helm-provisioned GPU discovery (gpuDiscovery.enabled).
// (e.g., namespace-scoped operator), the DGDR will fail
ifv.isClusterWideOperator||v.gpuDiscoveryEnabled{
//
// Fail at admission time to give users immediate feedback
ifv.isClusterWideOperator{
// Cluster-wide operator should have GPU discovery available
// Allow DGDR to be created - GPU discovery will provide hardware info
returnnil
returnnil
}
}
// Namespace-scoped operator likely doesn't have node read permissions
returnerrors.New("GPU hardware configuration required: GPU discovery is disabled (set dynamo-operator.gpuDiscovery.enabled=true in Helm values, or provide hardware config in spec.profilingConfig.config)")
// Require manual hardware config or explicit GPU ranges
returnerrors.New(`GPU hardware configuration required for namespace-scoped operators.
Namespace-scoped operators typically lack node read permissions for GPU auto-discovery.
Provide hardware configuration in one of these ways:
1. Add hardware config in spec.profilingConfig.config:
@@ -227,15 +227,35 @@ See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#
...
@@ -227,15 +227,35 @@ See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#
### Automatic GPU Discovery
### Automatic GPU Discovery
The operator automatically discovers GPU resources from your Kubernetes cluster nodes when available. GPU discovery provides:
The operator automatically discovers GPU resources from cluster nodes, providing hardware info (GPU model, VRAM, GPUs per node) and automatic profiling search space calculation.
- Hardware information (GPU model, VRAM, GPUs per node)
**Requirements:**
-Automatic calculation of profiling search space based on model size
-**Cluster-scoped operators**: Have node read permissions by default
-Hardware system identifier for AI Configurator integration
-**Namespace-scoped operators**: GPU discovery is enabled by default when installing via Helm — the chart provisions the required ClusterRole/ClusterRoleBinding automatically
**Permissions**: GPU discovery requires cluster-wide node read permissions. Cluster-scoped operators automatically have these permissions. Namespace-restricted operators can also use GPU discovery if granted node read permissions via RBAC.
**For namespace-scoped operators**, GPU discovery is controlled by a Helm value:
If GPU discovery is unavailable (no permissions or no GPU labels), the profiler will use manually specified hardware configuration or defaults.
### GPU Discovery for DynamoGraphDeploymentRequests with Namespace-Scoped Operators
GPU discovery is **enabled by default** for namespace-scoped operators. The Helm chart automatically provisions a ClusterRole/ClusterRoleBinding granting the operator read-only access to node GPU labels.