"components/backends/vscode:/vscode.git/clone" did not exist on "41f095cf9535121399e695bfc4e1d5ba40a9c252"
Unverified Commit 9572355f authored by Dr. Stefan Schimanski's avatar Dr. Stefan Schimanski Committed by GitHub
Browse files

fix(operator): enable DCGM discovery in namespace-scoped mode (#8365)


Signed-off-by: default avatarDr. Stefan Schimanski <sschimanski@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent c1a843b3
......@@ -27,6 +27,9 @@ rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
......
......@@ -1052,26 +1052,14 @@ func (r *DynamoGraphDeploymentRequestReconciler) validateGPUHardwareInfo(ctx con
return nil
}
isNamespaceScoped := r.Config.Namespace.Restricted != ""
if isNamespaceScoped {
return fmt.Errorf(
"GPU hardware info required but cannot be auto-discovered." +
"\n\nOptions to resolve:" +
"\n\n1. Re-enable GPU discovery (if it was disabled during Helm install):" +
"\n helm upgrade ... --set dynamo-operator.gpuDiscovery.enabled=true" +
"\n\n2. Add hardware config to spec.hardware:" +
"\n gpuSku: \"h100_sxm\"" +
"\n vramMb: 81920" +
"\n numGpusPerNode: 8" +
"\n totalGpus: 8")
}
// Try DCGM discovery. In namespace-scoped mode this requires a ClusterRole
// granting pod list/get (provisioned by the Helm chart when
// gpuDiscovery.enabled=true).
_, err := r.GPUDiscovery.DiscoverGPUsFromDCGM(ctx, r.APIReader, r.GPUDiscoveryCache)
if err == nil {
// GPU discovery is available, validation passes
return nil
}
// Refine the logger message
reason := GetGPUDiscoveryFailureReason(err)
logger.Info("GPU discovery not available", "reason", reason, "error", err.Error())
return fmt.Errorf("GPU hardware info required but auto-discovery failed. Add spec.hardware.gpuSku, spec.hardware.vramMb, spec.hardware.numGpusPerNode, spec.hardware.totalGpus")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment