Unverified Commit 2b077ec2 authored by Julien Mancuso's avatar Julien Mancuso Committed by GitHub
Browse files

fix: fix gpu discovery preflight job (#6628)


Signed-off-by: default avatarJulien Mancuso <jmancuso@nvidia.com>
parent 1e44ab95
...@@ -13,110 +13,59 @@ ...@@ -13,110 +13,59 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Pre-install check: verifies the installer has permission to create the GPU discovery ClusterRole. # Best-effort template-time check for cluster-scoped RBAC access required by GPU discovery.
# Runs only when namespaceRestriction.enabled=true and gpuDiscovery.enabled=true (the default). # Runs only when namespaceRestriction.enabled=true and gpuDiscovery.enabled=true (the default).
# If the check fails, installation is aborted with a clear error message explaining the options. #
# Limitation: Helm's lookup function can only test list/get access, not create. This check
# verifies the installer can read ClusterRole resources as a proxy for full cluster RBAC
# access. In the rare case where an installer has create but not list permissions, this
# check will false-fail; if that happens Helm's native error on the actual ClusterRole
# creation (in gpu-discovery-rbac.yaml) will still be clear about the missing permission.
{{- if and .Values.namespaceRestriction.enabled .Values.gpuDiscovery.enabled }} {{- if and .Values.namespaceRestriction.enabled .Values.gpuDiscovery.enabled }}
--- {{- /*
apiVersion: v1 GPU discovery (gpu-discovery-rbac.yaml) creates a ClusterRole + ClusterRoleBinding to grant
kind: ServiceAccount the namespace-scoped operator read-only node access.
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight This is a best-effort check: lookup runs as the installing user and tests list/get access
namespace: {{ .Release.Namespace }} on ClusterRoles — not the create permission that gpu-discovery-rbac.yaml actually needs.
labels: There is no mechanism in Helm templates to test create permissions directly. In practice,
{{- include "dynamo-operator.labels" . | nindent 4 }} installers with create access almost always have list access too, so this catches the
annotations: common case. If the check is wrong, the actual resource creation will fail with a clear
"helm.sh/hook": pre-install,pre-upgrade Helm RBAC error.
"helm.sh/hook-weight": "-10"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded lookup returns an empty dict during helm template / --dry-run, so we use a two-probe
--- heuristic to distinguish dry-run mode from insufficient permissions:
apiVersion: rbac.authorization.k8s.io/v1 1. Probe ClusterRoles — if non-empty, the installer has cluster RBAC read access.
kind: ClusterRole 2. If empty, probe the release Namespace. All authenticated users can read their own
metadata: namespace during a real install. If this succeeds but ClusterRoles did not, the
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-gpu-discovery-preflight installer likely lacks cluster-scoped access.
labels: 3. If both are empty we are in dry-run / template mode — skip the check.
{{- include "dynamo-operator.labels" . | nindent 4 }} */ -}}
annotations: {{- $clusterProbe := lookup "rbac.authorization.k8s.io/v1" "ClusterRole" "" "" -}}
"helm.sh/hook": pre-install,pre-upgrade {{- if not $clusterProbe -}}
"helm.sh/hook-weight": "-10" {{- $nsProbe := lookup "v1" "Namespace" "" .Release.Namespace -}}
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded {{- if $nsProbe -}}
rules: {{- fail (join "\n" (list ""
- apiGroups: ["authorization.k8s.io"] "ERROR: GPU discovery requires cluster-scoped RBAC permissions, but the installer"
resources: ["selfsubjectaccessreviews"] "does not have them. This is needed to grant the namespace-scoped operator"
verbs: ["create"] "read-only node access for automatic GPU hardware discovery."
--- ""
apiVersion: rbac.authorization.k8s.io/v1 "Options:"
kind: ClusterRoleBinding " 1. Ask your cluster admin to grant ClusterRole creation permissions and re-run."
metadata: ""
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-gpu-discovery-preflight " 2. Disable GPU discovery and provide hardware config manually in each DGDR:"
labels: " helm install ... --set dynamo-operator.gpuDiscovery.enabled=false"
{{- include "dynamo-operator.labels" . | nindent 4 }} ""
annotations: " Then in your DynamoGraphDeploymentRequest:"
"helm.sh/hook": pre-install,pre-upgrade " spec:"
"helm.sh/hook-weight": "-10" " profilingConfig:"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded " config:"
roleRef: " hardware:"
apiGroup: rbac.authorization.k8s.io " numGpusPerNode: 8"
kind: ClusterRole " gpuModel: \"H100-SXM5-80GB\""
name: {{ include "dynamo-operator.fullname" . }}-{{ .Release.Namespace }}-gpu-discovery-preflight " gpuVramMib: 81920"
subjects: ""
- kind: ServiceAccount )) -}}
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight {{- end -}}
namespace: {{ .Release.Namespace }} {{- end -}}
---
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
namespace: {{ .Release.Namespace }}
labels:
{{- include "dynamo-operator.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
backoffLimit: 0
activeDeadlineSeconds: 60
template:
spec:
restartPolicy: Never
serviceAccountName: {{ include "dynamo-operator.fullname" . }}-gpu-discovery-preflight
{{- if .Values.controllerManager.tolerations }}
tolerations:
{{- toYaml .Values.controllerManager.tolerations | nindent 8 }}
{{- end }}
containers:
- name: check
image: alpine/k8s:1.34.1
command:
- /bin/sh
- -c
- |
if kubectl auth can-i create clusterroles --all-namespaces > /dev/null 2>&1; then
echo "GPU discovery preflight check passed: installer has ClusterRole creation permissions."
exit 0
else
echo ""
echo "ERROR: GPU discovery requires ClusterRole creation permissions, but the installer"
echo "does not have them. This is needed to grant the namespace-scoped operator"
echo "read-only node access for automatic GPU hardware discovery."
echo ""
echo "Options:"
echo " 1. Ask your cluster admin to grant ClusterRole creation permissions and re-run."
echo ""
echo " 2. Disable GPU discovery and provide hardware config manually in each DGDR:"
echo " helm install ... --set dynamo-operator.gpuDiscovery.enabled=false"
echo ""
echo " Then in your DynamoGraphDeploymentRequest:"
echo " spec:"
echo " profilingConfig:"
echo " config:"
echo " hardware:"
echo " numGpusPerNode: 8"
echo " gpuModel: \"H100-SXM5-80GB\""
echo " gpuVramMib: 81920"
echo ""
exit 1
fi
{{- end }} {{- end }}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment