/* * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gpu import ( "context" "fmt" "strconv" "strings" nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1" corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) const ( // NVIDIA GPU Feature Discovery (GFD) label keys LabelGPUCount = "nvidia.com/gpu.count" LabelGPUProduct = "nvidia.com/gpu.product" LabelGPUMemory = "nvidia.com/gpu.memory" ) // GPUInfo contains discovered GPU configuration from cluster nodes type GPUInfo struct { GPUsPerNode int // Maximum GPUs per node found in the cluster NodesWithGPUs int // Number of nodes that have GPUs Model string // GPU product name (e.g., "H100-SXM5-80GB") VRAMPerGPU int // VRAM in MiB per GPU System nvidiacomv1beta1.GPUSKUType // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown } // DiscoverGPUs queries Kubernetes nodes to determine GPU configuration. // It extracts GPU information from NVIDIA GPU Feature Discovery (GFD) labels // and returns aggregated GPU info, preferring nodes with higher GPU count, // then higher VRAM if counts are equal. // // This function requires cluster-wide node read permissions and expects nodes // to have GFD labels. If no nodes with GPU labels are found, it returns an error. func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error) { logger := log.FromContext(ctx) logger.Info("Starting GPU discovery from cluster nodes") // List all nodes in the cluster nodeList := &corev1.NodeList{} if err := k8sClient.List(ctx, nodeList); err != nil { return nil, fmt.Errorf("failed to list cluster nodes: %w", err) } if len(nodeList.Items) == 0 { return nil, fmt.Errorf("no nodes found in cluster") } logger.Info("Found cluster nodes", "count", len(nodeList.Items)) // Track the best GPU configuration found var bestGPUInfo *GPUInfo nodesWithGPUs := 0 for i := range nodeList.Items { node := &nodeList.Items[i] gpuInfo, err := extractGPUInfoFromNode(node) if err != nil { // Node doesn't have GPU labels or has invalid labels, skip it logger.V(1).Info("Skipping node without valid GPU info", "node", node.Name, "reason", err.Error()) continue } nodesWithGPUs++ logger.Info("Found GPU node", "node", node.Name, "gpus", gpuInfo.GPUsPerNode, "model", gpuInfo.Model, "vram", gpuInfo.VRAMPerGPU) // Select best configuration: prefer higher GPU count, then higher VRAM if bestGPUInfo == nil || gpuInfo.GPUsPerNode > bestGPUInfo.GPUsPerNode || (gpuInfo.GPUsPerNode == bestGPUInfo.GPUsPerNode && gpuInfo.VRAMPerGPU > bestGPUInfo.VRAMPerGPU) { bestGPUInfo = gpuInfo } } if bestGPUInfo == nil { return nil, fmt.Errorf("no nodes with NVIDIA GPU Feature Discovery labels found (checked %d nodes). "+ "Ensure GPU nodes have labels: %s, %s, %s", len(nodeList.Items), LabelGPUCount, LabelGPUProduct, LabelGPUMemory) } // Infer hardware system from GPU model bestGPUInfo.System = InferHardwareSystem(bestGPUInfo.Model) bestGPUInfo.NodesWithGPUs = nodesWithGPUs logger.Info("GPU discovery completed", "gpusPerNode", bestGPUInfo.GPUsPerNode, "nodesWithGPUs", bestGPUInfo.NodesWithGPUs, "totalGpus", bestGPUInfo.GPUsPerNode*bestGPUInfo.NodesWithGPUs, "model", bestGPUInfo.Model, "vram", bestGPUInfo.VRAMPerGPU, "system", bestGPUInfo.System) return bestGPUInfo, nil } // extractGPUInfoFromNode extracts GPU information from a single node's labels. // Returns error if required labels are missing or invalid. func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) { labels := node.Labels if labels == nil { return nil, fmt.Errorf("node has no labels") } gpuCountStr, ok := labels[LabelGPUCount] if !ok { return nil, fmt.Errorf("missing label %s", LabelGPUCount) } gpuCount, err := strconv.Atoi(gpuCountStr) if err != nil || gpuCount <= 0 { return nil, fmt.Errorf("invalid GPU count: %s", gpuCountStr) } gpuModel, ok := labels[LabelGPUProduct] if !ok || gpuModel == "" { return nil, fmt.Errorf("missing or empty label %s", LabelGPUProduct) } // Extract VRAM (memory in MiB) gpuMemoryStr, ok := labels[LabelGPUMemory] if !ok { return nil, fmt.Errorf("missing label %s", LabelGPUMemory) } gpuMemory, err := strconv.Atoi(gpuMemoryStr) if err != nil || gpuMemory <= 0 { return nil, fmt.Errorf("invalid GPU memory: %s", gpuMemoryStr) } return &GPUInfo{ GPUsPerNode: gpuCount, Model: gpuModel, VRAMPerGPU: gpuMemory, }, nil } // InferHardwareSystem maps GPU product name to hardware system identifier. // Returns empty string if the GPU model cannot be confidently mapped. // // This is a best-effort mapping based on common NVIDIA datacenter GPU naming patterns. // The system identifier is used by the profiler for performance estimation and configuration. // // Limitations: // - Cannot distinguish SXM vs. PCIe variants from labels alone (assumes SXM for datacenter GPUs) // - New GPU models require code updates (gracefully returns empty string) // - Non-standard SKU names may not match // // Users can manually override the system in their profiling config (hardware.system) // if auto-detection is incorrect or unavailable. func InferHardwareSystem(gpuProduct string) nvidiacomv1beta1.GPUSKUType { if gpuProduct == "" { return "" } // Normalize: uppercase, remove spaces/dashes for pattern matching normalized := strings.ToUpper(strings.ReplaceAll(gpuProduct, "-", "")) normalized = strings.ReplaceAll(normalized, " ", "") // Map common NVIDIA datacenter GPU products to AIC hardware system identifiers. patterns := []struct { pattern string system nvidiacomv1beta1.GPUSKUType }{ {"GB200", nvidiacomv1beta1.GPUSKUTypeGB200SXM}, {"H200", nvidiacomv1beta1.GPUSKUTypeH200SXM}, {"H100", nvidiacomv1beta1.GPUSKUTypeH100SXM}, {"B200", nvidiacomv1beta1.GPUSKUTypeB200SXM}, {"A100", nvidiacomv1beta1.GPUSKUTypeA100SXM}, {"L40S", nvidiacomv1beta1.GPUSKUTypeL40S}, } for _, p := range patterns { if strings.Contains(normalized, p.pattern) { return p.system } } // Unknown GPU type, return empty value. // User must specify gpuSku explicitly in spec.hardware. return "" }