Unverified Commit 63787939 authored by devivasudevan's avatar devivasudevan Committed by GitHub
Browse files

feat: Expanding the hardware discovery to include RDMA, interconnect. (#7551)

parent df53b7a2
......@@ -59,11 +59,20 @@ class SearchStrategy(str, Enum):
class GPUSKUType(str, Enum):
GB200SXM = "gb200_sxm"
B200SXM = "b200_sxm"
H200SXM = "h200_sxm"
H100SXM = "h100_sxm"
B200SXM = "b200_sxm"
H100PCIe = "h100_pcie"
A100SXM = "a100_sxm"
A100PCIe = "a100_pcie"
L40S = "l40s"
L40 = "l40"
L4 = "l4"
V100SXM = "v100_sxm"
V100PCIe = "v100_pcie"
T4 = "t4"
MI200 = "mi200"
MI300 = "mi300"
class BackendType(str, Enum):
......@@ -210,6 +219,14 @@ class HardwareSpec(BaseModel):
numGpusPerNode: Optional[int] = Field(
default=None, description="NumGPUsPerNode is the number of GPUs per node."
)
interconnect: Optional[str] = Field(
default=None,
description='Interconnect describes the GPU interconnect type within a node. Examples: "pcie", "nvlink", "infiniband".',
)
rdma: Optional[bool] = Field(
default=None,
description="RDMA indicates whether RDMA is available on the cluster.",
)
class DynamoGraphDeploymentRequestSpec(BaseModel):
......
......@@ -579,26 +579,52 @@ spec:
allOf:
- enum:
- gb200_sxm
- b200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- h100_pcie
- a100_sxm
- a100_pcie
- l40s
- l40
- l4
- v100_sxm
- v100_pcie
- t4
- mi200
- mi300
- enum:
- gb200_sxm
- b200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- h100_pcie
- a100_sxm
- a100_pcie
- l40s
- l40
- l4
- v100_sxm
- v100_pcie
- t4
- mi200
- mi300
description: |-
GPUSKU is the AIC hardware system identifier for the GPU.
When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
type: string
interconnect:
description: |-
Interconnect describes the GPU interconnect type within a node.
Examples: "pcie", "nvlink", "infiniband".
type: string
numGpusPerNode:
description: NumGPUsPerNode is the number of GPUs per node.
format: int32
type: integer
rdma:
description: RDMA indicates whether RDMA is available on the cluster.
type: boolean
totalGpus:
description: TotalGPUs is the total number of GPUs available in the cluster.
format: int32
......
......@@ -175,16 +175,31 @@ const (
)
// GPUSKUType is the AIC hardware system identifier for a supported GPU.
// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
// +kubebuilder:validation:Enum=gb200_sxm;b200_sxm;h200_sxm;h100_sxm;h100_pcie;a100_sxm;a100_pcie;l40s;l40;l4;v100_sxm;v100_pcie;t4;mi200;mi300
type GPUSKUType string
const (
// --- Blackwell ---
GPUSKUTypeGB200SXM GPUSKUType = "gb200_sxm"
GPUSKUTypeB200SXM GPUSKUType = "b200_sxm"
// --- Hopper ---
GPUSKUTypeH200SXM GPUSKUType = "h200_sxm"
GPUSKUTypeH100SXM GPUSKUType = "h100_sxm"
GPUSKUTypeB200SXM GPUSKUType = "b200_sxm"
GPUSKUTypeH100PCIe GPUSKUType = "h100_pcie"
// --- Ampere ---
GPUSKUTypeA100SXM GPUSKUType = "a100_sxm"
GPUSKUTypeL40S GPUSKUType = "l40s"
GPUSKUTypeA100PCIe GPUSKUType = "a100_pcie"
// --- Ada ---
GPUSKUTypeL40S GPUSKUType = "l40s"
GPUSKUTypeL40 GPUSKUType = "l40"
GPUSKUTypeL4 GPUSKUType = "l4"
// --- Older NVIDIA ---
GPUSKUTypeV100SXM GPUSKUType = "v100_sxm"
GPUSKUTypeV100PCIe GPUSKUType = "v100_pcie"
GPUSKUTypeT4 GPUSKUType = "t4"
// --- AMD ---
GPUSKUTypeMI200 GPUSKUType = "mi200"
GPUSKUTypeMI300 GPUSKUType = "mi300"
)
// BackendType specifies the inference backend.
......@@ -324,7 +339,7 @@ type HardwareSpec struct {
// GPUSKU is the AIC hardware system identifier for the GPU.
// When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
// +optional
// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
// +kubebuilder:validation:Enum=gb200_sxm;b200_sxm;h200_sxm;h100_sxm;h100_pcie;a100_sxm;a100_pcie;l40s;l40;l4;v100_sxm;v100_pcie;t4;mi200;mi300
GPUSKU GPUSKUType `json:"gpuSku,omitempty"`
// VRAMMB is the VRAM per GPU in MiB.
......@@ -338,6 +353,13 @@ type HardwareSpec struct {
// NumGPUsPerNode is the number of GPUs per node.
// +optional
NumGPUsPerNode *int32 `json:"numGpusPerNode,omitempty"`
// Interconnect describes the GPU interconnect type within a node.
// Examples: "pcie", "nvlink", "infiniband".
// +optional
Interconnect string `json:"interconnect,omitempty"`
// RDMA indicates whether RDMA is available on the cluster.
// +optional
RDMA *bool `json:"rdma,omitempty"`
}
// DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.
......
......@@ -252,6 +252,11 @@ func (in *HardwareSpec) DeepCopyInto(out *HardwareSpec) {
*out = new(int32)
**out = **in
}
if in.RDMA != nil {
in, out := &in.RDMA, &out.RDMA
*out = new(bool)
**out = **in
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HardwareSpec.
......
......@@ -579,26 +579,52 @@ spec:
allOf:
- enum:
- gb200_sxm
- b200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- h100_pcie
- a100_sxm
- a100_pcie
- l40s
- l40
- l4
- v100_sxm
- v100_pcie
- t4
- mi200
- mi300
- enum:
- gb200_sxm
- b200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- h100_pcie
- a100_sxm
- a100_pcie
- l40s
- l40
- l4
- v100_sxm
- v100_pcie
- t4
- mi200
- mi300
description: |-
GPUSKU is the AIC hardware system identifier for the GPU.
When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
type: string
interconnect:
description: |-
Interconnect describes the GPU interconnect type within a node.
Examples: "pcie", "nvlink", "infiniband".
type: string
numGpusPerNode:
description: NumGPUsPerNode is the number of GPUs per node.
format: int32
type: integer
rdma:
description: RDMA indicates whether RDMA is available on the cluster.
type: boolean
totalGpus:
description: TotalGPUs is the total number of GPUs available in the cluster.
format: int32
......
......@@ -14,7 +14,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gpu
import (
......@@ -28,40 +27,73 @@ import (
"sync"
"time"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"github.com/prometheus/common/model"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
)
const (
defaultDCGMEndpointTemplate = "http://{POD_IP}:9400/metrics"
// NVIDIA GPU Feature Discovery (GFD) label keys
LabelGPUCount = "nvidia.com/gpu.count"
LabelGPUProduct = "nvidia.com/gpu.product"
LabelGPUMemory = "nvidia.com/gpu.memory"
// DCGM exporter label constants
LabelApp = "app"
LabelAppKubernetesName = "app.kubernetes.io/name"
LabelValueNvidiaDCGMExporter = "nvidia-dcgm-exporter"
LabelValueDCGMExporter = "dcgm-exporter"
LabelValueGPUOperator = "gpu-operator"
GPUOperatorNamespace = "gpu-operator"
requestTimeout = 5 * time.Second
dialTimeout = 3 * time.Second
tlsHandshakeTimeout = 3 * time.Second
CloudProviderGCP = "gcp"
CloudProviderAWS = "aws"
CloudProviderAKS = "aks"
CloudProviderOther = "other"
CloudProviderUnknown = "unknown"
LabelApp = "app"
LabelAppKubernetesName = "app.kubernetes.io/name"
LabelValueNvidiaDCGMExporter = "nvidia-dcgm-exporter"
LabelValueNvidiaNetworkOperator = "nvidia-network-operator"
LabelValueDCGMExporter = "dcgm-exporter"
LabelValueGPUOperator = "gpu-operator"
GPUOperatorNamespace = "gpu-operator"
requestTimeout = 5 * time.Second
dialTimeout = 3 * time.Second
tlsHandshakeTimeout = 3 * time.Second
CloudProviderGCP = "gcp"
CloudProviderAWS = "aws"
CloudProviderAKS = "aks"
CloudProviderOther = "other"
CloudProviderUnknown = "unknown"
)
// --- Normalization helpers ---
const (
strDash = "-"
strSpace = " "
strNone = "none"
)
// --- Form factor tokens ---
const (
tokenSXM = "SXM"
tokenHGX = "HGX"
tokenDGX = "DGX"
tokenPCIE = "PCIE"
formFactorSXM = "sxm"
formFactorPCIe = "pcie"
)
// --- GPU model tokens ---
const (
tokenGB200 = "GB200"
tokenB200 = "B200"
tokenH200 = "H200"
tokenH100 = "H100"
tokenA100 = "A100"
tokenL40S = "L40S"
tokenL40 = "L40"
tokenL4 = "L4"
tokenV100 = "V100"
tokenT4 = "T4"
tokenMI300 = "MI300"
tokenMI250 = "MI250"
tokenMI200 = "MI200"
LabelNVLink = "nvlink"
)
// awsInstanceTypePrefixes matches known GPU/accelerator instance families on EKS. See: https://aws.amazon.com/ec2/instance-types/
......@@ -79,27 +111,64 @@ var gcpMachineSeries = []string{
"g2-", // L4 GPU machines
}
type gpuRule struct {
token string
sxmSKU nvidiacomv1beta1.GPUSKUType
pcieSKU nvidiacomv1beta1.GPUSKUType
singleSKU nvidiacomv1beta1.GPUSKUType // for GPUs without form factor variants
}
var gpuRules = []gpuRule{
// Blackwell
{token: tokenGB200, sxmSKU: nvidiacomv1beta1.GPUSKUTypeGB200SXM},
{token: tokenB200, sxmSKU: nvidiacomv1beta1.GPUSKUTypeB200SXM},
// Hopper
{token: tokenH200, sxmSKU: nvidiacomv1beta1.GPUSKUTypeH200SXM},
{token: tokenH100, sxmSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, pcieSKU: nvidiacomv1beta1.GPUSKUTypeH100PCIe},
// Ampere
{token: tokenA100, sxmSKU: nvidiacomv1beta1.GPUSKUTypeA100SXM, pcieSKU: nvidiacomv1beta1.GPUSKUTypeA100PCIe},
// Ada
{token: tokenL40S, singleSKU: nvidiacomv1beta1.GPUSKUTypeL40S},
{token: tokenL40, singleSKU: nvidiacomv1beta1.GPUSKUTypeL40},
{token: tokenL4, singleSKU: nvidiacomv1beta1.GPUSKUTypeL4},
// Volta / Turing
{token: tokenV100, sxmSKU: nvidiacomv1beta1.GPUSKUTypeV100SXM, pcieSKU: nvidiacomv1beta1.GPUSKUTypeV100PCIe},
{token: tokenT4, singleSKU: nvidiacomv1beta1.GPUSKUTypeT4},
// AMD
{token: tokenMI300, singleSKU: nvidiacomv1beta1.GPUSKUTypeMI300},
{token: tokenMI250, singleSKU: nvidiacomv1beta1.GPUSKUTypeMI200},
{token: tokenMI200, singleSKU: nvidiacomv1beta1.GPUSKUTypeMI200},
}
// GPUInfo contains discovered GPU configuration from cluster nodes
type GPUInfo struct {
NodeName string // Name of the node with this GPU configuration
GPUsPerNode int // Maximum GPUs per node found in the cluster
NodesWithGPUs int // Number of nodes that have GPUs
Model string // GPU product name (e.g., "H100-SXM5-80GB")
VRAMPerGPU int // VRAM in MiB per GPU
System nvidiacomv1beta1.GPUSKUType // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
MIGEnabled bool // True if MIG is enabled (inferred from model or additional labels, not implemented in this version)
MIGProfiles map[string]int // Optional: map of MIG profile name to count (requires additional label parsing, not implemented in this version)
CloudProvider string // NEW: aws | gcp | aks | other | unknown
NodeName string // Name of the node with this GPU configuration
GPUsPerNode int // Maximum GPUs per node found in the cluster
NodesWithGPUs int // Number of nodes that have GPUs
Model string // GPU product name (e.g., "H100-SXM5-80GB")
VRAMPerGPU int // VRAM in MiB per GPU
System nvidiacomv1beta1.GPUSKUType // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
MIGEnabled bool // True if MIG is enabled (inferred from model or additional labels, not implemented in this version)
MIGProfiles map[string]int // Optional: map of MIG profile name to count (requires additional label parsing, not implemented in this version)
CloudProvider string // aws | gcp | aks | other | unknown
RDMAEnabled bool // Indicates whether RDMA is enabled for this node (e.g., via InfiniBand, RoCE, or similar high-speed networking)
RDMAType string // Type of RDMA transport detected (e.g., "infiniband", "roce", "rdma", "sriov", or "none")
Interconnect string // Primary GPU-to-GPU interconnect technology used within the node (e.g., "nvlink" for high-bandwidth links or "pcie" for standard bus-based communication)
InterconnectTier string // Qualitative or platform-specific classification of the interconnect (e.g., NVLink generation, topology tier, or vendor-defined performance level)
NVLinkLinks int // Number of NVLink connections per GPU (0 if NVLink is not present or interconnect is PCIe-only)
}
type ScrapeMetricsFunc func(ctx context.Context, endpoint string) (*GPUInfo, error)
type GPUDiscoveryCache struct {
mu sync.RWMutex
value *GPUInfo
expiresAt time.Time
}
type GPUDiscovery struct {
Scraper ScrapeMetricsFunc
}
......@@ -195,7 +264,6 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
if err != nil && !strings.Contains(err.Error(), "no DCGM exporter pods found") {
return nil, fmt.Errorf("listing DCGM exporter pods failed: %w", err)
}
// If no pods found
if len(dcgmPods) == 0 {
gpuPods, err := listGPUOperatorRunningPods(ctx, k8sClient)
......@@ -204,23 +272,28 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
}
return nil, err
}
// Scrape each running pod individually
var bestNode *GPUInfo
var scrapeErrors []error
var rdmaDetected bool
var rdmaType string
nodesWithGPUs := 0
for _, pod := range dcgmPods {
if pod.Status.Phase != corev1.PodRunning || pod.Status.PodIP == "" {
continue
}
endpoint := buildDCGMEndpoint(pod.Status.PodIP)
info, err := g.Scraper(ctx, endpoint)
if err != nil {
scrapeErrors = append(scrapeErrors, fmt.Errorf("pod %s (%s): %w", pod.Name, pod.Status.PodIP, err))
continue
}
// Detect RDMA on the node of this pod
rdma, rType := detectRDMAFromNode(ctx, k8sClient, pod.Spec.NodeName)
if rdma {
rdmaDetected = true
rdmaType = rType
}
// Increment NodesWithGPUs for every node that successfully reports GPU metrics
nodesWithGPUs++
// Select best node: highest GPU count, tie-breaker by VRAM
......@@ -228,18 +301,21 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
info.GPUsPerNode > bestNode.GPUsPerNode ||
(info.GPUsPerNode == bestNode.GPUsPerNode &&
info.VRAMPerGPU > bestNode.VRAMPerGPU) {
bestNode = info
}
}
if bestNode == nil {
if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("failed to scrape any DCGM exporter pod: %v", scrapeErrors)
}
return nil, fmt.Errorf("no GPU metrics could be parsed from any DCGM pod")
}
// --- Detect RDMA and InfiniBand presence ---
ib := detectIBPods(ctx, k8sClient)
if ib {
rdmaType = "infiniband"
rdmaDetected = true
}
// Infer cloud provider for the best node
cloudProvider, err := GetCloudProviderInfo(ctx, k8sClient)
if err != nil {
......@@ -247,61 +323,51 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
}
bestNode.CloudProvider = cloudProvider
bestNode.NodesWithGPUs = nodesWithGPUs
bestNode.RDMAEnabled = rdmaDetected
bestNode.RDMAType = rdmaType
if cache != nil {
// Cache result for 60 seconds
cache.Set(bestNode, 60*time.Second)
}
return bestNode, nil
}
func buildDCGMEndpoint(podIP string) string {
template := os.Getenv("DCGM_METRICS_ENDPOINT_TEMPLATE")
if template == "" {
template = defaultDCGMEndpointTemplate
}
return strings.ReplaceAll(template, "{POD_IP}", podIP)
}
func listDCGMExporterPods(ctx context.Context, k8sClient client.Reader) ([]corev1.Pod, error) {
var result []corev1.Pod
seen := make(map[string]struct{})
selectors := []client.MatchingLabels{
{LabelApp: LabelValueNvidiaDCGMExporter},
{LabelApp: LabelValueDCGMExporter},
{LabelAppKubernetesName: LabelValueDCGMExporter},
}
var lastErr error
for _, selector := range selectors {
podList := &corev1.PodList{}
err := k8sClient.List(ctx, podList, selector)
if err != nil {
lastErr = fmt.Errorf("list pods: %w", err)
continue
}
for _, pod := range podList.Items {
key := pod.Namespace + "/" + pod.Name
if _, exists := seen[key]; !exists {
seen[key] = struct{}{}
result = append(result, pod)
}
}
}
if len(result) > 0 {
return result, nil
}
if lastErr != nil {
return nil, lastErr
}
return nil, fmt.Errorf("no DCGM exporter pods found")
}
......@@ -311,20 +377,15 @@ func listDCGMExporterPods(ctx context.Context, k8sClient client.Reader) ([]corev
// It uses common GPU Operator label selectors and deduplicates results
// across selectors. If no running pods are found, an error is returned.
func listGPUOperatorRunningPods(ctx context.Context, k8sClient client.Reader) ([]corev1.Pod, error) {
var result []corev1.Pod
seen := make(map[string]struct{})
selectors := []client.MatchingLabels{
{LabelApp: LabelValueGPUOperator},
{LabelAppKubernetesName: LabelValueGPUOperator},
}
var lastErr error
for _, selector := range selectors {
podList := &corev1.PodList{}
err := k8sClient.List(
ctx,
podList,
......@@ -335,29 +396,23 @@ func listGPUOperatorRunningPods(ctx context.Context, k8sClient client.Reader) ([
lastErr = fmt.Errorf("list gpu operator pods: %w", err)
continue
}
for _, pod := range podList.Items {
if pod.Status.Phase != corev1.PodRunning {
continue
}
key := pod.Namespace + "/" + pod.Name
if _, exists := seen[key]; !exists {
seen[key] = struct{}{}
result = append(result, pod)
}
}
}
if len(result) > 0 {
return result, nil
}
if lastErr != nil {
return nil, lastErr
}
return nil, fmt.Errorf(
"gpu operator is not installed %s",
GPUOperatorNamespace,
......@@ -385,7 +440,6 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
// Set a timeout for the request
ctx, cancel := context.WithTimeout(ctx, requestTimeout)
defer cancel()
// Create a custom HTTP client with transport-level timeouts
client := &http.Client{
Transport: &http.Transport{
......@@ -396,12 +450,10 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
TLSHandshakeTimeout: tlsHandshakeTimeout, // TLS handshake timeout
},
}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return nil, fmt.Errorf("create request for %s: %w", endpoint, err)
}
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("HTTP GET %s failed: %w", endpoint, err)
......@@ -412,7 +464,6 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
log.FromContext(ctx).V(1).Info("failed to close response body", "err", cerr)
}
}()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf(
"metrics endpoint %s returned status %d",
......@@ -420,20 +471,18 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
resp.StatusCode,
)
}
parser := expfmt.NewTextParser(model.UTF8Validation)
metricFamilies, err := parser.TextToMetricFamilies(resp.Body)
if err != nil {
return nil, fmt.Errorf("parse prometheus metrics: %w", err)
}
return parseMetrics(ctx, metricFamilies)
}
// parseMetrics extracts GPU information for a node from DCGM Prometheus metrics.
// parseMetrics extracts GPU information and interconnect type for a node from DCGM Prometheus metrics.
//
// It parses the provided Prometheus metric families exported by the NVIDIA
// DCGM exporter and derives high-level GPU inventory information for the node.
// DCGM exporter and derives high-level GPU inventory and interconnect information for the node.
//
// The function performs the following:
//
......@@ -446,6 +495,10 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
// VRAM = FB_FREE + FB_USED + FB_RESERVED
// (values are in MiB).
//
// - Determines the interconnect type (PCIe or NVLink) from the
// DCGM_FI_DEV_NVLINK_LINK_COUNT metric. If NVLink links are present,
// interconnect is set to "nvlink", otherwise defaults to "pcie".
//
// - Assumes MIG is disabled unless explicit MIG metrics are present
// (not included in the provided DCGM metric set).
//
......@@ -464,17 +517,18 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
// - MIGEnabled: false because no MIG metrics were collected in the DCGM families
// - MIGProfiles: empty map; would contain MIG profile counts if MIG metrics were available
// - System (inferred from model)
// - Interconnect: "pcie" or "nvlink" depending on detected NVLink links
//
// Returns an error if no GPUs can be detected from the metrics.
//
// Notes:
// - This function relies on DCGM exporter metrics.
// - If required metrics are missing, zero values may be returned.
// - Interconnect detection is based on NVLink link count; other interconnects are not currently detected.
// - The implementation assumes homogeneous GPUs per node.
// - For heterogeneous configurations, per-GPU parsing should be implemented.
func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*GPUInfo, error) {
logger := log.FromContext(ctx)
getLabel := func(m *dto.Metric, name string) string {
for _, l := range m.GetLabel() {
if l.GetName() == name {
......@@ -483,18 +537,16 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
}
return ""
}
// Track unique GPUs
gpuSet := map[string]struct{}{}
var model string
var vram int
var hostName string
var nvlinkDetected bool
var nvlinkLinks int
fbFree := map[string]float64{}
fbUsed := map[string]float64{}
fbReserved := map[string]float64{}
// --- Detect GPUs + Model + Hostname ---
if mf, ok := families["DCGM_FI_DEV_GPU_TEMP"]; ok {
for _, m := range mf.Metric {
......@@ -503,19 +555,16 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
continue
}
gpuSet[gpuID] = struct{}{}
// Extract model from label
if model == "" {
model = getLabel(m, "modelName")
}
// Extract Hostname label
if hostName == "" {
hostName = getLabel(m, "Hostname")
}
}
}
// --- Collect framebuffer metrics ---
if mf, ok := families["DCGM_FI_DEV_FB_FREE"]; ok {
for _, m := range mf.Metric {
......@@ -524,13 +573,11 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
continue
}
fbFree[gpuID] = m.GetGauge().GetValue()
if hostName == "" {
hostName = getLabel(m, "Hostname")
}
}
}
if mf, ok := families["DCGM_FI_DEV_FB_USED"]; ok {
for _, m := range mf.Metric {
gpuID := getLabel(m, "gpu")
......@@ -538,13 +585,11 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
continue
}
fbUsed[gpuID] = m.GetGauge().GetValue()
if hostName == "" {
hostName = getLabel(m, "Hostname")
}
}
}
if mf, ok := families["DCGM_FI_DEV_FB_RESERVED"]; ok {
for _, m := range mf.Metric {
gpuID := getLabel(m, "gpu")
......@@ -552,13 +597,37 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
continue
}
fbReserved[gpuID] = m.GetGauge().GetValue()
if hostName == "" {
hostName = getLabel(m, "Hostname")
}
}
}
if mf, ok := families["DCGM_FI_DEV_NVLINK_LINK_COUNT"]; ok {
for _, m := range mf.Metric {
val := int(m.GetGauge().GetValue())
if val > 0 {
nvlinkDetected = true
nvlinkLinks = val
break
}
}
}
// --- Determine interconnect type ---
interconnect := "pcie"
interconnectDetail := strNone
if nvlinkDetected {
switch {
case nvlinkLinks >= 12:
interconnect = LabelNVLink
interconnectDetail = "full-mesh" // HGX / DGX class
case nvlinkLinks >= 6:
interconnect = LabelNVLink
interconnectDetail = "high"
default:
interconnect = LabelNVLink
interconnectDetail = "partial"
}
}
// --- Calculate Max VRAM
for gpuID := range gpuSet {
total := int(fbFree[gpuID] + fbUsed[gpuID] + fbReserved[gpuID])
......@@ -566,32 +635,33 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
vram = total
}
}
gpuCount := len(gpuSet)
if gpuCount == 0 {
return nil, fmt.Errorf("no GPUs detected from DCGM metrics")
}
// --- Infer system from model ---
system := InferHardwareSystem(model)
logger.Info("Parsed GPU info",
"node", hostName,
"gpuCount", gpuCount,
"model", model,
"vramMiB", vram,
"system", system,
"interconnect", interconnect,
"interconnectDetail", interconnectDetail,
"nvlinkLinks", nvlinkLinks,
)
return &GPUInfo{
NodeName: hostName,
GPUsPerNode: gpuCount,
Model: model,
VRAMPerGPU: vram,
MIGEnabled: false,
MIGProfiles: map[string]int{},
System: system, // populated from InferHardwareSystem
NodeName: hostName,
GPUsPerNode: gpuCount,
Model: model,
VRAMPerGPU: vram,
MIGEnabled: false,
MIGProfiles: map[string]int{},
System: system, // populated from InferHardwareSystem
Interconnect: interconnect,
InterconnectTier: interconnectDetail,
NVLinkLinks: nvlinkLinks,
}, nil
}
......@@ -605,23 +675,18 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error) {
logger := log.FromContext(ctx)
logger.Info("Starting GPU discovery from cluster nodes")
// List all nodes in the cluster
nodeList := &corev1.NodeList{}
if err := k8sClient.List(ctx, nodeList); err != nil {
return nil, fmt.Errorf("failed to list cluster nodes: %w", err)
}
if len(nodeList.Items) == 0 {
return nil, fmt.Errorf("no nodes found in cluster")
}
logger.Info("Found cluster nodes", "count", len(nodeList.Items))
// Track the best GPU configuration found
var bestGPUInfo *GPUInfo
nodesWithGPUs := 0
for i := range nodeList.Items {
node := &nodeList.Items[i]
gpuInfo, err := extractGPUInfoFromNode(node)
......@@ -632,14 +697,12 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error
"reason", err.Error())
continue
}
nodesWithGPUs++
logger.Info("Found GPU node",
"node", node.Name,
"gpus", gpuInfo.GPUsPerNode,
"model", gpuInfo.Model,
"vram", gpuInfo.VRAMPerGPU)
// Select best configuration: prefer higher GPU count, then higher VRAM
if bestGPUInfo == nil ||
gpuInfo.GPUsPerNode > bestGPUInfo.GPUsPerNode ||
......@@ -647,17 +710,14 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error
bestGPUInfo = gpuInfo
}
}
if bestGPUInfo == nil {
return nil, fmt.Errorf("no nodes with NVIDIA GPU Feature Discovery labels found (checked %d nodes). "+
"Ensure GPU nodes have labels: %s, %s, %s",
len(nodeList.Items), LabelGPUCount, LabelGPUProduct, LabelGPUMemory)
}
// Infer hardware system from GPU model
bestGPUInfo.System = InferHardwareSystem(bestGPUInfo.Model)
bestGPUInfo.NodesWithGPUs = nodesWithGPUs
logger.Info("GPU discovery completed",
"gpusPerNode", bestGPUInfo.GPUsPerNode,
"nodesWithGPUs", bestGPUInfo.NodesWithGPUs,
......@@ -665,7 +725,6 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error
"model", bestGPUInfo.Model,
"vram", bestGPUInfo.VRAMPerGPU,
"system", bestGPUInfo.System)
return bestGPUInfo, nil
}
......@@ -676,7 +735,6 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
if labels == nil {
return nil, fmt.Errorf("node has no labels")
}
gpuCountStr, ok := labels[LabelGPUCount]
if !ok {
return nil, fmt.Errorf("missing label %s", LabelGPUCount)
......@@ -685,12 +743,10 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
if err != nil || gpuCount <= 0 {
return nil, fmt.Errorf("invalid GPU count: %s", gpuCountStr)
}
gpuModel, ok := labels[LabelGPUProduct]
if !ok || gpuModel == "" {
return nil, fmt.Errorf("missing or empty label %s", LabelGPUProduct)
}
// Extract VRAM (memory in MiB)
gpuMemoryStr, ok := labels[LabelGPUMemory]
if !ok {
......@@ -700,7 +756,6 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
if err != nil || gpuMemory <= 0 {
return nil, fmt.Errorf("invalid GPU memory: %s", gpuMemoryStr)
}
return &GPUInfo{
GPUsPerNode: gpuCount,
Model: gpuModel,
......@@ -708,12 +763,18 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
}, nil
}
// InferHardwareSystem maps GPU product name to hardware system identifier.
// Returns empty string if the GPU model cannot be confidently mapped.
// InferHardwareSystem attempts to infer a normalized GPU SKU type from a
// free-form product string (e.g. "NVIDIA H100 SXM", "A100-PCIE").
//
// This is a best-effort mapping based on common NVIDIA datacenter GPU naming patterns.
// The system identifier is used by the profiler for performance estimation and configuration.
// The function performs three main steps:
// 1. Normalize the input string to a consistent format.
// 2. Detect the GPU form factor (SXM vs PCIe).
// 3. Match the normalized string against known GPU tokens and return
// the corresponding SKU type.
//
// Matching is based on substring checks and is tolerant of variations
// in formatting (case, spaces, dashes). If no known GPU is detected,
// an empty SKU type is returned.
// Limitations:
// - Cannot distinguish SXM vs. PCIe variants from labels alone (assumes SXM for datacenter GPUs)
// - New GPU models require code updates (gracefully returns empty string)
......@@ -726,34 +787,80 @@ func InferHardwareSystem(gpuProduct string) nvidiacomv1beta1.GPUSKUType {
return ""
}
// Normalize: uppercase, remove spaces/dashes for pattern matching
normalized := strings.ToUpper(strings.ReplaceAll(gpuProduct, "-", ""))
normalized = strings.ReplaceAll(normalized, " ", "")
// Map common NVIDIA datacenter GPU products to AIC hardware system identifiers.
patterns := []struct {
pattern string
system nvidiacomv1beta1.GPUSKUType
}{
{"GB200", nvidiacomv1beta1.GPUSKUTypeGB200SXM},
{"H200", nvidiacomv1beta1.GPUSKUTypeH200SXM},
{"H100", nvidiacomv1beta1.GPUSKUTypeH100SXM},
{"B200", nvidiacomv1beta1.GPUSKUTypeB200SXM},
{"A100", nvidiacomv1beta1.GPUSKUTypeA100SXM},
{"L40S", nvidiacomv1beta1.GPUSKUTypeL40S},
}
normalized := normalize(gpuProduct)
formFactor := detectFormFactor(normalized)
for _, p := range patterns {
if strings.Contains(normalized, p.pattern) {
return p.system
for _, rule := range gpuRules {
if strings.Contains(normalized, rule.token) {
if rule.singleSKU != "" {
return rule.singleSKU
}
if formFactor == formFactorSXM && rule.sxmSKU != "" {
return rule.sxmSKU
}
if rule.pcieSKU != "" {
return rule.pcieSKU
}
}
}
// Unknown GPU type, return empty value.
// User must specify gpuSku explicitly in spec.hardware.
return ""
}
// normalize standardizes a GPU product string to simplify matching.
//
// It converts the string to uppercase and removes common separators
// such as spaces and dashes. This allows consistent substring matching
// regardless of how the input is formatted (e.g. "H100-SXM",
// "h100 sxm", and "H100SXM" all normalize to the same value).
func normalize(input string) string {
s := strings.ToUpper(strings.ReplaceAll(input, strDash, strSpace))
return strings.ReplaceAll(s, " ", "")
}
// detectFormFactor determines the GPU form factor (e.g. SXM or PCIe)
// from a normalized product string.
//
// The detection is based on the presence of known substrings such as
// "SXM", "HGX", or "DGX" for SXM-based systems, and "PCIE" for PCIe.
// If no explicit indicator is found, PCIe is used as the default since
// it is the more common and safer assumption.
func detectFormFactor(normalized string) string {
switch {
case strings.Contains(normalized, tokenSXM),
strings.Contains(normalized, tokenHGX),
strings.Contains(normalized, tokenDGX):
return formFactorSXM
case strings.Contains(normalized, tokenPCIE):
return formFactorPCIe
default:
return formFactorPCIe
}
}
// GetCloudProviderInfo attempts to infer the cloud provider of the Kubernetes cluster.
//
// The function inspects the first node in the cluster (assumes homogeneous node setup)
// and uses a combination of ProviderID and node labels to detect the provider.
//
// Detection logic:
// - Primary detection uses node.Spec.ProviderID:
// - "azure" → AKS
// - "aws" → AWS
// - "gce" → GCP
// - Secondary detection uses node labels and instance type prefixes:
// - AKS: "kubernetes.azure.com/cluster" label or instance type starting with "standard_"
// - AWS: "eks.amazonaws.com/nodegroup" label or known AWS instance type prefix
// - GCP: "cloud.google.com/gke-nodepool" label or known GCP machine series prefix
// - If none match, returns "other".
//
// Parameters:
// - ctx: Context for logging, cancellation, or timeout.
// - k8sClient: Kubernetes client for reading Node objects.
//
// Returns:
// - A string identifying the cloud provider ("aks", "aws", "gcp", "other", or "unknown").
// - An error if no nodes are found or listing fails.
func GetCloudProviderInfo(ctx context.Context, k8sClient client.Reader) (string, error) {
var nodeList corev1.NodeList
if err := k8sClient.List(ctx, &nodeList); err != nil {
......@@ -801,6 +908,13 @@ func GetCloudProviderInfo(ctx context.Context, k8sClient client.Reader) (string,
return "other", nil
}
// isGCPInstanceType checks whether a given instance type string matches known GCP machine series.
//
// Parameters:
// - instanceType: string representing the node's instance type (lowercased).
//
// Returns:
// - true if the instance type belongs to a GCP machine series prefix.
func isGCPInstanceType(instanceType string) bool {
for _, prefix := range gcpMachineSeries {
if strings.HasPrefix(instanceType, prefix) {
......@@ -810,6 +924,13 @@ func isGCPInstanceType(instanceType string) bool {
return false
}
// isAWSInstanceType checks whether a given instance type string matches known AWS instance type prefixes.
//
// Parameters:
// - instanceType: string representing the node's instance type (lowercased).
//
// Returns:
// - true if the instance type belongs to an AWS instance type prefix.
func isAWSInstanceType(instanceType string) bool {
for _, prefix := range awsInstanceTypePrefixes {
if strings.HasPrefix(instanceType, prefix) {
......@@ -818,3 +939,59 @@ func isAWSInstanceType(instanceType string) bool {
}
return false
}
// detectRDMAFromNode inspects a single node for RDMA or SR-IOV network capability.
//
// Detection logic:
// - Checks node labels:
// - "nvidia.com/rdma.present" = "true" → RDMA detected
// - "feature.node.kubernetes.io/network-sriov.capable" = "true" → SR-IOV detected
//
// Parameters:
// - ctx: Context for logging or cancellation.
// - k8sClient: Kubernetes client for reading Node objects.
// - nodeName: Name of the node to inspect.
//
// Returns:
// - bool indicating whether RDMA/SR-IOV is present.
// - string representing the type ("rdma", "sriov", or "none").
func detectRDMAFromNode(ctx context.Context, k8sClient client.Reader, nodeName string) (bool, string) {
node := &corev1.Node{}
if err := k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, node); err != nil {
return false, strNone
}
labels := node.Labels
if labels["nvidia.com/rdma.present"] == "true" {
return true, "rdma"
}
if labels["feature.node.kubernetes.io/network-sriov.capable"] == "true" {
return true, "sriov"
}
return false, strNone
}
// detectIBPods checks if there are any RDMA or InfiniBand-related pods deployed
// in the "nvidia-network-operator" namespace.
//
// Detection logic:
// - Lists pods in "nvidia-network-operator" namespace.
// - If any pod name contains "rdma", returns true.
//
// Parameters:
// - ctx: Context for logging or cancellation.
// - k8sClient: Kubernetes client for listing pods.
//
// Returns:
// - true if any RDMA/IB pods are found, false otherwise.
func detectIBPods(ctx context.Context, k8sClient client.Reader) bool {
podList := &corev1.PodList{}
if err := k8sClient.List(ctx, podList, client.InNamespace(LabelValueNvidiaNetworkOperator)); err != nil {
return false
}
for _, p := range podList.Items {
if strings.Contains(p.Name, "rdma") {
return true
}
}
return false
}
......@@ -26,6 +26,7 @@ import (
"strings"
"testing"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
......@@ -322,32 +323,159 @@ func TestExtractGPUInfoFromNode_MissingLabels(t *testing.T) {
func TestInferHardwareSystem(t *testing.T) {
tests := []struct {
gpuProduct string
expectedSystem string
description string
name string
input string
expected nvidiacomv1beta1.GPUSKUType
}{
{"H100-SXM5-80GB", "h100_sxm", "H100 SXM variant"},
{"H100-PCIE-80GB", "h100_sxm", "H100 PCIe variant (mapped to SXM)"},
{"H200-SXM5-141GB", "h200_sxm", "H200 SXM variant"},
{"A100-SXM4-40GB", "a100_sxm", "A100 SXM variant"},
{"A100-PCIE-80GB", "a100_sxm", "A100 PCIe variant (mapped to SXM)"},
{"L40S", "l40s", "L40S"},
{"NVIDIA L40S", "l40s", "L40S with prefix"},
{"B200-SXM", "b200_sxm", "B200 SXM"},
{"GB200", "gb200_sxm", "GB200"},
{"Tesla V100-SXM2-16GB", "", "V100 (not in mapping)"},
{"RTX 4090", "", "Consumer GPU (not in mapping)"},
{"Unknown-GPU", "", "Unknown GPU"},
{"", "", "Empty string"},
// GFD product names as seen in real cluster labels (regression for GPUSKU bug)
{"NVIDIA-B200", "b200_sxm", "B200 with NVIDIA prefix (GFD label format)"},
{"NVIDIA-H200-SXM5-141GB", "h200_sxm", "H200 with NVIDIA prefix (GFD label format)"},
// --- Empty / unknown ---
{
name: "empty input",
input: "",
expected: "",
},
{
name: "unknown gpu",
input: "random-gpu",
expected: "",
},
// --- Blackwell ---
{
name: "GB200 SXM",
input: "GB200-SXM",
expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
},
{
name: "GB200 HGX (implies SXM)",
input: "HGX GB200",
expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
},
{
name: "B200 SXM",
input: "B200 SXM",
expected: nvidiacomv1beta1.GPUSKUTypeB200SXM,
},
// --- Hopper ---
{
name: "H100 SXM",
input: "H100 SXM",
expected: nvidiacomv1beta1.GPUSKUTypeH100SXM,
},
{
name: "H100 PCIe explicit",
input: "H100 PCIe",
expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
},
{
name: "H100 default PCIe",
input: "H100",
expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
},
{
name: "H200 SXM",
input: "H200 SXM",
expected: nvidiacomv1beta1.GPUSKUTypeH200SXM,
},
// --- Ampere ---
{
name: "A100 SXM",
input: "A100-SXM",
expected: nvidiacomv1beta1.GPUSKUTypeA100SXM,
},
{
name: "A100 PCIe",
input: "A100 PCIe",
expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
},
{
name: "A100 default PCIe",
input: "A100",
expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
},
// --- Ada ---
{
name: "L40S",
input: "L40S",
expected: nvidiacomv1beta1.GPUSKUTypeL40S,
},
{
name: "L40S should not match L40",
input: "L40S",
expected: nvidiacomv1beta1.GPUSKUTypeL40S,
},
{
name: "L40",
input: "L40",
expected: nvidiacomv1beta1.GPUSKUTypeL40,
},
{
name: "L4",
input: "L4",
expected: nvidiacomv1beta1.GPUSKUTypeL4,
},
// --- Volta / Turing ---
{
name: "V100 SXM",
input: "V100 SXM",
expected: nvidiacomv1beta1.GPUSKUTypeV100SXM,
},
{
name: "V100 PCIe",
input: "V100 PCIe",
expected: nvidiacomv1beta1.GPUSKUTypeV100PCIe,
},
{
name: "T4",
input: "T4",
expected: nvidiacomv1beta1.GPUSKUTypeT4,
},
// --- AMD ---
{
name: "MI300",
input: "MI300",
expected: nvidiacomv1beta1.GPUSKUTypeMI300,
},
{
name: "MI250",
input: "MI250",
expected: nvidiacomv1beta1.GPUSKUTypeMI200,
},
{
name: "MI200",
input: "MI200",
expected: nvidiacomv1beta1.GPUSKUTypeMI200,
},
// --- Normalization tests ---
{
name: "lowercase + spaces",
input: "h100 sxm",
expected: nvidiacomv1beta1.GPUSKUTypeH100SXM,
},
{
name: "mixed case + dash",
input: "A100-sXm",
expected: nvidiacomv1beta1.GPUSKUTypeA100SXM,
},
{
name: "with extra spaces",
input: " H100 PCIe ",
expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
},
}
for _, tt := range tests {
t.Run(tt.description, func(t *testing.T) {
result := InferHardwareSystem(tt.gpuProduct)
assert.Equal(t, tt.expectedSystem, string(result), "Failed for GPU: %s", tt.gpuProduct)
t.Run(tt.name, func(t *testing.T) {
result := InferHardwareSystem(tt.input)
if result != tt.expected {
t.Errorf("InferHardwareSystem(%q) = %v, want %v",
tt.input, result, tt.expected)
}
})
}
}
......@@ -382,6 +510,119 @@ func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) {
}
}
func TestNormalize(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "basic lowercase",
input: "h100",
expected: "H100",
},
{
name: "spaces removed",
input: "H100 SXM",
expected: "H100SXM",
},
{
name: "dashes replaced and removed",
input: "H100-SXM",
expected: "H100SXM",
},
{
name: "mixed spaces and dashes",
input: "A100 - SXM",
expected: "A100SXM",
},
{
name: "extra whitespace",
input: " H100 PCIe ",
expected: "H100PCIE",
},
{
name: "complex string",
input: "h100-sxm5-80gb",
expected: "H100SXM580GB",
},
{
name: "empty string",
input: "",
expected: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := normalize(tt.input)
if result != tt.expected {
t.Errorf("normalize(%q) = %q, want %q",
tt.input, result, tt.expected)
}
})
}
}
func TestDetectFormFactor(t *testing.T) {
tests := []struct {
name string
input string // already normalized
expected string
}{
{
name: "detect SXM explicitly",
input: "H100SXM",
expected: formFactorSXM,
},
{
name: "detect HGX implies SXM",
input: "HGXH100",
expected: formFactorSXM,
},
{
name: "detect DGX implies SXM",
input: "DGXH100",
expected: formFactorSXM,
},
{
name: "detect PCIe explicitly",
input: "H100PCIE",
expected: formFactorPCIe,
},
{
name: "default to PCIe when unknown",
input: "H100",
expected: formFactorPCIe,
},
{
name: "SXM wins over PCIe if both present",
input: "H100SXMPCIE",
expected: formFactorSXM,
},
{
name: "random string defaults to PCIe",
input: "RANDOMGPU",
expected: formFactorPCIe,
},
{
name: "empty string defaults to PCIe",
input: "",
expected: formFactorPCIe,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := detectFormFactor(tt.input)
if result != tt.expected {
t.Errorf("detectFormFactor(%q) = %v, want %v",
tt.input, result, tt.expected)
}
})
}
}
func TestParseMetrics(t *testing.T) {
ctx := context.Background()
......@@ -873,3 +1114,117 @@ func TestGetCloudProviderInfo(t *testing.T) {
})
}
}
func TestDetectRDMAFromNode(t *testing.T) {
scheme := runtime.NewScheme()
_ = corev1.AddToScheme(scheme)
tests := []struct {
name string
node *corev1.Node
nodeName string
expectedOK bool
expectedTyp string
}{
{
name: "node not found",
node: nil,
nodeName: "missing-node",
expectedOK: false,
expectedTyp: strNone,
},
{
name: "rdma detected",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-rdma",
Labels: map[string]string{
"nvidia.com/rdma.present": "true",
},
},
},
nodeName: "node-rdma",
expectedOK: true,
expectedTyp: "rdma",
},
{
name: "sriov detected",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-sriov",
Labels: map[string]string{
"feature.node.kubernetes.io/network-sriov.capable": "true",
},
},
},
nodeName: "node-sriov",
expectedOK: true,
expectedTyp: "sriov",
},
{
name: "both rdma and sriov - rdma takes precedence",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-both",
Labels: map[string]string{
"nvidia.com/rdma.present": "true",
"feature.node.kubernetes.io/network-sriov.capable": "true",
},
},
},
nodeName: "node-both",
expectedOK: true,
expectedTyp: "rdma",
},
{
name: "no relevant labels",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-none",
Labels: map[string]string{},
},
},
nodeName: "node-none",
expectedOK: false,
expectedTyp: strNone,
},
{
name: "labels present but false",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-false",
Labels: map[string]string{
"nvidia.com/rdma.present": "false",
"feature.node.kubernetes.io/network-sriov.capable": "false",
},
},
},
nodeName: "node-false",
expectedOK: false,
expectedTyp: strNone,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var objs []runtime.Object
if tt.node != nil {
objs = append(objs, tt.node)
}
fakeClient := fake.NewClientBuilder().
WithScheme(scheme).
WithRuntimeObjects(objs...).
Build()
ok, typ := detectRDMAFromNode(context.TODO(), fakeClient, tt.nodeName)
if ok != tt.expectedOK {
t.Errorf("expected ok=%v, got %v", tt.expectedOK, ok)
}
if typ != tt.expectedTyp {
t.Errorf("expected type=%s, got %s", tt.expectedTyp, typ)
}
})
}
}
......@@ -1542,19 +1542,28 @@ _Underlying type:_ _string_
GPUSKUType is the AIC hardware system identifier for a supported GPU.
_Validation:_
- Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s]
- Enum: [gb200_sxm b200_sxm h200_sxm h100_sxm h100_pcie a100_sxm a100_pcie l40s l40 l4 v100_sxm v100_pcie t4 mi200 mi300]
_Appears in:_
- [HardwareSpec](#hardwarespec)
| Field | Description |
| --- | --- |
| `gb200_sxm` | |
| `h200_sxm` | |
| `h100_sxm` | |
| `gb200_sxm` | --- Blackwell ---<br /> |
| `b200_sxm` | |
| `a100_sxm` | |
| `l40s` | |
| `h200_sxm` | --- Hopper ---<br /> |
| `h100_sxm` | |
| `h100_pcie` | |
| `a100_sxm` | --- Ampere ---<br /> |
| `a100_pcie` | |
| `l40s` | --- Ada ---<br /> |
| `l40` | |
| `l4` | |
| `v100_sxm` | --- Older NVIDIA ---<br /> |
| `v100_pcie` | |
| `t4` | |
| `mi200` | --- AMD ---<br /> |
| `mi300` | |
#### HardwareSpec
......@@ -1571,10 +1580,12 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. | | Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s] <br />Optional: \{\} <br /> |
| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. | | Enum: [gb200_sxm b200_sxm h200_sxm h100_sxm h100_pcie a100_sxm a100_pcie l40s l40 l4 v100_sxm v100_pcie t4 mi200 mi300] <br />Optional: \{\} <br /> |
| `vramMb` _float_ | VRAMMB is the VRAM per GPU in MiB. | | Optional: \{\} <br /> |
| `totalGpus` _integer_ | TotalGPUs is the total number of GPUs available in the cluster. | | Optional: \{\} <br /> |
| `numGpusPerNode` _integer_ | NumGPUsPerNode is the number of GPUs per node. | | Optional: \{\} <br /> |
| `interconnect` _string_ | Interconnect describes the GPU interconnect type within a node.<br />Examples: "pcie", "nvlink", "infiniband". | | Optional: \{\} <br /> |
| `rdma` _boolean_ | RDMA indicates whether RDMA is available on the cluster. | | Optional: \{\} <br /> |
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment