feat: Expanding the hardware discovery to include RDMA, interconnect. (#7551)

63787939 · devivasudevan · GitHub · df53b7a2 · 63787939 · 63787939
Unverified Commit 63787939 authored Apr 16, 2026 by devivasudevan Committed by GitHub Apr 16, 2026
8 changed files
--- a/components/src/dynamo/profiler/utils/dgdr_v1beta1_types.py
+++ b/components/src/dynamo/profiler/utils/dgdr_v1beta1_types.py
@@ -59,11 +59,20 @@ class SearchStrategy(str, Enum):

 class GPUSKUType(str, Enum):
    GB200SXM = "gb200_sxm"
+    B200SXM = "b200_sxm"
    H200SXM = "h200_sxm"
    H100SXM = "h100_sxm"
-    B200SXM = "b200_sxm"
+    H100PCIe = "h100_pcie"
    A100SXM = "a100_sxm"
+    A100PCIe = "a100_pcie"
    L40S = "l40s"
+    L40 = "l40"
+    L4 = "l4"
+    V100SXM = "v100_sxm"
+    V100PCIe = "v100_pcie"
+    T4 = "t4"
+    MI200 = "mi200"
+    MI300 = "mi300"


 class BackendType(str, Enum):
@@ -210,6 +219,14 @@ class HardwareSpec(BaseModel):
    numGpusPerNode: Optional[int] = Field(
        default=None, description="NumGPUsPerNode is the number of GPUs per node."
    )
+    interconnect: Optional[str] = Field(
+        default=None,
+        description='Interconnect describes the GPU interconnect type within a node. Examples: "pcie", "nvlink", "infiniband".',
+    )
+    rdma: Optional[bool] = Field(
+        default=None,
+        description="RDMA indicates whether RDMA is available on the cluster.",
+    )


 class DynamoGraphDeploymentRequestSpec(BaseModel):

--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
@@ -579,26 +579,52 @@ spec:
                      allOf:
                        - enum:
                            - gb200_sxm
+                            - b200_sxm
                            - h200_sxm
                            - h100_sxm
-                            - b200_sxm
+                            - h100_pcie
                            - a100_sxm
+                            - a100_pcie
                            - l40s
+                            - l40
+                            - l4
+                            - v100_sxm
+                            - v100_pcie
+                            - t4
+                            - mi200
+                            - mi300
                        - enum:
                            - gb200_sxm
+                            - b200_sxm
                            - h200_sxm
                            - h100_sxm
-                            - b200_sxm
+                            - h100_pcie
                            - a100_sxm
+                            - a100_pcie
                            - l40s
+                            - l40
+                            - l4
+                            - v100_sxm
+                            - v100_pcie
+                            - t4
+                            - mi200
+                            - mi300
                      description: |-
                        GPUSKU is the AIC hardware system identifier for the GPU.
                        When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
                      type: string
+                    interconnect:
+                      description: |-
+                        Interconnect describes the GPU interconnect type within a node.
+                        Examples: "pcie", "nvlink", "infiniband".
+                      type: string
                    numGpusPerNode:
                      description: NumGPUsPerNode is the number of GPUs per node.
                      format: int32
                      type: integer
+                    rdma:
+                      description: RDMA indicates whether RDMA is available on the cluster.
+                      type: boolean
                    totalGpus:
                      description: TotalGPUs is the total number of GPUs available in the cluster.
                      format: int32

--- a/deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
+++ b/deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
@@ -175,16 +175,31 @@ const (
 )

 // GPUSKUType is the AIC hardware system identifier for a supported GPU.
-// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
+// +kubebuilder:validation:Enum=gb200_sxm;b200_sxm;h200_sxm;h100_sxm;h100_pcie;a100_sxm;a100_pcie;l40s;l40;l4;v100_sxm;v100_pcie;t4;mi200;mi300
 type GPUSKUType string

 const (
+	// --- Blackwell ---
 	GPUSKUTypeGB200SXM GPUSKUType = "gb200_sxm"
+	GPUSKUTypeB200SXM  GPUSKUType = "b200_sxm"
+	// --- Hopper ---
 	GPUSKUTypeH200SXM  GPUSKUType = "h200_sxm"
 	GPUSKUTypeH100SXM  GPUSKUType = "h100_sxm"
-	GPUSKUTypeB200SXM  GPUSKUType = "b200_sxm"
+	GPUSKUTypeH100PCIe GPUSKUType = "h100_pcie"
+	// --- Ampere ---
 	GPUSKUTypeA100SXM  GPUSKUType = "a100_sxm"
+	GPUSKUTypeA100PCIe GPUSKUType = "a100_pcie"
+	// --- Ada ---
 	GPUSKUTypeL40S GPUSKUType = "l40s"
+	GPUSKUTypeL40  GPUSKUType = "l40"
+	GPUSKUTypeL4   GPUSKUType = "l4"
+	// --- Older NVIDIA ---
+	GPUSKUTypeV100SXM  GPUSKUType = "v100_sxm"
+	GPUSKUTypeV100PCIe GPUSKUType = "v100_pcie"
+	GPUSKUTypeT4       GPUSKUType = "t4"
+	// --- AMD ---
+	GPUSKUTypeMI200 GPUSKUType = "mi200"
+	GPUSKUTypeMI300 GPUSKUType = "mi300"
 )

 // BackendType specifies the inference backend.
@@ -324,7 +339,7 @@ type HardwareSpec struct {
 	// GPUSKU is the AIC hardware system identifier for the GPU.
 	// When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
 	// +optional
-	// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
+	// +kubebuilder:validation:Enum=gb200_sxm;b200_sxm;h200_sxm;h100_sxm;h100_pcie;a100_sxm;a100_pcie;l40s;l40;l4;v100_sxm;v100_pcie;t4;mi200;mi300
 	GPUSKU GPUSKUType `json:"gpuSku,omitempty"`

 	// VRAMMB is the VRAM per GPU in MiB.
@@ -338,6 +353,13 @@ type HardwareSpec struct {
 	// NumGPUsPerNode is the number of GPUs per node.
 	// +optional
 	NumGPUsPerNode *int32 `json:"numGpusPerNode,omitempty"`
+	// Interconnect describes the GPU interconnect type within a node.
+	// Examples: "pcie", "nvlink", "infiniband".
+	// +optional
+	Interconnect string `json:"interconnect,omitempty"`
+	// RDMA indicates whether RDMA is available on the cluster.
+	// +optional
+	RDMA *bool `json:"rdma,omitempty"`
 }

 // DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.

--- a/deploy/operator/api/v1beta1/zz_generated.deepcopy.go
+++ b/deploy/operator/api/v1beta1/zz_generated.deepcopy.go
@@ -252,6 +252,11 @@ func (in *HardwareSpec) DeepCopyInto(out *HardwareSpec) {
 		*out = new(int32)
 		**out = **in
 	}
+	if in.RDMA != nil {
+		in, out := &in.RDMA, &out.RDMA
+		*out = new(bool)
+		**out = **in
+	}
 }

 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HardwareSpec.

--- a/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
@@ -579,26 +579,52 @@ spec:
                      allOf:
                        - enum:
                            - gb200_sxm
+                            - b200_sxm
                            - h200_sxm
                            - h100_sxm
-                            - b200_sxm
+                            - h100_pcie
                            - a100_sxm
+                            - a100_pcie
                            - l40s
+                            - l40
+                            - l4
+                            - v100_sxm
+                            - v100_pcie
+                            - t4
+                            - mi200
+                            - mi300
                        - enum:
                            - gb200_sxm
+                            - b200_sxm
                            - h200_sxm
                            - h100_sxm
-                            - b200_sxm
+                            - h100_pcie
                            - a100_sxm
+                            - a100_pcie
                            - l40s
+                            - l40
+                            - l4
+                            - v100_sxm
+                            - v100_pcie
+                            - t4
+                            - mi200
+                            - mi300
                      description: |-
                        GPUSKU is the AIC hardware system identifier for the GPU.
                        When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
                      type: string
+                    interconnect:
+                      description: |-
+                        Interconnect describes the GPU interconnect type within a node.
+                        Examples: "pcie", "nvlink", "infiniband".
+                      type: string
                    numGpusPerNode:
                      description: NumGPUsPerNode is the number of GPUs per node.
                      format: int32
                      type: integer
+                    rdma:
+                      description: RDMA indicates whether RDMA is available on the cluster.
+                      type: boolean
                    totalGpus:
                      description: TotalGPUs is the total number of GPUs available in the cluster.
                      format: int32

--- a/deploy/operator/internal/gpu/discovery.go
+++ b/deploy/operator/internal/gpu/discovery.go
--- a/deploy/operator/internal/gpu/discovery_test.go
+++ b/deploy/operator/internal/gpu/discovery_test.go
@@ -26,6 +26,7 @@ import (
 	"strings"
 	"testing"

+	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
 	dto "github.com/prometheus/client_model/go"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -322,32 +323,159 @@ func TestExtractGPUInfoFromNode_MissingLabels(t *testing.T) {

 func TestInferHardwareSystem(t *testing.T) {
 	tests := []struct {
-		gpuProduct     string
-		expectedSystem string
-		description    string
+		name     string
+		input    string
+		expected nvidiacomv1beta1.GPUSKUType
 	}{
-		{"H100-SXM5-80GB", "h100_sxm", "H100 SXM variant"},
-		{"H100-PCIE-80GB", "h100_sxm", "H100 PCIe variant (mapped to SXM)"},
-		{"H200-SXM5-141GB", "h200_sxm", "H200 SXM variant"},
-		{"A100-SXM4-40GB", "a100_sxm", "A100 SXM variant"},
-		{"A100-PCIE-80GB", "a100_sxm", "A100 PCIe variant (mapped to SXM)"},
-		{"L40S", "l40s", "L40S"},
-		{"NVIDIA L40S", "l40s", "L40S with prefix"},
-		{"B200-SXM", "b200_sxm", "B200 SXM"},
-		{"GB200", "gb200_sxm", "GB200"},
-		{"Tesla V100-SXM2-16GB", "", "V100 (not in mapping)"},
-		{"RTX 4090", "", "Consumer GPU (not in mapping)"},
-		{"Unknown-GPU", "", "Unknown GPU"},
-		{"", "", "Empty string"},
-		// GFD product names as seen in real cluster labels (regression for GPUSKU bug)
-		{"NVIDIA-B200", "b200_sxm", "B200 with NVIDIA prefix (GFD label format)"},
-		{"NVIDIA-H200-SXM5-141GB", "h200_sxm", "H200 with NVIDIA prefix (GFD label format)"},
+		// --- Empty / unknown ---
+		{
+			name:     "empty input",
+			input:    "",
+			expected: "",
+		},
+		{
+			name:     "unknown gpu",
+			input:    "random-gpu",
+			expected: "",
+		},
+
+		// --- Blackwell ---
+		{
+			name:     "GB200 SXM",
+			input:    "GB200-SXM",
+			expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
+		},
+		{
+			name:     "GB200 HGX (implies SXM)",
+			input:    "HGX GB200",
+			expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
+		},
+		{
+			name:     "B200 SXM",
+			input:    "B200 SXM",
+			expected: nvidiacomv1beta1.GPUSKUTypeB200SXM,
+		},
+
+		// --- Hopper ---
+		{
+			name:     "H100 SXM",
+			input:    "H100 SXM",
+			expected: nvidiacomv1beta1.GPUSKUTypeH100SXM,
+		},
+		{
+			name:     "H100 PCIe explicit",
+			input:    "H100 PCIe",
+			expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
+		},
+		{
+			name:     "H100 default PCIe",
+			input:    "H100",
+			expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
+		},
+		{
+			name:     "H200 SXM",
+			input:    "H200 SXM",
+			expected: nvidiacomv1beta1.GPUSKUTypeH200SXM,
+		},
+
+		// --- Ampere ---
+		{
+			name:     "A100 SXM",
+			input:    "A100-SXM",
+			expected: nvidiacomv1beta1.GPUSKUTypeA100SXM,
+		},
+		{
+			name:     "A100 PCIe",
+			input:    "A100 PCIe",
+			expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
+		},
+		{
+			name:     "A100 default PCIe",
+			input:    "A100",
+			expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
+		},
+
+		// --- Ada ---
+		{
+			name:     "L40S",
+			input:    "L40S",
+			expected: nvidiacomv1beta1.GPUSKUTypeL40S,
+		},
+		{
+			name:     "L40S should not match L40",
+			input:    "L40S",
+			expected: nvidiacomv1beta1.GPUSKUTypeL40S,
+		},
+		{
+			name:     "L40",
+			input:    "L40",
+			expected: nvidiacomv1beta1.GPUSKUTypeL40,
+		},
+		{
+			name:     "L4",
+			input:    "L4",
+			expected: nvidiacomv1beta1.GPUSKUTypeL4,
+		},
+
+		// --- Volta / Turing ---
+		{
+			name:     "V100 SXM",
+			input:    "V100 SXM",
+			expected: nvidiacomv1beta1.GPUSKUTypeV100SXM,
+		},
+		{
+			name:     "V100 PCIe",
+			input:    "V100 PCIe",
+			expected: nvidiacomv1beta1.GPUSKUTypeV100PCIe,
+		},
+		{
+			name:     "T4",
+			input:    "T4",
+			expected: nvidiacomv1beta1.GPUSKUTypeT4,
+		},
+
+		// --- AMD ---
+		{
+			name:     "MI300",
+			input:    "MI300",
+			expected: nvidiacomv1beta1.GPUSKUTypeMI300,
+		},
+		{
+			name:     "MI250",
+			input:    "MI250",
+			expected: nvidiacomv1beta1.GPUSKUTypeMI200,
+		},
+		{
+			name:     "MI200",
+			input:    "MI200",
+			expected: nvidiacomv1beta1.GPUSKUTypeMI200,
+		},
+
+		// --- Normalization tests ---
+		{
+			name:     "lowercase + spaces",
+			input:    "h100 sxm",
+			expected: nvidiacomv1beta1.GPUSKUTypeH100SXM,
+		},
+		{
+			name:     "mixed case + dash",
+			input:    "A100-sXm",
+			expected: nvidiacomv1beta1.GPUSKUTypeA100SXM,
+		},
+		{
+			name:     "with extra spaces",
+			input:    "  H100   PCIe ",
+			expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
+		},
 	}

 	for _, tt := range tests {
-		t.Run(tt.description, func(t *testing.T) {
-			result := InferHardwareSystem(tt.gpuProduct)
-			assert.Equal(t, tt.expectedSystem, string(result), "Failed for GPU: %s", tt.gpuProduct)
+		t.Run(tt.name, func(t *testing.T) {
+			result := InferHardwareSystem(tt.input)
+			if result != tt.expected {
+				t.Errorf("InferHardwareSystem(%q) = %v, want %v",
+					tt.input, result, tt.expected)
+			}
 		})
 	}
 }
@@ -382,6 +510,119 @@ func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) {
 	}
 }

+func TestNormalize(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "basic lowercase",
+			input:    "h100",
+			expected: "H100",
+		},
+		{
+			name:     "spaces removed",
+			input:    "H100 SXM",
+			expected: "H100SXM",
+		},
+		{
+			name:     "dashes replaced and removed",
+			input:    "H100-SXM",
+			expected: "H100SXM",
+		},
+		{
+			name:     "mixed spaces and dashes",
+			input:    "A100 - SXM",
+			expected: "A100SXM",
+		},
+		{
+			name:     "extra whitespace",
+			input:    "  H100   PCIe ",
+			expected: "H100PCIE",
+		},
+		{
+			name:     "complex string",
+			input:    "h100-sxm5-80gb",
+			expected: "H100SXM580GB",
+		},
+		{
+			name:     "empty string",
+			input:    "",
+			expected: "",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := normalize(tt.input)
+			if result != tt.expected {
+				t.Errorf("normalize(%q) = %q, want %q",
+					tt.input, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestDetectFormFactor(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string // already normalized
+		expected string
+	}{
+		{
+			name:     "detect SXM explicitly",
+			input:    "H100SXM",
+			expected: formFactorSXM,
+		},
+		{
+			name:     "detect HGX implies SXM",
+			input:    "HGXH100",
+			expected: formFactorSXM,
+		},
+		{
+			name:     "detect DGX implies SXM",
+			input:    "DGXH100",
+			expected: formFactorSXM,
+		},
+		{
+			name:     "detect PCIe explicitly",
+			input:    "H100PCIE",
+			expected: formFactorPCIe,
+		},
+		{
+			name:     "default to PCIe when unknown",
+			input:    "H100",
+			expected: formFactorPCIe,
+		},
+		{
+			name:     "SXM wins over PCIe if both present",
+			input:    "H100SXMPCIE",
+			expected: formFactorSXM,
+		},
+		{
+			name:     "random string defaults to PCIe",
+			input:    "RANDOMGPU",
+			expected: formFactorPCIe,
+		},
+		{
+			name:     "empty string defaults to PCIe",
+			input:    "",
+			expected: formFactorPCIe,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := detectFormFactor(tt.input)
+			if result != tt.expected {
+				t.Errorf("detectFormFactor(%q) = %v, want %v",
+					tt.input, result, tt.expected)
+			}
+		})
+	}
+}
+
 func TestParseMetrics(t *testing.T) {
 	ctx := context.Background()

@@ -873,3 +1114,117 @@ func TestGetCloudProviderInfo(t *testing.T) {
 		})
 	}
 }
+
+func TestDetectRDMAFromNode(t *testing.T) {
+	scheme := runtime.NewScheme()
+	_ = corev1.AddToScheme(scheme)
+
+	tests := []struct {
+		name        string
+		node        *corev1.Node
+		nodeName    string
+		expectedOK  bool
+		expectedTyp string
+	}{
+		{
+			name:        "node not found",
+			node:        nil,
+			nodeName:    "missing-node",
+			expectedOK:  false,
+			expectedTyp: strNone,
+		},
+		{
+			name: "rdma detected",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "node-rdma",
+					Labels: map[string]string{
+						"nvidia.com/rdma.present": "true",
+					},
+				},
+			},
+			nodeName:    "node-rdma",
+			expectedOK:  true,
+			expectedTyp: "rdma",
+		},
+		{
+			name: "sriov detected",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "node-sriov",
+					Labels: map[string]string{
+						"feature.node.kubernetes.io/network-sriov.capable": "true",
+					},
+				},
+			},
+			nodeName:    "node-sriov",
+			expectedOK:  true,
+			expectedTyp: "sriov",
+		},
+		{
+			name: "both rdma and sriov - rdma takes precedence",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "node-both",
+					Labels: map[string]string{
+						"nvidia.com/rdma.present":                          "true",
+						"feature.node.kubernetes.io/network-sriov.capable": "true",
+					},
+				},
+			},
+			nodeName:    "node-both",
+			expectedOK:  true,
+			expectedTyp: "rdma",
+		},
+		{
+			name: "no relevant labels",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:   "node-none",
+					Labels: map[string]string{},
+				},
+			},
+			nodeName:    "node-none",
+			expectedOK:  false,
+			expectedTyp: strNone,
+		},
+		{
+			name: "labels present but false",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "node-false",
+					Labels: map[string]string{
+						"nvidia.com/rdma.present":                          "false",
+						"feature.node.kubernetes.io/network-sriov.capable": "false",
+					},
+				},
+			},
+			nodeName:    "node-false",
+			expectedOK:  false,
+			expectedTyp: strNone,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			var objs []runtime.Object
+			if tt.node != nil {
+				objs = append(objs, tt.node)
+			}
+
+			fakeClient := fake.NewClientBuilder().
+				WithScheme(scheme).
+				WithRuntimeObjects(objs...).
+				Build()
+
+			ok, typ := detectRDMAFromNode(context.TODO(), fakeClient, tt.nodeName)
+
+			if ok != tt.expectedOK {
+				t.Errorf("expected ok=%v, got %v", tt.expectedOK, ok)
+			}
+			if typ != tt.expectedTyp {
+				t.Errorf("expected type=%s, got %s", tt.expectedTyp, typ)
+			}
+		})
+	}
+}
--- a/docs/kubernetes/api-reference.md
+++ b/docs/kubernetes/api-reference.md
@@ -1542,19 +1542,28 @@ _Underlying type:_ _string_
 GPUSKUType is the AIC hardware system identifier for a supported GPU.

 _Validation:_
- Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s]
+- Enum: [gb200_sxm b200_sxm h200_sxm h100_sxm h100_pcie a100_sxm a100_pcie l40s l40 l4 v100_sxm v100_pcie t4 mi200 mi300]

 _Appears in:_
 - [HardwareSpec](#hardwarespec)

 | Field | Description |
 | --- | --- |
-| `gb200_sxm` |  |
-| `h200_sxm` |  |
-| `h100_sxm` |  |
+| `gb200_sxm` | --- Blackwell ---<br /> |
 | `b200_sxm` |  |
-| `a100_sxm` |  |
-| `l40s` |  |
+| `h200_sxm` | --- Hopper ---<br /> |
+| `h100_sxm` |  |
+| `h100_pcie` |  |
+| `a100_sxm` | --- Ampere ---<br /> |
+| `a100_pcie` |  |
+| `l40s` | --- Ada ---<br /> |
+| `l40` |  |
+| `l4` |  |
+| `v100_sxm` | --- Older NVIDIA ---<br /> |
+| `v100_pcie` |  |
+| `t4` |  |
+| `mi200` | --- AMD ---<br /> |
+| `mi300` |  |


 #### HardwareSpec
@@ -1571,10 +1580,12 @@ _Appears in:_

 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. |  | Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s] <br />Optional: \{\} <br /> |
+| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. |  | Enum: [gb200_sxm b200_sxm h200_sxm h100_sxm h100_pcie a100_sxm a100_pcie l40s l40 l4 v100_sxm v100_pcie t4 mi200 mi300] <br />Optional: \{\} <br /> |
 | `vramMb` _float_ | VRAMMB is the VRAM per GPU in MiB. |  | Optional: \{\} <br /> |
 | `totalGpus` _integer_ | TotalGPUs is the total number of GPUs available in the cluster. |  | Optional: \{\} <br /> |
 | `numGpusPerNode` _integer_ | NumGPUsPerNode is the number of GPUs per node. |  | Optional: \{\} <br /> |
+| `interconnect` _string_ | Interconnect describes the GPU interconnect type within a node.<br />Examples: "pcie", "nvlink", "infiniband". |  | Optional: \{\} <br /> |
+| `rdma` _boolean_ | RDMA indicates whether RDMA is available on the cluster. |  | Optional: \{\} <br /> |