feat: Expanding the hardware discovery to include RDMA, interconnect. (#7551)

63787939 · devivasudevan · GitHub · df53b7a2 · 63787939 · 63787939
Unverified Commit 63787939 authored Apr 16, 2026 by devivasudevan Committed by GitHub Apr 16, 2026
8 changed files
--- a/components/src/dynamo/profiler/utils/dgdr_v1beta1_types.py
+++ b/components/src/dynamo/profiler/utils/dgdr_v1beta1_types.py
@@ -59,11 +59,20 @@ class SearchStrategy(str, Enum):
 class GPUSKUType(str, Enum):
    GB200SXM = "gb200_sxm"
+    B200SXM = "b200_sxm"
    H200SXM = "h200_sxm"
    H100SXM = "h100_sxm"
-    B200SXM = "b200_sxm"
+    H100PCIe = "h100_pcie"
    A100SXM = "a100_sxm"
+    A100PCIe = "a100_pcie"
    L40S = "l40s"
+    L40 = "l40"
+    L4 = "l4"
+    V100SXM = "v100_sxm"
+    V100PCIe = "v100_pcie"
+    T4 = "t4"
+    MI200 = "mi200"
+    MI300 = "mi300"
 class BackendType(str, Enum):
@@ -210,6 +219,14 @@ class HardwareSpec(BaseModel):
    numGpusPerNode: Optional[int] = Field(
        default=None, description="NumGPUsPerNode is the number of GPUs per node."
    )
+    interconnect: Optional[str] = Field(
+        default=None,
+        description='Interconnect describes the GPU interconnect type within a node. Examples: "pcie", "nvlink", "infiniband".',
+    )
+    rdma: Optional[bool] = Field(
+        default=None,
+        description="RDMA indicates whether RDMA is available on the cluster.",
+    )
 class DynamoGraphDeploymentRequestSpec(BaseModel):

--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
@@ -579,26 +579,52 @@ spec:
                      allOf:
                        - enum:
                            - gb200_sxm
+                            - b200_sxm
                            - h200_sxm
                            - h100_sxm
-                            - b200_sxm
+                            - h100_pcie
                            - a100_sxm
+                            - a100_pcie
                            - l40s
+                            - l40
+                            - l4
+                            - v100_sxm
+                            - v100_pcie
+                            - t4
+                            - mi200
+                            - mi300
                        - enum:
                            - gb200_sxm
+                            - b200_sxm
                            - h200_sxm
                            - h100_sxm
-                            - b200_sxm
+                            - h100_pcie
                            - a100_sxm
+                            - a100_pcie
                            - l40s
+                            - l40
+                            - l4
+                            - v100_sxm
+                            - v100_pcie
+                            - t4
+                            - mi200
+                            - mi300
                      description: |-
                        GPUSKU is the AIC hardware system identifier for the GPU.
                        When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
                      type: string
+                    interconnect:
+                      description: |-
+                        Interconnect describes the GPU interconnect type within a node.
+                        Examples: "pcie", "nvlink", "infiniband".
+                      type: string
                    numGpusPerNode:
                      description: NumGPUsPerNode is the number of GPUs per node.
                      format: int32
                      type: integer
+                    rdma:
+                      description: RDMA indicates whether RDMA is available on the cluster.
+                      type: boolean
                    totalGpus:
                      description: TotalGPUs is the total number of GPUs available in the cluster.
                      format: int32

--- a/deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
+++ b/deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
@@ -175,16 +175,31 @@ const (
 )
 // GPUSKUType is the AIC hardware system identifier for a supported GPU.
-// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
+// +kubebuilder:validation:Enum=gb200_sxm;b200_sxm;h200_sxm;h100_sxm;h100_pcie;a100_sxm;a100_pcie;l40s;l40;l4;v100_sxm;v100_pcie;t4;mi200;mi300
 type GPUSKUType string
 const (
+	// --- Blackwell ---
 	GPUSKUTypeGB200SXM GPUSKUType = "gb200_sxm"
+	GPUSKUTypeB200SXM  GPUSKUType = "b200_sxm"
+	// --- Hopper ---
 	GPUSKUTypeH200SXM  GPUSKUType = "h200_sxm"
 	GPUSKUTypeH100SXM  GPUSKUType = "h100_sxm"
-	GPUSKUTypeB200SXM  GPUSKUType = "b200_sxm"
+	GPUSKUTypeH100PCIe GPUSKUType = "h100_pcie"
+	// --- Ampere ---
 	GPUSKUTypeA100SXM  GPUSKUType = "a100_sxm"
-	GPUSKUTypeL40S     GPUSKUType = "l40s"
+	GPUSKUTypeA100PCIe GPUSKUType = "a100_pcie"
+	// --- Ada ---
+	GPUSKUTypeL40S GPUSKUType = "l40s"
+	GPUSKUTypeL40  GPUSKUType = "l40"
+	GPUSKUTypeL4   GPUSKUType = "l4"
+	// --- Older NVIDIA ---
+	GPUSKUTypeV100SXM  GPUSKUType = "v100_sxm"
+	GPUSKUTypeV100PCIe GPUSKUType = "v100_pcie"
+	GPUSKUTypeT4       GPUSKUType = "t4"
+	// --- AMD ---
+	GPUSKUTypeMI200 GPUSKUType = "mi200"
+	GPUSKUTypeMI300 GPUSKUType = "mi300"
 )
 // BackendType specifies the inference backend.
@@ -324,7 +339,7 @@ type HardwareSpec struct {
 	// GPUSKU is the AIC hardware system identifier for the GPU.
 	// When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
 	// +optional
-	// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
+	// +kubebuilder:validation:Enum=gb200_sxm;b200_sxm;h200_sxm;h100_sxm;h100_pcie;a100_sxm;a100_pcie;l40s;l40;l4;v100_sxm;v100_pcie;t4;mi200;mi300
 	GPUSKU GPUSKUType `json:"gpuSku,omitempty"`
 	// VRAMMB is the VRAM per GPU in MiB.
@@ -338,6 +353,13 @@ type HardwareSpec struct {
 	// NumGPUsPerNode is the number of GPUs per node.
 	// +optional
 	NumGPUsPerNode *int32 `json:"numGpusPerNode,omitempty"`
+	// Interconnect describes the GPU interconnect type within a node.
+	// Examples: "pcie", "nvlink", "infiniband".
+	// +optional
+	Interconnect string `json:"interconnect,omitempty"`
+	// RDMA indicates whether RDMA is available on the cluster.
+	// +optional
+	RDMA *bool `json:"rdma,omitempty"`
 }
 // DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.

--- a/deploy/operator/api/v1beta1/zz_generated.deepcopy.go
+++ b/deploy/operator/api/v1beta1/zz_generated.deepcopy.go
@@ -252,6 +252,11 @@ func (in *HardwareSpec) DeepCopyInto(out *HardwareSpec) {
 		*out = new(int32)
 		**out = **in
 	}
+	if in.RDMA != nil {
+		in, out := &in.RDMA, &out.RDMA
+		*out = new(bool)
+		**out = **in
+	}
 }
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HardwareSpec.

--- a/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
@@ -579,26 +579,52 @@ spec:
                      allOf:
                        - enum:
                            - gb200_sxm
+                            - b200_sxm
                            - h200_sxm
                            - h100_sxm
-                            - b200_sxm
+                            - h100_pcie
                            - a100_sxm
+                            - a100_pcie
                            - l40s
+                            - l40
+                            - l4
+                            - v100_sxm
+                            - v100_pcie
+                            - t4
+                            - mi200
+                            - mi300
                        - enum:
                            - gb200_sxm
+                            - b200_sxm
                            - h200_sxm
                            - h100_sxm
-                            - b200_sxm
+                            - h100_pcie
                            - a100_sxm
+                            - a100_pcie
                            - l40s
+                            - l40
+                            - l4
+                            - v100_sxm
+                            - v100_pcie
+                            - t4
+                            - mi200
+                            - mi300
                      description: |-
                        GPUSKU is the AIC hardware system identifier for the GPU.
                        When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
                      type: string
+                    interconnect:
+                      description: |-
+                        Interconnect describes the GPU interconnect type within a node.
+                        Examples: "pcie", "nvlink", "infiniband".
+                      type: string
                    numGpusPerNode:
                      description: NumGPUsPerNode is the number of GPUs per node.
                      format: int32
                      type: integer
+                    rdma:
+                      description: RDMA indicates whether RDMA is available on the cluster.
+                      type: boolean
                    totalGpus:
                      description: TotalGPUs is the total number of GPUs available in the cluster.
                      format: int32

--- a/deploy/operator/internal/gpu/discovery.go
+++ b/deploy/operator/internal/gpu/discovery.go
@@ -14,7 +14,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package gpu
 import (
@@ -28,40 +27,73 @@ import (
 	"sync"
 	"time"
+	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
 	dto "github.com/prometheus/client_model/go"
 	"github.com/prometheus/common/expfmt"
 	"github.com/prometheus/common/model"
-	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/types"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 )
 const (
 	defaultDCGMEndpointTemplate = "http://{POD_IP}:9400/metrics"
 	// NVIDIA GPU Feature Discovery (GFD) label keys
 	LabelGPUCount   = "nvidia.com/gpu.count"
 	LabelGPUProduct = "nvidia.com/gpu.product"
 	LabelGPUMemory  = "nvidia.com/gpu.memory"
 	// DCGM exporter label constants
-	LabelApp                     = "app"
+	LabelApp                        = "app"
-	LabelAppKubernetesName       = "app.kubernetes.io/name"
+	LabelAppKubernetesName          = "app.kubernetes.io/name"
-	LabelValueNvidiaDCGMExporter = "nvidia-dcgm-exporter"
+	LabelValueNvidiaDCGMExporter    = "nvidia-dcgm-exporter"
-	LabelValueDCGMExporter       = "dcgm-exporter"
+	LabelValueNvidiaNetworkOperator = "nvidia-network-operator"
-	LabelValueGPUOperator        = "gpu-operator"
+	LabelValueDCGMExporter          = "dcgm-exporter"
-	GPUOperatorNamespace         = "gpu-operator"
+	LabelValueGPUOperator           = "gpu-operator"
+	GPUOperatorNamespace            = "gpu-operator"
-	requestTimeout      = 5 * time.Second
+	requestTimeout                  = 5 * time.Second
-	dialTimeout         = 3 * time.Second
+	dialTimeout                     = 3 * time.Second
-	tlsHandshakeTimeout = 3 * time.Second
+	tlsHandshakeTimeout             = 3 * time.Second
+	CloudProviderGCP                = "gcp"
-	CloudProviderGCP     = "gcp"
+	CloudProviderAWS                = "aws"
-	CloudProviderAWS     = "aws"
+	CloudProviderAKS                = "aks"
-	CloudProviderAKS     = "aks"
+	CloudProviderOther              = "other"
-	CloudProviderOther   = "other"
+	CloudProviderUnknown            = "unknown"
-	CloudProviderUnknown = "unknown"
+)
+// --- Normalization helpers ---
+const (
+	strDash  = "-"
+	strSpace = " "
+	strNone  = "none"
+)
+// --- Form factor tokens ---
+const (
+	tokenSXM       = "SXM"
+	tokenHGX       = "HGX"
+	tokenDGX       = "DGX"
+	tokenPCIE      = "PCIE"
+	formFactorSXM  = "sxm"
+	formFactorPCIe = "pcie"
+)
+// --- GPU model tokens ---
+const (
+	tokenGB200  = "GB200"
+	tokenB200   = "B200"
+	tokenH200   = "H200"
+	tokenH100   = "H100"
+	tokenA100   = "A100"
+	tokenL40S   = "L40S"
+	tokenL40    = "L40"
+	tokenL4     = "L4"
+	tokenV100   = "V100"
+	tokenT4     = "T4"
+	tokenMI300  = "MI300"
+	tokenMI250  = "MI250"
+	tokenMI200  = "MI200"
+	LabelNVLink = "nvlink"
 )
 // awsInstanceTypePrefixes matches known GPU/accelerator instance families on EKS. See: https://aws.amazon.com/ec2/instance-types/
@@ -79,27 +111,64 @@ var gcpMachineSeries = []string{
 	"g2-", // L4 GPU machines
 }
+type gpuRule struct {
+	token     string
+	sxmSKU    nvidiacomv1beta1.GPUSKUType
+	pcieSKU   nvidiacomv1beta1.GPUSKUType
+	singleSKU nvidiacomv1beta1.GPUSKUType // for GPUs without form factor variants
+}
+var gpuRules = []gpuRule{
+	// Blackwell
+	{token: tokenGB200, sxmSKU: nvidiacomv1beta1.GPUSKUTypeGB200SXM},
+	{token: tokenB200, sxmSKU: nvidiacomv1beta1.GPUSKUTypeB200SXM},
+	// Hopper
+	{token: tokenH200, sxmSKU: nvidiacomv1beta1.GPUSKUTypeH200SXM},
+	{token: tokenH100, sxmSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, pcieSKU: nvidiacomv1beta1.GPUSKUTypeH100PCIe},
+	// Ampere
+	{token: tokenA100, sxmSKU: nvidiacomv1beta1.GPUSKUTypeA100SXM, pcieSKU: nvidiacomv1beta1.GPUSKUTypeA100PCIe},
+	// Ada
+	{token: tokenL40S, singleSKU: nvidiacomv1beta1.GPUSKUTypeL40S},
+	{token: tokenL40, singleSKU: nvidiacomv1beta1.GPUSKUTypeL40},
+	{token: tokenL4, singleSKU: nvidiacomv1beta1.GPUSKUTypeL4},
+	// Volta / Turing
+	{token: tokenV100, sxmSKU: nvidiacomv1beta1.GPUSKUTypeV100SXM, pcieSKU: nvidiacomv1beta1.GPUSKUTypeV100PCIe},
+	{token: tokenT4, singleSKU: nvidiacomv1beta1.GPUSKUTypeT4},
+	// AMD
+	{token: tokenMI300, singleSKU: nvidiacomv1beta1.GPUSKUTypeMI300},
+	{token: tokenMI250, singleSKU: nvidiacomv1beta1.GPUSKUTypeMI200},
+	{token: tokenMI200, singleSKU: nvidiacomv1beta1.GPUSKUTypeMI200},
+}
 // GPUInfo contains discovered GPU configuration from cluster nodes
 type GPUInfo struct {
-	NodeName      string                      // Name of the node with this GPU configuration
+	NodeName         string                      // Name of the node with this GPU configuration
-	GPUsPerNode   int                         // Maximum GPUs per node found in the cluster
+	GPUsPerNode      int                         // Maximum GPUs per node found in the cluster
-	NodesWithGPUs int                         // Number of nodes that have GPUs
+	NodesWithGPUs    int                         // Number of nodes that have GPUs
-	Model         string                      // GPU product name (e.g., "H100-SXM5-80GB")
+	Model            string                      // GPU product name (e.g., "H100-SXM5-80GB")
-	VRAMPerGPU    int                         // VRAM in MiB per GPU
+	VRAMPerGPU       int                         // VRAM in MiB per GPU
-	System        nvidiacomv1beta1.GPUSKUType // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
+	System           nvidiacomv1beta1.GPUSKUType // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
-	MIGEnabled    bool                        // True if MIG is enabled (inferred from model or additional labels, not implemented in this version)
+	MIGEnabled       bool                        // True if MIG is enabled (inferred from model or additional labels, not implemented in this version)
-	MIGProfiles   map[string]int              // Optional: map of MIG profile name to count (requires additional label parsing, not implemented in this version)
+	MIGProfiles      map[string]int              // Optional: map of MIG profile name to count (requires additional label parsing, not implemented in this version)
-	CloudProvider string                      // NEW: aws | gcp | aks | other | unknown
+	CloudProvider    string                      // aws | gcp | aks | other | unknown
+	RDMAEnabled      bool                        // Indicates whether RDMA is enabled for this node (e.g., via InfiniBand, RoCE, or similar high-speed networking)
+	RDMAType         string                      // Type of RDMA transport detected (e.g., "infiniband", "roce", "rdma", "sriov", or "none")
+	Interconnect     string                      // Primary GPU-to-GPU interconnect technology used within the node (e.g., "nvlink" for high-bandwidth links or "pcie" for standard bus-based communication)
+	InterconnectTier string                      // Qualitative or platform-specific classification of the interconnect (e.g., NVLink generation, topology tier, or vendor-defined performance level)
+	NVLinkLinks      int                         // Number of NVLink connections per GPU (0 if NVLink is not present or interconnect is PCIe-only)
 }
 type ScrapeMetricsFunc func(ctx context.Context, endpoint string) (*GPUInfo, error)
 type GPUDiscoveryCache struct {
 	mu        sync.RWMutex
 	value     *GPUInfo
 	expiresAt time.Time
 }
 type GPUDiscovery struct {
 	Scraper ScrapeMetricsFunc
 }
@@ -195,7 +264,6 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
 	if err != nil && !strings.Contains(err.Error(), "no DCGM exporter pods found") {
 		return nil, fmt.Errorf("listing DCGM exporter pods failed: %w", err)
 	}
 	// If no pods found
 	if len(dcgmPods) == 0 {
 		gpuPods, err := listGPUOperatorRunningPods(ctx, k8sClient)
@@ -204,23 +272,28 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
 		}
 		return nil, err
 	}
 	// Scrape each running pod individually
 	var bestNode *GPUInfo
 	var scrapeErrors []error
+	var rdmaDetected bool
+	var rdmaType string
 	nodesWithGPUs := 0
 	for _, pod := range dcgmPods {
 		if pod.Status.Phase != corev1.PodRunning || pod.Status.PodIP == "" {
 			continue
 		}
 		endpoint := buildDCGMEndpoint(pod.Status.PodIP)
 		info, err := g.Scraper(ctx, endpoint)
 		if err != nil {
 			scrapeErrors = append(scrapeErrors, fmt.Errorf("pod %s (%s): %w", pod.Name, pod.Status.PodIP, err))
 			continue
 		}
+		// Detect RDMA on the node of this pod
+		rdma, rType := detectRDMAFromNode(ctx, k8sClient, pod.Spec.NodeName)
+		if rdma {
+			rdmaDetected = true
+			rdmaType = rType
+		}
 		// Increment NodesWithGPUs for every node that successfully reports GPU metrics
 		nodesWithGPUs++
 		// Select best node: highest GPU count, tie-breaker by VRAM
@@ -228,18 +301,21 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
 			info.GPUsPerNode > bestNode.GPUsPerNode ||
 			(info.GPUsPerNode == bestNode.GPUsPerNode &&
 				info.VRAMPerGPU > bestNode.VRAMPerGPU) {
 			bestNode = info
 		}
 	}
 	if bestNode == nil {
 		if len(scrapeErrors) > 0 {
 			return nil, fmt.Errorf("failed to scrape any DCGM exporter pod: %v", scrapeErrors)
 		}
 		return nil, fmt.Errorf("no GPU metrics could be parsed from any DCGM pod")
 	}
+	// --- Detect RDMA and InfiniBand presence ---
+	ib := detectIBPods(ctx, k8sClient)
+	if ib {
+		rdmaType = "infiniband"
+		rdmaDetected = true
+	}
 	// Infer cloud provider for the best node
 	cloudProvider, err := GetCloudProviderInfo(ctx, k8sClient)
 	if err != nil {
@@ -247,61 +323,51 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
 	}
 	bestNode.CloudProvider = cloudProvider
 	bestNode.NodesWithGPUs = nodesWithGPUs
+	bestNode.RDMAEnabled = rdmaDetected
+	bestNode.RDMAType = rdmaType
 	if cache != nil {
 		// Cache result for 60 seconds
 		cache.Set(bestNode, 60*time.Second)
 	}
 	return bestNode, nil
 }
 func buildDCGMEndpoint(podIP string) string {
 	template := os.Getenv("DCGM_METRICS_ENDPOINT_TEMPLATE")
 	if template == "" {
 		template = defaultDCGMEndpointTemplate
 	}
 	return strings.ReplaceAll(template, "{POD_IP}", podIP)
 }
 func listDCGMExporterPods(ctx context.Context, k8sClient client.Reader) ([]corev1.Pod, error) {
 	var result []corev1.Pod
 	seen := make(map[string]struct{})
 	selectors := []client.MatchingLabels{
 		{LabelApp: LabelValueNvidiaDCGMExporter},
 		{LabelApp: LabelValueDCGMExporter},
 		{LabelAppKubernetesName: LabelValueDCGMExporter},
 	}
 	var lastErr error
 	for _, selector := range selectors {
 		podList := &corev1.PodList{}
 		err := k8sClient.List(ctx, podList, selector)
 		if err != nil {
 			lastErr = fmt.Errorf("list pods: %w", err)
 			continue
 		}
 		for _, pod := range podList.Items {
 			key := pod.Namespace + "/" + pod.Name
 			if _, exists := seen[key]; !exists {
 				seen[key] = struct{}{}
 				result = append(result, pod)
 			}
 		}
 	}
 	if len(result) > 0 {
 		return result, nil
 	}
 	if lastErr != nil {
 		return nil, lastErr
 	}
 	return nil, fmt.Errorf("no DCGM exporter pods found")
 }
@@ -311,20 +377,15 @@ func listDCGMExporterPods(ctx context.Context, k8sClient client.Reader) ([]corev
 // It uses common GPU Operator label selectors and deduplicates results
 // across selectors. If no running pods are found, an error is returned.
 func listGPUOperatorRunningPods(ctx context.Context, k8sClient client.Reader) ([]corev1.Pod, error) {
 	var result []corev1.Pod
 	seen := make(map[string]struct{})
 	selectors := []client.MatchingLabels{
 		{LabelApp: LabelValueGPUOperator},
 		{LabelAppKubernetesName: LabelValueGPUOperator},
 	}
 	var lastErr error
 	for _, selector := range selectors {
 		podList := &corev1.PodList{}
 		err := k8sClient.List(
 			ctx,
 			podList,
@@ -335,29 +396,23 @@ func listGPUOperatorRunningPods(ctx context.Context, k8sClient client.Reader) ([
 			lastErr = fmt.Errorf("list gpu operator pods: %w", err)
 			continue
 		}
 		for _, pod := range podList.Items {
 			if pod.Status.Phase != corev1.PodRunning {
 				continue
 			}
 			key := pod.Namespace + "/" + pod.Name
 			if _, exists := seen[key]; !exists {
 				seen[key] = struct{}{}
 				result = append(result, pod)
 			}
 		}
 	}
 	if len(result) > 0 {
 		return result, nil
 	}
 	if lastErr != nil {
 		return nil, lastErr
 	}
 	return nil, fmt.Errorf(
 		"gpu operator is not installed %s",
 		GPUOperatorNamespace,
@@ -385,7 +440,6 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
 	// Set a timeout for the request
 	ctx, cancel := context.WithTimeout(ctx, requestTimeout)
 	defer cancel()
 	// Create a custom HTTP client with transport-level timeouts
 	client := &http.Client{
 		Transport: &http.Transport{
@@ -396,12 +450,10 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
 			TLSHandshakeTimeout: tlsHandshakeTimeout, // TLS handshake timeout
 		},
 	}
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
 	if err != nil {
 		return nil, fmt.Errorf("create request for %s: %w", endpoint, err)
 	}
 	resp, err := client.Do(req)
 	if err != nil {
 		return nil, fmt.Errorf("HTTP GET %s failed: %w", endpoint, err)
@@ -412,7 +464,6 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
 			log.FromContext(ctx).V(1).Info("failed to close response body", "err", cerr)
 		}
 	}()
 	if resp.StatusCode != http.StatusOK {
 		return nil, fmt.Errorf(
 			"metrics endpoint %s returned status %d",
@@ -420,20 +471,18 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
 			resp.StatusCode,
 		)
 	}
 	parser := expfmt.NewTextParser(model.UTF8Validation)
 	metricFamilies, err := parser.TextToMetricFamilies(resp.Body)
 	if err != nil {
 		return nil, fmt.Errorf("parse prometheus metrics: %w", err)
 	}
 	return parseMetrics(ctx, metricFamilies)
 }
-// parseMetrics extracts GPU information for a node from DCGM Prometheus metrics.
+// parseMetrics extracts GPU information and interconnect type for a node from DCGM Prometheus metrics.
 //
 // It parses the provided Prometheus metric families exported by the NVIDIA
-// DCGM exporter and derives high-level GPU inventory information for the node.
+// DCGM exporter and derives high-level GPU inventory and interconnect information for the node.
 //
 // The function performs the following:
 //
@@ -446,6 +495,10 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
 //     VRAM = FB_FREE + FB_USED + FB_RESERVED
 //     (values are in MiB).
 //
+//   - Determines the interconnect type (PCIe or NVLink) from the
+//     DCGM_FI_DEV_NVLINK_LINK_COUNT metric. If NVLink links are present,
+//     interconnect is set to "nvlink", otherwise defaults to "pcie".
+//
 //   - Assumes MIG is disabled unless explicit MIG metrics are present
 //     (not included in the provided DCGM metric set).
 //
@@ -464,17 +517,18 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
 //	  - MIGEnabled: false because no MIG metrics were collected in the DCGM families
 //	  - MIGProfiles: empty map; would contain MIG profile counts if MIG metrics were available
 //	  - System (inferred from model)
+//	  - Interconnect: "pcie" or "nvlink" depending on detected NVLink links
 //
 // Returns an error if no GPUs can be detected from the metrics.
 //
 // Notes:
 //   - This function relies on DCGM exporter metrics.
 //   - If required metrics are missing, zero values may be returned.
+//   - Interconnect detection is based on NVLink link count; other interconnects are not currently detected.
 //   - The implementation assumes homogeneous GPUs per node.
 //   - For heterogeneous configurations, per-GPU parsing should be implemented.
 func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*GPUInfo, error) {
 	logger := log.FromContext(ctx)
 	getLabel := func(m *dto.Metric, name string) string {
 		for _, l := range m.GetLabel() {
 			if l.GetName() == name {
@@ -483,18 +537,16 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
 		}
 		return ""
 	}
 	// Track unique GPUs
 	gpuSet := map[string]struct{}{}
 	var model string
 	var vram int
 	var hostName string
+	var nvlinkDetected bool
+	var nvlinkLinks int
 	fbFree := map[string]float64{}
 	fbUsed := map[string]float64{}
 	fbReserved := map[string]float64{}
 	// --- Detect GPUs + Model + Hostname ---
 	if mf, ok := families["DCGM_FI_DEV_GPU_TEMP"]; ok {
 		for _, m := range mf.Metric {
@@ -503,19 +555,16 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
 				continue
 			}
 			gpuSet[gpuID] = struct{}{}
 			// Extract model from label
 			if model == "" {
 				model = getLabel(m, "modelName")
 			}
 			// Extract Hostname label
 			if hostName == "" {
 				hostName = getLabel(m, "Hostname")
 			}
 		}
 	}
 	// --- Collect framebuffer metrics ---
 	if mf, ok := families["DCGM_FI_DEV_FB_FREE"]; ok {
 		for _, m := range mf.Metric {
@@ -524,13 +573,11 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
 				continue
 			}
 			fbFree[gpuID] = m.GetGauge().GetValue()
 			if hostName == "" {
 				hostName = getLabel(m, "Hostname")
 			}
 		}
 	}
 	if mf, ok := families["DCGM_FI_DEV_FB_USED"]; ok {
 		for _, m := range mf.Metric {
 			gpuID := getLabel(m, "gpu")
@@ -538,13 +585,11 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
 				continue
 			}
 			fbUsed[gpuID] = m.GetGauge().GetValue()
 			if hostName == "" {
 				hostName = getLabel(m, "Hostname")
 			}
 		}
 	}
 	if mf, ok := families["DCGM_FI_DEV_FB_RESERVED"]; ok {
 		for _, m := range mf.Metric {
 			gpuID := getLabel(m, "gpu")
@@ -552,13 +597,37 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
 				continue
 			}
 			fbReserved[gpuID] = m.GetGauge().GetValue()
 			if hostName == "" {
 				hostName = getLabel(m, "Hostname")
 			}
 		}
 	}
+	if mf, ok := families["DCGM_FI_DEV_NVLINK_LINK_COUNT"]; ok {
+		for _, m := range mf.Metric {
+			val := int(m.GetGauge().GetValue())
+			if val > 0 {
+				nvlinkDetected = true
+				nvlinkLinks = val
+				break
+			}
+		}
+	}
+	// --- Determine interconnect type ---
+	interconnect := "pcie"
+	interconnectDetail := strNone
+	if nvlinkDetected {
+		switch {
+		case nvlinkLinks >= 12:
+			interconnect = LabelNVLink
+			interconnectDetail = "full-mesh" // HGX / DGX class
+		case nvlinkLinks >= 6:
+			interconnect = LabelNVLink
+			interconnectDetail = "high"
+		default:
+			interconnect = LabelNVLink
+			interconnectDetail = "partial"
+		}
+	}
 	// --- Calculate Max VRAM
 	for gpuID := range gpuSet {
 		total := int(fbFree[gpuID] + fbUsed[gpuID] + fbReserved[gpuID])
@@ -566,32 +635,33 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
 			vram = total
 		}
 	}
 	gpuCount := len(gpuSet)
 	if gpuCount == 0 {
 		return nil, fmt.Errorf("no GPUs detected from DCGM metrics")
 	}
 	// --- Infer system from model ---
 	system := InferHardwareSystem(model)
 	logger.Info("Parsed GPU info",
 		"node", hostName,
 		"gpuCount", gpuCount,
 		"model", model,
 		"vramMiB", vram,
 		"system", system,
+		"interconnect", interconnect,
+		"interconnectDetail", interconnectDetail,
+		"nvlinkLinks", nvlinkLinks,
 	)
 	return &GPUInfo{
-		NodeName:    hostName,
+		NodeName:         hostName,
-		GPUsPerNode: gpuCount,
+		GPUsPerNode:      gpuCount,
-		Model:       model,
+		Model:            model,
-		VRAMPerGPU:  vram,
+		VRAMPerGPU:       vram,
-		MIGEnabled:  false,
+		MIGEnabled:       false,
-		MIGProfiles: map[string]int{},
+		MIGProfiles:      map[string]int{},
-		System:      system, // populated from InferHardwareSystem
+		System:           system, // populated from InferHardwareSystem
+		Interconnect:     interconnect,
+		InterconnectTier: interconnectDetail,
+		NVLinkLinks:      nvlinkLinks,
 	}, nil
 }
@@ -605,23 +675,18 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
 func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error) {
 	logger := log.FromContext(ctx)
 	logger.Info("Starting GPU discovery from cluster nodes")
 	// List all nodes in the cluster
 	nodeList := &corev1.NodeList{}
 	if err := k8sClient.List(ctx, nodeList); err != nil {
 		return nil, fmt.Errorf("failed to list cluster nodes: %w", err)
 	}
 	if len(nodeList.Items) == 0 {
 		return nil, fmt.Errorf("no nodes found in cluster")
 	}
 	logger.Info("Found cluster nodes", "count", len(nodeList.Items))
 	// Track the best GPU configuration found
 	var bestGPUInfo *GPUInfo
 	nodesWithGPUs := 0
 	for i := range nodeList.Items {
 		node := &nodeList.Items[i]
 		gpuInfo, err := extractGPUInfoFromNode(node)
@@ -632,14 +697,12 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error
 				"reason", err.Error())
 			continue
 		}
 		nodesWithGPUs++
 		logger.Info("Found GPU node",
 			"node", node.Name,
 			"gpus", gpuInfo.GPUsPerNode,
 			"model", gpuInfo.Model,
 			"vram", gpuInfo.VRAMPerGPU)
 		// Select best configuration: prefer higher GPU count, then higher VRAM
 		if bestGPUInfo == nil ||
 			gpuInfo.GPUsPerNode > bestGPUInfo.GPUsPerNode ||
@@ -647,17 +710,14 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error
 			bestGPUInfo = gpuInfo
 		}
 	}
 	if bestGPUInfo == nil {
 		return nil, fmt.Errorf("no nodes with NVIDIA GPU Feature Discovery labels found (checked %d nodes). "+
 			"Ensure GPU nodes have labels: %s, %s, %s",
 			len(nodeList.Items), LabelGPUCount, LabelGPUProduct, LabelGPUMemory)
 	}
 	// Infer hardware system from GPU model
 	bestGPUInfo.System = InferHardwareSystem(bestGPUInfo.Model)
 	bestGPUInfo.NodesWithGPUs = nodesWithGPUs
 	logger.Info("GPU discovery completed",
 		"gpusPerNode", bestGPUInfo.GPUsPerNode,
 		"nodesWithGPUs", bestGPUInfo.NodesWithGPUs,
@@ -665,7 +725,6 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error
 		"model", bestGPUInfo.Model,
 		"vram", bestGPUInfo.VRAMPerGPU,
 		"system", bestGPUInfo.System)
 	return bestGPUInfo, nil
 }
@@ -676,7 +735,6 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
 	if labels == nil {
 		return nil, fmt.Errorf("node has no labels")
 	}
 	gpuCountStr, ok := labels[LabelGPUCount]
 	if !ok {
 		return nil, fmt.Errorf("missing label %s", LabelGPUCount)
@@ -685,12 +743,10 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
 	if err != nil || gpuCount <= 0 {
 		return nil, fmt.Errorf("invalid GPU count: %s", gpuCountStr)
 	}
 	gpuModel, ok := labels[LabelGPUProduct]
 	if !ok || gpuModel == "" {
 		return nil, fmt.Errorf("missing or empty label %s", LabelGPUProduct)
 	}
 	// Extract VRAM (memory in MiB)
 	gpuMemoryStr, ok := labels[LabelGPUMemory]
 	if !ok {
@@ -700,7 +756,6 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
 	if err != nil || gpuMemory <= 0 {
 		return nil, fmt.Errorf("invalid GPU memory: %s", gpuMemoryStr)
 	}
 	return &GPUInfo{
 		GPUsPerNode: gpuCount,
 		Model:       gpuModel,
@@ -708,12 +763,18 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
 	}, nil
 }
-// InferHardwareSystem maps GPU product name to hardware system identifier.
+// InferHardwareSystem attempts to infer a normalized GPU SKU type from a
-// Returns empty string if the GPU model cannot be confidently mapped.
+// free-form product string (e.g. "NVIDIA H100 SXM", "A100-PCIE").
 //
-// This is a best-effort mapping based on common NVIDIA datacenter GPU naming patterns.
+// The function performs three main steps:
-// The system identifier is used by the profiler for performance estimation and configuration.
+//  1. Normalize the input string to a consistent format.
+//  2. Detect the GPU form factor (SXM vs PCIe).
+//  3. Match the normalized string against known GPU tokens and return
+//     the corresponding SKU type.
 //
+// Matching is based on substring checks and is tolerant of variations
+// in formatting (case, spaces, dashes). If no known GPU is detected,
+// an empty SKU type is returned.
 // Limitations:
 //   - Cannot distinguish SXM vs. PCIe variants from labels alone (assumes SXM for datacenter GPUs)
 //   - New GPU models require code updates (gracefully returns empty string)
@@ -726,34 +787,80 @@ func InferHardwareSystem(gpuProduct string) nvidiacomv1beta1.GPUSKUType {
 		return ""
 	}
-	// Normalize: uppercase, remove spaces/dashes for pattern matching
+	normalized := normalize(gpuProduct)
-	normalized := strings.ToUpper(strings.ReplaceAll(gpuProduct, "-", ""))
+	formFactor := detectFormFactor(normalized)
-	normalized = strings.ReplaceAll(normalized, " ", "")
-	// Map common NVIDIA datacenter GPU products to AIC hardware system identifiers.
-	patterns := []struct {
-		pattern string
-		system  nvidiacomv1beta1.GPUSKUType
-	}{
-		{"GB200", nvidiacomv1beta1.GPUSKUTypeGB200SXM},
-		{"H200", nvidiacomv1beta1.GPUSKUTypeH200SXM},
-		{"H100", nvidiacomv1beta1.GPUSKUTypeH100SXM},
-		{"B200", nvidiacomv1beta1.GPUSKUTypeB200SXM},
-		{"A100", nvidiacomv1beta1.GPUSKUTypeA100SXM},
-		{"L40S", nvidiacomv1beta1.GPUSKUTypeL40S},
-	}
-	for _, p := range patterns {
+	for _, rule := range gpuRules {
-		if strings.Contains(normalized, p.pattern) {
+		if strings.Contains(normalized, rule.token) {
-			return p.system
+			if rule.singleSKU != "" {
+				return rule.singleSKU
+			}
+			if formFactor == formFactorSXM && rule.sxmSKU != "" {
+				return rule.sxmSKU
+			}
+			if rule.pcieSKU != "" {
+				return rule.pcieSKU
+			}
 		}
 	}
-	// Unknown GPU type, return empty value.
-	// User must specify gpuSku explicitly in spec.hardware.
 	return ""
 }
+// normalize standardizes a GPU product string to simplify matching.
+//
+// It converts the string to uppercase and removes common separators
+// such as spaces and dashes. This allows consistent substring matching
+// regardless of how the input is formatted (e.g. "H100-SXM",
+// "h100 sxm", and "H100SXM" all normalize to the same value).
+func normalize(input string) string {
+	s := strings.ToUpper(strings.ReplaceAll(input, strDash, strSpace))
+	return strings.ReplaceAll(s, " ", "")
+}
+// detectFormFactor determines the GPU form factor (e.g. SXM or PCIe)
+// from a normalized product string.
+//
+// The detection is based on the presence of known substrings such as
+// "SXM", "HGX", or "DGX" for SXM-based systems, and "PCIE" for PCIe.
+// If no explicit indicator is found, PCIe is used as the default since
+// it is the more common and safer assumption.
+func detectFormFactor(normalized string) string {
+	switch {
+	case strings.Contains(normalized, tokenSXM),
+		strings.Contains(normalized, tokenHGX),
+		strings.Contains(normalized, tokenDGX):
+		return formFactorSXM
+	case strings.Contains(normalized, tokenPCIE):
+		return formFactorPCIe
+	default:
+		return formFactorPCIe
+	}
+}
+// GetCloudProviderInfo attempts to infer the cloud provider of the Kubernetes cluster.
+//
+// The function inspects the first node in the cluster (assumes homogeneous node setup)
+// and uses a combination of ProviderID and node labels to detect the provider.
+//
+// Detection logic:
+//   - Primary detection uses node.Spec.ProviderID:
+//   - "azure" → AKS
+//   - "aws"   → AWS
+//   - "gce"   → GCP
+//   - Secondary detection uses node labels and instance type prefixes:
+//   - AKS: "kubernetes.azure.com/cluster" label or instance type starting with "standard_"
+//   - AWS: "eks.amazonaws.com/nodegroup" label or known AWS instance type prefix
+//   - GCP: "cloud.google.com/gke-nodepool" label or known GCP machine series prefix
+//   - If none match, returns "other".
+//
+// Parameters:
+//   - ctx: Context for logging, cancellation, or timeout.
+//   - k8sClient: Kubernetes client for reading Node objects.
+//
+// Returns:
+//   - A string identifying the cloud provider ("aks", "aws", "gcp", "other", or "unknown").
+//   - An error if no nodes are found or listing fails.
 func GetCloudProviderInfo(ctx context.Context, k8sClient client.Reader) (string, error) {
 	var nodeList corev1.NodeList
 	if err := k8sClient.List(ctx, &nodeList); err != nil {
@@ -801,6 +908,13 @@ func GetCloudProviderInfo(ctx context.Context, k8sClient client.Reader) (string,
 	return "other", nil
 }
+// isGCPInstanceType checks whether a given instance type string matches known GCP machine series.
+//
+// Parameters:
+//   - instanceType: string representing the node's instance type (lowercased).
+//
+// Returns:
+//   - true if the instance type belongs to a GCP machine series prefix.
 func isGCPInstanceType(instanceType string) bool {
 	for _, prefix := range gcpMachineSeries {
 		if strings.HasPrefix(instanceType, prefix) {
@@ -810,6 +924,13 @@ func isGCPInstanceType(instanceType string) bool {
 	return false
 }
+// isAWSInstanceType checks whether a given instance type string matches known AWS instance type prefixes.
+//
+// Parameters:
+//   - instanceType: string representing the node's instance type (lowercased).
+//
+// Returns:
+//   - true if the instance type belongs to an AWS instance type prefix.
 func isAWSInstanceType(instanceType string) bool {
 	for _, prefix := range awsInstanceTypePrefixes {
 		if strings.HasPrefix(instanceType, prefix) {
@@ -818,3 +939,59 @@ func isAWSInstanceType(instanceType string) bool {
 	}
 	return false
 }
+// detectRDMAFromNode inspects a single node for RDMA or SR-IOV network capability.
+//
+// Detection logic:
+//   - Checks node labels:
+//   - "nvidia.com/rdma.present" = "true" → RDMA detected
+//   - "feature.node.kubernetes.io/network-sriov.capable" = "true" → SR-IOV detected
+//
+// Parameters:
+//   - ctx: Context for logging or cancellation.
+//   - k8sClient: Kubernetes client for reading Node objects.
+//   - nodeName: Name of the node to inspect.
+//
+// Returns:
+//   - bool indicating whether RDMA/SR-IOV is present.
+//   - string representing the type ("rdma", "sriov", or "none").
+func detectRDMAFromNode(ctx context.Context, k8sClient client.Reader, nodeName string) (bool, string) {
+	node := &corev1.Node{}
+	if err := k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, node); err != nil {
+		return false, strNone
+	}
+	labels := node.Labels
+	if labels["nvidia.com/rdma.present"] == "true" {
+		return true, "rdma"
+	}
+	if labels["feature.node.kubernetes.io/network-sriov.capable"] == "true" {
+		return true, "sriov"
+	}
+	return false, strNone
+}
+// detectIBPods checks if there are any RDMA or InfiniBand-related pods deployed
+// in the "nvidia-network-operator" namespace.
+//
+// Detection logic:
+//   - Lists pods in "nvidia-network-operator" namespace.
+//   - If any pod name contains "rdma", returns true.
+//
+// Parameters:
+//   - ctx: Context for logging or cancellation.
+//   - k8sClient: Kubernetes client for listing pods.
+//
+// Returns:
+//   - true if any RDMA/IB pods are found, false otherwise.
+func detectIBPods(ctx context.Context, k8sClient client.Reader) bool {
+	podList := &corev1.PodList{}
+	if err := k8sClient.List(ctx, podList, client.InNamespace(LabelValueNvidiaNetworkOperator)); err != nil {
+		return false
+	}
+	for _, p := range podList.Items {
+		if strings.Contains(p.Name, "rdma") {
+			return true
+		}
+	}
+	return false
+}
--- a/deploy/operator/internal/gpu/discovery_test.go
+++ b/deploy/operator/internal/gpu/discovery_test.go
@@ -26,6 +26,7 @@ import (
 	"strings"
 	"testing"
+	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
 	dto "github.com/prometheus/client_model/go"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -322,32 +323,159 @@ func TestExtractGPUInfoFromNode_MissingLabels(t *testing.T) {
 func TestInferHardwareSystem(t *testing.T) {
 	tests := []struct {
-		gpuProduct     string
+		name     string
-		expectedSystem string
+		input    string
-		description    string
+		expected nvidiacomv1beta1.GPUSKUType
 	}{
-		{"H100-SXM5-80GB", "h100_sxm", "H100 SXM variant"},
+		// --- Empty / unknown ---
-		{"H100-PCIE-80GB", "h100_sxm", "H100 PCIe variant (mapped to SXM)"},
+		{
-		{"H200-SXM5-141GB", "h200_sxm", "H200 SXM variant"},
+			name:     "empty input",
-		{"A100-SXM4-40GB", "a100_sxm", "A100 SXM variant"},
+			input:    "",
-		{"A100-PCIE-80GB", "a100_sxm", "A100 PCIe variant (mapped to SXM)"},
+			expected: "",
-		{"L40S", "l40s", "L40S"},
+		},
-		{"NVIDIA L40S", "l40s", "L40S with prefix"},
+		{
-		{"B200-SXM", "b200_sxm", "B200 SXM"},
+			name:     "unknown gpu",
-		{"GB200", "gb200_sxm", "GB200"},
+			input:    "random-gpu",
-		{"Tesla V100-SXM2-16GB", "", "V100 (not in mapping)"},
+			expected: "",
-		{"RTX 4090", "", "Consumer GPU (not in mapping)"},
+		},
-		{"Unknown-GPU", "", "Unknown GPU"},
-		{"", "", "Empty string"},
+		// --- Blackwell ---
-		// GFD product names as seen in real cluster labels (regression for GPUSKU bug)
+		{
-		{"NVIDIA-B200", "b200_sxm", "B200 with NVIDIA prefix (GFD label format)"},
+			name:     "GB200 SXM",
-		{"NVIDIA-H200-SXM5-141GB", "h200_sxm", "H200 with NVIDIA prefix (GFD label format)"},
+			input:    "GB200-SXM",
+			expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
+		},
+		{
+			name:     "GB200 HGX (implies SXM)",
+			input:    "HGX GB200",
+			expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
+		},
+		{
+			name:     "B200 SXM",
+			input:    "B200 SXM",
+			expected: nvidiacomv1beta1.GPUSKUTypeB200SXM,
+		},
+		// --- Hopper ---
+		{
+			name:     "H100 SXM",
+			input:    "H100 SXM",
+			expected: nvidiacomv1beta1.GPUSKUTypeH100SXM,
+		},
+		{
+			name:     "H100 PCIe explicit",
+			input:    "H100 PCIe",
+			expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
+		},
+		{
+			name:     "H100 default PCIe",
+			input:    "H100",
+			expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
+		},
+		{
+			name:     "H200 SXM",
+			input:    "H200 SXM",
+			expected: nvidiacomv1beta1.GPUSKUTypeH200SXM,
+		},
+		// --- Ampere ---
+		{
+			name:     "A100 SXM",
+			input:    "A100-SXM",
+			expected: nvidiacomv1beta1.GPUSKUTypeA100SXM,
+		},
+		{
+			name:     "A100 PCIe",
+			input:    "A100 PCIe",
+			expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
+		},
+		{
+			name:     "A100 default PCIe",
+			input:    "A100",
+			expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
+		},
+		// --- Ada ---
+		{
+			name:     "L40S",
+			input:    "L40S",
+			expected: nvidiacomv1beta1.GPUSKUTypeL40S,
+		},
+		{
+			name:     "L40S should not match L40",
+			input:    "L40S",
+			expected: nvidiacomv1beta1.GPUSKUTypeL40S,
+		},
+		{
+			name:     "L40",
+			input:    "L40",
+			expected: nvidiacomv1beta1.GPUSKUTypeL40,
+		},
+		{
+			name:     "L4",
+			input:    "L4",
+			expected: nvidiacomv1beta1.GPUSKUTypeL4,
+		},
+		// --- Volta / Turing ---
+		{
+			name:     "V100 SXM",
+			input:    "V100 SXM",
+			expected: nvidiacomv1beta1.GPUSKUTypeV100SXM,
+		},
+		{
+			name:     "V100 PCIe",
+			input:    "V100 PCIe",
+			expected: nvidiacomv1beta1.GPUSKUTypeV100PCIe,
+		},
+		{
+			name:     "T4",
+			input:    "T4",
+			expected: nvidiacomv1beta1.GPUSKUTypeT4,
+		},
+		// --- AMD ---
+		{
+			name:     "MI300",
+			input:    "MI300",
+			expected: nvidiacomv1beta1.GPUSKUTypeMI300,
+		},
+		{
+			name:     "MI250",
+			input:    "MI250",
+			expected: nvidiacomv1beta1.GPUSKUTypeMI200,
+		},
+		{
+			name:     "MI200",
+			input:    "MI200",
+			expected: nvidiacomv1beta1.GPUSKUTypeMI200,
+		},
+		// --- Normalization tests ---
+		{
+			name:     "lowercase + spaces",
+			input:    "h100 sxm",
+			expected: nvidiacomv1beta1.GPUSKUTypeH100SXM,
+		},
+		{
+			name:     "mixed case + dash",
+			input:    "A100-sXm",
+			expected: nvidiacomv1beta1.GPUSKUTypeA100SXM,
+		},
+		{
+			name:     "with extra spaces",
+			input:    "  H100   PCIe ",
+			expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
+		},
 	}
 	for _, tt := range tests {
-		t.Run(tt.description, func(t *testing.T) {
+		t.Run(tt.name, func(t *testing.T) {
-			result := InferHardwareSystem(tt.gpuProduct)
+			result := InferHardwareSystem(tt.input)
-			assert.Equal(t, tt.expectedSystem, string(result), "Failed for GPU: %s", tt.gpuProduct)
+			if result != tt.expected {
+				t.Errorf("InferHardwareSystem(%q) = %v, want %v",
+					tt.input, result, tt.expected)
+			}
 		})
 	}
 }
@@ -382,6 +510,119 @@ func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) {
 	}
 }
+func TestNormalize(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "basic lowercase",
+			input:    "h100",
+			expected: "H100",
+		},
+		{
+			name:     "spaces removed",
+			input:    "H100 SXM",
+			expected: "H100SXM",
+		},
+		{
+			name:     "dashes replaced and removed",
+			input:    "H100-SXM",
+			expected: "H100SXM",
+		},
+		{
+			name:     "mixed spaces and dashes",
+			input:    "A100 - SXM",
+			expected: "A100SXM",
+		},
+		{
+			name:     "extra whitespace",
+			input:    "  H100   PCIe ",
+			expected: "H100PCIE",
+		},
+		{
+			name:     "complex string",
+			input:    "h100-sxm5-80gb",
+			expected: "H100SXM580GB",
+		},
+		{
+			name:     "empty string",
+			input:    "",
+			expected: "",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := normalize(tt.input)
+			if result != tt.expected {
+				t.Errorf("normalize(%q) = %q, want %q",
+					tt.input, result, tt.expected)
+			}
+		})
+	}
+}
+func TestDetectFormFactor(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string // already normalized
+		expected string
+	}{
+		{
+			name:     "detect SXM explicitly",
+			input:    "H100SXM",
+			expected: formFactorSXM,
+		},
+		{
+			name:     "detect HGX implies SXM",
+			input:    "HGXH100",
+			expected: formFactorSXM,
+		},
+		{
+			name:     "detect DGX implies SXM",
+			input:    "DGXH100",
+			expected: formFactorSXM,
+		},
+		{
+			name:     "detect PCIe explicitly",
+			input:    "H100PCIE",
+			expected: formFactorPCIe,
+		},
+		{
+			name:     "default to PCIe when unknown",
+			input:    "H100",
+			expected: formFactorPCIe,
+		},
+		{
+			name:     "SXM wins over PCIe if both present",
+			input:    "H100SXMPCIE",
+			expected: formFactorSXM,
+		},
+		{
+			name:     "random string defaults to PCIe",
+			input:    "RANDOMGPU",
+			expected: formFactorPCIe,
+		},
+		{
+			name:     "empty string defaults to PCIe",
+			input:    "",
+			expected: formFactorPCIe,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := detectFormFactor(tt.input)
+			if result != tt.expected {
+				t.Errorf("detectFormFactor(%q) = %v, want %v",
+					tt.input, result, tt.expected)
+			}
+		})
+	}
+}
 func TestParseMetrics(t *testing.T) {
 	ctx := context.Background()
@@ -873,3 +1114,117 @@ func TestGetCloudProviderInfo(t *testing.T) {
 		})
 	}
 }
+func TestDetectRDMAFromNode(t *testing.T) {
+	scheme := runtime.NewScheme()
+	_ = corev1.AddToScheme(scheme)
+	tests := []struct {
+		name        string
+		node        *corev1.Node
+		nodeName    string
+		expectedOK  bool
+		expectedTyp string
+	}{
+		{
+			name:        "node not found",
+			node:        nil,
+			nodeName:    "missing-node",
+			expectedOK:  false,
+			expectedTyp: strNone,
+		},
+		{
+			name: "rdma detected",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "node-rdma",
+					Labels: map[string]string{
+						"nvidia.com/rdma.present": "true",
+					},
+				},
+			},
+			nodeName:    "node-rdma",
+			expectedOK:  true,
+			expectedTyp: "rdma",
+		},
+		{
+			name: "sriov detected",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "node-sriov",
+					Labels: map[string]string{
+						"feature.node.kubernetes.io/network-sriov.capable": "true",
+					},
+				},
+			},
+			nodeName:    "node-sriov",
+			expectedOK:  true,
+			expectedTyp: "sriov",
+		},
+		{
+			name: "both rdma and sriov - rdma takes precedence",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "node-both",
+					Labels: map[string]string{
+						"nvidia.com/rdma.present":                          "true",
+						"feature.node.kubernetes.io/network-sriov.capable": "true",
+					},
+				},
+			},
+			nodeName:    "node-both",
+			expectedOK:  true,
+			expectedTyp: "rdma",
+		},
+		{
+			name: "no relevant labels",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:   "node-none",
+					Labels: map[string]string{},
+				},
+			},
+			nodeName:    "node-none",
+			expectedOK:  false,
+			expectedTyp: strNone,
+		},
+		{
+			name: "labels present but false",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "node-false",
+					Labels: map[string]string{
+						"nvidia.com/rdma.present":                          "false",
+						"feature.node.kubernetes.io/network-sriov.capable": "false",
+					},
+				},
+			},
+			nodeName:    "node-false",
+			expectedOK:  false,
+			expectedTyp: strNone,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			var objs []runtime.Object
+			if tt.node != nil {
+				objs = append(objs, tt.node)
+			}
+			fakeClient := fake.NewClientBuilder().
+				WithScheme(scheme).
+				WithRuntimeObjects(objs...).
+				Build()
+			ok, typ := detectRDMAFromNode(context.TODO(), fakeClient, tt.nodeName)
+			if ok != tt.expectedOK {
+				t.Errorf("expected ok=%v, got %v", tt.expectedOK, ok)
+			}
+			if typ != tt.expectedTyp {
+				t.Errorf("expected type=%s, got %s", tt.expectedTyp, typ)
+			}
+		})
+	}
+}
--- a/docs/kubernetes/api-reference.md
+++ b/docs/kubernetes/api-reference.md
@@ -1542,19 +1542,28 @@ _Underlying type:_ _string_
 GPUSKUType is the AIC hardware system identifier for a supported GPU.
 _Validation:_
- Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s]
+- Enum: [gb200_sxm b200_sxm h200_sxm h100_sxm h100_pcie a100_sxm a100_pcie l40s l40 l4 v100_sxm v100_pcie t4 mi200 mi300]
 _Appears in:_
 - [HardwareSpec](#hardwarespec)
 | Field | Description |
 | --- | --- |
-| `gb200_sxm` |  |
+| `gb200_sxm` | --- Blackwell ---<br /> |
-| `h200_sxm` |  |
-| `h100_sxm` |  |
 | `b200_sxm` |  |
-| `a100_sxm` |  |
+| `h200_sxm` | --- Hopper ---<br /> |
-| `l40s` |  |
+| `h100_sxm` |  |
+| `h100_pcie` |  |
+| `a100_sxm` | --- Ampere ---<br /> |
+| `a100_pcie` |  |
+| `l40s` | --- Ada ---<br /> |
+| `l40` |  |
+| `l4` |  |
+| `v100_sxm` | --- Older NVIDIA ---<br /> |
+| `v100_pcie` |  |
+| `t4` |  |
+| `mi200` | --- AMD ---<br /> |
+| `mi300` |  |
 #### HardwareSpec
@@ -1571,10 +1580,12 @@ _Appears in:_
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. |  | Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s] <br />Optional: \{\} <br /> |
+| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. |  | Enum: [gb200_sxm b200_sxm h200_sxm h100_sxm h100_pcie a100_sxm a100_pcie l40s l40 l4 v100_sxm v100_pcie t4 mi200 mi300] <br />Optional: \{\} <br /> |
 | `vramMb` _float_ | VRAMMB is the VRAM per GPU in MiB. |  | Optional: \{\} <br /> |
 | `totalGpus` _integer_ | TotalGPUs is the total number of GPUs available in the cluster. |  | Optional: \{\} <br /> |
 | `numGpusPerNode` _integer_ | NumGPUsPerNode is the number of GPUs per node. |  | Optional: \{\} <br /> |
+| `interconnect` _string_ | Interconnect describes the GPU interconnect type within a node.<br />Examples: "pcie", "nvlink", "infiniband". |  | Optional: \{\} <br /> |
+| `rdma` _boolean_ | RDMA indicates whether RDMA is available on the cluster. |  | Optional: \{\} <br /> |