fix: add hardware discovery fallbacks and infer attempt (#8507)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

fix: add hardware discovery fallbacks and infer attempt (#8507)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
71fafe3f · hhzhang16 · GitHub · 9572355f · 71fafe3f · 71fafe3f
Unverified Commit 71fafe3f authored Apr 23, 2026 by hhzhang16 Committed by GitHub Apr 23, 2026
4 changed files
--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
@@ -1453,10 +1453,13 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
 		"cloudprovider", gpuInfo.CloudProvider)
 	if hw.GPUSKU == "" {
-		if gpuInfo.System != "" {
+		inferred := gpu.InferHardwareSystem(gpuInfo.Model)
+		switch {
+		case gpuInfo.System != "":
 			hw.GPUSKU = gpuInfo.System
-		} else {
+		case inferred != "":
-			// Unknown GPU type: use raw model name; profiler will attempt naive config generation.
+			hw.GPUSKU = inferred
+		default:
 			hw.GPUSKU = nvidiacomv1beta1.GPUSKUType(gpuInfo.Model)
 		}
 	}

--- a/deploy/operator/internal/controller/enrich_hardware_test.go
+++ b/deploy/operator/internal/controller/enrich_hardware_test.go
@@ -19,6 +19,7 @@ package controller
 import (
 	"context"
+	"fmt"
 	"testing"
 	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
@@ -46,6 +47,23 @@ func newFakeReconciler(objs ...client.Object) *DynamoGraphDeploymentRequestRecon
 	}
 }
+func gpuNode(name, product string, gpuCount int, vramMiB int) *corev1.Node {
+	return &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: name,
+			Labels: map[string]string{
+				gpupkg.LabelGPUCount:   intStr(gpuCount),
+				gpupkg.LabelGPUProduct: product,
+				gpupkg.LabelGPUMemory:  intStr(vramMiB),
+			},
+		},
+	}
+}
+func intStr(n int) string {
+	return fmt.Sprintf("%d", n)
+}
 func dcgmPod(name, ip string) *corev1.Pod {
 	return &corev1.Pod{
 		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "gpu-operator",
@@ -175,3 +193,101 @@ func TestEnrichHardwareFromDiscovery(t *testing.T) {
 		})
 	}
 }
+// TestEnrichHardwareFromDiscovery_NormalizesBareModelFromDCGM is the regression test for
+// the bug where DCGM reports "NVIDIA H200" (no SXM suffix, system="") and the controller
+// serialized the raw string into the profiling job config instead of normalizing it to
+// "h200_sxm", causing the Python profiler's Pydantic enum validation to fail.
+func TestEnrichHardwareFromDiscovery_NormalizesBareModelFromDCGM(t *testing.T) {
+	tests := []struct {
+		name           string
+		dcgmModel      string
+		expectedGPUSKU string
+	}{
+		{
+			name:           "NVIDIA H200 from DCGM normalizes to h200_sxm",
+			dcgmModel:      "NVIDIA H200",
+			expectedGPUSKU: "h200_sxm",
+		},
+		{
+			name:           "NVIDIA B200 from DCGM normalizes to b200_sxm",
+			dcgmModel:      "NVIDIA B200",
+			expectedGPUSKU: "b200_sxm",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			scheme := runtime.NewScheme()
+			_ = corev1.AddToScheme(scheme)
+			dcgmPod := &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "dcgm-exporter",
+					Namespace: "default",
+					Labels: map[string]string{
+						gpupkg.LabelApp: gpupkg.LabelValueNvidiaDCGMExporter,
+					},
+				},
+				Status: corev1.PodStatus{
+					Phase: corev1.PodRunning,
+					PodIP: "10.0.0.1",
+				},
+			}
+			fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(dcgmPod).Build()
+			// Mock scraper returns System="" to simulate the scenario where
+			// DCGM metrics lack a form factor suffix (e.g. "NVIDIA H200").
+			mockScraper := func(_ context.Context, _ string) (*gpupkg.GPUInfo, error) {
+				return &gpupkg.GPUInfo{
+					NodeName:    "gpu-node",
+					GPUsPerNode: 8,
+					Model:       tt.dcgmModel,
+					VRAMPerGPU:  143770,
+					System:      "",
+				}, nil
+			}
+			r := &DynamoGraphDeploymentRequestReconciler{
+				Client:            fakeClient,
+				APIReader:         fakeClient,
+				Recorder:          &record.FakeRecorder{},
+				GPUDiscovery:      gpupkg.NewGPUDiscovery(mockScraper),
+				GPUDiscoveryCache: gpupkg.NewGPUDiscoveryCache(),
+			}
+			dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
+				Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{},
+			}
+			err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
+			require.NoError(t, err)
+			require.NotNil(t, dgdr.Spec.Hardware)
+			assert.Equal(t, tt.expectedGPUSKU, string(dgdr.Spec.Hardware.GPUSKU),
+				"gpuSku must be a valid profiler enum, not the raw DCGM model string %q", tt.dcgmModel)
+		})
+	}
+}
+// TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU verifies that for GPUs
+// not in the AIC support matrix, the raw GFD product name is used as a fallback.
+func TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU(t *testing.T) {
+	r := newFakeReconciler(gpuNode("gpu-node-1", "Tesla-V100-SXM2-16GB", 8, 16384))
+	dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
+		Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
+			Hardware: &nvidiacomv1beta1.HardwareSpec{
+				GPUSKU:         "Tesla-V100-SXM2-16GB",
+				VRAMMB:         ptr.To(float64(16384)),
+				NumGPUsPerNode: ptr.To(int32(8)),
+				TotalGPUs:      ptr.To(int32(8)),
+			},
+		},
+	}
+	err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
+	require.NoError(t, err)
+	require.NotNil(t, dgdr.Spec.Hardware)
+	assert.Equal(t, "Tesla-V100-SXM2-16GB", string(dgdr.Spec.Hardware.GPUSKU),
+		"Unknown GPU should fall back to raw model name")
+}
--- a/deploy/operator/internal/gpu/discovery.go
+++ b/deploy/operator/internal/gpu/discovery.go
@@ -850,6 +850,12 @@ func InferHardwareSystem(gpuProduct string) nvidiacomv1beta1.GPUSKUType {
 			if rule.pcieSKU != "" {
 				return rule.pcieSKU
 			}
+			// Token matched but no form factor indicator was present in the string
+			// (e.g. "NVIDIA H200" from DCGM has no SXM/HGX/DGX suffix). If the GPU
+			// has no PCIe variant it must be SXM-only (H200, B200, GB200).
+			if rule.sxmSKU != "" {
+				return rule.sxmSKU
+			}
 		}
 	}

--- a/deploy/operator/internal/gpu/discovery_test.go
+++ b/deploy/operator/internal/gpu/discovery_test.go
@@ -451,6 +451,43 @@ func TestInferHardwareSystem(t *testing.T) {
 			expected: nvidiacomv1beta1.GPUSKUTypeMI200,
 		},
+		// --- Bare DCGM model names (no form factor suffix) ---
+		// DCGM often reports "NVIDIA H200" / "NVIDIA B200" with system="" because
+		// there is no SXM/HGX/DGX token in the string. GPUs that have no PCIe
+		// variant must still resolve to their SXM SKU.
+		{
+			name:     "NVIDIA H200 bare (DCGM format, no SXM suffix)",
+			input:    "NVIDIA H200",
+			expected: nvidiacomv1beta1.GPUSKUTypeH200SXM,
+		},
+		{
+			name:     "NVIDIA B200 bare (DCGM format, no SXM suffix)",
+			input:    "NVIDIA B200",
+			expected: nvidiacomv1beta1.GPUSKUTypeB200SXM,
+		},
+		{
+			name:     "NVIDIA GB200 bare (DCGM format, no SXM suffix)",
+			input:    "NVIDIA GB200",
+			expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
+		},
+		{
+			name:     "H200 bare without vendor prefix",
+			input:    "H200",
+			expected: nvidiacomv1beta1.GPUSKUTypeH200SXM,
+		},
+		// H100/A100 still default to PCIe when no form factor indicator is present,
+		// because those GPUs have a real PCIe variant.
+		{
+			name:     "H100 bare still defaults to PCIe (has PCIe variant)",
+			input:    "H100",
+			expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
+		},
+		{
+			name:     "A100 bare still defaults to PCIe (has PCIe variant)",
+			input:    "A100",
+			expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
+		},
 		// --- Normalization tests ---
 		{
 			name:     "lowercase + spaces",