Unverified Commit 71fafe3f authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: add hardware discovery fallbacks and infer attempt (#8507)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 9572355f
...@@ -1453,10 +1453,13 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx ...@@ -1453,10 +1453,13 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
"cloudprovider", gpuInfo.CloudProvider) "cloudprovider", gpuInfo.CloudProvider)
if hw.GPUSKU == "" { if hw.GPUSKU == "" {
if gpuInfo.System != "" { inferred := gpu.InferHardwareSystem(gpuInfo.Model)
switch {
case gpuInfo.System != "":
hw.GPUSKU = gpuInfo.System hw.GPUSKU = gpuInfo.System
} else { case inferred != "":
// Unknown GPU type: use raw model name; profiler will attempt naive config generation. hw.GPUSKU = inferred
default:
hw.GPUSKU = nvidiacomv1beta1.GPUSKUType(gpuInfo.Model) hw.GPUSKU = nvidiacomv1beta1.GPUSKUType(gpuInfo.Model)
} }
} }
......
...@@ -19,6 +19,7 @@ package controller ...@@ -19,6 +19,7 @@ package controller
import ( import (
"context" "context"
"fmt"
"testing" "testing"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1" nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
...@@ -46,6 +47,23 @@ func newFakeReconciler(objs ...client.Object) *DynamoGraphDeploymentRequestRecon ...@@ -46,6 +47,23 @@ func newFakeReconciler(objs ...client.Object) *DynamoGraphDeploymentRequestRecon
} }
} }
func gpuNode(name, product string, gpuCount int, vramMiB int) *corev1.Node {
return &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: map[string]string{
gpupkg.LabelGPUCount: intStr(gpuCount),
gpupkg.LabelGPUProduct: product,
gpupkg.LabelGPUMemory: intStr(vramMiB),
},
},
}
}
func intStr(n int) string {
return fmt.Sprintf("%d", n)
}
func dcgmPod(name, ip string) *corev1.Pod { func dcgmPod(name, ip string) *corev1.Pod {
return &corev1.Pod{ return &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "gpu-operator", ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "gpu-operator",
...@@ -175,3 +193,101 @@ func TestEnrichHardwareFromDiscovery(t *testing.T) { ...@@ -175,3 +193,101 @@ func TestEnrichHardwareFromDiscovery(t *testing.T) {
}) })
} }
} }
// TestEnrichHardwareFromDiscovery_NormalizesBareModelFromDCGM is the regression test for
// the bug where DCGM reports "NVIDIA H200" (no SXM suffix, system="") and the controller
// serialized the raw string into the profiling job config instead of normalizing it to
// "h200_sxm", causing the Python profiler's Pydantic enum validation to fail.
func TestEnrichHardwareFromDiscovery_NormalizesBareModelFromDCGM(t *testing.T) {
tests := []struct {
name string
dcgmModel string
expectedGPUSKU string
}{
{
name: "NVIDIA H200 from DCGM normalizes to h200_sxm",
dcgmModel: "NVIDIA H200",
expectedGPUSKU: "h200_sxm",
},
{
name: "NVIDIA B200 from DCGM normalizes to b200_sxm",
dcgmModel: "NVIDIA B200",
expectedGPUSKU: "b200_sxm",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
scheme := runtime.NewScheme()
_ = corev1.AddToScheme(scheme)
dcgmPod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "dcgm-exporter",
Namespace: "default",
Labels: map[string]string{
gpupkg.LabelApp: gpupkg.LabelValueNvidiaDCGMExporter,
},
},
Status: corev1.PodStatus{
Phase: corev1.PodRunning,
PodIP: "10.0.0.1",
},
}
fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(dcgmPod).Build()
// Mock scraper returns System="" to simulate the scenario where
// DCGM metrics lack a form factor suffix (e.g. "NVIDIA H200").
mockScraper := func(_ context.Context, _ string) (*gpupkg.GPUInfo, error) {
return &gpupkg.GPUInfo{
NodeName: "gpu-node",
GPUsPerNode: 8,
Model: tt.dcgmModel,
VRAMPerGPU: 143770,
System: "",
}, nil
}
r := &DynamoGraphDeploymentRequestReconciler{
Client: fakeClient,
APIReader: fakeClient,
Recorder: &record.FakeRecorder{},
GPUDiscovery: gpupkg.NewGPUDiscovery(mockScraper),
GPUDiscoveryCache: gpupkg.NewGPUDiscoveryCache(),
}
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{},
}
err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
require.NoError(t, err)
require.NotNil(t, dgdr.Spec.Hardware)
assert.Equal(t, tt.expectedGPUSKU, string(dgdr.Spec.Hardware.GPUSKU),
"gpuSku must be a valid profiler enum, not the raw DCGM model string %q", tt.dcgmModel)
})
}
}
// TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU verifies that for GPUs
// not in the AIC support matrix, the raw GFD product name is used as a fallback.
func TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU(t *testing.T) {
r := newFakeReconciler(gpuNode("gpu-node-1", "Tesla-V100-SXM2-16GB", 8, 16384))
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "Tesla-V100-SXM2-16GB",
VRAMMB: ptr.To(float64(16384)),
NumGPUsPerNode: ptr.To(int32(8)),
TotalGPUs: ptr.To(int32(8)),
},
},
}
err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
require.NoError(t, err)
require.NotNil(t, dgdr.Spec.Hardware)
assert.Equal(t, "Tesla-V100-SXM2-16GB", string(dgdr.Spec.Hardware.GPUSKU),
"Unknown GPU should fall back to raw model name")
}
...@@ -850,6 +850,12 @@ func InferHardwareSystem(gpuProduct string) nvidiacomv1beta1.GPUSKUType { ...@@ -850,6 +850,12 @@ func InferHardwareSystem(gpuProduct string) nvidiacomv1beta1.GPUSKUType {
if rule.pcieSKU != "" { if rule.pcieSKU != "" {
return rule.pcieSKU return rule.pcieSKU
} }
// Token matched but no form factor indicator was present in the string
// (e.g. "NVIDIA H200" from DCGM has no SXM/HGX/DGX suffix). If the GPU
// has no PCIe variant it must be SXM-only (H200, B200, GB200).
if rule.sxmSKU != "" {
return rule.sxmSKU
}
} }
} }
......
...@@ -451,6 +451,43 @@ func TestInferHardwareSystem(t *testing.T) { ...@@ -451,6 +451,43 @@ func TestInferHardwareSystem(t *testing.T) {
expected: nvidiacomv1beta1.GPUSKUTypeMI200, expected: nvidiacomv1beta1.GPUSKUTypeMI200,
}, },
// --- Bare DCGM model names (no form factor suffix) ---
// DCGM often reports "NVIDIA H200" / "NVIDIA B200" with system="" because
// there is no SXM/HGX/DGX token in the string. GPUs that have no PCIe
// variant must still resolve to their SXM SKU.
{
name: "NVIDIA H200 bare (DCGM format, no SXM suffix)",
input: "NVIDIA H200",
expected: nvidiacomv1beta1.GPUSKUTypeH200SXM,
},
{
name: "NVIDIA B200 bare (DCGM format, no SXM suffix)",
input: "NVIDIA B200",
expected: nvidiacomv1beta1.GPUSKUTypeB200SXM,
},
{
name: "NVIDIA GB200 bare (DCGM format, no SXM suffix)",
input: "NVIDIA GB200",
expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
},
{
name: "H200 bare without vendor prefix",
input: "H200",
expected: nvidiacomv1beta1.GPUSKUTypeH200SXM,
},
// H100/A100 still default to PCIe when no form factor indicator is present,
// because those GPUs have a real PCIe variant.
{
name: "H100 bare still defaults to PCIe (has PCIe variant)",
input: "H100",
expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
},
{
name: "A100 bare still defaults to PCIe (has PCIe variant)",
input: "A100",
expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
},
// --- Normalization tests --- // --- Normalization tests ---
{ {
name: "lowercase + spaces", name: "lowercase + spaces",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment