Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
71fafe3f
Unverified
Commit
71fafe3f
authored
Apr 23, 2026
by
hhzhang16
Committed by
GitHub
Apr 23, 2026
Browse files
fix: add hardware discovery fallbacks and infer attempt (#8507)
Signed-off-by:
Hannah Zhang
<
hannahz@nvidia.com
>
parent
9572355f
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
165 additions
and
3 deletions
+165
-3
deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
...nal/controller/dynamographdeploymentrequest_controller.go
+6
-3
deploy/operator/internal/controller/enrich_hardware_test.go
deploy/operator/internal/controller/enrich_hardware_test.go
+116
-0
deploy/operator/internal/gpu/discovery.go
deploy/operator/internal/gpu/discovery.go
+6
-0
deploy/operator/internal/gpu/discovery_test.go
deploy/operator/internal/gpu/discovery_test.go
+37
-0
No files found.
deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
View file @
71fafe3f
...
...
@@ -1453,10 +1453,13 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
"cloudprovider"
,
gpuInfo
.
CloudProvider
)
if
hw
.
GPUSKU
==
""
{
if
gpuInfo
.
System
!=
""
{
inferred
:=
gpu
.
InferHardwareSystem
(
gpuInfo
.
Model
)
switch
{
case
gpuInfo
.
System
!=
""
:
hw
.
GPUSKU
=
gpuInfo
.
System
}
else
{
// Unknown GPU type: use raw model name; profiler will attempt naive config generation.
case
inferred
!=
""
:
hw
.
GPUSKU
=
inferred
default
:
hw
.
GPUSKU
=
nvidiacomv1beta1
.
GPUSKUType
(
gpuInfo
.
Model
)
}
}
...
...
deploy/operator/internal/controller/enrich_hardware_test.go
View file @
71fafe3f
...
...
@@ -19,6 +19,7 @@ package controller
import
(
"context"
"fmt"
"testing"
nvidiacomv1beta1
"github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
...
...
@@ -46,6 +47,23 @@ func newFakeReconciler(objs ...client.Object) *DynamoGraphDeploymentRequestRecon
}
}
func
gpuNode
(
name
,
product
string
,
gpuCount
int
,
vramMiB
int
)
*
corev1
.
Node
{
return
&
corev1
.
Node
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
name
,
Labels
:
map
[
string
]
string
{
gpupkg
.
LabelGPUCount
:
intStr
(
gpuCount
),
gpupkg
.
LabelGPUProduct
:
product
,
gpupkg
.
LabelGPUMemory
:
intStr
(
vramMiB
),
},
},
}
}
func
intStr
(
n
int
)
string
{
return
fmt
.
Sprintf
(
"%d"
,
n
)
}
func
dcgmPod
(
name
,
ip
string
)
*
corev1
.
Pod
{
return
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
name
,
Namespace
:
"gpu-operator"
,
...
...
@@ -175,3 +193,101 @@ func TestEnrichHardwareFromDiscovery(t *testing.T) {
})
}
}
// TestEnrichHardwareFromDiscovery_NormalizesBareModelFromDCGM is the regression test for
// the bug where DCGM reports "NVIDIA H200" (no SXM suffix, system="") and the controller
// serialized the raw string into the profiling job config instead of normalizing it to
// "h200_sxm", causing the Python profiler's Pydantic enum validation to fail.
func
TestEnrichHardwareFromDiscovery_NormalizesBareModelFromDCGM
(
t
*
testing
.
T
)
{
tests
:=
[]
struct
{
name
string
dcgmModel
string
expectedGPUSKU
string
}{
{
name
:
"NVIDIA H200 from DCGM normalizes to h200_sxm"
,
dcgmModel
:
"NVIDIA H200"
,
expectedGPUSKU
:
"h200_sxm"
,
},
{
name
:
"NVIDIA B200 from DCGM normalizes to b200_sxm"
,
dcgmModel
:
"NVIDIA B200"
,
expectedGPUSKU
:
"b200_sxm"
,
},
}
for
_
,
tt
:=
range
tests
{
t
.
Run
(
tt
.
name
,
func
(
t
*
testing
.
T
)
{
scheme
:=
runtime
.
NewScheme
()
_
=
corev1
.
AddToScheme
(
scheme
)
dcgmPod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"dcgm-exporter"
,
Namespace
:
"default"
,
Labels
:
map
[
string
]
string
{
gpupkg
.
LabelApp
:
gpupkg
.
LabelValueNvidiaDCGMExporter
,
},
},
Status
:
corev1
.
PodStatus
{
Phase
:
corev1
.
PodRunning
,
PodIP
:
"10.0.0.1"
,
},
}
fakeClient
:=
fake
.
NewClientBuilder
()
.
WithScheme
(
scheme
)
.
WithObjects
(
dcgmPod
)
.
Build
()
// Mock scraper returns System="" to simulate the scenario where
// DCGM metrics lack a form factor suffix (e.g. "NVIDIA H200").
mockScraper
:=
func
(
_
context
.
Context
,
_
string
)
(
*
gpupkg
.
GPUInfo
,
error
)
{
return
&
gpupkg
.
GPUInfo
{
NodeName
:
"gpu-node"
,
GPUsPerNode
:
8
,
Model
:
tt
.
dcgmModel
,
VRAMPerGPU
:
143770
,
System
:
""
,
},
nil
}
r
:=
&
DynamoGraphDeploymentRequestReconciler
{
Client
:
fakeClient
,
APIReader
:
fakeClient
,
Recorder
:
&
record
.
FakeRecorder
{},
GPUDiscovery
:
gpupkg
.
NewGPUDiscovery
(
mockScraper
),
GPUDiscoveryCache
:
gpupkg
.
NewGPUDiscoveryCache
(),
}
dgdr
:=
&
nvidiacomv1beta1
.
DynamoGraphDeploymentRequest
{
Spec
:
nvidiacomv1beta1
.
DynamoGraphDeploymentRequestSpec
{},
}
err
:=
r
.
enrichHardwareFromDiscovery
(
context
.
Background
(),
dgdr
)
require
.
NoError
(
t
,
err
)
require
.
NotNil
(
t
,
dgdr
.
Spec
.
Hardware
)
assert
.
Equal
(
t
,
tt
.
expectedGPUSKU
,
string
(
dgdr
.
Spec
.
Hardware
.
GPUSKU
),
"gpuSku must be a valid profiler enum, not the raw DCGM model string %q"
,
tt
.
dcgmModel
)
})
}
}
// TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU verifies that for GPUs
// not in the AIC support matrix, the raw GFD product name is used as a fallback.
func
TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU
(
t
*
testing
.
T
)
{
r
:=
newFakeReconciler
(
gpuNode
(
"gpu-node-1"
,
"Tesla-V100-SXM2-16GB"
,
8
,
16384
))
dgdr
:=
&
nvidiacomv1beta1
.
DynamoGraphDeploymentRequest
{
Spec
:
nvidiacomv1beta1
.
DynamoGraphDeploymentRequestSpec
{
Hardware
:
&
nvidiacomv1beta1
.
HardwareSpec
{
GPUSKU
:
"Tesla-V100-SXM2-16GB"
,
VRAMMB
:
ptr
.
To
(
float64
(
16384
)),
NumGPUsPerNode
:
ptr
.
To
(
int32
(
8
)),
TotalGPUs
:
ptr
.
To
(
int32
(
8
)),
},
},
}
err
:=
r
.
enrichHardwareFromDiscovery
(
context
.
Background
(),
dgdr
)
require
.
NoError
(
t
,
err
)
require
.
NotNil
(
t
,
dgdr
.
Spec
.
Hardware
)
assert
.
Equal
(
t
,
"Tesla-V100-SXM2-16GB"
,
string
(
dgdr
.
Spec
.
Hardware
.
GPUSKU
),
"Unknown GPU should fall back to raw model name"
)
}
deploy/operator/internal/gpu/discovery.go
View file @
71fafe3f
...
...
@@ -850,6 +850,12 @@ func InferHardwareSystem(gpuProduct string) nvidiacomv1beta1.GPUSKUType {
if
rule
.
pcieSKU
!=
""
{
return
rule
.
pcieSKU
}
// Token matched but no form factor indicator was present in the string
// (e.g. "NVIDIA H200" from DCGM has no SXM/HGX/DGX suffix). If the GPU
// has no PCIe variant it must be SXM-only (H200, B200, GB200).
if
rule
.
sxmSKU
!=
""
{
return
rule
.
sxmSKU
}
}
}
...
...
deploy/operator/internal/gpu/discovery_test.go
View file @
71fafe3f
...
...
@@ -451,6 +451,43 @@ func TestInferHardwareSystem(t *testing.T) {
expected
:
nvidiacomv1beta1
.
GPUSKUTypeMI200
,
},
// --- Bare DCGM model names (no form factor suffix) ---
// DCGM often reports "NVIDIA H200" / "NVIDIA B200" with system="" because
// there is no SXM/HGX/DGX token in the string. GPUs that have no PCIe
// variant must still resolve to their SXM SKU.
{
name
:
"NVIDIA H200 bare (DCGM format, no SXM suffix)"
,
input
:
"NVIDIA H200"
,
expected
:
nvidiacomv1beta1
.
GPUSKUTypeH200SXM
,
},
{
name
:
"NVIDIA B200 bare (DCGM format, no SXM suffix)"
,
input
:
"NVIDIA B200"
,
expected
:
nvidiacomv1beta1
.
GPUSKUTypeB200SXM
,
},
{
name
:
"NVIDIA GB200 bare (DCGM format, no SXM suffix)"
,
input
:
"NVIDIA GB200"
,
expected
:
nvidiacomv1beta1
.
GPUSKUTypeGB200SXM
,
},
{
name
:
"H200 bare without vendor prefix"
,
input
:
"H200"
,
expected
:
nvidiacomv1beta1
.
GPUSKUTypeH200SXM
,
},
// H100/A100 still default to PCIe when no form factor indicator is present,
// because those GPUs have a real PCIe variant.
{
name
:
"H100 bare still defaults to PCIe (has PCIe variant)"
,
input
:
"H100"
,
expected
:
nvidiacomv1beta1
.
GPUSKUTypeH100PCIe
,
},
{
name
:
"A100 bare still defaults to PCIe (has PCIe variant)"
,
input
:
"A100"
,
expected
:
nvidiacomv1beta1
.
GPUSKUTypeA100PCIe
,
},
// --- Normalization tests ---
{
name
:
"lowercase + spaces"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment