Unverified Commit 71f9e7a9 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: normalize GPUSKU to AIC system identifier (#6984)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 45fd53d0
......@@ -62,6 +62,15 @@ class SearchStrategy(str, Enum):
Thorough = "thorough"
class GPUSKUType(str, Enum):
GB200SXM = "gb200_sxm"
H200SXM = "h200_sxm"
H100SXM = "h100_sxm"
B200SXM = "b200_sxm"
A100SXM = "a100_sxm"
L40S = "l40s"
class BackendType(str, Enum):
Auto = "auto"
Sglang = "sglang"
......@@ -200,9 +209,9 @@ class FeaturesSpec(BaseModel):
class HardwareSpec(BaseModel):
"""HardwareSpec describes the hardware resources available for profiling and deployment. These fields are typically auto-filled by the operator from cluster discovery."""
gpuSku: Optional[str] = Field(
gpuSku: Optional[GPUSKUType] = Field(
default=None,
description='GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB").',
description="GPUSKU is the AIC hardware system identifier for the GPU. When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.",
)
vramMb: Optional[float] = Field(
default=None, description="VRAMMB is the VRAM per GPU in MiB."
......
......@@ -578,7 +578,24 @@ spec:
Typically auto-filled by the operator from cluster discovery.
properties:
gpuSku:
description: GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB").
allOf:
- enum:
- gb200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- a100_sxm
- l40s
- enum:
- gb200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- a100_sxm
- l40s
description: |-
GPUSKU is the AIC hardware system identifier for the GPU.
When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
type: string
numGpusPerNode:
description: NumGPUsPerNode is the number of GPUs per node.
......
......@@ -183,6 +183,19 @@ const (
SearchStrategyThorough SearchStrategy = "thorough"
)
// GPUSKUType is the AIC hardware system identifier for a supported GPU.
// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
type GPUSKUType string
const (
GPUSKUTypeGB200SXM GPUSKUType = "gb200_sxm"
GPUSKUTypeH200SXM GPUSKUType = "h200_sxm"
GPUSKUTypeH100SXM GPUSKUType = "h100_sxm"
GPUSKUTypeB200SXM GPUSKUType = "b200_sxm"
GPUSKUTypeA100SXM GPUSKUType = "a100_sxm"
GPUSKUTypeL40S GPUSKUType = "l40s"
)
// BackendType specifies the inference backend.
// +kubebuilder:validation:Enum=auto;sglang;trtllm;vllm
type BackendType string
......@@ -324,9 +337,11 @@ type FeaturesSpec struct {
// HardwareSpec describes the hardware resources available for profiling and deployment.
// These fields are typically auto-filled by the operator from cluster discovery.
type HardwareSpec struct {
// GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB").
// GPUSKU is the AIC hardware system identifier for the GPU.
// When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
// +optional
GPUSKU string `json:"gpuSku,omitempty"`
// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
GPUSKU GPUSKUType `json:"gpuSku,omitempty"`
// VRAMMB is the VRAM per GPU in MiB.
// +optional
......
......@@ -578,7 +578,24 @@ spec:
Typically auto-filled by the operator from cluster discovery.
properties:
gpuSku:
description: GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB").
allOf:
- enum:
- gb200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- a100_sxm
- l40s
- enum:
- gb200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- a100_sxm
- l40s
description: |-
GPUSKU is the AIC hardware system identifier for the GPU.
When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
type: string
numGpusPerNode:
description: NumGPUsPerNode is the number of GPUs per node.
......
......@@ -1176,6 +1176,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
dgdr.Spec.Hardware = &nvidiacomv1beta1.HardwareSpec{}
}
hw := dgdr.Spec.Hardware
if hw.GPUSKU != "" && hw.VRAMMB != nil && hw.NumGPUsPerNode != nil {
return nil // all fields already set by user; TotalGPUs is filled below when discovery runs
}
......@@ -1191,10 +1192,16 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
"nodesWithGPUs", gpuInfo.NodesWithGPUs,
"totalGpus", gpuInfo.GPUsPerNode*gpuInfo.NodesWithGPUs,
"model", gpuInfo.Model,
"system", gpuInfo.System,
"vramMiB", gpuInfo.VRAMPerGPU)
if hw.GPUSKU == "" {
hw.GPUSKU = gpuInfo.Model
if gpuInfo.System != "" {
hw.GPUSKU = gpuInfo.System
} else {
// Unknown GPU type: use raw model name; profiler will attempt naive config generation.
hw.GPUSKU = nvidiacomv1beta1.GPUSKUType(gpuInfo.Model)
}
}
if hw.VRAMMB == nil {
vram := float64(gpuInfo.VRAMPerGPU)
......
......@@ -100,7 +100,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
AutoApply: ptr.To(true),
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -152,7 +152,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -227,7 +227,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -314,7 +314,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
SearchStrategy: "rapid",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -375,7 +375,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -486,7 +486,7 @@ spec:
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -611,7 +611,7 @@ spec:
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -687,7 +687,7 @@ spec:
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -807,7 +807,7 @@ var _ = Describe("DGDR Validation", func() {
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -831,7 +831,7 @@ var _ = Describe("DGDR Validation", func() {
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -892,7 +892,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Backend: "trtllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "H200-SXM",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH200SXM,
NumGPUsPerNode: ptr.To[int32](8),
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
......@@ -957,7 +957,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Image: "test-profiler:latest",
SearchStrategy: "rapid",
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "H200-SXM",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH200SXM,
NumGPUsPerNode: ptr.To[int32](8),
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
......@@ -1022,7 +1022,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -1104,7 +1104,7 @@ var _ = Describe("DGDR Error Handling", func() {
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -1468,7 +1468,7 @@ spec:
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](4),
GPUSKU: "A100-SXM4-40GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeA100SXM,
VRAMMB: ptr.To(40960.0),
},
SLA: &nvidiacomv1beta1.SLASpec{
......@@ -1680,7 +1680,7 @@ spec:
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -1745,7 +1745,7 @@ spec:
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -1790,7 +1790,7 @@ spec:
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -1884,7 +1884,7 @@ spec:
AutoApply: ptr.To(true),
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -1977,7 +1977,7 @@ spec:
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
......@@ -2029,7 +2029,7 @@ spec:
Backend: "vllm",
Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "A100-SXM4-40GB",
GPUSKU: nvidiacomv1beta1.GPUSKUTypeA100SXM,
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"context"
"fmt"
"testing"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
gpupkg "github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
)
func newFakeReconciler(nodes ...*corev1.Node) *DynamoGraphDeploymentRequestReconciler {
scheme := runtime.NewScheme()
_ = corev1.AddToScheme(scheme)
objs := make([]client.Object, len(nodes))
for i, n := range nodes {
objs[i] = n
}
fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build()
return &DynamoGraphDeploymentRequestReconciler{
Client: fakeClient,
APIReader: fakeClient,
Recorder: &record.FakeRecorder{},
}
}
func gpuNode(name, product string, gpuCount int, vramMiB int) *corev1.Node {
return &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: map[string]string{
gpupkg.LabelGPUCount: intStr(gpuCount),
gpupkg.LabelGPUProduct: product,
gpupkg.LabelGPUMemory: intStr(vramMiB),
},
},
}
}
func intStr(n int) string {
return fmt.Sprintf("%d", n)
}
// TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier is the regression test for the
// bug where GPUSKU was set to the raw GFD product name (e.g. "NVIDIA-B200") instead of
// the AIC system identifier (e.g. "b200_sxm"), causing AIC support checks to always fail
// and forcing every model/backend to fall back to naive config generation.
func TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier(t *testing.T) {
tests := []struct {
name string
gfdProduct string // raw GFD label value
expectedGPUSKU string // what the profiler needs
}{
{
name: "B200 GFD label maps to AIC system identifier",
gfdProduct: "NVIDIA-B200",
expectedGPUSKU: "b200_sxm",
},
{
name: "H200 GFD label maps to AIC system identifier",
gfdProduct: "NVIDIA-H200-SXM5-141GB",
expectedGPUSKU: "h200_sxm",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
r := newFakeReconciler(gpuNode("gpu-node-1", tt.gfdProduct, 8, 141312))
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}
err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
require.NoError(t, err)
require.NotNil(t, dgdr.Spec.Hardware)
assert.Equal(t, tt.expectedGPUSKU, string(dgdr.Spec.Hardware.GPUSKU),
"GPUSKU should be the AIC system identifier, not the raw GFD product name %q", tt.gfdProduct)
})
}
}
// TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU verifies that for GPUs
// not in the AIC support matrix, the raw GFD product name is used as a fallback.
func TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU(t *testing.T) {
r := newFakeReconciler(gpuNode("gpu-node-1", "Tesla-V100-SXM2-16GB", 8, 16384))
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}
err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
require.NoError(t, err)
require.NotNil(t, dgdr.Spec.Hardware)
assert.Equal(t, "Tesla-V100-SXM2-16GB", string(dgdr.Spec.Hardware.GPUSKU),
"Unknown GPU should fall back to raw model name")
}
......@@ -23,6 +23,7 @@ import (
"strconv"
"strings"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
......@@ -37,11 +38,11 @@ const (
// GPUInfo contains discovered GPU configuration from cluster nodes
type GPUInfo struct {
GPUsPerNode int // Maximum GPUs per node found in the cluster
NodesWithGPUs int // Number of nodes that have GPUs
Model string // GPU product name (e.g., "H100-SXM5-80GB")
VRAMPerGPU int // VRAM in MiB per GPU
System string // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
GPUsPerNode int // Maximum GPUs per node found in the cluster
NodesWithGPUs int // Number of nodes that have GPUs
Model string // GPU product name (e.g., "H100-SXM5-80GB")
VRAMPerGPU int // VRAM in MiB per GPU
System nvidiacomv1beta1.GPUSKUType // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
}
// DiscoverGPUs queries Kubernetes nodes to determine GPU configuration.
......@@ -170,7 +171,7 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
//
// Users can manually override the system in their profiling config (hardware.system)
// if auto-detection is incorrect or unavailable.
func InferHardwareSystem(gpuProduct string) string {
func InferHardwareSystem(gpuProduct string) nvidiacomv1beta1.GPUSKUType {
if gpuProduct == "" {
return ""
}
......@@ -179,17 +180,17 @@ func InferHardwareSystem(gpuProduct string) string {
normalized := strings.ToUpper(strings.ReplaceAll(gpuProduct, "-", ""))
normalized = strings.ReplaceAll(normalized, " ", "")
// Map common NVIDIA datacenter GPU products to hardware system identifiers
// Map common NVIDIA datacenter GPU products to AIC hardware system identifiers.
patterns := []struct {
pattern string
system string
system nvidiacomv1beta1.GPUSKUType
}{
{"GB200", "gb200_sxm"},
{"H200", "h200_sxm"},
{"H100", "h100_sxm"},
{"B200", "b200_sxm"},
{"A100", "a100_sxm"},
{"L40S", "l40s"},
{"GB200", nvidiacomv1beta1.GPUSKUTypeGB200SXM},
{"H200", nvidiacomv1beta1.GPUSKUTypeH200SXM},
{"H100", nvidiacomv1beta1.GPUSKUTypeH100SXM},
{"B200", nvidiacomv1beta1.GPUSKUTypeB200SXM},
{"A100", nvidiacomv1beta1.GPUSKUTypeA100SXM},
{"L40S", nvidiacomv1beta1.GPUSKUTypeL40S},
}
for _, p := range patterns {
......@@ -198,7 +199,7 @@ func InferHardwareSystem(gpuProduct string) string {
}
}
// Unknown GPU type, return empty string
// User must specify system manually in profiling config (hardware.system)
// Unknown GPU type, return empty value.
// User must specify gpuSku explicitly in spec.hardware.
return ""
}
......@@ -63,7 +63,7 @@ func TestDiscoverGPUs_SingleNode(t *testing.T) {
assert.Equal(t, 8, gpuInfo.GPUsPerNode)
assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
assert.Equal(t, "h100_sxm", gpuInfo.System)
assert.Equal(t, "h100_sxm", string(gpuInfo.System))
}
func TestDiscoverGPUs_MultipleNodesHomogeneous(t *testing.T) {
......@@ -333,12 +333,15 @@ func TestInferHardwareSystem(t *testing.T) {
{"RTX 4090", "", "Consumer GPU (not in mapping)"},
{"Unknown-GPU", "", "Unknown GPU"},
{"", "", "Empty string"},
// GFD product names as seen in real cluster labels (regression for GPUSKU bug)
{"NVIDIA-B200", "b200_sxm", "B200 with NVIDIA prefix (GFD label format)"},
{"NVIDIA-H200-SXM5-141GB", "h200_sxm", "H200 with NVIDIA prefix (GFD label format)"},
}
for _, tt := range tests {
t.Run(tt.description, func(t *testing.T) {
result := InferHardwareSystem(tt.gpuProduct)
assert.Equal(t, tt.expectedSystem, result, "Failed for GPU: %s", tt.gpuProduct)
assert.Equal(t, tt.expectedSystem, string(result), "Failed for GPU: %s", tt.gpuProduct)
})
}
}
......@@ -354,7 +357,7 @@ func TestInferHardwareSystem_CaseInsensitive(t *testing.T) {
for _, variant := range variants {
result := InferHardwareSystem(variant)
assert.Equal(t, "h100_sxm", result, "Should handle case variations: %s", variant)
assert.Equal(t, "h100_sxm", string(result), "Should handle case variations: %s", variant)
}
}
......@@ -369,6 +372,6 @@ func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) {
for _, variant := range variants {
result := InferHardwareSystem(variant)
assert.Equal(t, "h100_sxm", result, "Should normalize spaces/dashes: %s", variant)
assert.Equal(t, "h100_sxm", string(result), "Should normalize spaces/dashes: %s", variant)
}
}
......@@ -1402,6 +1402,28 @@ _Appears in:_
| `mocker` _[MockerSpec](#mockerspec)_ | Mocker configures the simulated (mocker) backend for testing without GPUs. | | Optional: \{\} <br /> |
#### GPUSKUType
_Underlying type:_ _string_
GPUSKUType is the AIC hardware system identifier for a supported GPU.
_Validation:_
- Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s]
_Appears in:_
- [HardwareSpec](#hardwarespec)
| Field | Description |
| --- | --- |
| `gb200_sxm` | |
| `h200_sxm` | |
| `h100_sxm` | |
| `b200_sxm` | |
| `a100_sxm` | |
| `l40s` | |
#### HardwareSpec
......@@ -1416,7 +1438,7 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `gpuSku` _string_ | GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB"). | | Optional: \{\} <br /> |
| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. | | Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s] <br />Optional: \{\} <br /> |
| `vramMb` _float_ | VRAMMB is the VRAM per GPU in MiB. | | Optional: \{\} <br /> |
| `totalGpus` _integer_ | TotalGPUs is the total number of GPUs available in the cluster. | | Optional: \{\} <br /> |
| `numGpusPerNode` _integer_ | NumGPUsPerNode is the number of GPUs per node. | | Optional: \{\} <br /> |
......
......@@ -209,7 +209,7 @@ class TestValidDgdrSpec:
@pytest.mark.gpu_0
def test_missing_gpu_sku_raises(self):
"""hardware.gpuSku is required."""
dgdr = _make_dgdr(hardware=HardwareSpec(gpuSku="", numGpusPerNode=8))
dgdr = _make_dgdr(hardware=HardwareSpec(gpuSku=None, numGpusPerNode=8))
with pytest.raises(ValueError, match="gpuSku.*required"):
valid_dgdr_spec(dgdr)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment