fix: normalize GPUSKU to AIC system identifier (#6984)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

fix: normalize GPUSKU to AIC system identifier (#6984)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
71f9e7a9 · hhzhang16 · GitHub · 45fd53d0 · 71f9e7a9 · 71f9e7a9
Unverified Commit 71f9e7a9 authored Mar 09, 2026 by hhzhang16 Committed by GitHub Mar 09, 2026
11 changed files
--- a/components/src/dynamo/profiler/utils/dgdr_v1beta1_types.py
+++ b/components/src/dynamo/profiler/utils/dgdr_v1beta1_types.py
@@ -62,6 +62,15 @@ class SearchStrategy(str, Enum):
    Thorough = "thorough"


+class GPUSKUType(str, Enum):
+    GB200SXM = "gb200_sxm"
+    H200SXM = "h200_sxm"
+    H100SXM = "h100_sxm"
+    B200SXM = "b200_sxm"
+    A100SXM = "a100_sxm"
+    L40S = "l40s"
+
+
 class BackendType(str, Enum):
    Auto = "auto"
    Sglang = "sglang"
@@ -200,9 +209,9 @@ class FeaturesSpec(BaseModel):
 class HardwareSpec(BaseModel):
    """HardwareSpec describes the hardware resources available for profiling and deployment. These fields are typically auto-filled by the operator from cluster discovery."""

-    gpuSku: Optional[str] = Field(
+    gpuSku: Optional[GPUSKUType] = Field(
        default=None,
-        description='GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB").',
+        description="GPUSKU is the AIC hardware system identifier for the GPU. When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.",
    )
    vramMb: Optional[float] = Field(
        default=None, description="VRAMMB is the VRAM per GPU in MiB."

--- a/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/helm/charts/platform/components/operator/crds/nvidia.com_dynamographdeploymentrequests.yaml
@@ -578,7 +578,24 @@ spec:
                    Typically auto-filled by the operator from cluster discovery.
                  properties:
                    gpuSku:
-                      description: GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB").
+                      allOf:
+                        - enum:
+                            - gb200_sxm
+                            - h200_sxm
+                            - h100_sxm
+                            - b200_sxm
+                            - a100_sxm
+                            - l40s
+                        - enum:
+                            - gb200_sxm
+                            - h200_sxm
+                            - h100_sxm
+                            - b200_sxm
+                            - a100_sxm
+                            - l40s
+                      description: |-
+                        GPUSKU is the AIC hardware system identifier for the GPU.
+                        When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
                      type: string
                    numGpusPerNode:
                      description: NumGPUsPerNode is the number of GPUs per node.

--- a/deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
+++ b/deploy/operator/api/v1beta1/dynamographdeploymentrequest_types.go
@@ -183,6 +183,19 @@ const (
 	SearchStrategyThorough SearchStrategy = "thorough"
 )

+// GPUSKUType is the AIC hardware system identifier for a supported GPU.
+// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
+type GPUSKUType string
+
+const (
+	GPUSKUTypeGB200SXM GPUSKUType = "gb200_sxm"
+	GPUSKUTypeH200SXM  GPUSKUType = "h200_sxm"
+	GPUSKUTypeH100SXM  GPUSKUType = "h100_sxm"
+	GPUSKUTypeB200SXM  GPUSKUType = "b200_sxm"
+	GPUSKUTypeA100SXM  GPUSKUType = "a100_sxm"
+	GPUSKUTypeL40S     GPUSKUType = "l40s"
+)
+
 // BackendType specifies the inference backend.
 // +kubebuilder:validation:Enum=auto;sglang;trtllm;vllm
 type BackendType string
@@ -324,9 +337,11 @@ type FeaturesSpec struct {
 // HardwareSpec describes the hardware resources available for profiling and deployment.
 // These fields are typically auto-filled by the operator from cluster discovery.
 type HardwareSpec struct {
-	// GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB").
+	// GPUSKU is the AIC hardware system identifier for the GPU.
+	// When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
 	// +optional
-	GPUSKU string `json:"gpuSku,omitempty"`
+	// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
+	GPUSKU GPUSKUType `json:"gpuSku,omitempty"`

 	// VRAMMB is the VRAM per GPU in MiB.
 	// +optional

--- a/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
+++ b/deploy/operator/config/crd/bases/nvidia.com_dynamographdeploymentrequests.yaml
@@ -578,7 +578,24 @@ spec:
                    Typically auto-filled by the operator from cluster discovery.
                  properties:
                    gpuSku:
-                      description: GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB").
+                      allOf:
+                        - enum:
+                            - gb200_sxm
+                            - h200_sxm
+                            - h100_sxm
+                            - b200_sxm
+                            - a100_sxm
+                            - l40s
+                        - enum:
+                            - gb200_sxm
+                            - h200_sxm
+                            - h100_sxm
+                            - b200_sxm
+                            - a100_sxm
+                            - l40s
+                      description: |-
+                        GPUSKU is the AIC hardware system identifier for the GPU.
+                        When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
                      type: string
                    numGpusPerNode:
                      description: NumGPUsPerNode is the number of GPUs per node.

--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller.go
@@ -1176,6 +1176,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
 		dgdr.Spec.Hardware = &nvidiacomv1beta1.HardwareSpec{}
 	}
 	hw := dgdr.Spec.Hardware
+
 	if hw.GPUSKU != "" && hw.VRAMMB != nil && hw.NumGPUsPerNode != nil {
 		return nil // all fields already set by user; TotalGPUs is filled below when discovery runs
 	}
@@ -1191,10 +1192,16 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
 		"nodesWithGPUs", gpuInfo.NodesWithGPUs,
 		"totalGpus", gpuInfo.GPUsPerNode*gpuInfo.NodesWithGPUs,
 		"model", gpuInfo.Model,
+		"system", gpuInfo.System,
 		"vramMiB", gpuInfo.VRAMPerGPU)

 	if hw.GPUSKU == "" {
-		hw.GPUSKU = gpuInfo.Model
+		if gpuInfo.System != "" {
+			hw.GPUSKU = gpuInfo.System
+		} else {
+			// Unknown GPU type: use raw model name; profiler will attempt naive config generation.
+			hw.GPUSKU = nvidiacomv1beta1.GPUSKUType(gpuInfo.Model)
+		}
 	}
 	if hw.VRAMMB == nil {
 		vram := float64(gpuInfo.VRAMPerGPU)

--- a/deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
+++ b/deploy/operator/internal/controller/dynamographdeploymentrequest_controller_test.go
@@ -100,7 +100,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
 					AutoApply: ptr.To(true),
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -152,7 +152,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -227,7 +227,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -314,7 +314,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
 					SearchStrategy: "rapid",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -375,7 +375,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -486,7 +486,7 @@ spec:
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -611,7 +611,7 @@ spec:
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -687,7 +687,7 @@ spec:
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -807,7 +807,7 @@ var _ = Describe("DGDR Validation", func() {
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -831,7 +831,7 @@ var _ = Describe("DGDR Validation", func() {
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -892,7 +892,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
 					Backend: "trtllm",
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
-						GPUSKU:         "H200-SXM",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH200SXM,
 						NumGPUsPerNode: ptr.To[int32](8),
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
@@ -957,7 +957,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
 					Image:          "test-profiler:latest",
 					SearchStrategy: "rapid",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
-						GPUSKU:         "H200-SXM",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH200SXM,
 						NumGPUsPerNode: ptr.To[int32](8),
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
@@ -1022,7 +1022,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -1104,7 +1104,7 @@ var _ = Describe("DGDR Error Handling", func() {
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -1468,7 +1468,7 @@ spec:
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](4),
-						GPUSKU:         "A100-SXM4-40GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeA100SXM,
 						VRAMMB:         ptr.To(40960.0),
 					},
 					SLA: &nvidiacomv1beta1.SLASpec{
@@ -1680,7 +1680,7 @@ spec:
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -1745,7 +1745,7 @@ spec:
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -1790,7 +1790,7 @@ spec:
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -1884,7 +1884,7 @@ spec:
 					AutoApply: ptr.To(true),
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -1977,7 +1977,7 @@ spec:
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
 						NumGPUsPerNode: ptr.To[int32](8),
-						GPUSKU:         "H100-SXM5-80GB",
+						GPUSKU:         nvidiacomv1beta1.GPUSKUTypeH100SXM,
 						VRAMMB:         ptr.To(81920.0),
 						TotalGPUs:      ptr.To[int32](128),
 					},
@@ -2029,7 +2029,7 @@ spec:
 					Backend: "vllm",
 					Image:   "test-profiler:latest",
 					Hardware: &nvidiacomv1beta1.HardwareSpec{
-						GPUSKU: "A100-SXM4-40GB",
+						GPUSKU: nvidiacomv1beta1.GPUSKUTypeA100SXM,
 					},
 					SLA: &nvidiacomv1beta1.SLASpec{
 						TTFT: ptr.To(100.0),

--- a/deploy/operator/internal/controller/enrich_hardware_test.go
+++ b/deploy/operator/internal/controller/enrich_hardware_test.go
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package controller
+
+import (
+	"context"
+	"fmt"
+	"testing"
+
+	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/client-go/tools/record"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+
+	gpupkg "github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
+)
+
+func newFakeReconciler(nodes ...*corev1.Node) *DynamoGraphDeploymentRequestReconciler {
+	scheme := runtime.NewScheme()
+	_ = corev1.AddToScheme(scheme)
+	objs := make([]client.Object, len(nodes))
+	for i, n := range nodes {
+		objs[i] = n
+	}
+	fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build()
+	return &DynamoGraphDeploymentRequestReconciler{
+		Client:    fakeClient,
+		APIReader: fakeClient,
+		Recorder:  &record.FakeRecorder{},
+	}
+}
+
+func gpuNode(name, product string, gpuCount int, vramMiB int) *corev1.Node {
+	return &corev1.Node{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: name,
+			Labels: map[string]string{
+				gpupkg.LabelGPUCount:   intStr(gpuCount),
+				gpupkg.LabelGPUProduct: product,
+				gpupkg.LabelGPUMemory:  intStr(vramMiB),
+			},
+		},
+	}
+}
+
+func intStr(n int) string {
+	return fmt.Sprintf("%d", n)
+}
+
+// TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier is the regression test for the
+// bug where GPUSKU was set to the raw GFD product name (e.g. "NVIDIA-B200") instead of
+// the AIC system identifier (e.g. "b200_sxm"), causing AIC support checks to always fail
+// and forcing every model/backend to fall back to naive config generation.
+func TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier(t *testing.T) {
+	tests := []struct {
+		name           string
+		gfdProduct     string // raw GFD label value
+		expectedGPUSKU string // what the profiler needs
+	}{
+		{
+			name:           "B200 GFD label maps to AIC system identifier",
+			gfdProduct:     "NVIDIA-B200",
+			expectedGPUSKU: "b200_sxm",
+		},
+		{
+			name:           "H200 GFD label maps to AIC system identifier",
+			gfdProduct:     "NVIDIA-H200-SXM5-141GB",
+			expectedGPUSKU: "h200_sxm",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			r := newFakeReconciler(gpuNode("gpu-node-1", tt.gfdProduct, 8, 141312))
+			dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}
+
+			err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
+			require.NoError(t, err)
+			require.NotNil(t, dgdr.Spec.Hardware)
+			assert.Equal(t, tt.expectedGPUSKU, string(dgdr.Spec.Hardware.GPUSKU),
+				"GPUSKU should be the AIC system identifier, not the raw GFD product name %q", tt.gfdProduct)
+		})
+	}
+}
+
+// TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU verifies that for GPUs
+// not in the AIC support matrix, the raw GFD product name is used as a fallback.
+func TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU(t *testing.T) {
+	r := newFakeReconciler(gpuNode("gpu-node-1", "Tesla-V100-SXM2-16GB", 8, 16384))
+	dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}
+
+	err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
+	require.NoError(t, err)
+	require.NotNil(t, dgdr.Spec.Hardware)
+	assert.Equal(t, "Tesla-V100-SXM2-16GB", string(dgdr.Spec.Hardware.GPUSKU),
+		"Unknown GPU should fall back to raw model name")
+}
--- a/deploy/operator/internal/gpu/discovery.go
+++ b/deploy/operator/internal/gpu/discovery.go
@@ -23,6 +23,7 @@ import (
 	"strconv"
 	"strings"

+	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
 	corev1 "k8s.io/api/core/v1"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/log"
@@ -37,11 +38,11 @@ const (

 // GPUInfo contains discovered GPU configuration from cluster nodes
 type GPUInfo struct {
-	GPUsPerNode   int    // Maximum GPUs per node found in the cluster
-	NodesWithGPUs int    // Number of nodes that have GPUs
-	Model         string // GPU product name (e.g., "H100-SXM5-80GB")
-	VRAMPerGPU    int    // VRAM in MiB per GPU
-	System        string // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
+	GPUsPerNode   int                         // Maximum GPUs per node found in the cluster
+	NodesWithGPUs int                         // Number of nodes that have GPUs
+	Model         string                      // GPU product name (e.g., "H100-SXM5-80GB")
+	VRAMPerGPU    int                         // VRAM in MiB per GPU
+	System        nvidiacomv1beta1.GPUSKUType // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
 }

 // DiscoverGPUs queries Kubernetes nodes to determine GPU configuration.
@@ -170,7 +171,7 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
 //
 // Users can manually override the system in their profiling config (hardware.system)
 // if auto-detection is incorrect or unavailable.
-func InferHardwareSystem(gpuProduct string) string {
+func InferHardwareSystem(gpuProduct string) nvidiacomv1beta1.GPUSKUType {
 	if gpuProduct == "" {
 		return ""
 	}
@@ -179,17 +180,17 @@ func InferHardwareSystem(gpuProduct string) string {
 	normalized := strings.ToUpper(strings.ReplaceAll(gpuProduct, "-", ""))
 	normalized = strings.ReplaceAll(normalized, " ", "")

-	// Map common NVIDIA datacenter GPU products to hardware system identifiers
+	// Map common NVIDIA datacenter GPU products to AIC hardware system identifiers.
 	patterns := []struct {
 		pattern string
-		system  string
+		system  nvidiacomv1beta1.GPUSKUType
 	}{
-		{"GB200", "gb200_sxm"},
-		{"H200", "h200_sxm"},
-		{"H100", "h100_sxm"},
-		{"B200", "b200_sxm"},
-		{"A100", "a100_sxm"},
-		{"L40S", "l40s"},
+		{"GB200", nvidiacomv1beta1.GPUSKUTypeGB200SXM},
+		{"H200", nvidiacomv1beta1.GPUSKUTypeH200SXM},
+		{"H100", nvidiacomv1beta1.GPUSKUTypeH100SXM},
+		{"B200", nvidiacomv1beta1.GPUSKUTypeB200SXM},
+		{"A100", nvidiacomv1beta1.GPUSKUTypeA100SXM},
+		{"L40S", nvidiacomv1beta1.GPUSKUTypeL40S},
 	}

 	for _, p := range patterns {
@@ -198,7 +199,7 @@ func InferHardwareSystem(gpuProduct string) string {
 		}
 	}

-	// Unknown GPU type, return empty string
-	// User must specify system manually in profiling config (hardware.system)
+	// Unknown GPU type, return empty value.
+	// User must specify gpuSku explicitly in spec.hardware.
 	return ""
 }
--- a/deploy/operator/internal/gpu/discovery_test.go
+++ b/deploy/operator/internal/gpu/discovery_test.go
@@ -63,7 +63,7 @@ func TestDiscoverGPUs_SingleNode(t *testing.T) {
 	assert.Equal(t, 8, gpuInfo.GPUsPerNode)
 	assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
 	assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
-	assert.Equal(t, "h100_sxm", gpuInfo.System)
+	assert.Equal(t, "h100_sxm", string(gpuInfo.System))
 }

 func TestDiscoverGPUs_MultipleNodesHomogeneous(t *testing.T) {
@@ -333,12 +333,15 @@ func TestInferHardwareSystem(t *testing.T) {
 		{"RTX 4090", "", "Consumer GPU (not in mapping)"},
 		{"Unknown-GPU", "", "Unknown GPU"},
 		{"", "", "Empty string"},
+		// GFD product names as seen in real cluster labels (regression for GPUSKU bug)
+		{"NVIDIA-B200", "b200_sxm", "B200 with NVIDIA prefix (GFD label format)"},
+		{"NVIDIA-H200-SXM5-141GB", "h200_sxm", "H200 with NVIDIA prefix (GFD label format)"},
 	}

 	for _, tt := range tests {
 		t.Run(tt.description, func(t *testing.T) {
 			result := InferHardwareSystem(tt.gpuProduct)
-			assert.Equal(t, tt.expectedSystem, result, "Failed for GPU: %s", tt.gpuProduct)
+			assert.Equal(t, tt.expectedSystem, string(result), "Failed for GPU: %s", tt.gpuProduct)
 		})
 	}
 }
@@ -354,7 +357,7 @@ func TestInferHardwareSystem_CaseInsensitive(t *testing.T) {

 	for _, variant := range variants {
 		result := InferHardwareSystem(variant)
-		assert.Equal(t, "h100_sxm", result, "Should handle case variations: %s", variant)
+		assert.Equal(t, "h100_sxm", string(result), "Should handle case variations: %s", variant)
 	}
 }

@@ -369,6 +372,6 @@ func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) {

 	for _, variant := range variants {
 		result := InferHardwareSystem(variant)
-		assert.Equal(t, "h100_sxm", result, "Should normalize spaces/dashes: %s", variant)
+		assert.Equal(t, "h100_sxm", string(result), "Should normalize spaces/dashes: %s", variant)
 	}
 }
--- a/docs/kubernetes/api-reference.md
+++ b/docs/kubernetes/api-reference.md
@@ -1402,6 +1402,28 @@ _Appears in:_
 | `mocker` _[MockerSpec](#mockerspec)_ | Mocker configures the simulated (mocker) backend for testing without GPUs. |  | Optional: \{\} <br /> |


+#### GPUSKUType
+
+_Underlying type:_ _string_
+
+GPUSKUType is the AIC hardware system identifier for a supported GPU.
+
+_Validation:_
+- Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s]
+
+_Appears in:_
+- [HardwareSpec](#hardwarespec)
+
+| Field | Description |
+| --- | --- |
+| `gb200_sxm` |  |
+| `h200_sxm` |  |
+| `h100_sxm` |  |
+| `b200_sxm` |  |
+| `a100_sxm` |  |
+| `l40s` |  |
+
+
 #### HardwareSpec


@@ -1416,7 +1438,7 @@ _Appears in:_

 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `gpuSku` _string_ | GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB"). |  | Optional: \{\} <br /> |
+| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. |  | Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s] <br />Optional: \{\} <br /> |
 | `vramMb` _float_ | VRAMMB is the VRAM per GPU in MiB. |  | Optional: \{\} <br /> |
 | `totalGpus` _integer_ | TotalGPUs is the total number of GPUs available in the cluster. |  | Optional: \{\} <br /> |
 | `numGpusPerNode` _integer_ | NumGPUsPerNode is the number of GPUs per node. |  | Optional: \{\} <br /> |

--- a/tests/profiler/test_helpers_profile_sla.py
+++ b/tests/profiler/test_helpers_profile_sla.py
@@ -209,7 +209,7 @@ class TestValidDgdrSpec:
    @pytest.mark.gpu_0
    def test_missing_gpu_sku_raises(self):
        """hardware.gpuSku is required."""
-        dgdr = _make_dgdr(hardware=HardwareSpec(gpuSku="", numGpusPerNode=8))
+        dgdr = _make_dgdr(hardware=HardwareSpec(gpuSku=None, numGpusPerNode=8))
        with pytest.raises(ValueError, match="gpuSku.*required"):
            valid_dgdr_spec(dgdr)