Unverified Commit 71f9e7a9 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: normalize GPUSKU to AIC system identifier (#6984)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 45fd53d0
...@@ -62,6 +62,15 @@ class SearchStrategy(str, Enum): ...@@ -62,6 +62,15 @@ class SearchStrategy(str, Enum):
Thorough = "thorough" Thorough = "thorough"
class GPUSKUType(str, Enum):
GB200SXM = "gb200_sxm"
H200SXM = "h200_sxm"
H100SXM = "h100_sxm"
B200SXM = "b200_sxm"
A100SXM = "a100_sxm"
L40S = "l40s"
class BackendType(str, Enum): class BackendType(str, Enum):
Auto = "auto" Auto = "auto"
Sglang = "sglang" Sglang = "sglang"
...@@ -200,9 +209,9 @@ class FeaturesSpec(BaseModel): ...@@ -200,9 +209,9 @@ class FeaturesSpec(BaseModel):
class HardwareSpec(BaseModel): class HardwareSpec(BaseModel):
"""HardwareSpec describes the hardware resources available for profiling and deployment. These fields are typically auto-filled by the operator from cluster discovery.""" """HardwareSpec describes the hardware resources available for profiling and deployment. These fields are typically auto-filled by the operator from cluster discovery."""
gpuSku: Optional[str] = Field( gpuSku: Optional[GPUSKUType] = Field(
default=None, default=None,
description='GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB").', description="GPUSKU is the AIC hardware system identifier for the GPU. When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.",
) )
vramMb: Optional[float] = Field( vramMb: Optional[float] = Field(
default=None, description="VRAMMB is the VRAM per GPU in MiB." default=None, description="VRAMMB is the VRAM per GPU in MiB."
......
...@@ -578,7 +578,24 @@ spec: ...@@ -578,7 +578,24 @@ spec:
Typically auto-filled by the operator from cluster discovery. Typically auto-filled by the operator from cluster discovery.
properties: properties:
gpuSku: gpuSku:
description: GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB"). allOf:
- enum:
- gb200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- a100_sxm
- l40s
- enum:
- gb200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- a100_sxm
- l40s
description: |-
GPUSKU is the AIC hardware system identifier for the GPU.
When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
type: string type: string
numGpusPerNode: numGpusPerNode:
description: NumGPUsPerNode is the number of GPUs per node. description: NumGPUsPerNode is the number of GPUs per node.
......
...@@ -183,6 +183,19 @@ const ( ...@@ -183,6 +183,19 @@ const (
SearchStrategyThorough SearchStrategy = "thorough" SearchStrategyThorough SearchStrategy = "thorough"
) )
// GPUSKUType is the AIC hardware system identifier for a supported GPU.
// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
type GPUSKUType string
const (
GPUSKUTypeGB200SXM GPUSKUType = "gb200_sxm"
GPUSKUTypeH200SXM GPUSKUType = "h200_sxm"
GPUSKUTypeH100SXM GPUSKUType = "h100_sxm"
GPUSKUTypeB200SXM GPUSKUType = "b200_sxm"
GPUSKUTypeA100SXM GPUSKUType = "a100_sxm"
GPUSKUTypeL40S GPUSKUType = "l40s"
)
// BackendType specifies the inference backend. // BackendType specifies the inference backend.
// +kubebuilder:validation:Enum=auto;sglang;trtllm;vllm // +kubebuilder:validation:Enum=auto;sglang;trtllm;vllm
type BackendType string type BackendType string
...@@ -324,9 +337,11 @@ type FeaturesSpec struct { ...@@ -324,9 +337,11 @@ type FeaturesSpec struct {
// HardwareSpec describes the hardware resources available for profiling and deployment. // HardwareSpec describes the hardware resources available for profiling and deployment.
// These fields are typically auto-filled by the operator from cluster discovery. // These fields are typically auto-filled by the operator from cluster discovery.
type HardwareSpec struct { type HardwareSpec struct {
// GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB"). // GPUSKU is the AIC hardware system identifier for the GPU.
// When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
// +optional // +optional
GPUSKU string `json:"gpuSku,omitempty"` // +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
GPUSKU GPUSKUType `json:"gpuSku,omitempty"`
// VRAMMB is the VRAM per GPU in MiB. // VRAMMB is the VRAM per GPU in MiB.
// +optional // +optional
......
...@@ -578,7 +578,24 @@ spec: ...@@ -578,7 +578,24 @@ spec:
Typically auto-filled by the operator from cluster discovery. Typically auto-filled by the operator from cluster discovery.
properties: properties:
gpuSku: gpuSku:
description: GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB"). allOf:
- enum:
- gb200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- a100_sxm
- l40s
- enum:
- gb200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- a100_sxm
- l40s
description: |-
GPUSKU is the AIC hardware system identifier for the GPU.
When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
type: string type: string
numGpusPerNode: numGpusPerNode:
description: NumGPUsPerNode is the number of GPUs per node. description: NumGPUsPerNode is the number of GPUs per node.
......
...@@ -1176,6 +1176,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx ...@@ -1176,6 +1176,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
dgdr.Spec.Hardware = &nvidiacomv1beta1.HardwareSpec{} dgdr.Spec.Hardware = &nvidiacomv1beta1.HardwareSpec{}
} }
hw := dgdr.Spec.Hardware hw := dgdr.Spec.Hardware
if hw.GPUSKU != "" && hw.VRAMMB != nil && hw.NumGPUsPerNode != nil { if hw.GPUSKU != "" && hw.VRAMMB != nil && hw.NumGPUsPerNode != nil {
return nil // all fields already set by user; TotalGPUs is filled below when discovery runs return nil // all fields already set by user; TotalGPUs is filled below when discovery runs
} }
...@@ -1191,10 +1192,16 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx ...@@ -1191,10 +1192,16 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
"nodesWithGPUs", gpuInfo.NodesWithGPUs, "nodesWithGPUs", gpuInfo.NodesWithGPUs,
"totalGpus", gpuInfo.GPUsPerNode*gpuInfo.NodesWithGPUs, "totalGpus", gpuInfo.GPUsPerNode*gpuInfo.NodesWithGPUs,
"model", gpuInfo.Model, "model", gpuInfo.Model,
"system", gpuInfo.System,
"vramMiB", gpuInfo.VRAMPerGPU) "vramMiB", gpuInfo.VRAMPerGPU)
if hw.GPUSKU == "" { if hw.GPUSKU == "" {
hw.GPUSKU = gpuInfo.Model if gpuInfo.System != "" {
hw.GPUSKU = gpuInfo.System
} else {
// Unknown GPU type: use raw model name; profiler will attempt naive config generation.
hw.GPUSKU = nvidiacomv1beta1.GPUSKUType(gpuInfo.Model)
}
} }
if hw.VRAMMB == nil { if hw.VRAMMB == nil {
vram := float64(gpuInfo.VRAMPerGPU) vram := float64(gpuInfo.VRAMPerGPU)
......
...@@ -100,7 +100,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -100,7 +100,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
AutoApply: ptr.To(true), AutoApply: ptr.To(true),
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -152,7 +152,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -152,7 +152,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -227,7 +227,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -227,7 +227,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -314,7 +314,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -314,7 +314,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
SearchStrategy: "rapid", SearchStrategy: "rapid",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -375,7 +375,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -375,7 +375,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -486,7 +486,7 @@ spec: ...@@ -486,7 +486,7 @@ spec:
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -611,7 +611,7 @@ spec: ...@@ -611,7 +611,7 @@ spec:
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -687,7 +687,7 @@ spec: ...@@ -687,7 +687,7 @@ spec:
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -807,7 +807,7 @@ var _ = Describe("DGDR Validation", func() { ...@@ -807,7 +807,7 @@ var _ = Describe("DGDR Validation", func() {
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -831,7 +831,7 @@ var _ = Describe("DGDR Validation", func() { ...@@ -831,7 +831,7 @@ var _ = Describe("DGDR Validation", func() {
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -892,7 +892,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -892,7 +892,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Backend: "trtllm", Backend: "trtllm",
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "H200-SXM", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH200SXM,
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
...@@ -957,7 +957,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -957,7 +957,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Image: "test-profiler:latest", Image: "test-profiler:latest",
SearchStrategy: "rapid", SearchStrategy: "rapid",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "H200-SXM", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH200SXM,
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
...@@ -1022,7 +1022,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -1022,7 +1022,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -1104,7 +1104,7 @@ var _ = Describe("DGDR Error Handling", func() { ...@@ -1104,7 +1104,7 @@ var _ = Describe("DGDR Error Handling", func() {
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -1468,7 +1468,7 @@ spec: ...@@ -1468,7 +1468,7 @@ spec:
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](4), NumGPUsPerNode: ptr.To[int32](4),
GPUSKU: "A100-SXM4-40GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeA100SXM,
VRAMMB: ptr.To(40960.0), VRAMMB: ptr.To(40960.0),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
...@@ -1680,7 +1680,7 @@ spec: ...@@ -1680,7 +1680,7 @@ spec:
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -1745,7 +1745,7 @@ spec: ...@@ -1745,7 +1745,7 @@ spec:
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -1790,7 +1790,7 @@ spec: ...@@ -1790,7 +1790,7 @@ spec:
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -1884,7 +1884,7 @@ spec: ...@@ -1884,7 +1884,7 @@ spec:
AutoApply: ptr.To(true), AutoApply: ptr.To(true),
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -1977,7 +1977,7 @@ spec: ...@@ -1977,7 +1977,7 @@ spec:
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM,
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128), TotalGPUs: ptr.To[int32](128),
}, },
...@@ -2029,7 +2029,7 @@ spec: ...@@ -2029,7 +2029,7 @@ spec:
Backend: "vllm", Backend: "vllm",
Image: "test-profiler:latest", Image: "test-profiler:latest",
Hardware: &nvidiacomv1beta1.HardwareSpec{ Hardware: &nvidiacomv1beta1.HardwareSpec{
GPUSKU: "A100-SXM4-40GB", GPUSKU: nvidiacomv1beta1.GPUSKUTypeA100SXM,
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
......
/*
* SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package controller
import (
"context"
"fmt"
"testing"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
gpupkg "github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
)
func newFakeReconciler(nodes ...*corev1.Node) *DynamoGraphDeploymentRequestReconciler {
scheme := runtime.NewScheme()
_ = corev1.AddToScheme(scheme)
objs := make([]client.Object, len(nodes))
for i, n := range nodes {
objs[i] = n
}
fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build()
return &DynamoGraphDeploymentRequestReconciler{
Client: fakeClient,
APIReader: fakeClient,
Recorder: &record.FakeRecorder{},
}
}
func gpuNode(name, product string, gpuCount int, vramMiB int) *corev1.Node {
return &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: map[string]string{
gpupkg.LabelGPUCount: intStr(gpuCount),
gpupkg.LabelGPUProduct: product,
gpupkg.LabelGPUMemory: intStr(vramMiB),
},
},
}
}
func intStr(n int) string {
return fmt.Sprintf("%d", n)
}
// TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier is the regression test for the
// bug where GPUSKU was set to the raw GFD product name (e.g. "NVIDIA-B200") instead of
// the AIC system identifier (e.g. "b200_sxm"), causing AIC support checks to always fail
// and forcing every model/backend to fall back to naive config generation.
func TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier(t *testing.T) {
tests := []struct {
name string
gfdProduct string // raw GFD label value
expectedGPUSKU string // what the profiler needs
}{
{
name: "B200 GFD label maps to AIC system identifier",
gfdProduct: "NVIDIA-B200",
expectedGPUSKU: "b200_sxm",
},
{
name: "H200 GFD label maps to AIC system identifier",
gfdProduct: "NVIDIA-H200-SXM5-141GB",
expectedGPUSKU: "h200_sxm",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
r := newFakeReconciler(gpuNode("gpu-node-1", tt.gfdProduct, 8, 141312))
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}
err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
require.NoError(t, err)
require.NotNil(t, dgdr.Spec.Hardware)
assert.Equal(t, tt.expectedGPUSKU, string(dgdr.Spec.Hardware.GPUSKU),
"GPUSKU should be the AIC system identifier, not the raw GFD product name %q", tt.gfdProduct)
})
}
}
// TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU verifies that for GPUs
// not in the AIC support matrix, the raw GFD product name is used as a fallback.
func TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU(t *testing.T) {
r := newFakeReconciler(gpuNode("gpu-node-1", "Tesla-V100-SXM2-16GB", 8, 16384))
dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}
err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
require.NoError(t, err)
require.NotNil(t, dgdr.Spec.Hardware)
assert.Equal(t, "Tesla-V100-SXM2-16GB", string(dgdr.Spec.Hardware.GPUSKU),
"Unknown GPU should fall back to raw model name")
}
...@@ -23,6 +23,7 @@ import ( ...@@ -23,6 +23,7 @@ import (
"strconv" "strconv"
"strings" "strings"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log"
...@@ -37,11 +38,11 @@ const ( ...@@ -37,11 +38,11 @@ const (
// GPUInfo contains discovered GPU configuration from cluster nodes // GPUInfo contains discovered GPU configuration from cluster nodes
type GPUInfo struct { type GPUInfo struct {
GPUsPerNode int // Maximum GPUs per node found in the cluster GPUsPerNode int // Maximum GPUs per node found in the cluster
NodesWithGPUs int // Number of nodes that have GPUs NodesWithGPUs int // Number of nodes that have GPUs
Model string // GPU product name (e.g., "H100-SXM5-80GB") Model string // GPU product name (e.g., "H100-SXM5-80GB")
VRAMPerGPU int // VRAM in MiB per GPU VRAMPerGPU int // VRAM in MiB per GPU
System string // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown System nvidiacomv1beta1.GPUSKUType // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
} }
// DiscoverGPUs queries Kubernetes nodes to determine GPU configuration. // DiscoverGPUs queries Kubernetes nodes to determine GPU configuration.
...@@ -170,7 +171,7 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) { ...@@ -170,7 +171,7 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
// //
// Users can manually override the system in their profiling config (hardware.system) // Users can manually override the system in their profiling config (hardware.system)
// if auto-detection is incorrect or unavailable. // if auto-detection is incorrect or unavailable.
func InferHardwareSystem(gpuProduct string) string { func InferHardwareSystem(gpuProduct string) nvidiacomv1beta1.GPUSKUType {
if gpuProduct == "" { if gpuProduct == "" {
return "" return ""
} }
...@@ -179,17 +180,17 @@ func InferHardwareSystem(gpuProduct string) string { ...@@ -179,17 +180,17 @@ func InferHardwareSystem(gpuProduct string) string {
normalized := strings.ToUpper(strings.ReplaceAll(gpuProduct, "-", "")) normalized := strings.ToUpper(strings.ReplaceAll(gpuProduct, "-", ""))
normalized = strings.ReplaceAll(normalized, " ", "") normalized = strings.ReplaceAll(normalized, " ", "")
// Map common NVIDIA datacenter GPU products to hardware system identifiers // Map common NVIDIA datacenter GPU products to AIC hardware system identifiers.
patterns := []struct { patterns := []struct {
pattern string pattern string
system string system nvidiacomv1beta1.GPUSKUType
}{ }{
{"GB200", "gb200_sxm"}, {"GB200", nvidiacomv1beta1.GPUSKUTypeGB200SXM},
{"H200", "h200_sxm"}, {"H200", nvidiacomv1beta1.GPUSKUTypeH200SXM},
{"H100", "h100_sxm"}, {"H100", nvidiacomv1beta1.GPUSKUTypeH100SXM},
{"B200", "b200_sxm"}, {"B200", nvidiacomv1beta1.GPUSKUTypeB200SXM},
{"A100", "a100_sxm"}, {"A100", nvidiacomv1beta1.GPUSKUTypeA100SXM},
{"L40S", "l40s"}, {"L40S", nvidiacomv1beta1.GPUSKUTypeL40S},
} }
for _, p := range patterns { for _, p := range patterns {
...@@ -198,7 +199,7 @@ func InferHardwareSystem(gpuProduct string) string { ...@@ -198,7 +199,7 @@ func InferHardwareSystem(gpuProduct string) string {
} }
} }
// Unknown GPU type, return empty string // Unknown GPU type, return empty value.
// User must specify system manually in profiling config (hardware.system) // User must specify gpuSku explicitly in spec.hardware.
return "" return ""
} }
...@@ -63,7 +63,7 @@ func TestDiscoverGPUs_SingleNode(t *testing.T) { ...@@ -63,7 +63,7 @@ func TestDiscoverGPUs_SingleNode(t *testing.T) {
assert.Equal(t, 8, gpuInfo.GPUsPerNode) assert.Equal(t, 8, gpuInfo.GPUsPerNode)
assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model) assert.Equal(t, "H100-SXM5-80GB", gpuInfo.Model)
assert.Equal(t, 81920, gpuInfo.VRAMPerGPU) assert.Equal(t, 81920, gpuInfo.VRAMPerGPU)
assert.Equal(t, "h100_sxm", gpuInfo.System) assert.Equal(t, "h100_sxm", string(gpuInfo.System))
} }
func TestDiscoverGPUs_MultipleNodesHomogeneous(t *testing.T) { func TestDiscoverGPUs_MultipleNodesHomogeneous(t *testing.T) {
...@@ -333,12 +333,15 @@ func TestInferHardwareSystem(t *testing.T) { ...@@ -333,12 +333,15 @@ func TestInferHardwareSystem(t *testing.T) {
{"RTX 4090", "", "Consumer GPU (not in mapping)"}, {"RTX 4090", "", "Consumer GPU (not in mapping)"},
{"Unknown-GPU", "", "Unknown GPU"}, {"Unknown-GPU", "", "Unknown GPU"},
{"", "", "Empty string"}, {"", "", "Empty string"},
// GFD product names as seen in real cluster labels (regression for GPUSKU bug)
{"NVIDIA-B200", "b200_sxm", "B200 with NVIDIA prefix (GFD label format)"},
{"NVIDIA-H200-SXM5-141GB", "h200_sxm", "H200 with NVIDIA prefix (GFD label format)"},
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.description, func(t *testing.T) { t.Run(tt.description, func(t *testing.T) {
result := InferHardwareSystem(tt.gpuProduct) result := InferHardwareSystem(tt.gpuProduct)
assert.Equal(t, tt.expectedSystem, result, "Failed for GPU: %s", tt.gpuProduct) assert.Equal(t, tt.expectedSystem, string(result), "Failed for GPU: %s", tt.gpuProduct)
}) })
} }
} }
...@@ -354,7 +357,7 @@ func TestInferHardwareSystem_CaseInsensitive(t *testing.T) { ...@@ -354,7 +357,7 @@ func TestInferHardwareSystem_CaseInsensitive(t *testing.T) {
for _, variant := range variants { for _, variant := range variants {
result := InferHardwareSystem(variant) result := InferHardwareSystem(variant)
assert.Equal(t, "h100_sxm", result, "Should handle case variations: %s", variant) assert.Equal(t, "h100_sxm", string(result), "Should handle case variations: %s", variant)
} }
} }
...@@ -369,6 +372,6 @@ func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) { ...@@ -369,6 +372,6 @@ func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) {
for _, variant := range variants { for _, variant := range variants {
result := InferHardwareSystem(variant) result := InferHardwareSystem(variant)
assert.Equal(t, "h100_sxm", result, "Should normalize spaces/dashes: %s", variant) assert.Equal(t, "h100_sxm", string(result), "Should normalize spaces/dashes: %s", variant)
} }
} }
...@@ -1402,6 +1402,28 @@ _Appears in:_ ...@@ -1402,6 +1402,28 @@ _Appears in:_
| `mocker` _[MockerSpec](#mockerspec)_ | Mocker configures the simulated (mocker) backend for testing without GPUs. | | Optional: \{\} <br /> | | `mocker` _[MockerSpec](#mockerspec)_ | Mocker configures the simulated (mocker) backend for testing without GPUs. | | Optional: \{\} <br /> |
#### GPUSKUType
_Underlying type:_ _string_
GPUSKUType is the AIC hardware system identifier for a supported GPU.
_Validation:_
- Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s]
_Appears in:_
- [HardwareSpec](#hardwarespec)
| Field | Description |
| --- | --- |
| `gb200_sxm` | |
| `h200_sxm` | |
| `h100_sxm` | |
| `b200_sxm` | |
| `a100_sxm` | |
| `l40s` | |
#### HardwareSpec #### HardwareSpec
...@@ -1416,7 +1438,7 @@ _Appears in:_ ...@@ -1416,7 +1438,7 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `gpuSku` _string_ | GPUSKU is the GPU SKU identifier (e.g., "H100_SXM", "A100_80GB"). | | Optional: \{\} <br /> | | `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. | | Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s] <br />Optional: \{\} <br /> |
| `vramMb` _float_ | VRAMMB is the VRAM per GPU in MiB. | | Optional: \{\} <br /> | | `vramMb` _float_ | VRAMMB is the VRAM per GPU in MiB. | | Optional: \{\} <br /> |
| `totalGpus` _integer_ | TotalGPUs is the total number of GPUs available in the cluster. | | Optional: \{\} <br /> | | `totalGpus` _integer_ | TotalGPUs is the total number of GPUs available in the cluster. | | Optional: \{\} <br /> |
| `numGpusPerNode` _integer_ | NumGPUsPerNode is the number of GPUs per node. | | Optional: \{\} <br /> | | `numGpusPerNode` _integer_ | NumGPUsPerNode is the number of GPUs per node. | | Optional: \{\} <br /> |
......
...@@ -209,7 +209,7 @@ class TestValidDgdrSpec: ...@@ -209,7 +209,7 @@ class TestValidDgdrSpec:
@pytest.mark.gpu_0 @pytest.mark.gpu_0
def test_missing_gpu_sku_raises(self): def test_missing_gpu_sku_raises(self):
"""hardware.gpuSku is required.""" """hardware.gpuSku is required."""
dgdr = _make_dgdr(hardware=HardwareSpec(gpuSku="", numGpusPerNode=8)) dgdr = _make_dgdr(hardware=HardwareSpec(gpuSku=None, numGpusPerNode=8))
with pytest.raises(ValueError, match="gpuSku.*required"): with pytest.raises(ValueError, match="gpuSku.*required"):
valid_dgdr_spec(dgdr) valid_dgdr_spec(dgdr)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment