Unverified Commit 63787939 authored by devivasudevan's avatar devivasudevan Committed by GitHub
Browse files

feat: Expanding the hardware discovery to include RDMA, interconnect. (#7551)

parent df53b7a2
......@@ -59,11 +59,20 @@ class SearchStrategy(str, Enum):
class GPUSKUType(str, Enum):
GB200SXM = "gb200_sxm"
B200SXM = "b200_sxm"
H200SXM = "h200_sxm"
H100SXM = "h100_sxm"
B200SXM = "b200_sxm"
H100PCIe = "h100_pcie"
A100SXM = "a100_sxm"
A100PCIe = "a100_pcie"
L40S = "l40s"
L40 = "l40"
L4 = "l4"
V100SXM = "v100_sxm"
V100PCIe = "v100_pcie"
T4 = "t4"
MI200 = "mi200"
MI300 = "mi300"
class BackendType(str, Enum):
......@@ -210,6 +219,14 @@ class HardwareSpec(BaseModel):
numGpusPerNode: Optional[int] = Field(
default=None, description="NumGPUsPerNode is the number of GPUs per node."
)
interconnect: Optional[str] = Field(
default=None,
description='Interconnect describes the GPU interconnect type within a node. Examples: "pcie", "nvlink", "infiniband".',
)
rdma: Optional[bool] = Field(
default=None,
description="RDMA indicates whether RDMA is available on the cluster.",
)
class DynamoGraphDeploymentRequestSpec(BaseModel):
......
......@@ -579,26 +579,52 @@ spec:
allOf:
- enum:
- gb200_sxm
- b200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- h100_pcie
- a100_sxm
- a100_pcie
- l40s
- l40
- l4
- v100_sxm
- v100_pcie
- t4
- mi200
- mi300
- enum:
- gb200_sxm
- b200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- h100_pcie
- a100_sxm
- a100_pcie
- l40s
- l40
- l4
- v100_sxm
- v100_pcie
- t4
- mi200
- mi300
description: |-
GPUSKU is the AIC hardware system identifier for the GPU.
When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
type: string
interconnect:
description: |-
Interconnect describes the GPU interconnect type within a node.
Examples: "pcie", "nvlink", "infiniband".
type: string
numGpusPerNode:
description: NumGPUsPerNode is the number of GPUs per node.
format: int32
type: integer
rdma:
description: RDMA indicates whether RDMA is available on the cluster.
type: boolean
totalGpus:
description: TotalGPUs is the total number of GPUs available in the cluster.
format: int32
......
......@@ -175,16 +175,31 @@ const (
)
// GPUSKUType is the AIC hardware system identifier for a supported GPU.
// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
// +kubebuilder:validation:Enum=gb200_sxm;b200_sxm;h200_sxm;h100_sxm;h100_pcie;a100_sxm;a100_pcie;l40s;l40;l4;v100_sxm;v100_pcie;t4;mi200;mi300
type GPUSKUType string
const (
// --- Blackwell ---
GPUSKUTypeGB200SXM GPUSKUType = "gb200_sxm"
GPUSKUTypeB200SXM GPUSKUType = "b200_sxm"
// --- Hopper ---
GPUSKUTypeH200SXM GPUSKUType = "h200_sxm"
GPUSKUTypeH100SXM GPUSKUType = "h100_sxm"
GPUSKUTypeB200SXM GPUSKUType = "b200_sxm"
GPUSKUTypeH100PCIe GPUSKUType = "h100_pcie"
// --- Ampere ---
GPUSKUTypeA100SXM GPUSKUType = "a100_sxm"
GPUSKUTypeA100PCIe GPUSKUType = "a100_pcie"
// --- Ada ---
GPUSKUTypeL40S GPUSKUType = "l40s"
GPUSKUTypeL40 GPUSKUType = "l40"
GPUSKUTypeL4 GPUSKUType = "l4"
// --- Older NVIDIA ---
GPUSKUTypeV100SXM GPUSKUType = "v100_sxm"
GPUSKUTypeV100PCIe GPUSKUType = "v100_pcie"
GPUSKUTypeT4 GPUSKUType = "t4"
// --- AMD ---
GPUSKUTypeMI200 GPUSKUType = "mi200"
GPUSKUTypeMI300 GPUSKUType = "mi300"
)
// BackendType specifies the inference backend.
......@@ -324,7 +339,7 @@ type HardwareSpec struct {
// GPUSKU is the AIC hardware system identifier for the GPU.
// When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
// +optional
// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s
// +kubebuilder:validation:Enum=gb200_sxm;b200_sxm;h200_sxm;h100_sxm;h100_pcie;a100_sxm;a100_pcie;l40s;l40;l4;v100_sxm;v100_pcie;t4;mi200;mi300
GPUSKU GPUSKUType `json:"gpuSku,omitempty"`
// VRAMMB is the VRAM per GPU in MiB.
......@@ -338,6 +353,13 @@ type HardwareSpec struct {
// NumGPUsPerNode is the number of GPUs per node.
// +optional
NumGPUsPerNode *int32 `json:"numGpusPerNode,omitempty"`
// Interconnect describes the GPU interconnect type within a node.
// Examples: "pcie", "nvlink", "infiniband".
// +optional
Interconnect string `json:"interconnect,omitempty"`
// RDMA indicates whether RDMA is available on the cluster.
// +optional
RDMA *bool `json:"rdma,omitempty"`
}
// DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.
......
......@@ -252,6 +252,11 @@ func (in *HardwareSpec) DeepCopyInto(out *HardwareSpec) {
*out = new(int32)
**out = **in
}
if in.RDMA != nil {
in, out := &in.RDMA, &out.RDMA
*out = new(bool)
**out = **in
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HardwareSpec.
......
......@@ -579,26 +579,52 @@ spec:
allOf:
- enum:
- gb200_sxm
- b200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- h100_pcie
- a100_sxm
- a100_pcie
- l40s
- l40
- l4
- v100_sxm
- v100_pcie
- t4
- mi200
- mi300
- enum:
- gb200_sxm
- b200_sxm
- h200_sxm
- h100_sxm
- b200_sxm
- h100_pcie
- a100_sxm
- a100_pcie
- l40s
- l40
- l4
- v100_sxm
- v100_pcie
- t4
- mi200
- mi300
description: |-
GPUSKU is the AIC hardware system identifier for the GPU.
When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
type: string
interconnect:
description: |-
Interconnect describes the GPU interconnect type within a node.
Examples: "pcie", "nvlink", "infiniband".
type: string
numGpusPerNode:
description: NumGPUsPerNode is the number of GPUs per node.
format: int32
type: integer
rdma:
description: RDMA indicates whether RDMA is available on the cluster.
type: boolean
totalGpus:
description: TotalGPUs is the total number of GPUs available in the cluster.
format: int32
......
This diff is collapsed.
......@@ -26,6 +26,7 @@ import (
"strings"
"testing"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
......@@ -322,32 +323,159 @@ func TestExtractGPUInfoFromNode_MissingLabels(t *testing.T) {
func TestInferHardwareSystem(t *testing.T) {
tests := []struct {
gpuProduct string
expectedSystem string
description string
name string
input string
expected nvidiacomv1beta1.GPUSKUType
}{
{"H100-SXM5-80GB", "h100_sxm", "H100 SXM variant"},
{"H100-PCIE-80GB", "h100_sxm", "H100 PCIe variant (mapped to SXM)"},
{"H200-SXM5-141GB", "h200_sxm", "H200 SXM variant"},
{"A100-SXM4-40GB", "a100_sxm", "A100 SXM variant"},
{"A100-PCIE-80GB", "a100_sxm", "A100 PCIe variant (mapped to SXM)"},
{"L40S", "l40s", "L40S"},
{"NVIDIA L40S", "l40s", "L40S with prefix"},
{"B200-SXM", "b200_sxm", "B200 SXM"},
{"GB200", "gb200_sxm", "GB200"},
{"Tesla V100-SXM2-16GB", "", "V100 (not in mapping)"},
{"RTX 4090", "", "Consumer GPU (not in mapping)"},
{"Unknown-GPU", "", "Unknown GPU"},
{"", "", "Empty string"},
// GFD product names as seen in real cluster labels (regression for GPUSKU bug)
{"NVIDIA-B200", "b200_sxm", "B200 with NVIDIA prefix (GFD label format)"},
{"NVIDIA-H200-SXM5-141GB", "h200_sxm", "H200 with NVIDIA prefix (GFD label format)"},
// --- Empty / unknown ---
{
name: "empty input",
input: "",
expected: "",
},
{
name: "unknown gpu",
input: "random-gpu",
expected: "",
},
// --- Blackwell ---
{
name: "GB200 SXM",
input: "GB200-SXM",
expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
},
{
name: "GB200 HGX (implies SXM)",
input: "HGX GB200",
expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
},
{
name: "B200 SXM",
input: "B200 SXM",
expected: nvidiacomv1beta1.GPUSKUTypeB200SXM,
},
// --- Hopper ---
{
name: "H100 SXM",
input: "H100 SXM",
expected: nvidiacomv1beta1.GPUSKUTypeH100SXM,
},
{
name: "H100 PCIe explicit",
input: "H100 PCIe",
expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
},
{
name: "H100 default PCIe",
input: "H100",
expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
},
{
name: "H200 SXM",
input: "H200 SXM",
expected: nvidiacomv1beta1.GPUSKUTypeH200SXM,
},
// --- Ampere ---
{
name: "A100 SXM",
input: "A100-SXM",
expected: nvidiacomv1beta1.GPUSKUTypeA100SXM,
},
{
name: "A100 PCIe",
input: "A100 PCIe",
expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
},
{
name: "A100 default PCIe",
input: "A100",
expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
},
// --- Ada ---
{
name: "L40S",
input: "L40S",
expected: nvidiacomv1beta1.GPUSKUTypeL40S,
},
{
name: "L40S should not match L40",
input: "L40S",
expected: nvidiacomv1beta1.GPUSKUTypeL40S,
},
{
name: "L40",
input: "L40",
expected: nvidiacomv1beta1.GPUSKUTypeL40,
},
{
name: "L4",
input: "L4",
expected: nvidiacomv1beta1.GPUSKUTypeL4,
},
// --- Volta / Turing ---
{
name: "V100 SXM",
input: "V100 SXM",
expected: nvidiacomv1beta1.GPUSKUTypeV100SXM,
},
{
name: "V100 PCIe",
input: "V100 PCIe",
expected: nvidiacomv1beta1.GPUSKUTypeV100PCIe,
},
{
name: "T4",
input: "T4",
expected: nvidiacomv1beta1.GPUSKUTypeT4,
},
// --- AMD ---
{
name: "MI300",
input: "MI300",
expected: nvidiacomv1beta1.GPUSKUTypeMI300,
},
{
name: "MI250",
input: "MI250",
expected: nvidiacomv1beta1.GPUSKUTypeMI200,
},
{
name: "MI200",
input: "MI200",
expected: nvidiacomv1beta1.GPUSKUTypeMI200,
},
// --- Normalization tests ---
{
name: "lowercase + spaces",
input: "h100 sxm",
expected: nvidiacomv1beta1.GPUSKUTypeH100SXM,
},
{
name: "mixed case + dash",
input: "A100-sXm",
expected: nvidiacomv1beta1.GPUSKUTypeA100SXM,
},
{
name: "with extra spaces",
input: " H100 PCIe ",
expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
},
}
for _, tt := range tests {
t.Run(tt.description, func(t *testing.T) {
result := InferHardwareSystem(tt.gpuProduct)
assert.Equal(t, tt.expectedSystem, string(result), "Failed for GPU: %s", tt.gpuProduct)
t.Run(tt.name, func(t *testing.T) {
result := InferHardwareSystem(tt.input)
if result != tt.expected {
t.Errorf("InferHardwareSystem(%q) = %v, want %v",
tt.input, result, tt.expected)
}
})
}
}
......@@ -382,6 +510,119 @@ func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) {
}
}
func TestNormalize(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "basic lowercase",
input: "h100",
expected: "H100",
},
{
name: "spaces removed",
input: "H100 SXM",
expected: "H100SXM",
},
{
name: "dashes replaced and removed",
input: "H100-SXM",
expected: "H100SXM",
},
{
name: "mixed spaces and dashes",
input: "A100 - SXM",
expected: "A100SXM",
},
{
name: "extra whitespace",
input: " H100 PCIe ",
expected: "H100PCIE",
},
{
name: "complex string",
input: "h100-sxm5-80gb",
expected: "H100SXM580GB",
},
{
name: "empty string",
input: "",
expected: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := normalize(tt.input)
if result != tt.expected {
t.Errorf("normalize(%q) = %q, want %q",
tt.input, result, tt.expected)
}
})
}
}
func TestDetectFormFactor(t *testing.T) {
tests := []struct {
name string
input string // already normalized
expected string
}{
{
name: "detect SXM explicitly",
input: "H100SXM",
expected: formFactorSXM,
},
{
name: "detect HGX implies SXM",
input: "HGXH100",
expected: formFactorSXM,
},
{
name: "detect DGX implies SXM",
input: "DGXH100",
expected: formFactorSXM,
},
{
name: "detect PCIe explicitly",
input: "H100PCIE",
expected: formFactorPCIe,
},
{
name: "default to PCIe when unknown",
input: "H100",
expected: formFactorPCIe,
},
{
name: "SXM wins over PCIe if both present",
input: "H100SXMPCIE",
expected: formFactorSXM,
},
{
name: "random string defaults to PCIe",
input: "RANDOMGPU",
expected: formFactorPCIe,
},
{
name: "empty string defaults to PCIe",
input: "",
expected: formFactorPCIe,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := detectFormFactor(tt.input)
if result != tt.expected {
t.Errorf("detectFormFactor(%q) = %v, want %v",
tt.input, result, tt.expected)
}
})
}
}
func TestParseMetrics(t *testing.T) {
ctx := context.Background()
......@@ -873,3 +1114,117 @@ func TestGetCloudProviderInfo(t *testing.T) {
})
}
}
func TestDetectRDMAFromNode(t *testing.T) {
scheme := runtime.NewScheme()
_ = corev1.AddToScheme(scheme)
tests := []struct {
name string
node *corev1.Node
nodeName string
expectedOK bool
expectedTyp string
}{
{
name: "node not found",
node: nil,
nodeName: "missing-node",
expectedOK: false,
expectedTyp: strNone,
},
{
name: "rdma detected",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-rdma",
Labels: map[string]string{
"nvidia.com/rdma.present": "true",
},
},
},
nodeName: "node-rdma",
expectedOK: true,
expectedTyp: "rdma",
},
{
name: "sriov detected",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-sriov",
Labels: map[string]string{
"feature.node.kubernetes.io/network-sriov.capable": "true",
},
},
},
nodeName: "node-sriov",
expectedOK: true,
expectedTyp: "sriov",
},
{
name: "both rdma and sriov - rdma takes precedence",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-both",
Labels: map[string]string{
"nvidia.com/rdma.present": "true",
"feature.node.kubernetes.io/network-sriov.capable": "true",
},
},
},
nodeName: "node-both",
expectedOK: true,
expectedTyp: "rdma",
},
{
name: "no relevant labels",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-none",
Labels: map[string]string{},
},
},
nodeName: "node-none",
expectedOK: false,
expectedTyp: strNone,
},
{
name: "labels present but false",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-false",
Labels: map[string]string{
"nvidia.com/rdma.present": "false",
"feature.node.kubernetes.io/network-sriov.capable": "false",
},
},
},
nodeName: "node-false",
expectedOK: false,
expectedTyp: strNone,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var objs []runtime.Object
if tt.node != nil {
objs = append(objs, tt.node)
}
fakeClient := fake.NewClientBuilder().
WithScheme(scheme).
WithRuntimeObjects(objs...).
Build()
ok, typ := detectRDMAFromNode(context.TODO(), fakeClient, tt.nodeName)
if ok != tt.expectedOK {
t.Errorf("expected ok=%v, got %v", tt.expectedOK, ok)
}
if typ != tt.expectedTyp {
t.Errorf("expected type=%s, got %s", tt.expectedTyp, typ)
}
})
}
}
......@@ -1542,19 +1542,28 @@ _Underlying type:_ _string_
GPUSKUType is the AIC hardware system identifier for a supported GPU.
_Validation:_
- Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s]
- Enum: [gb200_sxm b200_sxm h200_sxm h100_sxm h100_pcie a100_sxm a100_pcie l40s l40 l4 v100_sxm v100_pcie t4 mi200 mi300]
_Appears in:_
- [HardwareSpec](#hardwarespec)
| Field | Description |
| --- | --- |
| `gb200_sxm` | |
| `h200_sxm` | |
| `h100_sxm` | |
| `gb200_sxm` | --- Blackwell ---<br /> |
| `b200_sxm` | |
| `a100_sxm` | |
| `l40s` | |
| `h200_sxm` | --- Hopper ---<br /> |
| `h100_sxm` | |
| `h100_pcie` | |
| `a100_sxm` | --- Ampere ---<br /> |
| `a100_pcie` | |
| `l40s` | --- Ada ---<br /> |
| `l40` | |
| `l4` | |
| `v100_sxm` | --- Older NVIDIA ---<br /> |
| `v100_pcie` | |
| `t4` | |
| `mi200` | --- AMD ---<br /> |
| `mi300` | |
#### HardwareSpec
......@@ -1571,10 +1580,12 @@ _Appears in:_
| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. | | Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s] <br />Optional: \{\} <br /> |
| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. | | Enum: [gb200_sxm b200_sxm h200_sxm h100_sxm h100_pcie a100_sxm a100_pcie l40s l40 l4 v100_sxm v100_pcie t4 mi200 mi300] <br />Optional: \{\} <br /> |
| `vramMb` _float_ | VRAMMB is the VRAM per GPU in MiB. | | Optional: \{\} <br /> |
| `totalGpus` _integer_ | TotalGPUs is the total number of GPUs available in the cluster. | | Optional: \{\} <br /> |
| `numGpusPerNode` _integer_ | NumGPUsPerNode is the number of GPUs per node. | | Optional: \{\} <br /> |
| `interconnect` _string_ | Interconnect describes the GPU interconnect type within a node.<br />Examples: "pcie", "nvlink", "infiniband". | | Optional: \{\} <br /> |
| `rdma` _boolean_ | RDMA indicates whether RDMA is available on the cluster. | | Optional: \{\} <br /> |
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment