Unverified Commit 63787939 authored by devivasudevan's avatar devivasudevan Committed by GitHub
Browse files

feat: Expanding the hardware discovery to include RDMA, interconnect. (#7551)

parent df53b7a2
...@@ -59,11 +59,20 @@ class SearchStrategy(str, Enum): ...@@ -59,11 +59,20 @@ class SearchStrategy(str, Enum):
class GPUSKUType(str, Enum): class GPUSKUType(str, Enum):
GB200SXM = "gb200_sxm" GB200SXM = "gb200_sxm"
B200SXM = "b200_sxm"
H200SXM = "h200_sxm" H200SXM = "h200_sxm"
H100SXM = "h100_sxm" H100SXM = "h100_sxm"
B200SXM = "b200_sxm" H100PCIe = "h100_pcie"
A100SXM = "a100_sxm" A100SXM = "a100_sxm"
A100PCIe = "a100_pcie"
L40S = "l40s" L40S = "l40s"
L40 = "l40"
L4 = "l4"
V100SXM = "v100_sxm"
V100PCIe = "v100_pcie"
T4 = "t4"
MI200 = "mi200"
MI300 = "mi300"
class BackendType(str, Enum): class BackendType(str, Enum):
...@@ -210,6 +219,14 @@ class HardwareSpec(BaseModel): ...@@ -210,6 +219,14 @@ class HardwareSpec(BaseModel):
numGpusPerNode: Optional[int] = Field( numGpusPerNode: Optional[int] = Field(
default=None, description="NumGPUsPerNode is the number of GPUs per node." default=None, description="NumGPUsPerNode is the number of GPUs per node."
) )
interconnect: Optional[str] = Field(
default=None,
description='Interconnect describes the GPU interconnect type within a node. Examples: "pcie", "nvlink", "infiniband".',
)
rdma: Optional[bool] = Field(
default=None,
description="RDMA indicates whether RDMA is available on the cluster.",
)
class DynamoGraphDeploymentRequestSpec(BaseModel): class DynamoGraphDeploymentRequestSpec(BaseModel):
......
...@@ -579,26 +579,52 @@ spec: ...@@ -579,26 +579,52 @@ spec:
allOf: allOf:
- enum: - enum:
- gb200_sxm - gb200_sxm
- b200_sxm
- h200_sxm - h200_sxm
- h100_sxm - h100_sxm
- b200_sxm - h100_pcie
- a100_sxm - a100_sxm
- a100_pcie
- l40s - l40s
- l40
- l4
- v100_sxm
- v100_pcie
- t4
- mi200
- mi300
- enum: - enum:
- gb200_sxm - gb200_sxm
- b200_sxm
- h200_sxm - h200_sxm
- h100_sxm - h100_sxm
- b200_sxm - h100_pcie
- a100_sxm - a100_sxm
- a100_pcie
- l40s - l40s
- l40
- l4
- v100_sxm
- v100_pcie
- t4
- mi200
- mi300
description: |- description: |-
GPUSKU is the AIC hardware system identifier for the GPU. GPUSKU is the AIC hardware system identifier for the GPU.
When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
type: string type: string
interconnect:
description: |-
Interconnect describes the GPU interconnect type within a node.
Examples: "pcie", "nvlink", "infiniband".
type: string
numGpusPerNode: numGpusPerNode:
description: NumGPUsPerNode is the number of GPUs per node. description: NumGPUsPerNode is the number of GPUs per node.
format: int32 format: int32
type: integer type: integer
rdma:
description: RDMA indicates whether RDMA is available on the cluster.
type: boolean
totalGpus: totalGpus:
description: TotalGPUs is the total number of GPUs available in the cluster. description: TotalGPUs is the total number of GPUs available in the cluster.
format: int32 format: int32
......
...@@ -175,16 +175,31 @@ const ( ...@@ -175,16 +175,31 @@ const (
) )
// GPUSKUType is the AIC hardware system identifier for a supported GPU. // GPUSKUType is the AIC hardware system identifier for a supported GPU.
// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s // +kubebuilder:validation:Enum=gb200_sxm;b200_sxm;h200_sxm;h100_sxm;h100_pcie;a100_sxm;a100_pcie;l40s;l40;l4;v100_sxm;v100_pcie;t4;mi200;mi300
type GPUSKUType string type GPUSKUType string
const ( const (
// --- Blackwell ---
GPUSKUTypeGB200SXM GPUSKUType = "gb200_sxm" GPUSKUTypeGB200SXM GPUSKUType = "gb200_sxm"
GPUSKUTypeB200SXM GPUSKUType = "b200_sxm"
// --- Hopper ---
GPUSKUTypeH200SXM GPUSKUType = "h200_sxm" GPUSKUTypeH200SXM GPUSKUType = "h200_sxm"
GPUSKUTypeH100SXM GPUSKUType = "h100_sxm" GPUSKUTypeH100SXM GPUSKUType = "h100_sxm"
GPUSKUTypeB200SXM GPUSKUType = "b200_sxm" GPUSKUTypeH100PCIe GPUSKUType = "h100_pcie"
// --- Ampere ---
GPUSKUTypeA100SXM GPUSKUType = "a100_sxm" GPUSKUTypeA100SXM GPUSKUType = "a100_sxm"
GPUSKUTypeL40S GPUSKUType = "l40s" GPUSKUTypeA100PCIe GPUSKUType = "a100_pcie"
// --- Ada ---
GPUSKUTypeL40S GPUSKUType = "l40s"
GPUSKUTypeL40 GPUSKUType = "l40"
GPUSKUTypeL4 GPUSKUType = "l4"
// --- Older NVIDIA ---
GPUSKUTypeV100SXM GPUSKUType = "v100_sxm"
GPUSKUTypeV100PCIe GPUSKUType = "v100_pcie"
GPUSKUTypeT4 GPUSKUType = "t4"
// --- AMD ---
GPUSKUTypeMI200 GPUSKUType = "mi200"
GPUSKUTypeMI300 GPUSKUType = "mi300"
) )
// BackendType specifies the inference backend. // BackendType specifies the inference backend.
...@@ -324,7 +339,7 @@ type HardwareSpec struct { ...@@ -324,7 +339,7 @@ type HardwareSpec struct {
// GPUSKU is the AIC hardware system identifier for the GPU. // GPUSKU is the AIC hardware system identifier for the GPU.
// When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. // When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
// +optional // +optional
// +kubebuilder:validation:Enum=gb200_sxm;h200_sxm;h100_sxm;b200_sxm;a100_sxm;l40s // +kubebuilder:validation:Enum=gb200_sxm;b200_sxm;h200_sxm;h100_sxm;h100_pcie;a100_sxm;a100_pcie;l40s;l40;l4;v100_sxm;v100_pcie;t4;mi200;mi300
GPUSKU GPUSKUType `json:"gpuSku,omitempty"` GPUSKU GPUSKUType `json:"gpuSku,omitempty"`
// VRAMMB is the VRAM per GPU in MiB. // VRAMMB is the VRAM per GPU in MiB.
...@@ -338,6 +353,13 @@ type HardwareSpec struct { ...@@ -338,6 +353,13 @@ type HardwareSpec struct {
// NumGPUsPerNode is the number of GPUs per node. // NumGPUsPerNode is the number of GPUs per node.
// +optional // +optional
NumGPUsPerNode *int32 `json:"numGpusPerNode,omitempty"` NumGPUsPerNode *int32 `json:"numGpusPerNode,omitempty"`
// Interconnect describes the GPU interconnect type within a node.
// Examples: "pcie", "nvlink", "infiniband".
// +optional
Interconnect string `json:"interconnect,omitempty"`
// RDMA indicates whether RDMA is available on the cluster.
// +optional
RDMA *bool `json:"rdma,omitempty"`
} }
// DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest. // DynamoGraphDeploymentRequestSpec defines the desired state of a DynamoGraphDeploymentRequest.
......
...@@ -252,6 +252,11 @@ func (in *HardwareSpec) DeepCopyInto(out *HardwareSpec) { ...@@ -252,6 +252,11 @@ func (in *HardwareSpec) DeepCopyInto(out *HardwareSpec) {
*out = new(int32) *out = new(int32)
**out = **in **out = **in
} }
if in.RDMA != nil {
in, out := &in.RDMA, &out.RDMA
*out = new(bool)
**out = **in
}
} }
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HardwareSpec. // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HardwareSpec.
......
...@@ -579,26 +579,52 @@ spec: ...@@ -579,26 +579,52 @@ spec:
allOf: allOf:
- enum: - enum:
- gb200_sxm - gb200_sxm
- b200_sxm
- h200_sxm - h200_sxm
- h100_sxm - h100_sxm
- b200_sxm - h100_pcie
- a100_sxm - a100_sxm
- a100_pcie
- l40s - l40s
- l40
- l4
- v100_sxm
- v100_pcie
- t4
- mi200
- mi300
- enum: - enum:
- gb200_sxm - gb200_sxm
- b200_sxm
- h200_sxm - h200_sxm
- h100_sxm - h100_sxm
- b200_sxm - h100_pcie
- a100_sxm - a100_sxm
- a100_pcie
- l40s - l40s
- l40
- l4
- v100_sxm
- v100_pcie
- t4
- mi200
- mi300
description: |- description: |-
GPUSKU is the AIC hardware system identifier for the GPU. GPUSKU is the AIC hardware system identifier for the GPU.
When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels.
type: string type: string
interconnect:
description: |-
Interconnect describes the GPU interconnect type within a node.
Examples: "pcie", "nvlink", "infiniband".
type: string
numGpusPerNode: numGpusPerNode:
description: NumGPUsPerNode is the number of GPUs per node. description: NumGPUsPerNode is the number of GPUs per node.
format: int32 format: int32
type: integer type: integer
rdma:
description: RDMA indicates whether RDMA is available on the cluster.
type: boolean
totalGpus: totalGpus:
description: TotalGPUs is the total number of GPUs available in the cluster. description: TotalGPUs is the total number of GPUs available in the cluster.
format: int32 format: int32
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package gpu package gpu
import ( import (
...@@ -28,40 +27,73 @@ import ( ...@@ -28,40 +27,73 @@ import (
"sync" "sync"
"time" "time"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
dto "github.com/prometheus/client_model/go" dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt" "github.com/prometheus/common/expfmt"
"github.com/prometheus/common/model" "github.com/prometheus/common/model"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log"
) )
const ( const (
defaultDCGMEndpointTemplate = "http://{POD_IP}:9400/metrics" defaultDCGMEndpointTemplate = "http://{POD_IP}:9400/metrics"
// NVIDIA GPU Feature Discovery (GFD) label keys // NVIDIA GPU Feature Discovery (GFD) label keys
LabelGPUCount = "nvidia.com/gpu.count" LabelGPUCount = "nvidia.com/gpu.count"
LabelGPUProduct = "nvidia.com/gpu.product" LabelGPUProduct = "nvidia.com/gpu.product"
LabelGPUMemory = "nvidia.com/gpu.memory" LabelGPUMemory = "nvidia.com/gpu.memory"
// DCGM exporter label constants // DCGM exporter label constants
LabelApp = "app" LabelApp = "app"
LabelAppKubernetesName = "app.kubernetes.io/name" LabelAppKubernetesName = "app.kubernetes.io/name"
LabelValueNvidiaDCGMExporter = "nvidia-dcgm-exporter" LabelValueNvidiaDCGMExporter = "nvidia-dcgm-exporter"
LabelValueDCGMExporter = "dcgm-exporter" LabelValueNvidiaNetworkOperator = "nvidia-network-operator"
LabelValueGPUOperator = "gpu-operator" LabelValueDCGMExporter = "dcgm-exporter"
GPUOperatorNamespace = "gpu-operator" LabelValueGPUOperator = "gpu-operator"
GPUOperatorNamespace = "gpu-operator"
requestTimeout = 5 * time.Second requestTimeout = 5 * time.Second
dialTimeout = 3 * time.Second dialTimeout = 3 * time.Second
tlsHandshakeTimeout = 3 * time.Second tlsHandshakeTimeout = 3 * time.Second
CloudProviderGCP = "gcp"
CloudProviderGCP = "gcp" CloudProviderAWS = "aws"
CloudProviderAWS = "aws" CloudProviderAKS = "aks"
CloudProviderAKS = "aks" CloudProviderOther = "other"
CloudProviderOther = "other" CloudProviderUnknown = "unknown"
CloudProviderUnknown = "unknown" )
// --- Normalization helpers ---
const (
strDash = "-"
strSpace = " "
strNone = "none"
)
// --- Form factor tokens ---
const (
tokenSXM = "SXM"
tokenHGX = "HGX"
tokenDGX = "DGX"
tokenPCIE = "PCIE"
formFactorSXM = "sxm"
formFactorPCIe = "pcie"
)
// --- GPU model tokens ---
const (
tokenGB200 = "GB200"
tokenB200 = "B200"
tokenH200 = "H200"
tokenH100 = "H100"
tokenA100 = "A100"
tokenL40S = "L40S"
tokenL40 = "L40"
tokenL4 = "L4"
tokenV100 = "V100"
tokenT4 = "T4"
tokenMI300 = "MI300"
tokenMI250 = "MI250"
tokenMI200 = "MI200"
LabelNVLink = "nvlink"
) )
// awsInstanceTypePrefixes matches known GPU/accelerator instance families on EKS. See: https://aws.amazon.com/ec2/instance-types/ // awsInstanceTypePrefixes matches known GPU/accelerator instance families on EKS. See: https://aws.amazon.com/ec2/instance-types/
...@@ -79,27 +111,64 @@ var gcpMachineSeries = []string{ ...@@ -79,27 +111,64 @@ var gcpMachineSeries = []string{
"g2-", // L4 GPU machines "g2-", // L4 GPU machines
} }
type gpuRule struct {
token string
sxmSKU nvidiacomv1beta1.GPUSKUType
pcieSKU nvidiacomv1beta1.GPUSKUType
singleSKU nvidiacomv1beta1.GPUSKUType // for GPUs without form factor variants
}
var gpuRules = []gpuRule{
// Blackwell
{token: tokenGB200, sxmSKU: nvidiacomv1beta1.GPUSKUTypeGB200SXM},
{token: tokenB200, sxmSKU: nvidiacomv1beta1.GPUSKUTypeB200SXM},
// Hopper
{token: tokenH200, sxmSKU: nvidiacomv1beta1.GPUSKUTypeH200SXM},
{token: tokenH100, sxmSKU: nvidiacomv1beta1.GPUSKUTypeH100SXM, pcieSKU: nvidiacomv1beta1.GPUSKUTypeH100PCIe},
// Ampere
{token: tokenA100, sxmSKU: nvidiacomv1beta1.GPUSKUTypeA100SXM, pcieSKU: nvidiacomv1beta1.GPUSKUTypeA100PCIe},
// Ada
{token: tokenL40S, singleSKU: nvidiacomv1beta1.GPUSKUTypeL40S},
{token: tokenL40, singleSKU: nvidiacomv1beta1.GPUSKUTypeL40},
{token: tokenL4, singleSKU: nvidiacomv1beta1.GPUSKUTypeL4},
// Volta / Turing
{token: tokenV100, sxmSKU: nvidiacomv1beta1.GPUSKUTypeV100SXM, pcieSKU: nvidiacomv1beta1.GPUSKUTypeV100PCIe},
{token: tokenT4, singleSKU: nvidiacomv1beta1.GPUSKUTypeT4},
// AMD
{token: tokenMI300, singleSKU: nvidiacomv1beta1.GPUSKUTypeMI300},
{token: tokenMI250, singleSKU: nvidiacomv1beta1.GPUSKUTypeMI200},
{token: tokenMI200, singleSKU: nvidiacomv1beta1.GPUSKUTypeMI200},
}
// GPUInfo contains discovered GPU configuration from cluster nodes // GPUInfo contains discovered GPU configuration from cluster nodes
type GPUInfo struct { type GPUInfo struct {
NodeName string // Name of the node with this GPU configuration NodeName string // Name of the node with this GPU configuration
GPUsPerNode int // Maximum GPUs per node found in the cluster GPUsPerNode int // Maximum GPUs per node found in the cluster
NodesWithGPUs int // Number of nodes that have GPUs NodesWithGPUs int // Number of nodes that have GPUs
Model string // GPU product name (e.g., "H100-SXM5-80GB") Model string // GPU product name (e.g., "H100-SXM5-80GB")
VRAMPerGPU int // VRAM in MiB per GPU VRAMPerGPU int // VRAM in MiB per GPU
System nvidiacomv1beta1.GPUSKUType // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown System nvidiacomv1beta1.GPUSKUType // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
MIGEnabled bool // True if MIG is enabled (inferred from model or additional labels, not implemented in this version) MIGEnabled bool // True if MIG is enabled (inferred from model or additional labels, not implemented in this version)
MIGProfiles map[string]int // Optional: map of MIG profile name to count (requires additional label parsing, not implemented in this version) MIGProfiles map[string]int // Optional: map of MIG profile name to count (requires additional label parsing, not implemented in this version)
CloudProvider string // NEW: aws | gcp | aks | other | unknown CloudProvider string // aws | gcp | aks | other | unknown
RDMAEnabled bool // Indicates whether RDMA is enabled for this node (e.g., via InfiniBand, RoCE, or similar high-speed networking)
RDMAType string // Type of RDMA transport detected (e.g., "infiniband", "roce", "rdma", "sriov", or "none")
Interconnect string // Primary GPU-to-GPU interconnect technology used within the node (e.g., "nvlink" for high-bandwidth links or "pcie" for standard bus-based communication)
InterconnectTier string // Qualitative or platform-specific classification of the interconnect (e.g., NVLink generation, topology tier, or vendor-defined performance level)
NVLinkLinks int // Number of NVLink connections per GPU (0 if NVLink is not present or interconnect is PCIe-only)
} }
type ScrapeMetricsFunc func(ctx context.Context, endpoint string) (*GPUInfo, error) type ScrapeMetricsFunc func(ctx context.Context, endpoint string) (*GPUInfo, error)
type GPUDiscoveryCache struct { type GPUDiscoveryCache struct {
mu sync.RWMutex mu sync.RWMutex
value *GPUInfo value *GPUInfo
expiresAt time.Time expiresAt time.Time
} }
type GPUDiscovery struct { type GPUDiscovery struct {
Scraper ScrapeMetricsFunc Scraper ScrapeMetricsFunc
} }
...@@ -195,7 +264,6 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien ...@@ -195,7 +264,6 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
if err != nil && !strings.Contains(err.Error(), "no DCGM exporter pods found") { if err != nil && !strings.Contains(err.Error(), "no DCGM exporter pods found") {
return nil, fmt.Errorf("listing DCGM exporter pods failed: %w", err) return nil, fmt.Errorf("listing DCGM exporter pods failed: %w", err)
} }
// If no pods found // If no pods found
if len(dcgmPods) == 0 { if len(dcgmPods) == 0 {
gpuPods, err := listGPUOperatorRunningPods(ctx, k8sClient) gpuPods, err := listGPUOperatorRunningPods(ctx, k8sClient)
...@@ -204,23 +272,28 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien ...@@ -204,23 +272,28 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
} }
return nil, err return nil, err
} }
// Scrape each running pod individually // Scrape each running pod individually
var bestNode *GPUInfo var bestNode *GPUInfo
var scrapeErrors []error var scrapeErrors []error
var rdmaDetected bool
var rdmaType string
nodesWithGPUs := 0 nodesWithGPUs := 0
for _, pod := range dcgmPods { for _, pod := range dcgmPods {
if pod.Status.Phase != corev1.PodRunning || pod.Status.PodIP == "" { if pod.Status.Phase != corev1.PodRunning || pod.Status.PodIP == "" {
continue continue
} }
endpoint := buildDCGMEndpoint(pod.Status.PodIP) endpoint := buildDCGMEndpoint(pod.Status.PodIP)
info, err := g.Scraper(ctx, endpoint) info, err := g.Scraper(ctx, endpoint)
if err != nil { if err != nil {
scrapeErrors = append(scrapeErrors, fmt.Errorf("pod %s (%s): %w", pod.Name, pod.Status.PodIP, err)) scrapeErrors = append(scrapeErrors, fmt.Errorf("pod %s (%s): %w", pod.Name, pod.Status.PodIP, err))
continue continue
} }
// Detect RDMA on the node of this pod
rdma, rType := detectRDMAFromNode(ctx, k8sClient, pod.Spec.NodeName)
if rdma {
rdmaDetected = true
rdmaType = rType
}
// Increment NodesWithGPUs for every node that successfully reports GPU metrics // Increment NodesWithGPUs for every node that successfully reports GPU metrics
nodesWithGPUs++ nodesWithGPUs++
// Select best node: highest GPU count, tie-breaker by VRAM // Select best node: highest GPU count, tie-breaker by VRAM
...@@ -228,18 +301,21 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien ...@@ -228,18 +301,21 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
info.GPUsPerNode > bestNode.GPUsPerNode || info.GPUsPerNode > bestNode.GPUsPerNode ||
(info.GPUsPerNode == bestNode.GPUsPerNode && (info.GPUsPerNode == bestNode.GPUsPerNode &&
info.VRAMPerGPU > bestNode.VRAMPerGPU) { info.VRAMPerGPU > bestNode.VRAMPerGPU) {
bestNode = info bestNode = info
} }
} }
if bestNode == nil { if bestNode == nil {
if len(scrapeErrors) > 0 { if len(scrapeErrors) > 0 {
return nil, fmt.Errorf("failed to scrape any DCGM exporter pod: %v", scrapeErrors) return nil, fmt.Errorf("failed to scrape any DCGM exporter pod: %v", scrapeErrors)
} }
return nil, fmt.Errorf("no GPU metrics could be parsed from any DCGM pod") return nil, fmt.Errorf("no GPU metrics could be parsed from any DCGM pod")
} }
// --- Detect RDMA and InfiniBand presence ---
ib := detectIBPods(ctx, k8sClient)
if ib {
rdmaType = "infiniband"
rdmaDetected = true
}
// Infer cloud provider for the best node // Infer cloud provider for the best node
cloudProvider, err := GetCloudProviderInfo(ctx, k8sClient) cloudProvider, err := GetCloudProviderInfo(ctx, k8sClient)
if err != nil { if err != nil {
...@@ -247,61 +323,51 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien ...@@ -247,61 +323,51 @@ func (g *GPUDiscovery) DiscoverGPUsFromDCGM(ctx context.Context, k8sClient clien
} }
bestNode.CloudProvider = cloudProvider bestNode.CloudProvider = cloudProvider
bestNode.NodesWithGPUs = nodesWithGPUs bestNode.NodesWithGPUs = nodesWithGPUs
bestNode.RDMAEnabled = rdmaDetected
bestNode.RDMAType = rdmaType
if cache != nil { if cache != nil {
// Cache result for 60 seconds // Cache result for 60 seconds
cache.Set(bestNode, 60*time.Second) cache.Set(bestNode, 60*time.Second)
} }
return bestNode, nil return bestNode, nil
} }
func buildDCGMEndpoint(podIP string) string { func buildDCGMEndpoint(podIP string) string {
template := os.Getenv("DCGM_METRICS_ENDPOINT_TEMPLATE") template := os.Getenv("DCGM_METRICS_ENDPOINT_TEMPLATE")
if template == "" { if template == "" {
template = defaultDCGMEndpointTemplate template = defaultDCGMEndpointTemplate
} }
return strings.ReplaceAll(template, "{POD_IP}", podIP) return strings.ReplaceAll(template, "{POD_IP}", podIP)
} }
func listDCGMExporterPods(ctx context.Context, k8sClient client.Reader) ([]corev1.Pod, error) { func listDCGMExporterPods(ctx context.Context, k8sClient client.Reader) ([]corev1.Pod, error) {
var result []corev1.Pod var result []corev1.Pod
seen := make(map[string]struct{}) seen := make(map[string]struct{})
selectors := []client.MatchingLabels{ selectors := []client.MatchingLabels{
{LabelApp: LabelValueNvidiaDCGMExporter}, {LabelApp: LabelValueNvidiaDCGMExporter},
{LabelApp: LabelValueDCGMExporter}, {LabelApp: LabelValueDCGMExporter},
{LabelAppKubernetesName: LabelValueDCGMExporter}, {LabelAppKubernetesName: LabelValueDCGMExporter},
} }
var lastErr error var lastErr error
for _, selector := range selectors { for _, selector := range selectors {
podList := &corev1.PodList{} podList := &corev1.PodList{}
err := k8sClient.List(ctx, podList, selector) err := k8sClient.List(ctx, podList, selector)
if err != nil { if err != nil {
lastErr = fmt.Errorf("list pods: %w", err) lastErr = fmt.Errorf("list pods: %w", err)
continue continue
} }
for _, pod := range podList.Items { for _, pod := range podList.Items {
key := pod.Namespace + "/" + pod.Name key := pod.Namespace + "/" + pod.Name
if _, exists := seen[key]; !exists { if _, exists := seen[key]; !exists {
seen[key] = struct{}{} seen[key] = struct{}{}
result = append(result, pod) result = append(result, pod)
} }
} }
} }
if len(result) > 0 { if len(result) > 0 {
return result, nil return result, nil
} }
if lastErr != nil { if lastErr != nil {
return nil, lastErr return nil, lastErr
} }
return nil, fmt.Errorf("no DCGM exporter pods found") return nil, fmt.Errorf("no DCGM exporter pods found")
} }
...@@ -311,20 +377,15 @@ func listDCGMExporterPods(ctx context.Context, k8sClient client.Reader) ([]corev ...@@ -311,20 +377,15 @@ func listDCGMExporterPods(ctx context.Context, k8sClient client.Reader) ([]corev
// It uses common GPU Operator label selectors and deduplicates results // It uses common GPU Operator label selectors and deduplicates results
// across selectors. If no running pods are found, an error is returned. // across selectors. If no running pods are found, an error is returned.
func listGPUOperatorRunningPods(ctx context.Context, k8sClient client.Reader) ([]corev1.Pod, error) { func listGPUOperatorRunningPods(ctx context.Context, k8sClient client.Reader) ([]corev1.Pod, error) {
var result []corev1.Pod var result []corev1.Pod
seen := make(map[string]struct{}) seen := make(map[string]struct{})
selectors := []client.MatchingLabels{ selectors := []client.MatchingLabels{
{LabelApp: LabelValueGPUOperator}, {LabelApp: LabelValueGPUOperator},
{LabelAppKubernetesName: LabelValueGPUOperator}, {LabelAppKubernetesName: LabelValueGPUOperator},
} }
var lastErr error var lastErr error
for _, selector := range selectors { for _, selector := range selectors {
podList := &corev1.PodList{} podList := &corev1.PodList{}
err := k8sClient.List( err := k8sClient.List(
ctx, ctx,
podList, podList,
...@@ -335,29 +396,23 @@ func listGPUOperatorRunningPods(ctx context.Context, k8sClient client.Reader) ([ ...@@ -335,29 +396,23 @@ func listGPUOperatorRunningPods(ctx context.Context, k8sClient client.Reader) ([
lastErr = fmt.Errorf("list gpu operator pods: %w", err) lastErr = fmt.Errorf("list gpu operator pods: %w", err)
continue continue
} }
for _, pod := range podList.Items { for _, pod := range podList.Items {
if pod.Status.Phase != corev1.PodRunning { if pod.Status.Phase != corev1.PodRunning {
continue continue
} }
key := pod.Namespace + "/" + pod.Name key := pod.Namespace + "/" + pod.Name
if _, exists := seen[key]; !exists { if _, exists := seen[key]; !exists {
seen[key] = struct{}{} seen[key] = struct{}{}
result = append(result, pod) result = append(result, pod)
} }
} }
} }
if len(result) > 0 { if len(result) > 0 {
return result, nil return result, nil
} }
if lastErr != nil { if lastErr != nil {
return nil, lastErr return nil, lastErr
} }
return nil, fmt.Errorf( return nil, fmt.Errorf(
"gpu operator is not installed %s", "gpu operator is not installed %s",
GPUOperatorNamespace, GPUOperatorNamespace,
...@@ -385,7 +440,6 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro ...@@ -385,7 +440,6 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
// Set a timeout for the request // Set a timeout for the request
ctx, cancel := context.WithTimeout(ctx, requestTimeout) ctx, cancel := context.WithTimeout(ctx, requestTimeout)
defer cancel() defer cancel()
// Create a custom HTTP client with transport-level timeouts // Create a custom HTTP client with transport-level timeouts
client := &http.Client{ client := &http.Client{
Transport: &http.Transport{ Transport: &http.Transport{
...@@ -396,12 +450,10 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro ...@@ -396,12 +450,10 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
TLSHandshakeTimeout: tlsHandshakeTimeout, // TLS handshake timeout TLSHandshakeTimeout: tlsHandshakeTimeout, // TLS handshake timeout
}, },
} }
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil { if err != nil {
return nil, fmt.Errorf("create request for %s: %w", endpoint, err) return nil, fmt.Errorf("create request for %s: %w", endpoint, err)
} }
resp, err := client.Do(req) resp, err := client.Do(req)
if err != nil { if err != nil {
return nil, fmt.Errorf("HTTP GET %s failed: %w", endpoint, err) return nil, fmt.Errorf("HTTP GET %s failed: %w", endpoint, err)
...@@ -412,7 +464,6 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro ...@@ -412,7 +464,6 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
log.FromContext(ctx).V(1).Info("failed to close response body", "err", cerr) log.FromContext(ctx).V(1).Info("failed to close response body", "err", cerr)
} }
}() }()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf( return nil, fmt.Errorf(
"metrics endpoint %s returned status %d", "metrics endpoint %s returned status %d",
...@@ -420,20 +471,18 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro ...@@ -420,20 +471,18 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
resp.StatusCode, resp.StatusCode,
) )
} }
parser := expfmt.NewTextParser(model.UTF8Validation) parser := expfmt.NewTextParser(model.UTF8Validation)
metricFamilies, err := parser.TextToMetricFamilies(resp.Body) metricFamilies, err := parser.TextToMetricFamilies(resp.Body)
if err != nil { if err != nil {
return nil, fmt.Errorf("parse prometheus metrics: %w", err) return nil, fmt.Errorf("parse prometheus metrics: %w", err)
} }
return parseMetrics(ctx, metricFamilies) return parseMetrics(ctx, metricFamilies)
} }
// parseMetrics extracts GPU information for a node from DCGM Prometheus metrics. // parseMetrics extracts GPU information and interconnect type for a node from DCGM Prometheus metrics.
// //
// It parses the provided Prometheus metric families exported by the NVIDIA // It parses the provided Prometheus metric families exported by the NVIDIA
// DCGM exporter and derives high-level GPU inventory information for the node. // DCGM exporter and derives high-level GPU inventory and interconnect information for the node.
// //
// The function performs the following: // The function performs the following:
// //
...@@ -446,6 +495,10 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro ...@@ -446,6 +495,10 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
// VRAM = FB_FREE + FB_USED + FB_RESERVED // VRAM = FB_FREE + FB_USED + FB_RESERVED
// (values are in MiB). // (values are in MiB).
// //
// - Determines the interconnect type (PCIe or NVLink) from the
// DCGM_FI_DEV_NVLINK_LINK_COUNT metric. If NVLink links are present,
// interconnect is set to "nvlink", otherwise defaults to "pcie".
//
// - Assumes MIG is disabled unless explicit MIG metrics are present // - Assumes MIG is disabled unless explicit MIG metrics are present
// (not included in the provided DCGM metric set). // (not included in the provided DCGM metric set).
// //
...@@ -464,17 +517,18 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro ...@@ -464,17 +517,18 @@ func ScrapeMetricsEndpoint(ctx context.Context, endpoint string) (*GPUInfo, erro
// - MIGEnabled: false because no MIG metrics were collected in the DCGM families // - MIGEnabled: false because no MIG metrics were collected in the DCGM families
// - MIGProfiles: empty map; would contain MIG profile counts if MIG metrics were available // - MIGProfiles: empty map; would contain MIG profile counts if MIG metrics were available
// - System (inferred from model) // - System (inferred from model)
// - Interconnect: "pcie" or "nvlink" depending on detected NVLink links
// //
// Returns an error if no GPUs can be detected from the metrics. // Returns an error if no GPUs can be detected from the metrics.
// //
// Notes: // Notes:
// - This function relies on DCGM exporter metrics. // - This function relies on DCGM exporter metrics.
// - If required metrics are missing, zero values may be returned. // - If required metrics are missing, zero values may be returned.
// - Interconnect detection is based on NVLink link count; other interconnects are not currently detected.
// - The implementation assumes homogeneous GPUs per node. // - The implementation assumes homogeneous GPUs per node.
// - For heterogeneous configurations, per-GPU parsing should be implemented. // - For heterogeneous configurations, per-GPU parsing should be implemented.
func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*GPUInfo, error) { func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*GPUInfo, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
getLabel := func(m *dto.Metric, name string) string { getLabel := func(m *dto.Metric, name string) string {
for _, l := range m.GetLabel() { for _, l := range m.GetLabel() {
if l.GetName() == name { if l.GetName() == name {
...@@ -483,18 +537,16 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (* ...@@ -483,18 +537,16 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
} }
return "" return ""
} }
// Track unique GPUs // Track unique GPUs
gpuSet := map[string]struct{}{} gpuSet := map[string]struct{}{}
var model string var model string
var vram int var vram int
var hostName string var hostName string
var nvlinkDetected bool
var nvlinkLinks int
fbFree := map[string]float64{} fbFree := map[string]float64{}
fbUsed := map[string]float64{} fbUsed := map[string]float64{}
fbReserved := map[string]float64{} fbReserved := map[string]float64{}
// --- Detect GPUs + Model + Hostname --- // --- Detect GPUs + Model + Hostname ---
if mf, ok := families["DCGM_FI_DEV_GPU_TEMP"]; ok { if mf, ok := families["DCGM_FI_DEV_GPU_TEMP"]; ok {
for _, m := range mf.Metric { for _, m := range mf.Metric {
...@@ -503,19 +555,16 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (* ...@@ -503,19 +555,16 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
continue continue
} }
gpuSet[gpuID] = struct{}{} gpuSet[gpuID] = struct{}{}
// Extract model from label // Extract model from label
if model == "" { if model == "" {
model = getLabel(m, "modelName") model = getLabel(m, "modelName")
} }
// Extract Hostname label // Extract Hostname label
if hostName == "" { if hostName == "" {
hostName = getLabel(m, "Hostname") hostName = getLabel(m, "Hostname")
} }
} }
} }
// --- Collect framebuffer metrics --- // --- Collect framebuffer metrics ---
if mf, ok := families["DCGM_FI_DEV_FB_FREE"]; ok { if mf, ok := families["DCGM_FI_DEV_FB_FREE"]; ok {
for _, m := range mf.Metric { for _, m := range mf.Metric {
...@@ -524,13 +573,11 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (* ...@@ -524,13 +573,11 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
continue continue
} }
fbFree[gpuID] = m.GetGauge().GetValue() fbFree[gpuID] = m.GetGauge().GetValue()
if hostName == "" { if hostName == "" {
hostName = getLabel(m, "Hostname") hostName = getLabel(m, "Hostname")
} }
} }
} }
if mf, ok := families["DCGM_FI_DEV_FB_USED"]; ok { if mf, ok := families["DCGM_FI_DEV_FB_USED"]; ok {
for _, m := range mf.Metric { for _, m := range mf.Metric {
gpuID := getLabel(m, "gpu") gpuID := getLabel(m, "gpu")
...@@ -538,13 +585,11 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (* ...@@ -538,13 +585,11 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
continue continue
} }
fbUsed[gpuID] = m.GetGauge().GetValue() fbUsed[gpuID] = m.GetGauge().GetValue()
if hostName == "" { if hostName == "" {
hostName = getLabel(m, "Hostname") hostName = getLabel(m, "Hostname")
} }
} }
} }
if mf, ok := families["DCGM_FI_DEV_FB_RESERVED"]; ok { if mf, ok := families["DCGM_FI_DEV_FB_RESERVED"]; ok {
for _, m := range mf.Metric { for _, m := range mf.Metric {
gpuID := getLabel(m, "gpu") gpuID := getLabel(m, "gpu")
...@@ -552,13 +597,37 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (* ...@@ -552,13 +597,37 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
continue continue
} }
fbReserved[gpuID] = m.GetGauge().GetValue() fbReserved[gpuID] = m.GetGauge().GetValue()
if hostName == "" { if hostName == "" {
hostName = getLabel(m, "Hostname") hostName = getLabel(m, "Hostname")
} }
} }
} }
if mf, ok := families["DCGM_FI_DEV_NVLINK_LINK_COUNT"]; ok {
for _, m := range mf.Metric {
val := int(m.GetGauge().GetValue())
if val > 0 {
nvlinkDetected = true
nvlinkLinks = val
break
}
}
}
// --- Determine interconnect type ---
interconnect := "pcie"
interconnectDetail := strNone
if nvlinkDetected {
switch {
case nvlinkLinks >= 12:
interconnect = LabelNVLink
interconnectDetail = "full-mesh" // HGX / DGX class
case nvlinkLinks >= 6:
interconnect = LabelNVLink
interconnectDetail = "high"
default:
interconnect = LabelNVLink
interconnectDetail = "partial"
}
}
// --- Calculate Max VRAM // --- Calculate Max VRAM
for gpuID := range gpuSet { for gpuID := range gpuSet {
total := int(fbFree[gpuID] + fbUsed[gpuID] + fbReserved[gpuID]) total := int(fbFree[gpuID] + fbUsed[gpuID] + fbReserved[gpuID])
...@@ -566,32 +635,33 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (* ...@@ -566,32 +635,33 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
vram = total vram = total
} }
} }
gpuCount := len(gpuSet) gpuCount := len(gpuSet)
if gpuCount == 0 { if gpuCount == 0 {
return nil, fmt.Errorf("no GPUs detected from DCGM metrics") return nil, fmt.Errorf("no GPUs detected from DCGM metrics")
} }
// --- Infer system from model --- // --- Infer system from model ---
system := InferHardwareSystem(model) system := InferHardwareSystem(model)
logger.Info("Parsed GPU info", logger.Info("Parsed GPU info",
"node", hostName, "node", hostName,
"gpuCount", gpuCount, "gpuCount", gpuCount,
"model", model, "model", model,
"vramMiB", vram, "vramMiB", vram,
"system", system, "system", system,
"interconnect", interconnect,
"interconnectDetail", interconnectDetail,
"nvlinkLinks", nvlinkLinks,
) )
return &GPUInfo{ return &GPUInfo{
NodeName: hostName, NodeName: hostName,
GPUsPerNode: gpuCount, GPUsPerNode: gpuCount,
Model: model, Model: model,
VRAMPerGPU: vram, VRAMPerGPU: vram,
MIGEnabled: false, MIGEnabled: false,
MIGProfiles: map[string]int{}, MIGProfiles: map[string]int{},
System: system, // populated from InferHardwareSystem System: system, // populated from InferHardwareSystem
Interconnect: interconnect,
InterconnectTier: interconnectDetail,
NVLinkLinks: nvlinkLinks,
}, nil }, nil
} }
...@@ -605,23 +675,18 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (* ...@@ -605,23 +675,18 @@ func parseMetrics(ctx context.Context, families map[string]*dto.MetricFamily) (*
func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error) { func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error) {
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("Starting GPU discovery from cluster nodes") logger.Info("Starting GPU discovery from cluster nodes")
// List all nodes in the cluster // List all nodes in the cluster
nodeList := &corev1.NodeList{} nodeList := &corev1.NodeList{}
if err := k8sClient.List(ctx, nodeList); err != nil { if err := k8sClient.List(ctx, nodeList); err != nil {
return nil, fmt.Errorf("failed to list cluster nodes: %w", err) return nil, fmt.Errorf("failed to list cluster nodes: %w", err)
} }
if len(nodeList.Items) == 0 { if len(nodeList.Items) == 0 {
return nil, fmt.Errorf("no nodes found in cluster") return nil, fmt.Errorf("no nodes found in cluster")
} }
logger.Info("Found cluster nodes", "count", len(nodeList.Items)) logger.Info("Found cluster nodes", "count", len(nodeList.Items))
// Track the best GPU configuration found // Track the best GPU configuration found
var bestGPUInfo *GPUInfo var bestGPUInfo *GPUInfo
nodesWithGPUs := 0 nodesWithGPUs := 0
for i := range nodeList.Items { for i := range nodeList.Items {
node := &nodeList.Items[i] node := &nodeList.Items[i]
gpuInfo, err := extractGPUInfoFromNode(node) gpuInfo, err := extractGPUInfoFromNode(node)
...@@ -632,14 +697,12 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error ...@@ -632,14 +697,12 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error
"reason", err.Error()) "reason", err.Error())
continue continue
} }
nodesWithGPUs++ nodesWithGPUs++
logger.Info("Found GPU node", logger.Info("Found GPU node",
"node", node.Name, "node", node.Name,
"gpus", gpuInfo.GPUsPerNode, "gpus", gpuInfo.GPUsPerNode,
"model", gpuInfo.Model, "model", gpuInfo.Model,
"vram", gpuInfo.VRAMPerGPU) "vram", gpuInfo.VRAMPerGPU)
// Select best configuration: prefer higher GPU count, then higher VRAM // Select best configuration: prefer higher GPU count, then higher VRAM
if bestGPUInfo == nil || if bestGPUInfo == nil ||
gpuInfo.GPUsPerNode > bestGPUInfo.GPUsPerNode || gpuInfo.GPUsPerNode > bestGPUInfo.GPUsPerNode ||
...@@ -647,17 +710,14 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error ...@@ -647,17 +710,14 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error
bestGPUInfo = gpuInfo bestGPUInfo = gpuInfo
} }
} }
if bestGPUInfo == nil { if bestGPUInfo == nil {
return nil, fmt.Errorf("no nodes with NVIDIA GPU Feature Discovery labels found (checked %d nodes). "+ return nil, fmt.Errorf("no nodes with NVIDIA GPU Feature Discovery labels found (checked %d nodes). "+
"Ensure GPU nodes have labels: %s, %s, %s", "Ensure GPU nodes have labels: %s, %s, %s",
len(nodeList.Items), LabelGPUCount, LabelGPUProduct, LabelGPUMemory) len(nodeList.Items), LabelGPUCount, LabelGPUProduct, LabelGPUMemory)
} }
// Infer hardware system from GPU model // Infer hardware system from GPU model
bestGPUInfo.System = InferHardwareSystem(bestGPUInfo.Model) bestGPUInfo.System = InferHardwareSystem(bestGPUInfo.Model)
bestGPUInfo.NodesWithGPUs = nodesWithGPUs bestGPUInfo.NodesWithGPUs = nodesWithGPUs
logger.Info("GPU discovery completed", logger.Info("GPU discovery completed",
"gpusPerNode", bestGPUInfo.GPUsPerNode, "gpusPerNode", bestGPUInfo.GPUsPerNode,
"nodesWithGPUs", bestGPUInfo.NodesWithGPUs, "nodesWithGPUs", bestGPUInfo.NodesWithGPUs,
...@@ -665,7 +725,6 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error ...@@ -665,7 +725,6 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error
"model", bestGPUInfo.Model, "model", bestGPUInfo.Model,
"vram", bestGPUInfo.VRAMPerGPU, "vram", bestGPUInfo.VRAMPerGPU,
"system", bestGPUInfo.System) "system", bestGPUInfo.System)
return bestGPUInfo, nil return bestGPUInfo, nil
} }
...@@ -676,7 +735,6 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) { ...@@ -676,7 +735,6 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
if labels == nil { if labels == nil {
return nil, fmt.Errorf("node has no labels") return nil, fmt.Errorf("node has no labels")
} }
gpuCountStr, ok := labels[LabelGPUCount] gpuCountStr, ok := labels[LabelGPUCount]
if !ok { if !ok {
return nil, fmt.Errorf("missing label %s", LabelGPUCount) return nil, fmt.Errorf("missing label %s", LabelGPUCount)
...@@ -685,12 +743,10 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) { ...@@ -685,12 +743,10 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
if err != nil || gpuCount <= 0 { if err != nil || gpuCount <= 0 {
return nil, fmt.Errorf("invalid GPU count: %s", gpuCountStr) return nil, fmt.Errorf("invalid GPU count: %s", gpuCountStr)
} }
gpuModel, ok := labels[LabelGPUProduct] gpuModel, ok := labels[LabelGPUProduct]
if !ok || gpuModel == "" { if !ok || gpuModel == "" {
return nil, fmt.Errorf("missing or empty label %s", LabelGPUProduct) return nil, fmt.Errorf("missing or empty label %s", LabelGPUProduct)
} }
// Extract VRAM (memory in MiB) // Extract VRAM (memory in MiB)
gpuMemoryStr, ok := labels[LabelGPUMemory] gpuMemoryStr, ok := labels[LabelGPUMemory]
if !ok { if !ok {
...@@ -700,7 +756,6 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) { ...@@ -700,7 +756,6 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
if err != nil || gpuMemory <= 0 { if err != nil || gpuMemory <= 0 {
return nil, fmt.Errorf("invalid GPU memory: %s", gpuMemoryStr) return nil, fmt.Errorf("invalid GPU memory: %s", gpuMemoryStr)
} }
return &GPUInfo{ return &GPUInfo{
GPUsPerNode: gpuCount, GPUsPerNode: gpuCount,
Model: gpuModel, Model: gpuModel,
...@@ -708,12 +763,18 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) { ...@@ -708,12 +763,18 @@ func extractGPUInfoFromNode(node *corev1.Node) (*GPUInfo, error) {
}, nil }, nil
} }
// InferHardwareSystem maps GPU product name to hardware system identifier. // InferHardwareSystem attempts to infer a normalized GPU SKU type from a
// Returns empty string if the GPU model cannot be confidently mapped. // free-form product string (e.g. "NVIDIA H100 SXM", "A100-PCIE").
// //
// This is a best-effort mapping based on common NVIDIA datacenter GPU naming patterns. // The function performs three main steps:
// The system identifier is used by the profiler for performance estimation and configuration. // 1. Normalize the input string to a consistent format.
// 2. Detect the GPU form factor (SXM vs PCIe).
// 3. Match the normalized string against known GPU tokens and return
// the corresponding SKU type.
// //
// Matching is based on substring checks and is tolerant of variations
// in formatting (case, spaces, dashes). If no known GPU is detected,
// an empty SKU type is returned.
// Limitations: // Limitations:
// - Cannot distinguish SXM vs. PCIe variants from labels alone (assumes SXM for datacenter GPUs) // - Cannot distinguish SXM vs. PCIe variants from labels alone (assumes SXM for datacenter GPUs)
// - New GPU models require code updates (gracefully returns empty string) // - New GPU models require code updates (gracefully returns empty string)
...@@ -726,34 +787,80 @@ func InferHardwareSystem(gpuProduct string) nvidiacomv1beta1.GPUSKUType { ...@@ -726,34 +787,80 @@ func InferHardwareSystem(gpuProduct string) nvidiacomv1beta1.GPUSKUType {
return "" return ""
} }
// Normalize: uppercase, remove spaces/dashes for pattern matching normalized := normalize(gpuProduct)
normalized := strings.ToUpper(strings.ReplaceAll(gpuProduct, "-", "")) formFactor := detectFormFactor(normalized)
normalized = strings.ReplaceAll(normalized, " ", "")
// Map common NVIDIA datacenter GPU products to AIC hardware system identifiers.
patterns := []struct {
pattern string
system nvidiacomv1beta1.GPUSKUType
}{
{"GB200", nvidiacomv1beta1.GPUSKUTypeGB200SXM},
{"H200", nvidiacomv1beta1.GPUSKUTypeH200SXM},
{"H100", nvidiacomv1beta1.GPUSKUTypeH100SXM},
{"B200", nvidiacomv1beta1.GPUSKUTypeB200SXM},
{"A100", nvidiacomv1beta1.GPUSKUTypeA100SXM},
{"L40S", nvidiacomv1beta1.GPUSKUTypeL40S},
}
for _, p := range patterns { for _, rule := range gpuRules {
if strings.Contains(normalized, p.pattern) { if strings.Contains(normalized, rule.token) {
return p.system if rule.singleSKU != "" {
return rule.singleSKU
}
if formFactor == formFactorSXM && rule.sxmSKU != "" {
return rule.sxmSKU
}
if rule.pcieSKU != "" {
return rule.pcieSKU
}
} }
} }
// Unknown GPU type, return empty value.
// User must specify gpuSku explicitly in spec.hardware.
return "" return ""
} }
// normalize standardizes a GPU product string to simplify matching.
//
// It converts the string to uppercase and removes common separators
// such as spaces and dashes. This allows consistent substring matching
// regardless of how the input is formatted (e.g. "H100-SXM",
// "h100 sxm", and "H100SXM" all normalize to the same value).
func normalize(input string) string {
s := strings.ToUpper(strings.ReplaceAll(input, strDash, strSpace))
return strings.ReplaceAll(s, " ", "")
}
// detectFormFactor determines the GPU form factor (e.g. SXM or PCIe)
// from a normalized product string.
//
// The detection is based on the presence of known substrings such as
// "SXM", "HGX", or "DGX" for SXM-based systems, and "PCIE" for PCIe.
// If no explicit indicator is found, PCIe is used as the default since
// it is the more common and safer assumption.
func detectFormFactor(normalized string) string {
switch {
case strings.Contains(normalized, tokenSXM),
strings.Contains(normalized, tokenHGX),
strings.Contains(normalized, tokenDGX):
return formFactorSXM
case strings.Contains(normalized, tokenPCIE):
return formFactorPCIe
default:
return formFactorPCIe
}
}
// GetCloudProviderInfo attempts to infer the cloud provider of the Kubernetes cluster.
//
// The function inspects the first node in the cluster (assumes homogeneous node setup)
// and uses a combination of ProviderID and node labels to detect the provider.
//
// Detection logic:
// - Primary detection uses node.Spec.ProviderID:
// - "azure" → AKS
// - "aws" → AWS
// - "gce" → GCP
// - Secondary detection uses node labels and instance type prefixes:
// - AKS: "kubernetes.azure.com/cluster" label or instance type starting with "standard_"
// - AWS: "eks.amazonaws.com/nodegroup" label or known AWS instance type prefix
// - GCP: "cloud.google.com/gke-nodepool" label or known GCP machine series prefix
// - If none match, returns "other".
//
// Parameters:
// - ctx: Context for logging, cancellation, or timeout.
// - k8sClient: Kubernetes client for reading Node objects.
//
// Returns:
// - A string identifying the cloud provider ("aks", "aws", "gcp", "other", or "unknown").
// - An error if no nodes are found or listing fails.
func GetCloudProviderInfo(ctx context.Context, k8sClient client.Reader) (string, error) { func GetCloudProviderInfo(ctx context.Context, k8sClient client.Reader) (string, error) {
var nodeList corev1.NodeList var nodeList corev1.NodeList
if err := k8sClient.List(ctx, &nodeList); err != nil { if err := k8sClient.List(ctx, &nodeList); err != nil {
...@@ -801,6 +908,13 @@ func GetCloudProviderInfo(ctx context.Context, k8sClient client.Reader) (string, ...@@ -801,6 +908,13 @@ func GetCloudProviderInfo(ctx context.Context, k8sClient client.Reader) (string,
return "other", nil return "other", nil
} }
// isGCPInstanceType checks whether a given instance type string matches known GCP machine series.
//
// Parameters:
// - instanceType: string representing the node's instance type (lowercased).
//
// Returns:
// - true if the instance type belongs to a GCP machine series prefix.
func isGCPInstanceType(instanceType string) bool { func isGCPInstanceType(instanceType string) bool {
for _, prefix := range gcpMachineSeries { for _, prefix := range gcpMachineSeries {
if strings.HasPrefix(instanceType, prefix) { if strings.HasPrefix(instanceType, prefix) {
...@@ -810,6 +924,13 @@ func isGCPInstanceType(instanceType string) bool { ...@@ -810,6 +924,13 @@ func isGCPInstanceType(instanceType string) bool {
return false return false
} }
// isAWSInstanceType checks whether a given instance type string matches known AWS instance type prefixes.
//
// Parameters:
// - instanceType: string representing the node's instance type (lowercased).
//
// Returns:
// - true if the instance type belongs to an AWS instance type prefix.
func isAWSInstanceType(instanceType string) bool { func isAWSInstanceType(instanceType string) bool {
for _, prefix := range awsInstanceTypePrefixes { for _, prefix := range awsInstanceTypePrefixes {
if strings.HasPrefix(instanceType, prefix) { if strings.HasPrefix(instanceType, prefix) {
...@@ -818,3 +939,59 @@ func isAWSInstanceType(instanceType string) bool { ...@@ -818,3 +939,59 @@ func isAWSInstanceType(instanceType string) bool {
} }
return false return false
} }
// detectRDMAFromNode inspects a single node for RDMA or SR-IOV network capability.
//
// Detection logic:
// - Checks node labels:
// - "nvidia.com/rdma.present" = "true" → RDMA detected
// - "feature.node.kubernetes.io/network-sriov.capable" = "true" → SR-IOV detected
//
// Parameters:
// - ctx: Context for logging or cancellation.
// - k8sClient: Kubernetes client for reading Node objects.
// - nodeName: Name of the node to inspect.
//
// Returns:
// - bool indicating whether RDMA/SR-IOV is present.
// - string representing the type ("rdma", "sriov", or "none").
func detectRDMAFromNode(ctx context.Context, k8sClient client.Reader, nodeName string) (bool, string) {
node := &corev1.Node{}
if err := k8sClient.Get(ctx, types.NamespacedName{Name: nodeName}, node); err != nil {
return false, strNone
}
labels := node.Labels
if labels["nvidia.com/rdma.present"] == "true" {
return true, "rdma"
}
if labels["feature.node.kubernetes.io/network-sriov.capable"] == "true" {
return true, "sriov"
}
return false, strNone
}
// detectIBPods checks if there are any RDMA or InfiniBand-related pods deployed
// in the "nvidia-network-operator" namespace.
//
// Detection logic:
// - Lists pods in "nvidia-network-operator" namespace.
// - If any pod name contains "rdma", returns true.
//
// Parameters:
// - ctx: Context for logging or cancellation.
// - k8sClient: Kubernetes client for listing pods.
//
// Returns:
// - true if any RDMA/IB pods are found, false otherwise.
func detectIBPods(ctx context.Context, k8sClient client.Reader) bool {
podList := &corev1.PodList{}
if err := k8sClient.List(ctx, podList, client.InNamespace(LabelValueNvidiaNetworkOperator)); err != nil {
return false
}
for _, p := range podList.Items {
if strings.Contains(p.Name, "rdma") {
return true
}
}
return false
}
...@@ -26,6 +26,7 @@ import ( ...@@ -26,6 +26,7 @@ import (
"strings" "strings"
"testing" "testing"
nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
dto "github.com/prometheus/client_model/go" dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
...@@ -322,32 +323,159 @@ func TestExtractGPUInfoFromNode_MissingLabels(t *testing.T) { ...@@ -322,32 +323,159 @@ func TestExtractGPUInfoFromNode_MissingLabels(t *testing.T) {
func TestInferHardwareSystem(t *testing.T) { func TestInferHardwareSystem(t *testing.T) {
tests := []struct { tests := []struct {
gpuProduct string name string
expectedSystem string input string
description string expected nvidiacomv1beta1.GPUSKUType
}{ }{
{"H100-SXM5-80GB", "h100_sxm", "H100 SXM variant"}, // --- Empty / unknown ---
{"H100-PCIE-80GB", "h100_sxm", "H100 PCIe variant (mapped to SXM)"}, {
{"H200-SXM5-141GB", "h200_sxm", "H200 SXM variant"}, name: "empty input",
{"A100-SXM4-40GB", "a100_sxm", "A100 SXM variant"}, input: "",
{"A100-PCIE-80GB", "a100_sxm", "A100 PCIe variant (mapped to SXM)"}, expected: "",
{"L40S", "l40s", "L40S"}, },
{"NVIDIA L40S", "l40s", "L40S with prefix"}, {
{"B200-SXM", "b200_sxm", "B200 SXM"}, name: "unknown gpu",
{"GB200", "gb200_sxm", "GB200"}, input: "random-gpu",
{"Tesla V100-SXM2-16GB", "", "V100 (not in mapping)"}, expected: "",
{"RTX 4090", "", "Consumer GPU (not in mapping)"}, },
{"Unknown-GPU", "", "Unknown GPU"},
{"", "", "Empty string"}, // --- Blackwell ---
// GFD product names as seen in real cluster labels (regression for GPUSKU bug) {
{"NVIDIA-B200", "b200_sxm", "B200 with NVIDIA prefix (GFD label format)"}, name: "GB200 SXM",
{"NVIDIA-H200-SXM5-141GB", "h200_sxm", "H200 with NVIDIA prefix (GFD label format)"}, input: "GB200-SXM",
expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
},
{
name: "GB200 HGX (implies SXM)",
input: "HGX GB200",
expected: nvidiacomv1beta1.GPUSKUTypeGB200SXM,
},
{
name: "B200 SXM",
input: "B200 SXM",
expected: nvidiacomv1beta1.GPUSKUTypeB200SXM,
},
// --- Hopper ---
{
name: "H100 SXM",
input: "H100 SXM",
expected: nvidiacomv1beta1.GPUSKUTypeH100SXM,
},
{
name: "H100 PCIe explicit",
input: "H100 PCIe",
expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
},
{
name: "H100 default PCIe",
input: "H100",
expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
},
{
name: "H200 SXM",
input: "H200 SXM",
expected: nvidiacomv1beta1.GPUSKUTypeH200SXM,
},
// --- Ampere ---
{
name: "A100 SXM",
input: "A100-SXM",
expected: nvidiacomv1beta1.GPUSKUTypeA100SXM,
},
{
name: "A100 PCIe",
input: "A100 PCIe",
expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
},
{
name: "A100 default PCIe",
input: "A100",
expected: nvidiacomv1beta1.GPUSKUTypeA100PCIe,
},
// --- Ada ---
{
name: "L40S",
input: "L40S",
expected: nvidiacomv1beta1.GPUSKUTypeL40S,
},
{
name: "L40S should not match L40",
input: "L40S",
expected: nvidiacomv1beta1.GPUSKUTypeL40S,
},
{
name: "L40",
input: "L40",
expected: nvidiacomv1beta1.GPUSKUTypeL40,
},
{
name: "L4",
input: "L4",
expected: nvidiacomv1beta1.GPUSKUTypeL4,
},
// --- Volta / Turing ---
{
name: "V100 SXM",
input: "V100 SXM",
expected: nvidiacomv1beta1.GPUSKUTypeV100SXM,
},
{
name: "V100 PCIe",
input: "V100 PCIe",
expected: nvidiacomv1beta1.GPUSKUTypeV100PCIe,
},
{
name: "T4",
input: "T4",
expected: nvidiacomv1beta1.GPUSKUTypeT4,
},
// --- AMD ---
{
name: "MI300",
input: "MI300",
expected: nvidiacomv1beta1.GPUSKUTypeMI300,
},
{
name: "MI250",
input: "MI250",
expected: nvidiacomv1beta1.GPUSKUTypeMI200,
},
{
name: "MI200",
input: "MI200",
expected: nvidiacomv1beta1.GPUSKUTypeMI200,
},
// --- Normalization tests ---
{
name: "lowercase + spaces",
input: "h100 sxm",
expected: nvidiacomv1beta1.GPUSKUTypeH100SXM,
},
{
name: "mixed case + dash",
input: "A100-sXm",
expected: nvidiacomv1beta1.GPUSKUTypeA100SXM,
},
{
name: "with extra spaces",
input: " H100 PCIe ",
expected: nvidiacomv1beta1.GPUSKUTypeH100PCIe,
},
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.description, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
result := InferHardwareSystem(tt.gpuProduct) result := InferHardwareSystem(tt.input)
assert.Equal(t, tt.expectedSystem, string(result), "Failed for GPU: %s", tt.gpuProduct) if result != tt.expected {
t.Errorf("InferHardwareSystem(%q) = %v, want %v",
tt.input, result, tt.expected)
}
}) })
} }
} }
...@@ -382,6 +510,119 @@ func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) { ...@@ -382,6 +510,119 @@ func TestInferHardwareSystem_SpacesAndDashes(t *testing.T) {
} }
} }
func TestNormalize(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "basic lowercase",
input: "h100",
expected: "H100",
},
{
name: "spaces removed",
input: "H100 SXM",
expected: "H100SXM",
},
{
name: "dashes replaced and removed",
input: "H100-SXM",
expected: "H100SXM",
},
{
name: "mixed spaces and dashes",
input: "A100 - SXM",
expected: "A100SXM",
},
{
name: "extra whitespace",
input: " H100 PCIe ",
expected: "H100PCIE",
},
{
name: "complex string",
input: "h100-sxm5-80gb",
expected: "H100SXM580GB",
},
{
name: "empty string",
input: "",
expected: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := normalize(tt.input)
if result != tt.expected {
t.Errorf("normalize(%q) = %q, want %q",
tt.input, result, tt.expected)
}
})
}
}
func TestDetectFormFactor(t *testing.T) {
tests := []struct {
name string
input string // already normalized
expected string
}{
{
name: "detect SXM explicitly",
input: "H100SXM",
expected: formFactorSXM,
},
{
name: "detect HGX implies SXM",
input: "HGXH100",
expected: formFactorSXM,
},
{
name: "detect DGX implies SXM",
input: "DGXH100",
expected: formFactorSXM,
},
{
name: "detect PCIe explicitly",
input: "H100PCIE",
expected: formFactorPCIe,
},
{
name: "default to PCIe when unknown",
input: "H100",
expected: formFactorPCIe,
},
{
name: "SXM wins over PCIe if both present",
input: "H100SXMPCIE",
expected: formFactorSXM,
},
{
name: "random string defaults to PCIe",
input: "RANDOMGPU",
expected: formFactorPCIe,
},
{
name: "empty string defaults to PCIe",
input: "",
expected: formFactorPCIe,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := detectFormFactor(tt.input)
if result != tt.expected {
t.Errorf("detectFormFactor(%q) = %v, want %v",
tt.input, result, tt.expected)
}
})
}
}
func TestParseMetrics(t *testing.T) { func TestParseMetrics(t *testing.T) {
ctx := context.Background() ctx := context.Background()
...@@ -873,3 +1114,117 @@ func TestGetCloudProviderInfo(t *testing.T) { ...@@ -873,3 +1114,117 @@ func TestGetCloudProviderInfo(t *testing.T) {
}) })
} }
} }
func TestDetectRDMAFromNode(t *testing.T) {
scheme := runtime.NewScheme()
_ = corev1.AddToScheme(scheme)
tests := []struct {
name string
node *corev1.Node
nodeName string
expectedOK bool
expectedTyp string
}{
{
name: "node not found",
node: nil,
nodeName: "missing-node",
expectedOK: false,
expectedTyp: strNone,
},
{
name: "rdma detected",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-rdma",
Labels: map[string]string{
"nvidia.com/rdma.present": "true",
},
},
},
nodeName: "node-rdma",
expectedOK: true,
expectedTyp: "rdma",
},
{
name: "sriov detected",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-sriov",
Labels: map[string]string{
"feature.node.kubernetes.io/network-sriov.capable": "true",
},
},
},
nodeName: "node-sriov",
expectedOK: true,
expectedTyp: "sriov",
},
{
name: "both rdma and sriov - rdma takes precedence",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-both",
Labels: map[string]string{
"nvidia.com/rdma.present": "true",
"feature.node.kubernetes.io/network-sriov.capable": "true",
},
},
},
nodeName: "node-both",
expectedOK: true,
expectedTyp: "rdma",
},
{
name: "no relevant labels",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-none",
Labels: map[string]string{},
},
},
nodeName: "node-none",
expectedOK: false,
expectedTyp: strNone,
},
{
name: "labels present but false",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-false",
Labels: map[string]string{
"nvidia.com/rdma.present": "false",
"feature.node.kubernetes.io/network-sriov.capable": "false",
},
},
},
nodeName: "node-false",
expectedOK: false,
expectedTyp: strNone,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var objs []runtime.Object
if tt.node != nil {
objs = append(objs, tt.node)
}
fakeClient := fake.NewClientBuilder().
WithScheme(scheme).
WithRuntimeObjects(objs...).
Build()
ok, typ := detectRDMAFromNode(context.TODO(), fakeClient, tt.nodeName)
if ok != tt.expectedOK {
t.Errorf("expected ok=%v, got %v", tt.expectedOK, ok)
}
if typ != tt.expectedTyp {
t.Errorf("expected type=%s, got %s", tt.expectedTyp, typ)
}
})
}
}
...@@ -1542,19 +1542,28 @@ _Underlying type:_ _string_ ...@@ -1542,19 +1542,28 @@ _Underlying type:_ _string_
GPUSKUType is the AIC hardware system identifier for a supported GPU. GPUSKUType is the AIC hardware system identifier for a supported GPU.
_Validation:_ _Validation:_
- Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s] - Enum: [gb200_sxm b200_sxm h200_sxm h100_sxm h100_pcie a100_sxm a100_pcie l40s l40 l4 v100_sxm v100_pcie t4 mi200 mi300]
_Appears in:_ _Appears in:_
- [HardwareSpec](#hardwarespec) - [HardwareSpec](#hardwarespec)
| Field | Description | | Field | Description |
| --- | --- | | --- | --- |
| `gb200_sxm` | | | `gb200_sxm` | --- Blackwell ---<br /> |
| `h200_sxm` | |
| `h100_sxm` | |
| `b200_sxm` | | | `b200_sxm` | |
| `a100_sxm` | | | `h200_sxm` | --- Hopper ---<br /> |
| `l40s` | | | `h100_sxm` | |
| `h100_pcie` | |
| `a100_sxm` | --- Ampere ---<br /> |
| `a100_pcie` | |
| `l40s` | --- Ada ---<br /> |
| `l40` | |
| `l4` | |
| `v100_sxm` | --- Older NVIDIA ---<br /> |
| `v100_pcie` | |
| `t4` | |
| `mi200` | --- AMD ---<br /> |
| `mi300` | |
#### HardwareSpec #### HardwareSpec
...@@ -1571,10 +1580,12 @@ _Appears in:_ ...@@ -1571,10 +1580,12 @@ _Appears in:_
| Field | Description | Default | Validation | | Field | Description | Default | Validation |
| --- | --- | --- | --- | | --- | --- | --- | --- |
| `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. | | Enum: [gb200_sxm h200_sxm h100_sxm b200_sxm a100_sxm l40s] <br />Optional: \{\} <br /> | | `gpuSku` _[GPUSKUType](#gpuskutype)_ | GPUSKU is the AIC hardware system identifier for the GPU.<br />When omitted, the operator auto-detects this via InferHardwareSystem from cluster GPU node labels. | | Enum: [gb200_sxm b200_sxm h200_sxm h100_sxm h100_pcie a100_sxm a100_pcie l40s l40 l4 v100_sxm v100_pcie t4 mi200 mi300] <br />Optional: \{\} <br /> |
| `vramMb` _float_ | VRAMMB is the VRAM per GPU in MiB. | | Optional: \{\} <br /> | | `vramMb` _float_ | VRAMMB is the VRAM per GPU in MiB. | | Optional: \{\} <br /> |
| `totalGpus` _integer_ | TotalGPUs is the total number of GPUs available in the cluster. | | Optional: \{\} <br /> | | `totalGpus` _integer_ | TotalGPUs is the total number of GPUs available in the cluster. | | Optional: \{\} <br /> |
| `numGpusPerNode` _integer_ | NumGPUsPerNode is the number of GPUs per node. | | Optional: \{\} <br /> | | `numGpusPerNode` _integer_ | NumGPUsPerNode is the number of GPUs per node. | | Optional: \{\} <br /> |
| `interconnect` _string_ | Interconnect describes the GPU interconnect type within a node.<br />Examples: "pcie", "nvlink", "infiniband". | | Optional: \{\} <br /> |
| `rdma` _boolean_ | RDMA indicates whether RDMA is available on the cluster. | | Optional: \{\} <br /> |
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment