Unverified Commit 98a6d3b9 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: store nodesWithGPUs (#6690)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 35c30493
......@@ -1169,7 +1169,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
}
hw := dgdr.Spec.Hardware
if hw.GPUSKU != "" && hw.VRAMMB != nil && hw.NumGPUsPerNode != nil {
return nil // all fields already set by user
return nil // all fields already set by user; TotalGPUs is filled below when discovery runs
}
gpuInfo, err := gpu.DiscoverGPUs(ctx, r.APIReader)
......@@ -1180,6 +1180,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
logger := log.FromContext(ctx)
logger.Info("GPU discovery completed successfully",
"gpusPerNode", gpuInfo.GPUsPerNode,
"nodesWithGPUs", gpuInfo.NodesWithGPUs,
"totalGpus", gpuInfo.GPUsPerNode*gpuInfo.NodesWithGPUs,
"model", gpuInfo.Model,
"vramMiB", gpuInfo.VRAMPerGPU)
......@@ -1194,6 +1196,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
n := int32(gpuInfo.GPUsPerNode)
hw.NumGPUsPerNode = &n
}
if hw.TotalGPUs == nil {
total := int32(gpuInfo.GPUsPerNode * gpuInfo.NodesWithGPUs)
hw.TotalGPUs = &total
}
return nil
}
......
......@@ -102,6 +102,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -153,6 +154,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -227,6 +229,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -313,6 +316,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -373,6 +377,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -483,6 +488,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -607,6 +613,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -682,6 +689,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -801,6 +809,7 @@ var _ = Describe("DGDR Validation", func() {
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -824,6 +833,7 @@ var _ = Describe("DGDR Validation", func() {
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -885,6 +895,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
GPUSKU: "H200-SXM",
NumGPUsPerNode: ptr.To[int32](8),
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(50.0),
......@@ -949,6 +960,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
GPUSKU: "H200-SXM",
NumGPUsPerNode: ptr.To[int32](8),
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(50.0),
......@@ -1012,6 +1024,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(50.0),
......@@ -1093,6 +1106,7 @@ var _ = Describe("DGDR Error Handling", func() {
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -1668,6 +1682,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -1732,6 +1747,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -1776,6 +1792,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -1869,6 +1886,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......@@ -1961,6 +1979,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
},
SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0),
......
......@@ -37,10 +37,11 @@ const (
// GPUInfo contains discovered GPU configuration from cluster nodes
type GPUInfo struct {
GPUsPerNode int // Maximum GPUs per node found in the cluster
Model string // GPU product name (e.g., "H100-SXM5-80GB")
VRAMPerGPU int // VRAM in MiB per GPU
System string // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
GPUsPerNode int // Maximum GPUs per node found in the cluster
NodesWithGPUs int // Number of nodes that have GPUs
Model string // GPU product name (e.g., "H100-SXM5-80GB")
VRAMPerGPU int // VRAM in MiB per GPU
System string // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
}
// DiscoverGPUs queries Kubernetes nodes to determine GPU configuration.
......@@ -104,13 +105,15 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error
// Infer hardware system from GPU model
bestGPUInfo.System = InferHardwareSystem(bestGPUInfo.Model)
bestGPUInfo.NodesWithGPUs = nodesWithGPUs
logger.Info("GPU discovery completed",
"gpusPerNode", bestGPUInfo.GPUsPerNode,
"nodesWithGPUs", bestGPUInfo.NodesWithGPUs,
"totalGpus", bestGPUInfo.GPUsPerNode*bestGPUInfo.NodesWithGPUs,
"model", bestGPUInfo.Model,
"vram", bestGPUInfo.VRAMPerGPU,
"system", bestGPUInfo.System,
"nodesWithGPUs", nodesWithGPUs)
"system", bestGPUInfo.System)
return bestGPUInfo, nil
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment