Unverified Commit 98a6d3b9 authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: store nodesWithGPUs (#6690)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 35c30493
...@@ -1169,7 +1169,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx ...@@ -1169,7 +1169,7 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
} }
hw := dgdr.Spec.Hardware hw := dgdr.Spec.Hardware
if hw.GPUSKU != "" && hw.VRAMMB != nil && hw.NumGPUsPerNode != nil { if hw.GPUSKU != "" && hw.VRAMMB != nil && hw.NumGPUsPerNode != nil {
return nil // all fields already set by user return nil // all fields already set by user; TotalGPUs is filled below when discovery runs
} }
gpuInfo, err := gpu.DiscoverGPUs(ctx, r.APIReader) gpuInfo, err := gpu.DiscoverGPUs(ctx, r.APIReader)
...@@ -1180,6 +1180,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx ...@@ -1180,6 +1180,8 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
logger := log.FromContext(ctx) logger := log.FromContext(ctx)
logger.Info("GPU discovery completed successfully", logger.Info("GPU discovery completed successfully",
"gpusPerNode", gpuInfo.GPUsPerNode, "gpusPerNode", gpuInfo.GPUsPerNode,
"nodesWithGPUs", gpuInfo.NodesWithGPUs,
"totalGpus", gpuInfo.GPUsPerNode*gpuInfo.NodesWithGPUs,
"model", gpuInfo.Model, "model", gpuInfo.Model,
"vramMiB", gpuInfo.VRAMPerGPU) "vramMiB", gpuInfo.VRAMPerGPU)
...@@ -1194,6 +1196,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx ...@@ -1194,6 +1196,10 @@ func (r *DynamoGraphDeploymentRequestReconciler) enrichHardwareFromDiscovery(ctx
n := int32(gpuInfo.GPUsPerNode) n := int32(gpuInfo.GPUsPerNode)
hw.NumGPUsPerNode = &n hw.NumGPUsPerNode = &n
} }
if hw.TotalGPUs == nil {
total := int32(gpuInfo.GPUsPerNode * gpuInfo.NodesWithGPUs)
hw.TotalGPUs = &total
}
return nil return nil
} }
......
...@@ -102,6 +102,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -102,6 +102,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -153,6 +154,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -153,6 +154,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -227,6 +229,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -227,6 +229,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -313,6 +316,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -313,6 +316,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -373,6 +377,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -373,6 +377,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -483,6 +488,7 @@ spec: ...@@ -483,6 +488,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -607,6 +613,7 @@ spec: ...@@ -607,6 +613,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -682,6 +689,7 @@ spec: ...@@ -682,6 +689,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -801,6 +809,7 @@ var _ = Describe("DGDR Validation", func() { ...@@ -801,6 +809,7 @@ var _ = Describe("DGDR Validation", func() {
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -824,6 +833,7 @@ var _ = Describe("DGDR Validation", func() { ...@@ -824,6 +833,7 @@ var _ = Describe("DGDR Validation", func() {
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -885,6 +895,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -885,6 +895,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
GPUSKU: "H200-SXM", GPUSKU: "H200-SXM",
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(50.0), TTFT: ptr.To(50.0),
...@@ -949,6 +960,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -949,6 +960,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
GPUSKU: "H200-SXM", GPUSKU: "H200-SXM",
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(50.0), TTFT: ptr.To(50.0),
...@@ -1012,6 +1024,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -1012,6 +1024,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(50.0), TTFT: ptr.To(50.0),
...@@ -1093,6 +1106,7 @@ var _ = Describe("DGDR Error Handling", func() { ...@@ -1093,6 +1106,7 @@ var _ = Describe("DGDR Error Handling", func() {
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -1668,6 +1682,7 @@ spec: ...@@ -1668,6 +1682,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -1732,6 +1747,7 @@ spec: ...@@ -1732,6 +1747,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -1776,6 +1792,7 @@ spec: ...@@ -1776,6 +1792,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -1869,6 +1886,7 @@ spec: ...@@ -1869,6 +1886,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
...@@ -1961,6 +1979,7 @@ spec: ...@@ -1961,6 +1979,7 @@ spec:
NumGPUsPerNode: ptr.To[int32](8), NumGPUsPerNode: ptr.To[int32](8),
GPUSKU: "H100-SXM5-80GB", GPUSKU: "H100-SXM5-80GB",
VRAMMB: ptr.To(81920.0), VRAMMB: ptr.To(81920.0),
TotalGPUs: ptr.To[int32](128),
}, },
SLA: &nvidiacomv1beta1.SLASpec{ SLA: &nvidiacomv1beta1.SLASpec{
TTFT: ptr.To(100.0), TTFT: ptr.To(100.0),
......
...@@ -37,10 +37,11 @@ const ( ...@@ -37,10 +37,11 @@ const (
// GPUInfo contains discovered GPU configuration from cluster nodes // GPUInfo contains discovered GPU configuration from cluster nodes
type GPUInfo struct { type GPUInfo struct {
GPUsPerNode int // Maximum GPUs per node found in the cluster GPUsPerNode int // Maximum GPUs per node found in the cluster
Model string // GPU product name (e.g., "H100-SXM5-80GB") NodesWithGPUs int // Number of nodes that have GPUs
VRAMPerGPU int // VRAM in MiB per GPU Model string // GPU product name (e.g., "H100-SXM5-80GB")
System string // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown VRAMPerGPU int // VRAM in MiB per GPU
System string // AIC hardware system identifier (e.g., "h100_sxm", "h200_sxm"), empty if unknown
} }
// DiscoverGPUs queries Kubernetes nodes to determine GPU configuration. // DiscoverGPUs queries Kubernetes nodes to determine GPU configuration.
...@@ -104,13 +105,15 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error ...@@ -104,13 +105,15 @@ func DiscoverGPUs(ctx context.Context, k8sClient client.Reader) (*GPUInfo, error
// Infer hardware system from GPU model // Infer hardware system from GPU model
bestGPUInfo.System = InferHardwareSystem(bestGPUInfo.Model) bestGPUInfo.System = InferHardwareSystem(bestGPUInfo.Model)
bestGPUInfo.NodesWithGPUs = nodesWithGPUs
logger.Info("GPU discovery completed", logger.Info("GPU discovery completed",
"gpusPerNode", bestGPUInfo.GPUsPerNode, "gpusPerNode", bestGPUInfo.GPUsPerNode,
"nodesWithGPUs", bestGPUInfo.NodesWithGPUs,
"totalGpus", bestGPUInfo.GPUsPerNode*bestGPUInfo.NodesWithGPUs,
"model", bestGPUInfo.Model, "model", bestGPUInfo.Model,
"vram", bestGPUInfo.VRAMPerGPU, "vram", bestGPUInfo.VRAMPerGPU,
"system", bestGPUInfo.System, "system", bestGPUInfo.System)
"nodesWithGPUs", nodesWithGPUs)
return bestGPUInfo, nil return bestGPUInfo, nil
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment