enrich_hardware_test.go 6.36 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package controller

import (
	"context"
	"testing"

	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
	corev1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/runtime"
	"k8s.io/client-go/tools/record"
	"sigs.k8s.io/controller-runtime/pkg/client"
	"sigs.k8s.io/controller-runtime/pkg/client/fake"

	gpupkg "github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
35
	"k8s.io/utils/ptr"
36
37
)

38
func newFakeReconciler(objs ...client.Object) *DynamoGraphDeploymentRequestReconciler {
39
40
41
42
43
44
45
46
47
48
	scheme := runtime.NewScheme()
	_ = corev1.AddToScheme(scheme)
	fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build()
	return &DynamoGraphDeploymentRequestReconciler{
		Client:    fakeClient,
		APIReader: fakeClient,
		Recorder:  &record.FakeRecorder{},
	}
}

49
50
51
52
53
func dcgmPod(name, ip string) *corev1.Pod {
	return &corev1.Pod{
		ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "gpu-operator",
			Labels: map[string]string{gpupkg.LabelApp: gpupkg.LabelValueNvidiaDCGMExporter}},
		Status: corev1.PodStatus{Phase: corev1.PodRunning, PodIP: ip},
54
55
56
	}
}

57
func TestEnrichHardwareFromDiscovery(t *testing.T) {
58
	tests := []struct {
59
60
61
62
63
64
65
66
67
68
69
70
71
72
		name string

		// Input hardware spec (nil fields = not set by user).
		hardware *nvidiacomv1beta1.HardwareSpec

		// Discovery mock: what DCGM returns. Nil means no discovery available.
		discoveredGPU *gpupkg.GPUInfo

		// Expected outcome.
		wantErr       string // non-empty means error expected, substring match
		wantGPUSKU    string
		wantVRAM      float64
		wantGPUsNode  int32
		wantTotalGPUs int32
73
74
	}{
		{
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
			name: "all four fields set, discovery skipped",
			hardware: &nvidiacomv1beta1.HardwareSpec{
				GPUSKU: "h100_sxm", VRAMMB: ptr.To(81920.0),
				NumGPUsPerNode: ptr.To(int32(8)), TotalGPUs: ptr.To(int32(16)),
			},
			wantGPUSKU: "h100_sxm", wantVRAM: 81920, wantGPUsNode: 8, wantTotalGPUs: 16,
		},
		{
			name:          "nothing set, full discovery",
			discoveredGPU: &gpupkg.GPUInfo{NodeName: "n1", GPUsPerNode: 8, Model: "H100-SXM5-80GB", VRAMPerGPU: 81920},
			wantGPUSKU:    "h100_sxm", wantVRAM: 81920, wantGPUsNode: 8, wantTotalGPUs: 8,
		},
		{
			name:          "nothing set, V100 discovered",
			discoveredGPU: &gpupkg.GPUInfo{NodeName: "n1", GPUsPerNode: 8, Model: "Tesla-V100-SXM2-16GB", VRAMPerGPU: 16384},
			wantGPUSKU:    "v100_sxm", wantVRAM: 16384, wantGPUsNode: 8, wantTotalGPUs: 8,
		},
		{
			name:          "nothing set, unknown GPU falls back to model name",
			discoveredGPU: &gpupkg.GPUInfo{NodeName: "n1", GPUsPerNode: 4, Model: "FutureGPU-X1000", VRAMPerGPU: 65536},
			wantGPUSKU:    "FutureGPU-X1000", wantVRAM: 65536, wantGPUsNode: 4, wantTotalGPUs: 4,
		},
		{
			name: "only totalGpus missing, discovery fills it",
			hardware: &nvidiacomv1beta1.HardwareSpec{
				GPUSKU: "b200_sxm", VRAMMB: ptr.To(141312.0), NumGPUsPerNode: ptr.To(int32(8)),
			},
			discoveredGPU: &gpupkg.GPUInfo{NodeName: "n1", GPUsPerNode: 8, Model: "B200-SXM-180GB", VRAMPerGPU: 141312},
			wantGPUSKU:    "b200_sxm", wantVRAM: 141312, wantGPUsNode: 8, wantTotalGPUs: 8,
		},
		{
			name: "only gpuSku missing, discovery fills it",
			hardware: &nvidiacomv1beta1.HardwareSpec{
				VRAMMB: ptr.To(81920.0), NumGPUsPerNode: ptr.To(int32(8)), TotalGPUs: ptr.To(int32(16)),
			},
			discoveredGPU: &gpupkg.GPUInfo{NodeName: "n1", GPUsPerNode: 8, Model: "H200-SXM5-141GB", VRAMPerGPU: 141312},
			wantGPUSKU:    "h200_sxm", wantVRAM: 81920, wantGPUsNode: 8, wantTotalGPUs: 16, // user overrides win
		},
		{
			name: "vramMb and numGpusPerNode override discovery",
			hardware: &nvidiacomv1beta1.HardwareSpec{
				GPUSKU: "a100_sxm", VRAMMB: ptr.To(40960.0), NumGPUsPerNode: ptr.To(int32(4)),
			},
			discoveredGPU: &gpupkg.GPUInfo{NodeName: "n1", GPUsPerNode: 8, Model: "A100-SXM4-80GB", VRAMPerGPU: 81920},
			wantGPUSKU:    "a100_sxm", wantVRAM: 40960, wantGPUsNode: 4, wantTotalGPUs: 8,
		},
		{
			name:    "no fields set, discovery fails",
			wantErr: "auto-discovery failed",
124
125
		},
		{
126
127
128
129
130
			name: "three fields set, discovery fails",
			hardware: &nvidiacomv1beta1.HardwareSpec{
				GPUSKU: "h100_sxm", VRAMMB: ptr.To(81920.0), NumGPUsPerNode: ptr.To(int32(8)),
			},
			wantErr: "auto-discovery failed",
131
132
133
134
135
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
			var r *DynamoGraphDeploymentRequestReconciler
			if tt.discoveredGPU != nil {
				// Set up a DCGM pod and mock scraper so discovery works.
				r = newFakeReconciler(dcgmPod("dcgm-exporter", "10.0.0.1"))
				r.GPUDiscovery = gpupkg.NewGPUDiscovery(func(ctx context.Context, endpoint string) (*gpupkg.GPUInfo, error) {
					return tt.discoveredGPU, nil
				})
				r.GPUDiscoveryCache = gpupkg.NewGPUDiscoveryCache()
			} else if tt.wantErr != "" {
				// No discovery — will fail if discovery is attempted.
				r = newFakeReconciler()
			} else {
				// All fields set — discovery not needed, no mock required.
				r = newFakeReconciler()
			}
151

152
153
154
155
			hw := tt.hardware
			if hw == nil {
				hw = &nvidiacomv1beta1.HardwareSpec{}
			}
156
157
			dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{
				Spec: nvidiacomv1beta1.DynamoGraphDeploymentRequestSpec{
158
					Hardware: hw,
159
160
				},
			}
161

162
			err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
163

164
165
166
167
168
			if tt.wantErr != "" {
				require.Error(t, err)
				assert.Contains(t, err.Error(), tt.wantErr)
				return
			}
169
170
			require.NoError(t, err)
			require.NotNil(t, dgdr.Spec.Hardware)
171
172
173
174
			assert.Equal(t, tt.wantGPUSKU, string(dgdr.Spec.Hardware.GPUSKU))
			assert.Equal(t, tt.wantVRAM, *dgdr.Spec.Hardware.VRAMMB)
			assert.Equal(t, tt.wantGPUsNode, *dgdr.Spec.Hardware.NumGPUsPerNode)
			assert.Equal(t, tt.wantTotalGPUs, *dgdr.Spec.Hardware.TotalGPUs)
175
176
177
		})
	}
}