enrich_hardware_test.go 4.02 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/*
 * SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package controller

import (
	"context"
	"fmt"
	"testing"

	nvidiacomv1beta1 "github.com/ai-dynamo/dynamo/deploy/operator/api/v1beta1"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
	corev1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/runtime"
	"k8s.io/client-go/tools/record"
	"sigs.k8s.io/controller-runtime/pkg/client"
	"sigs.k8s.io/controller-runtime/pkg/client/fake"

	gpupkg "github.com/ai-dynamo/dynamo/deploy/operator/internal/gpu"
)

func newFakeReconciler(nodes ...*corev1.Node) *DynamoGraphDeploymentRequestReconciler {
	scheme := runtime.NewScheme()
	_ = corev1.AddToScheme(scheme)
	objs := make([]client.Object, len(nodes))
	for i, n := range nodes {
		objs[i] = n
	}
	fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build()
	return &DynamoGraphDeploymentRequestReconciler{
		Client:    fakeClient,
		APIReader: fakeClient,
		Recorder:  &record.FakeRecorder{},
	}
}

func gpuNode(name, product string, gpuCount int, vramMiB int) *corev1.Node {
	return &corev1.Node{
		ObjectMeta: metav1.ObjectMeta{
			Name: name,
			Labels: map[string]string{
				gpupkg.LabelGPUCount:   intStr(gpuCount),
				gpupkg.LabelGPUProduct: product,
				gpupkg.LabelGPUMemory:  intStr(vramMiB),
			},
		},
	}
}

func intStr(n int) string {
	return fmt.Sprintf("%d", n)
}

// TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier is the regression test for the
// bug where GPUSKU was set to the raw GFD product name (e.g. "NVIDIA-B200") instead of
// the AIC system identifier (e.g. "b200_sxm"), causing AIC support checks to always fail
// and forcing every model/backend to fall back to naive config generation.
func TestEnrichHardwareFromDiscovery_UsesAICSystemIdentifier(t *testing.T) {
	tests := []struct {
		name           string
		gfdProduct     string // raw GFD label value
		expectedGPUSKU string // what the profiler needs
	}{
		{
			name:           "B200 GFD label maps to AIC system identifier",
			gfdProduct:     "NVIDIA-B200",
			expectedGPUSKU: "b200_sxm",
		},
		{
			name:           "H200 GFD label maps to AIC system identifier",
			gfdProduct:     "NVIDIA-H200-SXM5-141GB",
			expectedGPUSKU: "h200_sxm",
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			r := newFakeReconciler(gpuNode("gpu-node-1", tt.gfdProduct, 8, 141312))
			dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}

			err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
			require.NoError(t, err)
			require.NotNil(t, dgdr.Spec.Hardware)
			assert.Equal(t, tt.expectedGPUSKU, string(dgdr.Spec.Hardware.GPUSKU),
				"GPUSKU should be the AIC system identifier, not the raw GFD product name %q", tt.gfdProduct)
		})
	}
}

// TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU verifies that for GPUs
// not in the AIC support matrix, the raw GFD product name is used as a fallback.
func TestEnrichHardwareFromDiscovery_FallsBackToModelForUnknownGPU(t *testing.T) {
	r := newFakeReconciler(gpuNode("gpu-node-1", "Tesla-V100-SXM2-16GB", 8, 16384))
	dgdr := &nvidiacomv1beta1.DynamoGraphDeploymentRequest{}

	err := r.enrichHardwareFromDiscovery(context.Background(), dgdr)
	require.NoError(t, err)
	require.NotNil(t, dgdr.Spec.Hardware)
	assert.Equal(t, "Tesla-V100-SXM2-16GB", string(dgdr.Spec.Hardware.GPUSKU),
		"Unknown GPU should fall back to raw model name")
}