package cuda import ( "context" "errors" "net" "path/filepath" "strings" "testing" "time" "github.com/go-logr/logr" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" corev1 "k8s.io/api/core/v1" resourcev1 "k8s.io/api/resource/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/fake" podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1" ) func TestBuildDeviceMap(t *testing.T) { tests := []struct { name string source []string target []string want string wantErr bool }{ { name: "single GPU", source: []string{"GPU-aaa"}, target: []string{"GPU-bbb"}, want: "GPU-aaa=GPU-bbb", }, { name: "multiple GPUs", source: []string{"GPU-aaa", "GPU-bbb"}, target: []string{"GPU-ccc", "GPU-ddd"}, want: "GPU-aaa=GPU-ccc,GPU-bbb=GPU-ddd", }, { name: "mismatched lengths", source: []string{"GPU-aaa", "GPU-bbb"}, target: []string{"GPU-ccc"}, wantErr: true, }, { name: "both empty", source: []string{}, target: []string{}, wantErr: true, }, { name: "source empty target non-empty", source: []string{}, target: []string{"GPU-aaa"}, wantErr: true, }, } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { got, err := BuildDeviceMap(tc.source, tc.target, logr.Discard()) if tc.wantErr { if err == nil { t.Errorf("expected error, got %q", got) } return } if err != nil { t.Fatalf("unexpected error: %v", err) } if got != tc.want { t.Errorf("got %q, want %q", got, tc.want) } }) } } type testPodResourcesServer struct { podresourcesv1.UnimplementedPodResourcesListerServer resp *podresourcesv1.ListPodResourcesResponse } func (s *testPodResourcesServer) List(context.Context, *podresourcesv1.ListPodResourcesRequest) (*podresourcesv1.ListPodResourcesResponse, error) { return s.resp, nil } func (s *testPodResourcesServer) GetAllocatableResources(context.Context, *podresourcesv1.AllocatableResourcesRequest) (*podresourcesv1.AllocatableResourcesResponse, error) { return nil, status.Error(codes.Unimplemented, "not implemented in test") } func (s *testPodResourcesServer) Get(context.Context, *podresourcesv1.GetPodResourcesRequest) (*podresourcesv1.GetPodResourcesResponse, error) { return nil, status.Error(codes.Unimplemented, "not implemented in test") } func installTestPodResourcesServer(t *testing.T, resp *podresourcesv1.ListPodResourcesResponse) { socketDir := t.TempDir() socketPath := filepath.Join(socketDir, "kubelet.sock") listener, err := net.Listen("unix", socketPath) if err != nil { t.Fatalf("listen unix socket: %v", err) } server := grpc.NewServer() podresourcesv1.RegisterPodResourcesListerServer(server, &testPodResourcesServer{ resp: resp, }) go func() { if serveErr := server.Serve(listener); serveErr != nil { if errors.Is(serveErr, grpc.ErrServerStopped) || strings.Contains(serveErr.Error(), "use of closed network connection") { return } t.Errorf("serve test pod-resources gRPC server: %v", serveErr) } }() t.Cleanup(server.Stop) t.Cleanup(func() { _ = listener.Close() }) previousSocketPath := podResourcesSocketPath podResourcesSocketPath = socketPath t.Cleanup(func() { podResourcesSocketPath = previousSocketPath }) } func TestGetPodGPUUUIDs(t *testing.T) { installTestPodResourcesServer(t, &podresourcesv1.ListPodResourcesResponse{ PodResources: []*podresourcesv1.PodResources{ { Name: "other-pod", Namespace: "default", Containers: []*podresourcesv1.ContainerResources{ { Name: "main", Devices: []*podresourcesv1.ContainerDevices{ { ResourceName: nvidiaGPUResource, DeviceIds: []string{"GPU-ignore"}, }, }, }, }, }, { Name: "test-pod", Namespace: "default", Containers: []*podresourcesv1.ContainerResources{ { Name: "sidecar", Devices: []*podresourcesv1.ContainerDevices{ { ResourceName: nvidiaGPUResource, DeviceIds: []string{"GPU-sidecar"}, }, }, }, { Name: "main", Devices: []*podresourcesv1.ContainerDevices{ { ResourceName: nvidiaGPUResource, DeviceIds: []string{"GPU-a", "GPU-b"}, }, { ResourceName: "example.com/fpga", DeviceIds: []string{"FPGA-ignore"}, }, { ResourceName: nvidiaGPUResource, DeviceIds: []string{"GPU-c"}, }, }, }, }, }, }, }) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() got, err := GetPodGPUUUIDs(ctx, "test-pod", "default", "main") if err != nil { t.Fatalf("GetPodGPUUUIDs: %v", err) } want := []string{"GPU-a", "GPU-b", "GPU-c"} if len(got) != len(want) { t.Fatalf("got %v, want %v", got, want) } for i := range want { if got[i] != want[i] { t.Fatalf("got %v, want %v", got, want) } } } func TestDiscoverGPUUUIDsUsesPodResourcesForClassicPod(t *testing.T) { installTestPodResourcesServer(t, &podresourcesv1.ListPodResourcesResponse{ PodResources: []*podresourcesv1.PodResources{ { Name: "test-pod", Namespace: "default", Containers: []*podresourcesv1.ContainerResources{ { Name: "main", Devices: []*podresourcesv1.ContainerDevices{ { ResourceName: nvidiaGPUResource, DeviceIds: []string{"GPU-a", "GPU-b"}, }, }, }, }, }, }, }) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() got, err := DiscoverGPUUUIDs( ctx, nil, "test-pod", "default", "main", "/proc", 123, logr.Discard(), ) if err != nil { t.Fatalf("DiscoverGPUUUIDs: %v", err) } want := []string{"GPU-a", "GPU-b"} if len(got) != len(want) { t.Fatalf("got %v, want %v", got, want) } for i := range want { if got[i] != want[i] { t.Fatalf("got %v, want %v", got, want) } } } func TestDiscoverGPUUUIDsFallsBackToPodResourcesAfterDRAAPILookupError(t *testing.T) { installTestPodResourcesServer(t, &podresourcesv1.ListPodResourcesResponse{ PodResources: []*podresourcesv1.PodResources{ { Name: "test-pod", Namespace: "default", Containers: []*podresourcesv1.ContainerResources{ { Name: "main", Devices: []*podresourcesv1.ContainerDevices{ { ResourceName: nvidiaGPUResource, DeviceIds: []string{"GPU-a"}, }, }, }, }, }, }, }) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() got, err := DiscoverGPUUUIDs( ctx, fake.NewSimpleClientset(), "test-pod", "default", "main", "/proc", 123, logr.Discard(), ) if err != nil { t.Fatalf("DiscoverGPUUUIDs: %v", err) } if len(got) != 1 || got[0] != "GPU-a" { t.Fatalf("got %v, want [GPU-a]", got) } } func TestDiscoverGPUUUIDsPrefersDRAForDRAPod(t *testing.T) { previousSocketPath := podResourcesSocketPath podResourcesSocketPath = filepath.Join(t.TempDir(), "missing-kubelet.sock") t.Cleanup(func() { podResourcesSocketPath = previousSocketPath }) nodeName := "node-1" poolName := "pool-node-1" namespace := "default" podName := "test-pod" claimName := "gpu-claim" uuid := "GPU-ffffffff-1111-2222-3333-444444444444" pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: podName, Namespace: namespace}, Spec: corev1.PodSpec{ NodeName: nodeName, ResourceClaims: []corev1.PodResourceClaim{ { Name: "gpu", ResourceClaimName: &claimName, }, }, }, } claim := &resourcev1.ResourceClaim{ ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: namespace}, Status: resourcev1.ResourceClaimStatus{ Allocation: &resourcev1.AllocationResult{ Devices: resourcev1.DeviceAllocationResult{ Results: []resourcev1.DeviceRequestAllocationResult{ {Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-0", Request: "gpu"}, }, }, }, }, } slice := &resourcev1.ResourceSlice{ ObjectMeta: metav1.ObjectMeta{Name: poolName + "-gpu.nvidia.com-xxx"}, Spec: resourcev1.ResourceSliceSpec{ Driver: nvidiaGPUDRADriver, NodeName: &nodeName, Pool: resourcev1.ResourcePool{Name: poolName}, Devices: []resourcev1.Device{ { Name: "gpu-0", Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{ resourcev1.QualifiedName("uuid"): {StringValue: &uuid}, }, }, }, }, } client := fake.NewSimpleClientset(pod, claim, slice) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() got, err := DiscoverGPUUUIDs( ctx, client, podName, namespace, "main", "/proc", 123, logr.Discard(), ) if err != nil { t.Fatalf("DiscoverGPUUUIDs: %v", err) } if len(got) != 1 || got[0] != uuid { t.Fatalf("got %v, want [%s]", got, uuid) } }