Unverified Commit 2cc6d1e2 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

fix(snapshot): resolve DRA GPU UUIDs from claims (#8292)

parent 8428c65f
...@@ -13,7 +13,7 @@ import ( ...@@ -13,7 +13,7 @@ import (
"github.com/go-logr/logr" "github.com/go-logr/logr"
"google.golang.org/grpc" "google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure" "google.golang.org/grpc/credentials/insecure"
"k8s.io/client-go/kubernetes"
podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1" podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
) )
...@@ -104,6 +104,45 @@ func GetGPUUUIDsViaNvidiaSmi(ctx context.Context, hostProcPath string, pid int) ...@@ -104,6 +104,45 @@ func GetGPUUUIDsViaNvidiaSmi(ctx context.Context, hostProcPath string, pid int)
return uuids, nil return uuids, nil
} }
// DiscoverGPUUUIDs resolves GPU UUIDs according to the pod's allocation mode:
// DRA-backed pods use the DRA API, classic nvidia.com/gpu pods use PodResources,
// and nvidia-smi remains the last fallback for either path.
func DiscoverGPUUUIDs(ctx context.Context, clientset kubernetes.Interface, podName, podNamespace, containerName, hostProcPath string, pid int, log logr.Logger) ([]string, error) {
gpuUUIDs, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, clientset, podName, podNamespace, log)
fallbackReason := "DRA API returned no GPU UUIDs"
if err != nil {
log.Error(
err,
"DRA API GPU UUID lookup failed, trying other discovery paths",
"pod", podNamespace+"/"+podName,
"has_nvidia_dra_allocation", hasNVIDIADRAAllocation,
)
gpuUUIDs = nil
fallbackReason = "DRA API GPU UUID lookup failed"
}
if len(gpuUUIDs) > 0 {
return gpuUUIDs, nil
}
if !hasNVIDIADRAAllocation {
gpuUUIDs, err = GetPodGPUUUIDs(ctx, podName, podNamespace, containerName)
if err != nil {
return nil, fmt.Errorf("PodResources GPU UUID lookup failed: %w", err)
}
if len(gpuUUIDs) > 0 {
return gpuUUIDs, nil
}
fallbackReason = "PodResources API returned no GPU UUIDs"
}
log.Info(fallbackReason+", falling back to nvidia-smi", "pid", pid)
gpuUUIDs, err = GetGPUUUIDsViaNvidiaSmi(ctx, hostProcPath, pid)
if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err)
}
log.Info("nvidia-smi fallback discovered GPU UUIDs", "uuids", gpuUUIDs)
return gpuUUIDs, nil
}
// FilterProcesses returns the subset of candidate PIDs that hold actual CUDA contexts. // FilterProcesses returns the subset of candidate PIDs that hold actual CUDA contexts.
// Uses --get-restore-tid (the same technique as the CRIU CUDA plugin) instead of // Uses --get-restore-tid (the same technique as the CRIU CUDA plugin) instead of
// --get-state, because --get-state incorrectly matches coordinator processes like // --get-state, because --get-state incorrectly matches coordinator processes like
......
...@@ -13,7 +13,10 @@ import ( ...@@ -13,7 +13,10 @@ import (
"google.golang.org/grpc" "google.golang.org/grpc"
"google.golang.org/grpc/codes" "google.golang.org/grpc/codes"
"google.golang.org/grpc/status" "google.golang.org/grpc/status"
corev1 "k8s.io/api/core/v1"
resourcev1 "k8s.io/api/resource/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1" podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
) )
...@@ -93,7 +96,7 @@ func (s *testPodResourcesServer) Get(context.Context, *podresourcesv1.GetPodReso ...@@ -93,7 +96,7 @@ func (s *testPodResourcesServer) Get(context.Context, *podresourcesv1.GetPodReso
return nil, status.Error(codes.Unimplemented, "not implemented in test") return nil, status.Error(codes.Unimplemented, "not implemented in test")
} }
func TestGetPodGPUUUIDs(t *testing.T) { func installTestPodResourcesServer(t *testing.T, resp *podresourcesv1.ListPodResourcesResponse) {
socketDir := t.TempDir() socketDir := t.TempDir()
socketPath := filepath.Join(socketDir, "kubelet.sock") socketPath := filepath.Join(socketDir, "kubelet.sock")
...@@ -101,61 +104,10 @@ func TestGetPodGPUUUIDs(t *testing.T) { ...@@ -101,61 +104,10 @@ func TestGetPodGPUUUIDs(t *testing.T) {
if err != nil { if err != nil {
t.Fatalf("listen unix socket: %v", err) t.Fatalf("listen unix socket: %v", err)
} }
defer listener.Close()
server := grpc.NewServer() server := grpc.NewServer()
podresourcesv1.RegisterPodResourcesListerServer(server, &testPodResourcesServer{ podresourcesv1.RegisterPodResourcesListerServer(server, &testPodResourcesServer{
resp: &podresourcesv1.ListPodResourcesResponse{ resp: resp,
PodResources: []*podresourcesv1.PodResources{
{
Name: "other-pod",
Namespace: "default",
Containers: []*podresourcesv1.ContainerResources{
{
Name: "main",
Devices: []*podresourcesv1.ContainerDevices{
{
ResourceName: nvidiaGPUResource,
DeviceIds: []string{"GPU-ignore"},
},
},
},
},
},
{
Name: "test-pod",
Namespace: "default",
Containers: []*podresourcesv1.ContainerResources{
{
Name: "sidecar",
Devices: []*podresourcesv1.ContainerDevices{
{
ResourceName: nvidiaGPUResource,
DeviceIds: []string{"GPU-sidecar"},
},
},
},
{
Name: "main",
Devices: []*podresourcesv1.ContainerDevices{
{
ResourceName: nvidiaGPUResource,
DeviceIds: []string{"GPU-a", "GPU-b"},
},
{
ResourceName: "example.com/fpga",
DeviceIds: []string{"FPGA-ignore"},
},
{
ResourceName: nvidiaGPUResource,
DeviceIds: []string{"GPU-c"},
},
},
},
},
},
},
},
}) })
go func() { go func() {
...@@ -167,12 +119,69 @@ func TestGetPodGPUUUIDs(t *testing.T) { ...@@ -167,12 +119,69 @@ func TestGetPodGPUUUIDs(t *testing.T) {
} }
}() }()
t.Cleanup(server.Stop) t.Cleanup(server.Stop)
t.Cleanup(func() {
_ = listener.Close()
})
previousSocketPath := podResourcesSocketPath previousSocketPath := podResourcesSocketPath
podResourcesSocketPath = socketPath podResourcesSocketPath = socketPath
t.Cleanup(func() { t.Cleanup(func() {
podResourcesSocketPath = previousSocketPath podResourcesSocketPath = previousSocketPath
}) })
}
func TestGetPodGPUUUIDs(t *testing.T) {
installTestPodResourcesServer(t, &podresourcesv1.ListPodResourcesResponse{
PodResources: []*podresourcesv1.PodResources{
{
Name: "other-pod",
Namespace: "default",
Containers: []*podresourcesv1.ContainerResources{
{
Name: "main",
Devices: []*podresourcesv1.ContainerDevices{
{
ResourceName: nvidiaGPUResource,
DeviceIds: []string{"GPU-ignore"},
},
},
},
},
},
{
Name: "test-pod",
Namespace: "default",
Containers: []*podresourcesv1.ContainerResources{
{
Name: "sidecar",
Devices: []*podresourcesv1.ContainerDevices{
{
ResourceName: nvidiaGPUResource,
DeviceIds: []string{"GPU-sidecar"},
},
},
},
{
Name: "main",
Devices: []*podresourcesv1.ContainerDevices{
{
ResourceName: nvidiaGPUResource,
DeviceIds: []string{"GPU-a", "GPU-b"},
},
{
ResourceName: "example.com/fpga",
DeviceIds: []string{"FPGA-ignore"},
},
{
ResourceName: nvidiaGPUResource,
DeviceIds: []string{"GPU-c"},
},
},
},
},
},
},
})
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel() defer cancel()
...@@ -192,3 +201,172 @@ func TestGetPodGPUUUIDs(t *testing.T) { ...@@ -192,3 +201,172 @@ func TestGetPodGPUUUIDs(t *testing.T) {
} }
} }
} }
func TestDiscoverGPUUUIDsUsesPodResourcesForClassicPod(t *testing.T) {
installTestPodResourcesServer(t, &podresourcesv1.ListPodResourcesResponse{
PodResources: []*podresourcesv1.PodResources{
{
Name: "test-pod",
Namespace: "default",
Containers: []*podresourcesv1.ContainerResources{
{
Name: "main",
Devices: []*podresourcesv1.ContainerDevices{
{
ResourceName: nvidiaGPUResource,
DeviceIds: []string{"GPU-a", "GPU-b"},
},
},
},
},
},
},
})
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
got, err := DiscoverGPUUUIDs(
ctx,
nil,
"test-pod",
"default",
"main",
"/proc",
123,
logr.Discard(),
)
if err != nil {
t.Fatalf("DiscoverGPUUUIDs: %v", err)
}
want := []string{"GPU-a", "GPU-b"}
if len(got) != len(want) {
t.Fatalf("got %v, want %v", got, want)
}
for i := range want {
if got[i] != want[i] {
t.Fatalf("got %v, want %v", got, want)
}
}
}
func TestDiscoverGPUUUIDsFallsBackToPodResourcesAfterDRAAPILookupError(t *testing.T) {
installTestPodResourcesServer(t, &podresourcesv1.ListPodResourcesResponse{
PodResources: []*podresourcesv1.PodResources{
{
Name: "test-pod",
Namespace: "default",
Containers: []*podresourcesv1.ContainerResources{
{
Name: "main",
Devices: []*podresourcesv1.ContainerDevices{
{
ResourceName: nvidiaGPUResource,
DeviceIds: []string{"GPU-a"},
},
},
},
},
},
},
})
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
got, err := DiscoverGPUUUIDs(
ctx,
fake.NewSimpleClientset(),
"test-pod",
"default",
"main",
"/proc",
123,
logr.Discard(),
)
if err != nil {
t.Fatalf("DiscoverGPUUUIDs: %v", err)
}
if len(got) != 1 || got[0] != "GPU-a" {
t.Fatalf("got %v, want [GPU-a]", got)
}
}
func TestDiscoverGPUUUIDsPrefersDRAForDRAPod(t *testing.T) {
previousSocketPath := podResourcesSocketPath
podResourcesSocketPath = filepath.Join(t.TempDir(), "missing-kubelet.sock")
t.Cleanup(func() {
podResourcesSocketPath = previousSocketPath
})
nodeName := "node-1"
poolName := "pool-node-1"
namespace := "default"
podName := "test-pod"
claimName := "gpu-claim"
uuid := "GPU-ffffffff-1111-2222-3333-444444444444"
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: podName, Namespace: namespace},
Spec: corev1.PodSpec{
NodeName: nodeName,
ResourceClaims: []corev1.PodResourceClaim{
{
Name: "gpu",
ResourceClaimName: &claimName,
},
},
},
}
claim := &resourcev1.ResourceClaim{
ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: namespace},
Status: resourcev1.ResourceClaimStatus{
Allocation: &resourcev1.AllocationResult{
Devices: resourcev1.DeviceAllocationResult{
Results: []resourcev1.DeviceRequestAllocationResult{
{Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-0", Request: "gpu"},
},
},
},
},
}
slice := &resourcev1.ResourceSlice{
ObjectMeta: metav1.ObjectMeta{Name: poolName + "-gpu.nvidia.com-xxx"},
Spec: resourcev1.ResourceSliceSpec{
Driver: nvidiaGPUDRADriver,
NodeName: &nodeName,
Pool: resourcev1.ResourcePool{Name: poolName},
Devices: []resourcev1.Device{
{
Name: "gpu-0",
Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{
resourcev1.QualifiedName("uuid"): {StringValue: &uuid},
},
},
},
},
}
client := fake.NewSimpleClientset(pod, claim, slice)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
got, err := DiscoverGPUUUIDs(
ctx,
client,
podName,
namespace,
"main",
"/proc",
123,
logr.Discard(),
)
if err != nil {
t.Fatalf("DiscoverGPUUUIDs: %v", err)
}
if len(got) != 1 || got[0] != uuid {
t.Fatalf("got %v, want [%s]", got, uuid)
}
}
...@@ -14,66 +14,93 @@ const ( ...@@ -14,66 +14,93 @@ const (
resourceAttributeUUID = "uuid" resourceAttributeUUID = "uuid"
) )
// GetGPUUUIDsViaDRAAPI resolves GPU UUIDs for a pod by querying the Kubernetes API: type allocatedDRADevice struct {
// Pod (resource claim refs) -> ResourceClaim (allocation results) -> ResourceSlice (device attributes). pool string
// Returns nil without error if the pod has no DRA claims or the driver is not gpu.nvidia.com. device string
func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, podName, podNamespace string, log logr.Logger) ([]string, error) { }
func getAllocatedNVIDIADRADevices(ctx context.Context, clientset kubernetes.Interface, podName, podNamespace string, log logr.Logger) ([]allocatedDRADevice, string, bool, error) {
if clientset == nil { if clientset == nil {
return nil, nil return nil, "", false, nil
} }
if podName == "" || podNamespace == "" { if podName == "" || podNamespace == "" {
return nil, nil return nil, "", false, nil
} }
pod, err := clientset.CoreV1().Pods(podNamespace).Get(ctx, podName, metav1.GetOptions{}) pod, err := clientset.CoreV1().Pods(podNamespace).Get(ctx, podName, metav1.GetOptions{})
if err != nil { if err != nil {
return nil, fmt.Errorf("get pod %s/%s: %w", podNamespace, podName, err) return nil, "", false, fmt.Errorf("get pod %s/%s: %w", podNamespace, podName, err)
} }
if len(pod.Spec.ResourceClaims) == 0 { if len(pod.Spec.ResourceClaims) == 0 {
return nil, nil return nil, pod.Spec.NodeName, false, nil
} }
nodeName := pod.Spec.NodeName if pod.Spec.NodeName == "" {
if nodeName == "" {
log.V(1).Info("pod has no node name, skipping DRA API lookup") log.V(1).Info("pod has no node name, skipping DRA API lookup")
return nil, nil return nil, "", false, nil
} }
var allocated []struct { claimNamesByPodRef := make(map[string]string, len(pod.Spec.ResourceClaims))
driver string for _, ref := range pod.Spec.ResourceClaims {
pool string if ref.ResourceClaimName != nil && *ref.ResourceClaimName != "" {
device string claimNamesByPodRef[ref.Name] = *ref.ResourceClaimName
}
}
for _, status := range pod.Status.ResourceClaimStatuses {
if status.ResourceClaimName == nil || *status.ResourceClaimName == "" {
continue
}
if _, exists := claimNamesByPodRef[status.Name]; !exists {
claimNamesByPodRef[status.Name] = *status.ResourceClaimName
}
} }
var allocated []allocatedDRADevice
hasNVIDIADRAAllocation := false
for _, ref := range pod.Spec.ResourceClaims { for _, ref := range pod.Spec.ResourceClaims {
if ref.ResourceClaimName == nil || *ref.ResourceClaimName == "" { claimName := claimNamesByPodRef[ref.Name]
if claimName == "" {
log.V(1).Info("pod resource claim has no resolved claim name", "pod_claim", ref.Name)
continue continue
} }
claimName := *ref.ResourceClaimName
claim, err := clientset.ResourceV1().ResourceClaims(podNamespace).Get(ctx, claimName, metav1.GetOptions{}) claim, err := clientset.ResourceV1().ResourceClaims(podNamespace).Get(ctx, claimName, metav1.GetOptions{})
if err != nil { if err != nil {
return nil, fmt.Errorf("get resource claim %s/%s: %w", podNamespace, claimName, err) return nil, pod.Spec.NodeName, hasNVIDIADRAAllocation, fmt.Errorf("get resource claim %s/%s: %w", podNamespace, claimName, err)
} }
if claim.Status.Allocation == nil || len(claim.Status.Allocation.Devices.Results) == 0 { if claim.Status.Allocation == nil || len(claim.Status.Allocation.Devices.Results) == 0 {
continue continue
} }
for _, r := range claim.Status.Allocation.Devices.Results { for _, result := range claim.Status.Allocation.Devices.Results {
if r.Driver == nvidiaGPUDRADriver { if result.Driver != nvidiaGPUDRADriver {
allocated = append(allocated, struct { continue
driver string
pool string
device string
}{r.Driver, r.Pool, r.Device})
} }
hasNVIDIADRAAllocation = true
allocated = append(allocated, allocatedDRADevice{
pool: result.Pool,
device: result.Device,
})
} }
} }
if len(allocated) == 0 {
return nil, nil return allocated, pod.Spec.NodeName, hasNVIDIADRAAllocation, nil
}
// GetGPUUUIDsViaDRAAPI resolves GPU UUIDs for a pod by querying the Kubernetes API:
// Pod (resource claim refs) -> ResourceClaim (allocation results) -> ResourceSlice (device attributes).
// It also reports whether the pod is using NVIDIA DRA GPU allocations at all.
func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, podName, podNamespace string, log logr.Logger) ([]string, bool, error) {
allocated, nodeName, hasNVIDIADRAAllocation, err := getAllocatedNVIDIADRADevices(ctx, clientset, podName, podNamespace, log)
if err != nil {
return nil, hasNVIDIADRAAllocation, err
}
if !hasNVIDIADRAAllocation || len(allocated) == 0 {
return nil, hasNVIDIADRAAllocation, nil
} }
slices, err := clientset.ResourceV1().ResourceSlices().List(ctx, metav1.ListOptions{ slices, err := clientset.ResourceV1().ResourceSlices().List(ctx, metav1.ListOptions{
FieldSelector: fmt.Sprintf("spec.driver=%s,spec.nodeName=%s", nvidiaGPUDRADriver, nodeName), FieldSelector: fmt.Sprintf("spec.driver=%s,spec.nodeName=%s", nvidiaGPUDRADriver, nodeName),
}) })
if err != nil { if err != nil {
return nil, fmt.Errorf("list resource slices for node %s: %w", nodeName, err) return nil, true, fmt.Errorf("list resource slices for node %s: %w", nodeName, err)
} }
poolDeviceToUUID := make(map[string]map[string]string) poolDeviceToUUID := make(map[string]map[string]string)
...@@ -92,15 +119,15 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p ...@@ -92,15 +119,15 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
} }
var uuids []string var uuids []string
for _, a := range allocated { for _, device := range allocated {
devMap := poolDeviceToUUID[a.pool] devMap := poolDeviceToUUID[device.pool]
if devMap == nil { if devMap == nil {
log.V(1).Info("no ResourceSlice found for pool", "pool", a.pool, "device", a.device) log.V(1).Info("no ResourceSlice found for pool", "pool", device.pool, "device", device.device)
continue continue
} }
uuid, ok := devMap[a.device] uuid, ok := devMap[device.device]
if !ok || uuid == "" { if !ok || uuid == "" {
log.V(1).Info("device has no UUID in ResourceSlice", "pool", a.pool, "device", a.device) log.V(1).Info("device has no UUID in ResourceSlice", "pool", device.pool, "device", device.device)
continue continue
} }
uuids = append(uuids, uuid) uuids = append(uuids, uuid)
...@@ -108,7 +135,7 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p ...@@ -108,7 +135,7 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
if len(uuids) > 0 { if len(uuids) > 0 {
log.Info("resolved GPU UUIDs via DRA API", "uuids", uuids) log.Info("resolved GPU UUIDs via DRA API", "uuids", uuids)
} }
return uuids, nil return uuids, true, nil
} }
func deviceUUIDFromAttributes(attrs map[resourcev1.QualifiedName]resourcev1.DeviceAttribute) string { func deviceUUIDFromAttributes(attrs map[resourcev1.QualifiedName]resourcev1.DeviceAttribute) string {
......
...@@ -60,10 +60,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) { ...@@ -60,10 +60,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
log := logr.Discard() log := logr.Discard()
t.Run("nil clientset returns nil without error", func(t *testing.T) { t.Run("nil clientset returns nil without error", func(t *testing.T) {
got, err := GetGPUUUIDsViaDRAAPI(ctx, nil, "pod", "ns", log) got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, nil, "pod", "ns", log)
if err != nil { if err != nil {
t.Fatalf("unexpected error: %v", err) t.Fatalf("unexpected error: %v", err)
} }
if hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be false")
}
if got != nil { if got != nil {
t.Errorf("got %v, want nil", got) t.Errorf("got %v, want nil", got)
} }
...@@ -71,10 +74,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) { ...@@ -71,10 +74,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
t.Run("empty pod name returns nil", func(t *testing.T) { t.Run("empty pod name returns nil", func(t *testing.T) {
client := fake.NewSimpleClientset() client := fake.NewSimpleClientset()
got, err := GetGPUUUIDsViaDRAAPI(ctx, client, "", "ns", log) got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, "", "ns", log)
if err != nil { if err != nil {
t.Fatalf("unexpected error: %v", err) t.Fatalf("unexpected error: %v", err)
} }
if hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be false")
}
if got != nil { if got != nil {
t.Errorf("got %v, want nil", got) t.Errorf("got %v, want nil", got)
} }
...@@ -82,7 +88,7 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) { ...@@ -82,7 +88,7 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
t.Run("pod not found returns error", func(t *testing.T) { t.Run("pod not found returns error", func(t *testing.T) {
client := fake.NewSimpleClientset() client := fake.NewSimpleClientset()
_, err := GetGPUUUIDsViaDRAAPI(ctx, client, "missing", "default", log) _, _, err := GetGPUUUIDsViaDRAAPI(ctx, client, "missing", "default", log)
if err == nil { if err == nil {
t.Fatal("expected error when pod not found") t.Fatal("expected error when pod not found")
} }
...@@ -146,10 +152,213 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) { ...@@ -146,10 +152,213 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
} }
client := fake.NewSimpleClientset(pod, claim, slice) client := fake.NewSimpleClientset(pod, claim, slice)
got, err := GetGPUUUIDsViaDRAAPI(ctx, client, podName, namespace, log) got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, podName, namespace, log)
if err != nil {
t.Fatalf("GetGPUUUIDsViaDRAAPI: %v", err)
}
if !hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be true")
}
want := []string{uuid1, uuid2}
if len(got) != len(want) {
t.Fatalf("got %v (len %d), want %v (len %d)", got, len(got), want, len(want))
}
for i := range want {
if got[i] != want[i] {
t.Errorf("got[%d] = %q, want %q", i, got[i], want[i])
}
}
})
t.Run("pod with template-backed DRA claims resolves UUIDs via pod status", func(t *testing.T) {
nodeName := "node-1"
poolName := "pool-node-1"
namespace := "default"
podName := "test-pod"
generatedClaimName := "generated-gpu-claim"
uuid1 := "GPU-cccccccc-1111-2222-3333-444444444444"
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: podName, Namespace: namespace},
Spec: corev1.PodSpec{
NodeName: nodeName,
ResourceClaims: []corev1.PodResourceClaim{
{
Name: "gpu",
},
},
},
Status: corev1.PodStatus{
ResourceClaimStatuses: []corev1.PodResourceClaimStatus{
{
Name: "gpu",
ResourceClaimName: ptr(generatedClaimName),
},
},
},
}
claim := &resourcev1.ResourceClaim{
ObjectMeta: metav1.ObjectMeta{Name: generatedClaimName, Namespace: namespace},
Status: resourcev1.ResourceClaimStatus{
Allocation: &resourcev1.AllocationResult{
Devices: resourcev1.DeviceAllocationResult{
Results: []resourcev1.DeviceRequestAllocationResult{
{Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-0", Request: "gpu"},
},
},
},
},
}
slice := &resourcev1.ResourceSlice{
ObjectMeta: metav1.ObjectMeta{Name: poolName + "-gpu.nvidia.com-xxx"},
Spec: resourcev1.ResourceSliceSpec{
Driver: nvidiaGPUDRADriver,
NodeName: &nodeName,
Pool: resourcev1.ResourcePool{Name: poolName},
Devices: []resourcev1.Device{
{
Name: "gpu-0",
Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{
resourcev1.QualifiedName("uuid"): {StringValue: &uuid1},
},
},
},
},
}
client := fake.NewSimpleClientset(pod, claim, slice)
got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, podName, namespace, log)
if err != nil {
t.Fatalf("GetGPUUUIDsViaDRAAPI: %v", err)
}
if !hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be true")
}
want := []string{uuid1}
if len(got) != len(want) {
t.Fatalf("got %v (len %d), want %v (len %d)", got, len(got), want, len(want))
}
for i := range want {
if got[i] != want[i] {
t.Errorf("got[%d] = %q, want %q", i, got[i], want[i])
}
}
})
t.Run("pod with unresolved resource claim returns nil", func(t *testing.T) {
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "pod", Namespace: "default"},
Spec: corev1.PodSpec{
NodeName: "node-1",
ResourceClaims: []corev1.PodResourceClaim{
{
Name: "gpu",
},
},
},
}
client := fake.NewSimpleClientset(pod)
got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, "pod", "default", log)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be false")
}
if got != nil {
t.Errorf("got %v, want nil", got)
}
})
t.Run("pod with direct and template-backed claims resolves UUIDs from both", func(t *testing.T) {
nodeName := "node-1"
poolName := "pool-node-1"
namespace := "default"
podName := "test-pod"
directClaimName := "direct-gpu-claim"
generatedClaimName := "generated-gpu-claim"
uuid1 := "GPU-dddddddd-1111-2222-3333-444444444444"
uuid2 := "GPU-eeeeeeee-5555-6666-7777-888888888888"
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: podName, Namespace: namespace},
Spec: corev1.PodSpec{
NodeName: nodeName,
ResourceClaims: []corev1.PodResourceClaim{
{
Name: "gpu-direct",
ResourceClaimName: ptr(directClaimName),
},
{
Name: "gpu-template",
},
},
},
Status: corev1.PodStatus{
ResourceClaimStatuses: []corev1.PodResourceClaimStatus{
{
Name: "gpu-template",
ResourceClaimName: ptr(generatedClaimName),
},
},
},
}
directClaim := &resourcev1.ResourceClaim{
ObjectMeta: metav1.ObjectMeta{Name: directClaimName, Namespace: namespace},
Status: resourcev1.ResourceClaimStatus{
Allocation: &resourcev1.AllocationResult{
Devices: resourcev1.DeviceAllocationResult{
Results: []resourcev1.DeviceRequestAllocationResult{
{Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-0", Request: "gpu-direct"},
},
},
},
},
}
generatedClaim := &resourcev1.ResourceClaim{
ObjectMeta: metav1.ObjectMeta{Name: generatedClaimName, Namespace: namespace},
Status: resourcev1.ResourceClaimStatus{
Allocation: &resourcev1.AllocationResult{
Devices: resourcev1.DeviceAllocationResult{
Results: []resourcev1.DeviceRequestAllocationResult{
{Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-1", Request: "gpu-template"},
},
},
},
},
}
slice := &resourcev1.ResourceSlice{
ObjectMeta: metav1.ObjectMeta{Name: poolName + "-gpu.nvidia.com-xxx"},
Spec: resourcev1.ResourceSliceSpec{
Driver: nvidiaGPUDRADriver,
NodeName: &nodeName,
Pool: resourcev1.ResourcePool{Name: poolName},
Devices: []resourcev1.Device{
{
Name: "gpu-0",
Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{
resourcev1.QualifiedName("uuid"): {StringValue: &uuid1},
},
},
{
Name: "gpu-1",
Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{
resourcev1.QualifiedName("uuid"): {StringValue: &uuid2},
},
},
},
},
}
client := fake.NewSimpleClientset(pod, directClaim, generatedClaim, slice)
got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, podName, namespace, log)
if err != nil { if err != nil {
t.Fatalf("GetGPUUUIDsViaDRAAPI: %v", err) t.Fatalf("GetGPUUUIDsViaDRAAPI: %v", err)
} }
if !hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be true")
}
want := []string{uuid1, uuid2} want := []string{uuid1, uuid2}
if len(got) != len(want) { if len(got) != len(want) {
t.Fatalf("got %v (len %d), want %v (len %d)", got, len(got), want, len(want)) t.Fatalf("got %v (len %d), want %v (len %d)", got, len(got), want, len(want))
...@@ -167,10 +376,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) { ...@@ -167,10 +376,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
Spec: corev1.PodSpec{NodeName: "node-1"}, Spec: corev1.PodSpec{NodeName: "node-1"},
} }
client := fake.NewSimpleClientset(pod) client := fake.NewSimpleClientset(pod)
got, err := GetGPUUUIDsViaDRAAPI(ctx, client, "pod", "default", log) got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, "pod", "default", log)
if err != nil { if err != nil {
t.Fatalf("unexpected error: %v", err) t.Fatalf("unexpected error: %v", err)
} }
if hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be false")
}
if got != nil { if got != nil {
t.Errorf("got %v, want nil", got) t.Errorf("got %v, want nil", got)
} }
......
...@@ -191,18 +191,19 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log ...@@ -191,18 +191,19 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
} }
var gpuUUIDs []string var gpuUUIDs []string
if len(cudaHostPIDs) > 0 { if len(cudaHostPIDs) > 0 {
gpuUUIDs, err = cuda.GetPodGPUUUIDs(ctx, req.PodName, req.PodNamespace, req.ContainerName) gpuUUIDs, err = cuda.DiscoverGPUUUIDs(
ctx,
req.Clientset,
req.PodName,
req.PodNamespace,
req.ContainerName,
snapshotruntime.HostProcPath,
pid,
log,
)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to discover source GPU UUIDs: %w", err) return nil, fmt.Errorf("failed to discover source GPU UUIDs: %w", err)
} }
if len(gpuUUIDs) == 0 {
log.Info("PodResources API returned no GPU UUIDs, falling back to nvidia-smi", "pid", pid)
gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, pid)
if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err)
}
log.Info("nvidia-smi fallback discovered GPU UUIDs", "uuids", gpuUUIDs)
}
} }
return &types.CheckpointContainerSnapshot{ return &types.CheckpointContainerSnapshot{
......
...@@ -142,18 +142,19 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge ...@@ -142,18 +142,19 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
if len(m.CUDA.SourceGPUUUIDs) == 0 { if len(m.CUDA.SourceGPUUUIDs) == 0 {
return nil, fmt.Errorf("missing source GPU UUIDs in checkpoint manifest") return nil, fmt.Errorf("missing source GPU UUIDs in checkpoint manifest")
} }
targetGPUUUIDs, err := cuda.GetPodGPUUUIDs(ctx, req.PodName, req.PodNamespace, containerName) targetGPUUUIDs, err := cuda.DiscoverGPUUUIDs(
ctx,
req.Clientset,
req.PodName,
req.PodNamespace,
containerName,
snapshotruntime.HostProcPath,
placeholderPID,
log,
)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get target GPU UUIDs: %w", err) return nil, fmt.Errorf("failed to get target GPU UUIDs: %w", err)
} }
if len(targetGPUUUIDs) == 0 {
log.Info("PodResources API returned no target GPU UUIDs, falling back to nvidia-smi", "pid", placeholderPID)
targetGPUUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, placeholderPID)
if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed for restore target: %w", err)
}
log.Info("nvidia-smi fallback discovered target GPU UUIDs", "uuids", targetGPUUUIDs)
}
if len(targetGPUUUIDs) == 0 { if len(targetGPUUUIDs) == 0 {
return nil, fmt.Errorf("missing target GPU UUIDs for %s/%s container %s", req.PodNamespace, req.PodName, containerName) return nil, fmt.Errorf("missing target GPU UUIDs for %s/%s container %s", req.PodNamespace, req.PodName, containerName)
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment