fix(snapshot): resolve DRA GPU UUIDs from claims (#8292)

2cc6d1e2 · Schwinn Saereesitthipitak · GitHub · 8428c65f · 2cc6d1e2 · 2cc6d1e2
Unverified Commit 2cc6d1e2 authored Apr 17, 2026 by Schwinn Saereesitthipitak Committed by GitHub Apr 17, 2026
6 changed files
--- a/deploy/snapshot/internal/cuda/cuda.go
+++ b/deploy/snapshot/internal/cuda/cuda.go
@@ -13,7 +13,7 @@ import (
 	"github.com/go-logr/logr"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/credentials/insecure"
-
+	"k8s.io/client-go/kubernetes"
 	podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
 )

@@ -104,6 +104,45 @@ func GetGPUUUIDsViaNvidiaSmi(ctx context.Context, hostProcPath string, pid int)
 	return uuids, nil
 }

+// DiscoverGPUUUIDs resolves GPU UUIDs according to the pod's allocation mode:
+// DRA-backed pods use the DRA API, classic nvidia.com/gpu pods use PodResources,
+// and nvidia-smi remains the last fallback for either path.
+func DiscoverGPUUUIDs(ctx context.Context, clientset kubernetes.Interface, podName, podNamespace, containerName, hostProcPath string, pid int, log logr.Logger) ([]string, error) {
+	gpuUUIDs, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, clientset, podName, podNamespace, log)
+	fallbackReason := "DRA API returned no GPU UUIDs"
+	if err != nil {
+		log.Error(
+			err,
+			"DRA API GPU UUID lookup failed, trying other discovery paths",
+			"pod", podNamespace+"/"+podName,
+			"has_nvidia_dra_allocation", hasNVIDIADRAAllocation,
+		)
+		gpuUUIDs = nil
+		fallbackReason = "DRA API GPU UUID lookup failed"
+	}
+	if len(gpuUUIDs) > 0 {
+		return gpuUUIDs, nil
+	}
+	if !hasNVIDIADRAAllocation {
+		gpuUUIDs, err = GetPodGPUUUIDs(ctx, podName, podNamespace, containerName)
+		if err != nil {
+			return nil, fmt.Errorf("PodResources GPU UUID lookup failed: %w", err)
+		}
+		if len(gpuUUIDs) > 0 {
+			return gpuUUIDs, nil
+		}
+		fallbackReason = "PodResources API returned no GPU UUIDs"
+	}
+
+	log.Info(fallbackReason+", falling back to nvidia-smi", "pid", pid)
+	gpuUUIDs, err = GetGPUUUIDsViaNvidiaSmi(ctx, hostProcPath, pid)
+	if err != nil {
+		return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err)
+	}
+	log.Info("nvidia-smi fallback discovered GPU UUIDs", "uuids", gpuUUIDs)
+	return gpuUUIDs, nil
+}
+
 // FilterProcesses returns the subset of candidate PIDs that hold actual CUDA contexts.
 // Uses --get-restore-tid (the same technique as the CRIU CUDA plugin) instead of
 // --get-state, because --get-state incorrectly matches coordinator processes like

--- a/deploy/snapshot/internal/cuda/cuda_test.go
+++ b/deploy/snapshot/internal/cuda/cuda_test.go
@@ -13,7 +13,10 @@ import (
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/status"
-
+	corev1 "k8s.io/api/core/v1"
+	resourcev1 "k8s.io/api/resource/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes/fake"
 	podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
 )

@@ -93,7 +96,7 @@ func (s *testPodResourcesServer) Get(context.Context, *podresourcesv1.GetPodReso
 	return nil, status.Error(codes.Unimplemented, "not implemented in test")
 }

-func TestGetPodGPUUUIDs(t *testing.T) {
+func installTestPodResourcesServer(t *testing.T, resp *podresourcesv1.ListPodResourcesResponse) {
 	socketDir := t.TempDir()
 	socketPath := filepath.Join(socketDir, "kubelet.sock")

@@ -101,11 +104,34 @@ func TestGetPodGPUUUIDs(t *testing.T) {
 	if err != nil {
 		t.Fatalf("listen unix socket: %v", err)
 	}
-	defer listener.Close()

 	server := grpc.NewServer()
 	podresourcesv1.RegisterPodResourcesListerServer(server, &testPodResourcesServer{
-		resp: &podresourcesv1.ListPodResourcesResponse{
+		resp: resp,
+	})
+
+	go func() {
+		if serveErr := server.Serve(listener); serveErr != nil {
+			if errors.Is(serveErr, grpc.ErrServerStopped) || strings.Contains(serveErr.Error(), "use of closed network connection") {
+				return
+			}
+			t.Errorf("serve test pod-resources gRPC server: %v", serveErr)
+		}
+	}()
+	t.Cleanup(server.Stop)
+	t.Cleanup(func() {
+		_ = listener.Close()
+	})
+
+	previousSocketPath := podResourcesSocketPath
+	podResourcesSocketPath = socketPath
+	t.Cleanup(func() {
+		podResourcesSocketPath = previousSocketPath
+	})
+}
+
+func TestGetPodGPUUUIDs(t *testing.T) {
+	installTestPodResourcesServer(t, &podresourcesv1.ListPodResourcesResponse{
 		PodResources: []*podresourcesv1.PodResources{
 			{
 				Name:      "other-pod",
@@ -155,34 +181,66 @@ func TestGetPodGPUUUIDs(t *testing.T) {
 				},
 			},
 		},
-		},
 	})

-	go func() {
-		if serveErr := server.Serve(listener); serveErr != nil {
-			if errors.Is(serveErr, grpc.ErrServerStopped) || strings.Contains(serveErr.Error(), "use of closed network connection") {
-				return
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	got, err := GetPodGPUUUIDs(ctx, "test-pod", "default", "main")
+	if err != nil {
+		t.Fatalf("GetPodGPUUUIDs: %v", err)
 	}
-			t.Errorf("serve test pod-resources gRPC server: %v", serveErr)
+
+	want := []string{"GPU-a", "GPU-b", "GPU-c"}
+	if len(got) != len(want) {
+		t.Fatalf("got %v, want %v", got, want)
 	}
-	}()
-	t.Cleanup(server.Stop)
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("got %v, want %v", got, want)
+		}
+	}
+}

-	previousSocketPath := podResourcesSocketPath
-	podResourcesSocketPath = socketPath
-	t.Cleanup(func() {
-		podResourcesSocketPath = previousSocketPath
+func TestDiscoverGPUUUIDsUsesPodResourcesForClassicPod(t *testing.T) {
+	installTestPodResourcesServer(t, &podresourcesv1.ListPodResourcesResponse{
+		PodResources: []*podresourcesv1.PodResources{
+			{
+				Name:      "test-pod",
+				Namespace: "default",
+				Containers: []*podresourcesv1.ContainerResources{
+					{
+						Name: "main",
+						Devices: []*podresourcesv1.ContainerDevices{
+							{
+								ResourceName: nvidiaGPUResource,
+								DeviceIds:    []string{"GPU-a", "GPU-b"},
+							},
+						},
+					},
+				},
+			},
+		},
 	})

 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 	defer cancel()

-	got, err := GetPodGPUUUIDs(ctx, "test-pod", "default", "main")
+	got, err := DiscoverGPUUUIDs(
+		ctx,
+		nil,
+		"test-pod",
+		"default",
+		"main",
+		"/proc",
+		123,
+		logr.Discard(),
+	)
 	if err != nil {
-		t.Fatalf("GetPodGPUUUIDs: %v", err)
+		t.Fatalf("DiscoverGPUUUIDs: %v", err)
 	}

-	want := []string{"GPU-a", "GPU-b", "GPU-c"}
+	want := []string{"GPU-a", "GPU-b"}
 	if len(got) != len(want) {
 		t.Fatalf("got %v, want %v", got, want)
 	}
@@ -192,3 +250,123 @@ func TestGetPodGPUUUIDs(t *testing.T) {
 		}
 	}
 }
+
+func TestDiscoverGPUUUIDsFallsBackToPodResourcesAfterDRAAPILookupError(t *testing.T) {
+	installTestPodResourcesServer(t, &podresourcesv1.ListPodResourcesResponse{
+		PodResources: []*podresourcesv1.PodResources{
+			{
+				Name:      "test-pod",
+				Namespace: "default",
+				Containers: []*podresourcesv1.ContainerResources{
+					{
+						Name: "main",
+						Devices: []*podresourcesv1.ContainerDevices{
+							{
+								ResourceName: nvidiaGPUResource,
+								DeviceIds:    []string{"GPU-a"},
+							},
+						},
+					},
+				},
+			},
+		},
+	})
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	got, err := DiscoverGPUUUIDs(
+		ctx,
+		fake.NewSimpleClientset(),
+		"test-pod",
+		"default",
+		"main",
+		"/proc",
+		123,
+		logr.Discard(),
+	)
+	if err != nil {
+		t.Fatalf("DiscoverGPUUUIDs: %v", err)
+	}
+	if len(got) != 1 || got[0] != "GPU-a" {
+		t.Fatalf("got %v, want [GPU-a]", got)
+	}
+}
+
+func TestDiscoverGPUUUIDsPrefersDRAForDRAPod(t *testing.T) {
+	previousSocketPath := podResourcesSocketPath
+	podResourcesSocketPath = filepath.Join(t.TempDir(), "missing-kubelet.sock")
+	t.Cleanup(func() {
+		podResourcesSocketPath = previousSocketPath
+	})
+
+	nodeName := "node-1"
+	poolName := "pool-node-1"
+	namespace := "default"
+	podName := "test-pod"
+	claimName := "gpu-claim"
+	uuid := "GPU-ffffffff-1111-2222-3333-444444444444"
+
+	pod := &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: podName, Namespace: namespace},
+		Spec: corev1.PodSpec{
+			NodeName: nodeName,
+			ResourceClaims: []corev1.PodResourceClaim{
+				{
+					Name:              "gpu",
+					ResourceClaimName: &claimName,
+				},
+			},
+		},
+	}
+	claim := &resourcev1.ResourceClaim{
+		ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: namespace},
+		Status: resourcev1.ResourceClaimStatus{
+			Allocation: &resourcev1.AllocationResult{
+				Devices: resourcev1.DeviceAllocationResult{
+					Results: []resourcev1.DeviceRequestAllocationResult{
+						{Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-0", Request: "gpu"},
+					},
+				},
+			},
+		},
+	}
+	slice := &resourcev1.ResourceSlice{
+		ObjectMeta: metav1.ObjectMeta{Name: poolName + "-gpu.nvidia.com-xxx"},
+		Spec: resourcev1.ResourceSliceSpec{
+			Driver:   nvidiaGPUDRADriver,
+			NodeName: &nodeName,
+			Pool:     resourcev1.ResourcePool{Name: poolName},
+			Devices: []resourcev1.Device{
+				{
+					Name: "gpu-0",
+					Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{
+						resourcev1.QualifiedName("uuid"): {StringValue: &uuid},
+					},
+				},
+			},
+		},
+	}
+
+	client := fake.NewSimpleClientset(pod, claim, slice)
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	got, err := DiscoverGPUUUIDs(
+		ctx,
+		client,
+		podName,
+		namespace,
+		"main",
+		"/proc",
+		123,
+		logr.Discard(),
+	)
+	if err != nil {
+		t.Fatalf("DiscoverGPUUUIDs: %v", err)
+	}
+	if len(got) != 1 || got[0] != uuid {
+		t.Fatalf("got %v, want [%s]", got, uuid)
+	}
+}
--- a/deploy/snapshot/internal/cuda/dra.go
+++ b/deploy/snapshot/internal/cuda/dra.go
@@ -14,66 +14,93 @@ const (
 	resourceAttributeUUID = "uuid"
 )

-// GetGPUUUIDsViaDRAAPI resolves GPU UUIDs for a pod by querying the Kubernetes API:
-// Pod (resource claim refs) -> ResourceClaim (allocation results) -> ResourceSlice (device attributes).
-// Returns nil without error if the pod has no DRA claims or the driver is not gpu.nvidia.com.
-func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, podName, podNamespace string, log logr.Logger) ([]string, error) {
+type allocatedDRADevice struct {
+	pool   string
+	device string
+}
+
+func getAllocatedNVIDIADRADevices(ctx context.Context, clientset kubernetes.Interface, podName, podNamespace string, log logr.Logger) ([]allocatedDRADevice, string, bool, error) {
 	if clientset == nil {
-		return nil, nil
+		return nil, "", false, nil
 	}
 	if podName == "" || podNamespace == "" {
-		return nil, nil
+		return nil, "", false, nil
 	}

 	pod, err := clientset.CoreV1().Pods(podNamespace).Get(ctx, podName, metav1.GetOptions{})
 	if err != nil {
-		return nil, fmt.Errorf("get pod %s/%s: %w", podNamespace, podName, err)
+		return nil, "", false, fmt.Errorf("get pod %s/%s: %w", podNamespace, podName, err)
 	}
 	if len(pod.Spec.ResourceClaims) == 0 {
-		return nil, nil
+		return nil, pod.Spec.NodeName, false, nil
 	}
-	nodeName := pod.Spec.NodeName
-	if nodeName == "" {
+	if pod.Spec.NodeName == "" {
 		log.V(1).Info("pod has no node name, skipping DRA API lookup")
-		return nil, nil
+		return nil, "", false, nil
 	}

-	var allocated []struct {
-		driver string
-		pool   string
-		device string
+	claimNamesByPodRef := make(map[string]string, len(pod.Spec.ResourceClaims))
+	for _, ref := range pod.Spec.ResourceClaims {
+		if ref.ResourceClaimName != nil && *ref.ResourceClaimName != "" {
+			claimNamesByPodRef[ref.Name] = *ref.ResourceClaimName
+		}
+	}
+	for _, status := range pod.Status.ResourceClaimStatuses {
+		if status.ResourceClaimName == nil || *status.ResourceClaimName == "" {
+			continue
+		}
+		if _, exists := claimNamesByPodRef[status.Name]; !exists {
+			claimNamesByPodRef[status.Name] = *status.ResourceClaimName
 		}
+	}
+
+	var allocated []allocatedDRADevice
+	hasNVIDIADRAAllocation := false
 	for _, ref := range pod.Spec.ResourceClaims {
-		if ref.ResourceClaimName == nil || *ref.ResourceClaimName == "" {
+		claimName := claimNamesByPodRef[ref.Name]
+		if claimName == "" {
+			log.V(1).Info("pod resource claim has no resolved claim name", "pod_claim", ref.Name)
 			continue
 		}
-		claimName := *ref.ResourceClaimName
 		claim, err := clientset.ResourceV1().ResourceClaims(podNamespace).Get(ctx, claimName, metav1.GetOptions{})
 		if err != nil {
-			return nil, fmt.Errorf("get resource claim %s/%s: %w", podNamespace, claimName, err)
+			return nil, pod.Spec.NodeName, hasNVIDIADRAAllocation, fmt.Errorf("get resource claim %s/%s: %w", podNamespace, claimName, err)
 		}
 		if claim.Status.Allocation == nil || len(claim.Status.Allocation.Devices.Results) == 0 {
 			continue
 		}
-		for _, r := range claim.Status.Allocation.Devices.Results {
-			if r.Driver == nvidiaGPUDRADriver {
-				allocated = append(allocated, struct {
-					driver string
-					pool   string
-					device string
-				}{r.Driver, r.Pool, r.Device})
+		for _, result := range claim.Status.Allocation.Devices.Results {
+			if result.Driver != nvidiaGPUDRADriver {
+				continue
+			}
+			hasNVIDIADRAAllocation = true
+			allocated = append(allocated, allocatedDRADevice{
+				pool:   result.Pool,
+				device: result.Device,
+			})
 		}
 	}
+
+	return allocated, pod.Spec.NodeName, hasNVIDIADRAAllocation, nil
+}
+
+// GetGPUUUIDsViaDRAAPI resolves GPU UUIDs for a pod by querying the Kubernetes API:
+// Pod (resource claim refs) -> ResourceClaim (allocation results) -> ResourceSlice (device attributes).
+// It also reports whether the pod is using NVIDIA DRA GPU allocations at all.
+func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, podName, podNamespace string, log logr.Logger) ([]string, bool, error) {
+	allocated, nodeName, hasNVIDIADRAAllocation, err := getAllocatedNVIDIADRADevices(ctx, clientset, podName, podNamespace, log)
+	if err != nil {
+		return nil, hasNVIDIADRAAllocation, err
 	}
-	if len(allocated) == 0 {
-		return nil, nil
+	if !hasNVIDIADRAAllocation || len(allocated) == 0 {
+		return nil, hasNVIDIADRAAllocation, nil
 	}

 	slices, err := clientset.ResourceV1().ResourceSlices().List(ctx, metav1.ListOptions{
 		FieldSelector: fmt.Sprintf("spec.driver=%s,spec.nodeName=%s", nvidiaGPUDRADriver, nodeName),
 	})
 	if err != nil {
-		return nil, fmt.Errorf("list resource slices for node %s: %w", nodeName, err)
+		return nil, true, fmt.Errorf("list resource slices for node %s: %w", nodeName, err)
 	}

 	poolDeviceToUUID := make(map[string]map[string]string)
@@ -92,15 +119,15 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
 	}

 	var uuids []string
-	for _, a := range allocated {
-		devMap := poolDeviceToUUID[a.pool]
+	for _, device := range allocated {
+		devMap := poolDeviceToUUID[device.pool]
 		if devMap == nil {
-			log.V(1).Info("no ResourceSlice found for pool", "pool", a.pool, "device", a.device)
+			log.V(1).Info("no ResourceSlice found for pool", "pool", device.pool, "device", device.device)
 			continue
 		}
-		uuid, ok := devMap[a.device]
+		uuid, ok := devMap[device.device]
 		if !ok || uuid == "" {
-			log.V(1).Info("device has no UUID in ResourceSlice", "pool", a.pool, "device", a.device)
+			log.V(1).Info("device has no UUID in ResourceSlice", "pool", device.pool, "device", device.device)
 			continue
 		}
 		uuids = append(uuids, uuid)
@@ -108,7 +135,7 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
 	if len(uuids) > 0 {
 		log.Info("resolved GPU UUIDs via DRA API", "uuids", uuids)
 	}
-	return uuids, nil
+	return uuids, true, nil
 }

 func deviceUUIDFromAttributes(attrs map[resourcev1.QualifiedName]resourcev1.DeviceAttribute) string {

--- a/deploy/snapshot/internal/cuda/dra_test.go
+++ b/deploy/snapshot/internal/cuda/dra_test.go
@@ -60,10 +60,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
 	log := logr.Discard()

 	t.Run("nil clientset returns nil without error", func(t *testing.T) {
-		got, err := GetGPUUUIDsViaDRAAPI(ctx, nil, "pod", "ns", log)
+		got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, nil, "pod", "ns", log)
 		if err != nil {
 			t.Fatalf("unexpected error: %v", err)
 		}
+		if hasNVIDIADRAAllocation {
+			t.Fatal("expected hasNVIDIADRAAllocation to be false")
+		}
 		if got != nil {
 			t.Errorf("got %v, want nil", got)
 		}
@@ -71,10 +74,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {

 	t.Run("empty pod name returns nil", func(t *testing.T) {
 		client := fake.NewSimpleClientset()
-		got, err := GetGPUUUIDsViaDRAAPI(ctx, client, "", "ns", log)
+		got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, "", "ns", log)
 		if err != nil {
 			t.Fatalf("unexpected error: %v", err)
 		}
+		if hasNVIDIADRAAllocation {
+			t.Fatal("expected hasNVIDIADRAAllocation to be false")
+		}
 		if got != nil {
 			t.Errorf("got %v, want nil", got)
 		}
@@ -82,7 +88,7 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {

 	t.Run("pod not found returns error", func(t *testing.T) {
 		client := fake.NewSimpleClientset()
-		_, err := GetGPUUUIDsViaDRAAPI(ctx, client, "missing", "default", log)
+		_, _, err := GetGPUUUIDsViaDRAAPI(ctx, client, "missing", "default", log)
 		if err == nil {
 			t.Fatal("expected error when pod not found")
 		}
@@ -146,10 +152,213 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
 		}

 		client := fake.NewSimpleClientset(pod, claim, slice)
-		got, err := GetGPUUUIDsViaDRAAPI(ctx, client, podName, namespace, log)
+		got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, podName, namespace, log)
+		if err != nil {
+			t.Fatalf("GetGPUUUIDsViaDRAAPI: %v", err)
+		}
+		if !hasNVIDIADRAAllocation {
+			t.Fatal("expected hasNVIDIADRAAllocation to be true")
+		}
+		want := []string{uuid1, uuid2}
+		if len(got) != len(want) {
+			t.Fatalf("got %v (len %d), want %v (len %d)", got, len(got), want, len(want))
+		}
+		for i := range want {
+			if got[i] != want[i] {
+				t.Errorf("got[%d] = %q, want %q", i, got[i], want[i])
+			}
+		}
+	})
+
+	t.Run("pod with template-backed DRA claims resolves UUIDs via pod status", func(t *testing.T) {
+		nodeName := "node-1"
+		poolName := "pool-node-1"
+		namespace := "default"
+		podName := "test-pod"
+		generatedClaimName := "generated-gpu-claim"
+		uuid1 := "GPU-cccccccc-1111-2222-3333-444444444444"
+
+		pod := &corev1.Pod{
+			ObjectMeta: metav1.ObjectMeta{Name: podName, Namespace: namespace},
+			Spec: corev1.PodSpec{
+				NodeName: nodeName,
+				ResourceClaims: []corev1.PodResourceClaim{
+					{
+						Name: "gpu",
+					},
+				},
+			},
+			Status: corev1.PodStatus{
+				ResourceClaimStatuses: []corev1.PodResourceClaimStatus{
+					{
+						Name:              "gpu",
+						ResourceClaimName: ptr(generatedClaimName),
+					},
+				},
+			},
+		}
+		claim := &resourcev1.ResourceClaim{
+			ObjectMeta: metav1.ObjectMeta{Name: generatedClaimName, Namespace: namespace},
+			Status: resourcev1.ResourceClaimStatus{
+				Allocation: &resourcev1.AllocationResult{
+					Devices: resourcev1.DeviceAllocationResult{
+						Results: []resourcev1.DeviceRequestAllocationResult{
+							{Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-0", Request: "gpu"},
+						},
+					},
+				},
+			},
+		}
+		slice := &resourcev1.ResourceSlice{
+			ObjectMeta: metav1.ObjectMeta{Name: poolName + "-gpu.nvidia.com-xxx"},
+			Spec: resourcev1.ResourceSliceSpec{
+				Driver:   nvidiaGPUDRADriver,
+				NodeName: &nodeName,
+				Pool:     resourcev1.ResourcePool{Name: poolName},
+				Devices: []resourcev1.Device{
+					{
+						Name: "gpu-0",
+						Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{
+							resourcev1.QualifiedName("uuid"): {StringValue: &uuid1},
+						},
+					},
+				},
+			},
+		}
+
+		client := fake.NewSimpleClientset(pod, claim, slice)
+		got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, podName, namespace, log)
+		if err != nil {
+			t.Fatalf("GetGPUUUIDsViaDRAAPI: %v", err)
+		}
+		if !hasNVIDIADRAAllocation {
+			t.Fatal("expected hasNVIDIADRAAllocation to be true")
+		}
+		want := []string{uuid1}
+		if len(got) != len(want) {
+			t.Fatalf("got %v (len %d), want %v (len %d)", got, len(got), want, len(want))
+		}
+		for i := range want {
+			if got[i] != want[i] {
+				t.Errorf("got[%d] = %q, want %q", i, got[i], want[i])
+			}
+		}
+	})
+
+	t.Run("pod with unresolved resource claim returns nil", func(t *testing.T) {
+		pod := &corev1.Pod{
+			ObjectMeta: metav1.ObjectMeta{Name: "pod", Namespace: "default"},
+			Spec: corev1.PodSpec{
+				NodeName: "node-1",
+				ResourceClaims: []corev1.PodResourceClaim{
+					{
+						Name: "gpu",
+					},
+				},
+			},
+		}
+
+		client := fake.NewSimpleClientset(pod)
+		got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, "pod", "default", log)
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if hasNVIDIADRAAllocation {
+			t.Fatal("expected hasNVIDIADRAAllocation to be false")
+		}
+		if got != nil {
+			t.Errorf("got %v, want nil", got)
+		}
+	})
+
+	t.Run("pod with direct and template-backed claims resolves UUIDs from both", func(t *testing.T) {
+		nodeName := "node-1"
+		poolName := "pool-node-1"
+		namespace := "default"
+		podName := "test-pod"
+		directClaimName := "direct-gpu-claim"
+		generatedClaimName := "generated-gpu-claim"
+		uuid1 := "GPU-dddddddd-1111-2222-3333-444444444444"
+		uuid2 := "GPU-eeeeeeee-5555-6666-7777-888888888888"
+
+		pod := &corev1.Pod{
+			ObjectMeta: metav1.ObjectMeta{Name: podName, Namespace: namespace},
+			Spec: corev1.PodSpec{
+				NodeName: nodeName,
+				ResourceClaims: []corev1.PodResourceClaim{
+					{
+						Name:              "gpu-direct",
+						ResourceClaimName: ptr(directClaimName),
+					},
+					{
+						Name: "gpu-template",
+					},
+				},
+			},
+			Status: corev1.PodStatus{
+				ResourceClaimStatuses: []corev1.PodResourceClaimStatus{
+					{
+						Name:              "gpu-template",
+						ResourceClaimName: ptr(generatedClaimName),
+					},
+				},
+			},
+		}
+		directClaim := &resourcev1.ResourceClaim{
+			ObjectMeta: metav1.ObjectMeta{Name: directClaimName, Namespace: namespace},
+			Status: resourcev1.ResourceClaimStatus{
+				Allocation: &resourcev1.AllocationResult{
+					Devices: resourcev1.DeviceAllocationResult{
+						Results: []resourcev1.DeviceRequestAllocationResult{
+							{Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-0", Request: "gpu-direct"},
+						},
+					},
+				},
+			},
+		}
+		generatedClaim := &resourcev1.ResourceClaim{
+			ObjectMeta: metav1.ObjectMeta{Name: generatedClaimName, Namespace: namespace},
+			Status: resourcev1.ResourceClaimStatus{
+				Allocation: &resourcev1.AllocationResult{
+					Devices: resourcev1.DeviceAllocationResult{
+						Results: []resourcev1.DeviceRequestAllocationResult{
+							{Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-1", Request: "gpu-template"},
+						},
+					},
+				},
+			},
+		}
+		slice := &resourcev1.ResourceSlice{
+			ObjectMeta: metav1.ObjectMeta{Name: poolName + "-gpu.nvidia.com-xxx"},
+			Spec: resourcev1.ResourceSliceSpec{
+				Driver:   nvidiaGPUDRADriver,
+				NodeName: &nodeName,
+				Pool:     resourcev1.ResourcePool{Name: poolName},
+				Devices: []resourcev1.Device{
+					{
+						Name: "gpu-0",
+						Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{
+							resourcev1.QualifiedName("uuid"): {StringValue: &uuid1},
+						},
+					},
+					{
+						Name: "gpu-1",
+						Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{
+							resourcev1.QualifiedName("uuid"): {StringValue: &uuid2},
+						},
+					},
+				},
+			},
+		}
+
+		client := fake.NewSimpleClientset(pod, directClaim, generatedClaim, slice)
+		got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, podName, namespace, log)
 		if err != nil {
 			t.Fatalf("GetGPUUUIDsViaDRAAPI: %v", err)
 		}
+		if !hasNVIDIADRAAllocation {
+			t.Fatal("expected hasNVIDIADRAAllocation to be true")
+		}
 		want := []string{uuid1, uuid2}
 		if len(got) != len(want) {
 			t.Fatalf("got %v (len %d), want %v (len %d)", got, len(got), want, len(want))
@@ -167,10 +376,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
 			Spec:       corev1.PodSpec{NodeName: "node-1"},
 		}
 		client := fake.NewSimpleClientset(pod)
-		got, err := GetGPUUUIDsViaDRAAPI(ctx, client, "pod", "default", log)
+		got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, "pod", "default", log)
 		if err != nil {
 			t.Fatalf("unexpected error: %v", err)
 		}
+		if hasNVIDIADRAAllocation {
+			t.Fatal("expected hasNVIDIADRAAllocation to be false")
+		}
 		if got != nil {
 			t.Errorf("got %v, want nil", got)
 		}

--- a/deploy/snapshot/internal/executor/checkpoint.go
+++ b/deploy/snapshot/internal/executor/checkpoint.go
@@ -191,18 +191,19 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
 	}
 	var gpuUUIDs []string
 	if len(cudaHostPIDs) > 0 {
-		gpuUUIDs, err = cuda.GetPodGPUUUIDs(ctx, req.PodName, req.PodNamespace, req.ContainerName)
+		gpuUUIDs, err = cuda.DiscoverGPUUUIDs(
+			ctx,
+			req.Clientset,
+			req.PodName,
+			req.PodNamespace,
+			req.ContainerName,
+			snapshotruntime.HostProcPath,
+			pid,
+			log,
+		)
 		if err != nil {
 			return nil, fmt.Errorf("failed to discover source GPU UUIDs: %w", err)
 		}
-		if len(gpuUUIDs) == 0 {
-			log.Info("PodResources API returned no GPU UUIDs, falling back to nvidia-smi", "pid", pid)
-			gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, pid)
-			if err != nil {
-				return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err)
-			}
-			log.Info("nvidia-smi fallback discovered GPU UUIDs", "uuids", gpuUUIDs)
-		}
 	}

 	return &types.CheckpointContainerSnapshot{

--- a/deploy/snapshot/internal/executor/restore.go
+++ b/deploy/snapshot/internal/executor/restore.go
@@ -142,18 +142,19 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
 		if len(m.CUDA.SourceGPUUUIDs) == 0 {
 			return nil, fmt.Errorf("missing source GPU UUIDs in checkpoint manifest")
 		}
-		targetGPUUUIDs, err := cuda.GetPodGPUUUIDs(ctx, req.PodName, req.PodNamespace, containerName)
+		targetGPUUUIDs, err := cuda.DiscoverGPUUUIDs(
+			ctx,
+			req.Clientset,
+			req.PodName,
+			req.PodNamespace,
+			containerName,
+			snapshotruntime.HostProcPath,
+			placeholderPID,
+			log,
+		)
 		if err != nil {
 			return nil, fmt.Errorf("failed to get target GPU UUIDs: %w", err)
 		}
-		if len(targetGPUUUIDs) == 0 {
-			log.Info("PodResources API returned no target GPU UUIDs, falling back to nvidia-smi", "pid", placeholderPID)
-			targetGPUUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, placeholderPID)
-			if err != nil {
-				return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed for restore target: %w", err)
-			}
-			log.Info("nvidia-smi fallback discovered target GPU UUIDs", "uuids", targetGPUUUIDs)
-		}
 		if len(targetGPUUUIDs) == 0 {
 			return nil, fmt.Errorf("missing target GPU UUIDs for %s/%s container %s", req.PodNamespace, req.PodName, containerName)
 		}