Unverified Commit 2cc6d1e2 authored by Schwinn Saereesitthipitak's avatar Schwinn Saereesitthipitak Committed by GitHub
Browse files

fix(snapshot): resolve DRA GPU UUIDs from claims (#8292)

parent 8428c65f
......@@ -13,7 +13,7 @@ import (
"github.com/go-logr/logr"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/client-go/kubernetes"
podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
)
......@@ -104,6 +104,45 @@ func GetGPUUUIDsViaNvidiaSmi(ctx context.Context, hostProcPath string, pid int)
return uuids, nil
}
// DiscoverGPUUUIDs resolves GPU UUIDs according to the pod's allocation mode:
// DRA-backed pods use the DRA API, classic nvidia.com/gpu pods use PodResources,
// and nvidia-smi remains the last fallback for either path.
func DiscoverGPUUUIDs(ctx context.Context, clientset kubernetes.Interface, podName, podNamespace, containerName, hostProcPath string, pid int, log logr.Logger) ([]string, error) {
gpuUUIDs, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, clientset, podName, podNamespace, log)
fallbackReason := "DRA API returned no GPU UUIDs"
if err != nil {
log.Error(
err,
"DRA API GPU UUID lookup failed, trying other discovery paths",
"pod", podNamespace+"/"+podName,
"has_nvidia_dra_allocation", hasNVIDIADRAAllocation,
)
gpuUUIDs = nil
fallbackReason = "DRA API GPU UUID lookup failed"
}
if len(gpuUUIDs) > 0 {
return gpuUUIDs, nil
}
if !hasNVIDIADRAAllocation {
gpuUUIDs, err = GetPodGPUUUIDs(ctx, podName, podNamespace, containerName)
if err != nil {
return nil, fmt.Errorf("PodResources GPU UUID lookup failed: %w", err)
}
if len(gpuUUIDs) > 0 {
return gpuUUIDs, nil
}
fallbackReason = "PodResources API returned no GPU UUIDs"
}
log.Info(fallbackReason+", falling back to nvidia-smi", "pid", pid)
gpuUUIDs, err = GetGPUUUIDsViaNvidiaSmi(ctx, hostProcPath, pid)
if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err)
}
log.Info("nvidia-smi fallback discovered GPU UUIDs", "uuids", gpuUUIDs)
return gpuUUIDs, nil
}
// FilterProcesses returns the subset of candidate PIDs that hold actual CUDA contexts.
// Uses --get-restore-tid (the same technique as the CRIU CUDA plugin) instead of
// --get-state, because --get-state incorrectly matches coordinator processes like
......
......@@ -13,7 +13,10 @@ import (
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
corev1 "k8s.io/api/core/v1"
resourcev1 "k8s.io/api/resource/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
)
......@@ -93,7 +96,7 @@ func (s *testPodResourcesServer) Get(context.Context, *podresourcesv1.GetPodReso
return nil, status.Error(codes.Unimplemented, "not implemented in test")
}
func TestGetPodGPUUUIDs(t *testing.T) {
func installTestPodResourcesServer(t *testing.T, resp *podresourcesv1.ListPodResourcesResponse) {
socketDir := t.TempDir()
socketPath := filepath.Join(socketDir, "kubelet.sock")
......@@ -101,11 +104,34 @@ func TestGetPodGPUUUIDs(t *testing.T) {
if err != nil {
t.Fatalf("listen unix socket: %v", err)
}
defer listener.Close()
server := grpc.NewServer()
podresourcesv1.RegisterPodResourcesListerServer(server, &testPodResourcesServer{
resp: &podresourcesv1.ListPodResourcesResponse{
resp: resp,
})
go func() {
if serveErr := server.Serve(listener); serveErr != nil {
if errors.Is(serveErr, grpc.ErrServerStopped) || strings.Contains(serveErr.Error(), "use of closed network connection") {
return
}
t.Errorf("serve test pod-resources gRPC server: %v", serveErr)
}
}()
t.Cleanup(server.Stop)
t.Cleanup(func() {
_ = listener.Close()
})
previousSocketPath := podResourcesSocketPath
podResourcesSocketPath = socketPath
t.Cleanup(func() {
podResourcesSocketPath = previousSocketPath
})
}
func TestGetPodGPUUUIDs(t *testing.T) {
installTestPodResourcesServer(t, &podresourcesv1.ListPodResourcesResponse{
PodResources: []*podresourcesv1.PodResources{
{
Name: "other-pod",
......@@ -155,34 +181,66 @@ func TestGetPodGPUUUIDs(t *testing.T) {
},
},
},
},
})
go func() {
if serveErr := server.Serve(listener); serveErr != nil {
if errors.Is(serveErr, grpc.ErrServerStopped) || strings.Contains(serveErr.Error(), "use of closed network connection") {
return
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
got, err := GetPodGPUUUIDs(ctx, "test-pod", "default", "main")
if err != nil {
t.Fatalf("GetPodGPUUUIDs: %v", err)
}
t.Errorf("serve test pod-resources gRPC server: %v", serveErr)
want := []string{"GPU-a", "GPU-b", "GPU-c"}
if len(got) != len(want) {
t.Fatalf("got %v, want %v", got, want)
}
}()
t.Cleanup(server.Stop)
for i := range want {
if got[i] != want[i] {
t.Fatalf("got %v, want %v", got, want)
}
}
}
previousSocketPath := podResourcesSocketPath
podResourcesSocketPath = socketPath
t.Cleanup(func() {
podResourcesSocketPath = previousSocketPath
func TestDiscoverGPUUUIDsUsesPodResourcesForClassicPod(t *testing.T) {
installTestPodResourcesServer(t, &podresourcesv1.ListPodResourcesResponse{
PodResources: []*podresourcesv1.PodResources{
{
Name: "test-pod",
Namespace: "default",
Containers: []*podresourcesv1.ContainerResources{
{
Name: "main",
Devices: []*podresourcesv1.ContainerDevices{
{
ResourceName: nvidiaGPUResource,
DeviceIds: []string{"GPU-a", "GPU-b"},
},
},
},
},
},
},
})
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
got, err := GetPodGPUUUIDs(ctx, "test-pod", "default", "main")
got, err := DiscoverGPUUUIDs(
ctx,
nil,
"test-pod",
"default",
"main",
"/proc",
123,
logr.Discard(),
)
if err != nil {
t.Fatalf("GetPodGPUUUIDs: %v", err)
t.Fatalf("DiscoverGPUUUIDs: %v", err)
}
want := []string{"GPU-a", "GPU-b", "GPU-c"}
want := []string{"GPU-a", "GPU-b"}
if len(got) != len(want) {
t.Fatalf("got %v, want %v", got, want)
}
......@@ -192,3 +250,123 @@ func TestGetPodGPUUUIDs(t *testing.T) {
}
}
}
func TestDiscoverGPUUUIDsFallsBackToPodResourcesAfterDRAAPILookupError(t *testing.T) {
installTestPodResourcesServer(t, &podresourcesv1.ListPodResourcesResponse{
PodResources: []*podresourcesv1.PodResources{
{
Name: "test-pod",
Namespace: "default",
Containers: []*podresourcesv1.ContainerResources{
{
Name: "main",
Devices: []*podresourcesv1.ContainerDevices{
{
ResourceName: nvidiaGPUResource,
DeviceIds: []string{"GPU-a"},
},
},
},
},
},
},
})
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
got, err := DiscoverGPUUUIDs(
ctx,
fake.NewSimpleClientset(),
"test-pod",
"default",
"main",
"/proc",
123,
logr.Discard(),
)
if err != nil {
t.Fatalf("DiscoverGPUUUIDs: %v", err)
}
if len(got) != 1 || got[0] != "GPU-a" {
t.Fatalf("got %v, want [GPU-a]", got)
}
}
func TestDiscoverGPUUUIDsPrefersDRAForDRAPod(t *testing.T) {
previousSocketPath := podResourcesSocketPath
podResourcesSocketPath = filepath.Join(t.TempDir(), "missing-kubelet.sock")
t.Cleanup(func() {
podResourcesSocketPath = previousSocketPath
})
nodeName := "node-1"
poolName := "pool-node-1"
namespace := "default"
podName := "test-pod"
claimName := "gpu-claim"
uuid := "GPU-ffffffff-1111-2222-3333-444444444444"
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: podName, Namespace: namespace},
Spec: corev1.PodSpec{
NodeName: nodeName,
ResourceClaims: []corev1.PodResourceClaim{
{
Name: "gpu",
ResourceClaimName: &claimName,
},
},
},
}
claim := &resourcev1.ResourceClaim{
ObjectMeta: metav1.ObjectMeta{Name: claimName, Namespace: namespace},
Status: resourcev1.ResourceClaimStatus{
Allocation: &resourcev1.AllocationResult{
Devices: resourcev1.DeviceAllocationResult{
Results: []resourcev1.DeviceRequestAllocationResult{
{Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-0", Request: "gpu"},
},
},
},
},
}
slice := &resourcev1.ResourceSlice{
ObjectMeta: metav1.ObjectMeta{Name: poolName + "-gpu.nvidia.com-xxx"},
Spec: resourcev1.ResourceSliceSpec{
Driver: nvidiaGPUDRADriver,
NodeName: &nodeName,
Pool: resourcev1.ResourcePool{Name: poolName},
Devices: []resourcev1.Device{
{
Name: "gpu-0",
Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{
resourcev1.QualifiedName("uuid"): {StringValue: &uuid},
},
},
},
},
}
client := fake.NewSimpleClientset(pod, claim, slice)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
got, err := DiscoverGPUUUIDs(
ctx,
client,
podName,
namespace,
"main",
"/proc",
123,
logr.Discard(),
)
if err != nil {
t.Fatalf("DiscoverGPUUUIDs: %v", err)
}
if len(got) != 1 || got[0] != uuid {
t.Fatalf("got %v, want [%s]", got, uuid)
}
}
......@@ -14,66 +14,93 @@ const (
resourceAttributeUUID = "uuid"
)
// GetGPUUUIDsViaDRAAPI resolves GPU UUIDs for a pod by querying the Kubernetes API:
// Pod (resource claim refs) -> ResourceClaim (allocation results) -> ResourceSlice (device attributes).
// Returns nil without error if the pod has no DRA claims or the driver is not gpu.nvidia.com.
func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, podName, podNamespace string, log logr.Logger) ([]string, error) {
type allocatedDRADevice struct {
pool string
device string
}
func getAllocatedNVIDIADRADevices(ctx context.Context, clientset kubernetes.Interface, podName, podNamespace string, log logr.Logger) ([]allocatedDRADevice, string, bool, error) {
if clientset == nil {
return nil, nil
return nil, "", false, nil
}
if podName == "" || podNamespace == "" {
return nil, nil
return nil, "", false, nil
}
pod, err := clientset.CoreV1().Pods(podNamespace).Get(ctx, podName, metav1.GetOptions{})
if err != nil {
return nil, fmt.Errorf("get pod %s/%s: %w", podNamespace, podName, err)
return nil, "", false, fmt.Errorf("get pod %s/%s: %w", podNamespace, podName, err)
}
if len(pod.Spec.ResourceClaims) == 0 {
return nil, nil
return nil, pod.Spec.NodeName, false, nil
}
nodeName := pod.Spec.NodeName
if nodeName == "" {
if pod.Spec.NodeName == "" {
log.V(1).Info("pod has no node name, skipping DRA API lookup")
return nil, nil
return nil, "", false, nil
}
var allocated []struct {
driver string
pool string
device string
claimNamesByPodRef := make(map[string]string, len(pod.Spec.ResourceClaims))
for _, ref := range pod.Spec.ResourceClaims {
if ref.ResourceClaimName != nil && *ref.ResourceClaimName != "" {
claimNamesByPodRef[ref.Name] = *ref.ResourceClaimName
}
}
for _, status := range pod.Status.ResourceClaimStatuses {
if status.ResourceClaimName == nil || *status.ResourceClaimName == "" {
continue
}
if _, exists := claimNamesByPodRef[status.Name]; !exists {
claimNamesByPodRef[status.Name] = *status.ResourceClaimName
}
}
var allocated []allocatedDRADevice
hasNVIDIADRAAllocation := false
for _, ref := range pod.Spec.ResourceClaims {
if ref.ResourceClaimName == nil || *ref.ResourceClaimName == "" {
claimName := claimNamesByPodRef[ref.Name]
if claimName == "" {
log.V(1).Info("pod resource claim has no resolved claim name", "pod_claim", ref.Name)
continue
}
claimName := *ref.ResourceClaimName
claim, err := clientset.ResourceV1().ResourceClaims(podNamespace).Get(ctx, claimName, metav1.GetOptions{})
if err != nil {
return nil, fmt.Errorf("get resource claim %s/%s: %w", podNamespace, claimName, err)
return nil, pod.Spec.NodeName, hasNVIDIADRAAllocation, fmt.Errorf("get resource claim %s/%s: %w", podNamespace, claimName, err)
}
if claim.Status.Allocation == nil || len(claim.Status.Allocation.Devices.Results) == 0 {
continue
}
for _, r := range claim.Status.Allocation.Devices.Results {
if r.Driver == nvidiaGPUDRADriver {
allocated = append(allocated, struct {
driver string
pool string
device string
}{r.Driver, r.Pool, r.Device})
for _, result := range claim.Status.Allocation.Devices.Results {
if result.Driver != nvidiaGPUDRADriver {
continue
}
hasNVIDIADRAAllocation = true
allocated = append(allocated, allocatedDRADevice{
pool: result.Pool,
device: result.Device,
})
}
}
return allocated, pod.Spec.NodeName, hasNVIDIADRAAllocation, nil
}
// GetGPUUUIDsViaDRAAPI resolves GPU UUIDs for a pod by querying the Kubernetes API:
// Pod (resource claim refs) -> ResourceClaim (allocation results) -> ResourceSlice (device attributes).
// It also reports whether the pod is using NVIDIA DRA GPU allocations at all.
func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, podName, podNamespace string, log logr.Logger) ([]string, bool, error) {
allocated, nodeName, hasNVIDIADRAAllocation, err := getAllocatedNVIDIADRADevices(ctx, clientset, podName, podNamespace, log)
if err != nil {
return nil, hasNVIDIADRAAllocation, err
}
if len(allocated) == 0 {
return nil, nil
if !hasNVIDIADRAAllocation || len(allocated) == 0 {
return nil, hasNVIDIADRAAllocation, nil
}
slices, err := clientset.ResourceV1().ResourceSlices().List(ctx, metav1.ListOptions{
FieldSelector: fmt.Sprintf("spec.driver=%s,spec.nodeName=%s", nvidiaGPUDRADriver, nodeName),
})
if err != nil {
return nil, fmt.Errorf("list resource slices for node %s: %w", nodeName, err)
return nil, true, fmt.Errorf("list resource slices for node %s: %w", nodeName, err)
}
poolDeviceToUUID := make(map[string]map[string]string)
......@@ -92,15 +119,15 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
}
var uuids []string
for _, a := range allocated {
devMap := poolDeviceToUUID[a.pool]
for _, device := range allocated {
devMap := poolDeviceToUUID[device.pool]
if devMap == nil {
log.V(1).Info("no ResourceSlice found for pool", "pool", a.pool, "device", a.device)
log.V(1).Info("no ResourceSlice found for pool", "pool", device.pool, "device", device.device)
continue
}
uuid, ok := devMap[a.device]
uuid, ok := devMap[device.device]
if !ok || uuid == "" {
log.V(1).Info("device has no UUID in ResourceSlice", "pool", a.pool, "device", a.device)
log.V(1).Info("device has no UUID in ResourceSlice", "pool", device.pool, "device", device.device)
continue
}
uuids = append(uuids, uuid)
......@@ -108,7 +135,7 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
if len(uuids) > 0 {
log.Info("resolved GPU UUIDs via DRA API", "uuids", uuids)
}
return uuids, nil
return uuids, true, nil
}
func deviceUUIDFromAttributes(attrs map[resourcev1.QualifiedName]resourcev1.DeviceAttribute) string {
......
......@@ -60,10 +60,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
log := logr.Discard()
t.Run("nil clientset returns nil without error", func(t *testing.T) {
got, err := GetGPUUUIDsViaDRAAPI(ctx, nil, "pod", "ns", log)
got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, nil, "pod", "ns", log)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be false")
}
if got != nil {
t.Errorf("got %v, want nil", got)
}
......@@ -71,10 +74,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
t.Run("empty pod name returns nil", func(t *testing.T) {
client := fake.NewSimpleClientset()
got, err := GetGPUUUIDsViaDRAAPI(ctx, client, "", "ns", log)
got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, "", "ns", log)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be false")
}
if got != nil {
t.Errorf("got %v, want nil", got)
}
......@@ -82,7 +88,7 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
t.Run("pod not found returns error", func(t *testing.T) {
client := fake.NewSimpleClientset()
_, err := GetGPUUUIDsViaDRAAPI(ctx, client, "missing", "default", log)
_, _, err := GetGPUUUIDsViaDRAAPI(ctx, client, "missing", "default", log)
if err == nil {
t.Fatal("expected error when pod not found")
}
......@@ -146,10 +152,213 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
}
client := fake.NewSimpleClientset(pod, claim, slice)
got, err := GetGPUUUIDsViaDRAAPI(ctx, client, podName, namespace, log)
got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, podName, namespace, log)
if err != nil {
t.Fatalf("GetGPUUUIDsViaDRAAPI: %v", err)
}
if !hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be true")
}
want := []string{uuid1, uuid2}
if len(got) != len(want) {
t.Fatalf("got %v (len %d), want %v (len %d)", got, len(got), want, len(want))
}
for i := range want {
if got[i] != want[i] {
t.Errorf("got[%d] = %q, want %q", i, got[i], want[i])
}
}
})
t.Run("pod with template-backed DRA claims resolves UUIDs via pod status", func(t *testing.T) {
nodeName := "node-1"
poolName := "pool-node-1"
namespace := "default"
podName := "test-pod"
generatedClaimName := "generated-gpu-claim"
uuid1 := "GPU-cccccccc-1111-2222-3333-444444444444"
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: podName, Namespace: namespace},
Spec: corev1.PodSpec{
NodeName: nodeName,
ResourceClaims: []corev1.PodResourceClaim{
{
Name: "gpu",
},
},
},
Status: corev1.PodStatus{
ResourceClaimStatuses: []corev1.PodResourceClaimStatus{
{
Name: "gpu",
ResourceClaimName: ptr(generatedClaimName),
},
},
},
}
claim := &resourcev1.ResourceClaim{
ObjectMeta: metav1.ObjectMeta{Name: generatedClaimName, Namespace: namespace},
Status: resourcev1.ResourceClaimStatus{
Allocation: &resourcev1.AllocationResult{
Devices: resourcev1.DeviceAllocationResult{
Results: []resourcev1.DeviceRequestAllocationResult{
{Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-0", Request: "gpu"},
},
},
},
},
}
slice := &resourcev1.ResourceSlice{
ObjectMeta: metav1.ObjectMeta{Name: poolName + "-gpu.nvidia.com-xxx"},
Spec: resourcev1.ResourceSliceSpec{
Driver: nvidiaGPUDRADriver,
NodeName: &nodeName,
Pool: resourcev1.ResourcePool{Name: poolName},
Devices: []resourcev1.Device{
{
Name: "gpu-0",
Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{
resourcev1.QualifiedName("uuid"): {StringValue: &uuid1},
},
},
},
},
}
client := fake.NewSimpleClientset(pod, claim, slice)
got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, podName, namespace, log)
if err != nil {
t.Fatalf("GetGPUUUIDsViaDRAAPI: %v", err)
}
if !hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be true")
}
want := []string{uuid1}
if len(got) != len(want) {
t.Fatalf("got %v (len %d), want %v (len %d)", got, len(got), want, len(want))
}
for i := range want {
if got[i] != want[i] {
t.Errorf("got[%d] = %q, want %q", i, got[i], want[i])
}
}
})
t.Run("pod with unresolved resource claim returns nil", func(t *testing.T) {
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "pod", Namespace: "default"},
Spec: corev1.PodSpec{
NodeName: "node-1",
ResourceClaims: []corev1.PodResourceClaim{
{
Name: "gpu",
},
},
},
}
client := fake.NewSimpleClientset(pod)
got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, "pod", "default", log)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be false")
}
if got != nil {
t.Errorf("got %v, want nil", got)
}
})
t.Run("pod with direct and template-backed claims resolves UUIDs from both", func(t *testing.T) {
nodeName := "node-1"
poolName := "pool-node-1"
namespace := "default"
podName := "test-pod"
directClaimName := "direct-gpu-claim"
generatedClaimName := "generated-gpu-claim"
uuid1 := "GPU-dddddddd-1111-2222-3333-444444444444"
uuid2 := "GPU-eeeeeeee-5555-6666-7777-888888888888"
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: podName, Namespace: namespace},
Spec: corev1.PodSpec{
NodeName: nodeName,
ResourceClaims: []corev1.PodResourceClaim{
{
Name: "gpu-direct",
ResourceClaimName: ptr(directClaimName),
},
{
Name: "gpu-template",
},
},
},
Status: corev1.PodStatus{
ResourceClaimStatuses: []corev1.PodResourceClaimStatus{
{
Name: "gpu-template",
ResourceClaimName: ptr(generatedClaimName),
},
},
},
}
directClaim := &resourcev1.ResourceClaim{
ObjectMeta: metav1.ObjectMeta{Name: directClaimName, Namespace: namespace},
Status: resourcev1.ResourceClaimStatus{
Allocation: &resourcev1.AllocationResult{
Devices: resourcev1.DeviceAllocationResult{
Results: []resourcev1.DeviceRequestAllocationResult{
{Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-0", Request: "gpu-direct"},
},
},
},
},
}
generatedClaim := &resourcev1.ResourceClaim{
ObjectMeta: metav1.ObjectMeta{Name: generatedClaimName, Namespace: namespace},
Status: resourcev1.ResourceClaimStatus{
Allocation: &resourcev1.AllocationResult{
Devices: resourcev1.DeviceAllocationResult{
Results: []resourcev1.DeviceRequestAllocationResult{
{Driver: nvidiaGPUDRADriver, Pool: poolName, Device: "gpu-1", Request: "gpu-template"},
},
},
},
},
}
slice := &resourcev1.ResourceSlice{
ObjectMeta: metav1.ObjectMeta{Name: poolName + "-gpu.nvidia.com-xxx"},
Spec: resourcev1.ResourceSliceSpec{
Driver: nvidiaGPUDRADriver,
NodeName: &nodeName,
Pool: resourcev1.ResourcePool{Name: poolName},
Devices: []resourcev1.Device{
{
Name: "gpu-0",
Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{
resourcev1.QualifiedName("uuid"): {StringValue: &uuid1},
},
},
{
Name: "gpu-1",
Attributes: map[resourcev1.QualifiedName]resourcev1.DeviceAttribute{
resourcev1.QualifiedName("uuid"): {StringValue: &uuid2},
},
},
},
},
}
client := fake.NewSimpleClientset(pod, directClaim, generatedClaim, slice)
got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, podName, namespace, log)
if err != nil {
t.Fatalf("GetGPUUUIDsViaDRAAPI: %v", err)
}
if !hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be true")
}
want := []string{uuid1, uuid2}
if len(got) != len(want) {
t.Fatalf("got %v (len %d), want %v (len %d)", got, len(got), want, len(want))
......@@ -167,10 +376,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
Spec: corev1.PodSpec{NodeName: "node-1"},
}
client := fake.NewSimpleClientset(pod)
got, err := GetGPUUUIDsViaDRAAPI(ctx, client, "pod", "default", log)
got, hasNVIDIADRAAllocation, err := GetGPUUUIDsViaDRAAPI(ctx, client, "pod", "default", log)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if hasNVIDIADRAAllocation {
t.Fatal("expected hasNVIDIADRAAllocation to be false")
}
if got != nil {
t.Errorf("got %v, want nil", got)
}
......
......@@ -191,18 +191,19 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
}
var gpuUUIDs []string
if len(cudaHostPIDs) > 0 {
gpuUUIDs, err = cuda.GetPodGPUUUIDs(ctx, req.PodName, req.PodNamespace, req.ContainerName)
gpuUUIDs, err = cuda.DiscoverGPUUUIDs(
ctx,
req.Clientset,
req.PodName,
req.PodNamespace,
req.ContainerName,
snapshotruntime.HostProcPath,
pid,
log,
)
if err != nil {
return nil, fmt.Errorf("failed to discover source GPU UUIDs: %w", err)
}
if len(gpuUUIDs) == 0 {
log.Info("PodResources API returned no GPU UUIDs, falling back to nvidia-smi", "pid", pid)
gpuUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, pid)
if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed: %w", err)
}
log.Info("nvidia-smi fallback discovered GPU UUIDs", "uuids", gpuUUIDs)
}
}
return &types.CheckpointContainerSnapshot{
......
......@@ -142,18 +142,19 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
if len(m.CUDA.SourceGPUUUIDs) == 0 {
return nil, fmt.Errorf("missing source GPU UUIDs in checkpoint manifest")
}
targetGPUUUIDs, err := cuda.GetPodGPUUUIDs(ctx, req.PodName, req.PodNamespace, containerName)
targetGPUUUIDs, err := cuda.DiscoverGPUUUIDs(
ctx,
req.Clientset,
req.PodName,
req.PodNamespace,
containerName,
snapshotruntime.HostProcPath,
placeholderPID,
log,
)
if err != nil {
return nil, fmt.Errorf("failed to get target GPU UUIDs: %w", err)
}
if len(targetGPUUUIDs) == 0 {
log.Info("PodResources API returned no target GPU UUIDs, falling back to nvidia-smi", "pid", placeholderPID)
targetGPUUUIDs, err = cuda.GetGPUUUIDsViaNvidiaSmi(ctx, snapshotruntime.HostProcPath, placeholderPID)
if err != nil {
return nil, fmt.Errorf("nvidia-smi GPU UUID fallback failed for restore target: %w", err)
}
log.Info("nvidia-smi fallback discovered target GPU UUIDs", "uuids", targetGPUUUIDs)
}
if len(targetGPUUUIDs) == 0 {
return nil, fmt.Errorf("missing target GPU UUIDs for %s/%s container %s", req.PodNamespace, req.PodName, containerName)
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment