Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2cc6d1e2
Unverified
Commit
2cc6d1e2
authored
Apr 17, 2026
by
Schwinn Saereesitthipitak
Committed by
GitHub
Apr 17, 2026
Browse files
fix(snapshot): resolve DRA GPU UUIDs from claims (#8292)
parent
8428c65f
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
570 additions
and
112 deletions
+570
-112
deploy/snapshot/internal/cuda/cuda.go
deploy/snapshot/internal/cuda/cuda.go
+40
-1
deploy/snapshot/internal/cuda/cuda_test.go
deploy/snapshot/internal/cuda/cuda_test.go
+232
-54
deploy/snapshot/internal/cuda/dra.go
deploy/snapshot/internal/cuda/dra.go
+61
-34
deploy/snapshot/internal/cuda/dra_test.go
deploy/snapshot/internal/cuda/dra_test.go
+217
-5
deploy/snapshot/internal/executor/checkpoint.go
deploy/snapshot/internal/executor/checkpoint.go
+10
-9
deploy/snapshot/internal/executor/restore.go
deploy/snapshot/internal/executor/restore.go
+10
-9
No files found.
deploy/snapshot/internal/cuda/cuda.go
View file @
2cc6d1e2
...
...
@@ -13,7 +13,7 @@ import (
"github.com/go-logr/logr"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/client-go/kubernetes"
podresourcesv1
"k8s.io/kubelet/pkg/apis/podresources/v1"
)
...
...
@@ -104,6 +104,45 @@ func GetGPUUUIDsViaNvidiaSmi(ctx context.Context, hostProcPath string, pid int)
return
uuids
,
nil
}
// DiscoverGPUUUIDs resolves GPU UUIDs according to the pod's allocation mode:
// DRA-backed pods use the DRA API, classic nvidia.com/gpu pods use PodResources,
// and nvidia-smi remains the last fallback for either path.
func
DiscoverGPUUUIDs
(
ctx
context
.
Context
,
clientset
kubernetes
.
Interface
,
podName
,
podNamespace
,
containerName
,
hostProcPath
string
,
pid
int
,
log
logr
.
Logger
)
([]
string
,
error
)
{
gpuUUIDs
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
clientset
,
podName
,
podNamespace
,
log
)
fallbackReason
:=
"DRA API returned no GPU UUIDs"
if
err
!=
nil
{
log
.
Error
(
err
,
"DRA API GPU UUID lookup failed, trying other discovery paths"
,
"pod"
,
podNamespace
+
"/"
+
podName
,
"has_nvidia_dra_allocation"
,
hasNVIDIADRAAllocation
,
)
gpuUUIDs
=
nil
fallbackReason
=
"DRA API GPU UUID lookup failed"
}
if
len
(
gpuUUIDs
)
>
0
{
return
gpuUUIDs
,
nil
}
if
!
hasNVIDIADRAAllocation
{
gpuUUIDs
,
err
=
GetPodGPUUUIDs
(
ctx
,
podName
,
podNamespace
,
containerName
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"PodResources GPU UUID lookup failed: %w"
,
err
)
}
if
len
(
gpuUUIDs
)
>
0
{
return
gpuUUIDs
,
nil
}
fallbackReason
=
"PodResources API returned no GPU UUIDs"
}
log
.
Info
(
fallbackReason
+
", falling back to nvidia-smi"
,
"pid"
,
pid
)
gpuUUIDs
,
err
=
GetGPUUUIDsViaNvidiaSmi
(
ctx
,
hostProcPath
,
pid
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"nvidia-smi GPU UUID fallback failed: %w"
,
err
)
}
log
.
Info
(
"nvidia-smi fallback discovered GPU UUIDs"
,
"uuids"
,
gpuUUIDs
)
return
gpuUUIDs
,
nil
}
// FilterProcesses returns the subset of candidate PIDs that hold actual CUDA contexts.
// Uses --get-restore-tid (the same technique as the CRIU CUDA plugin) instead of
// --get-state, because --get-state incorrectly matches coordinator processes like
...
...
deploy/snapshot/internal/cuda/cuda_test.go
View file @
2cc6d1e2
...
...
@@ -13,7 +13,10 @@ import (
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
corev1
"k8s.io/api/core/v1"
resourcev1
"k8s.io/api/resource/v1"
metav1
"k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
podresourcesv1
"k8s.io/kubelet/pkg/apis/podresources/v1"
)
...
...
@@ -93,7 +96,7 @@ func (s *testPodResourcesServer) Get(context.Context, *podresourcesv1.GetPodReso
return
nil
,
status
.
Error
(
codes
.
Unimplemented
,
"not implemented in test"
)
}
func
TestGetPodGPUUUIDs
(
t
*
testing
.
T
)
{
func
installTestPodResourcesServer
(
t
*
testing
.
T
,
resp
*
podresourcesv1
.
ListPodResourcesResponse
)
{
socketDir
:=
t
.
TempDir
()
socketPath
:=
filepath
.
Join
(
socketDir
,
"kubelet.sock"
)
...
...
@@ -101,61 +104,10 @@ func TestGetPodGPUUUIDs(t *testing.T) {
if
err
!=
nil
{
t
.
Fatalf
(
"listen unix socket: %v"
,
err
)
}
defer
listener
.
Close
()
server
:=
grpc
.
NewServer
()
podresourcesv1
.
RegisterPodResourcesListerServer
(
server
,
&
testPodResourcesServer
{
resp
:
&
podresourcesv1
.
ListPodResourcesResponse
{
PodResources
:
[]
*
podresourcesv1
.
PodResources
{
{
Name
:
"other-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-ignore"
},
},
},
},
},
},
{
Name
:
"test-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"sidecar"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-sidecar"
},
},
},
},
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-a"
,
"GPU-b"
},
},
{
ResourceName
:
"example.com/fpga"
,
DeviceIds
:
[]
string
{
"FPGA-ignore"
},
},
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-c"
},
},
},
},
},
},
},
},
resp
:
resp
,
})
go
func
()
{
...
...
@@ -167,12 +119,69 @@ func TestGetPodGPUUUIDs(t *testing.T) {
}
}()
t
.
Cleanup
(
server
.
Stop
)
t
.
Cleanup
(
func
()
{
_
=
listener
.
Close
()
})
previousSocketPath
:=
podResourcesSocketPath
podResourcesSocketPath
=
socketPath
t
.
Cleanup
(
func
()
{
podResourcesSocketPath
=
previousSocketPath
})
}
func
TestGetPodGPUUUIDs
(
t
*
testing
.
T
)
{
installTestPodResourcesServer
(
t
,
&
podresourcesv1
.
ListPodResourcesResponse
{
PodResources
:
[]
*
podresourcesv1
.
PodResources
{
{
Name
:
"other-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-ignore"
},
},
},
},
},
},
{
Name
:
"test-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"sidecar"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-sidecar"
},
},
},
},
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-a"
,
"GPU-b"
},
},
{
ResourceName
:
"example.com/fpga"
,
DeviceIds
:
[]
string
{
"FPGA-ignore"
},
},
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-c"
},
},
},
},
},
},
},
})
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
defer
cancel
()
...
...
@@ -192,3 +201,172 @@ func TestGetPodGPUUUIDs(t *testing.T) {
}
}
}
func
TestDiscoverGPUUUIDsUsesPodResourcesForClassicPod
(
t
*
testing
.
T
)
{
installTestPodResourcesServer
(
t
,
&
podresourcesv1
.
ListPodResourcesResponse
{
PodResources
:
[]
*
podresourcesv1
.
PodResources
{
{
Name
:
"test-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-a"
,
"GPU-b"
},
},
},
},
},
},
},
})
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
defer
cancel
()
got
,
err
:=
DiscoverGPUUUIDs
(
ctx
,
nil
,
"test-pod"
,
"default"
,
"main"
,
"/proc"
,
123
,
logr
.
Discard
(),
)
if
err
!=
nil
{
t
.
Fatalf
(
"DiscoverGPUUUIDs: %v"
,
err
)
}
want
:=
[]
string
{
"GPU-a"
,
"GPU-b"
}
if
len
(
got
)
!=
len
(
want
)
{
t
.
Fatalf
(
"got %v, want %v"
,
got
,
want
)
}
for
i
:=
range
want
{
if
got
[
i
]
!=
want
[
i
]
{
t
.
Fatalf
(
"got %v, want %v"
,
got
,
want
)
}
}
}
func
TestDiscoverGPUUUIDsFallsBackToPodResourcesAfterDRAAPILookupError
(
t
*
testing
.
T
)
{
installTestPodResourcesServer
(
t
,
&
podresourcesv1
.
ListPodResourcesResponse
{
PodResources
:
[]
*
podresourcesv1
.
PodResources
{
{
Name
:
"test-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-a"
},
},
},
},
},
},
},
})
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
defer
cancel
()
got
,
err
:=
DiscoverGPUUUIDs
(
ctx
,
fake
.
NewSimpleClientset
(),
"test-pod"
,
"default"
,
"main"
,
"/proc"
,
123
,
logr
.
Discard
(),
)
if
err
!=
nil
{
t
.
Fatalf
(
"DiscoverGPUUUIDs: %v"
,
err
)
}
if
len
(
got
)
!=
1
||
got
[
0
]
!=
"GPU-a"
{
t
.
Fatalf
(
"got %v, want [GPU-a]"
,
got
)
}
}
func
TestDiscoverGPUUUIDsPrefersDRAForDRAPod
(
t
*
testing
.
T
)
{
previousSocketPath
:=
podResourcesSocketPath
podResourcesSocketPath
=
filepath
.
Join
(
t
.
TempDir
(),
"missing-kubelet.sock"
)
t
.
Cleanup
(
func
()
{
podResourcesSocketPath
=
previousSocketPath
})
nodeName
:=
"node-1"
poolName
:=
"pool-node-1"
namespace
:=
"default"
podName
:=
"test-pod"
claimName
:=
"gpu-claim"
uuid
:=
"GPU-ffffffff-1111-2222-3333-444444444444"
pod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
podName
,
Namespace
:
namespace
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
nodeName
,
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"gpu"
,
ResourceClaimName
:
&
claimName
,
},
},
},
}
claim
:=
&
resourcev1
.
ResourceClaim
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
claimName
,
Namespace
:
namespace
},
Status
:
resourcev1
.
ResourceClaimStatus
{
Allocation
:
&
resourcev1
.
AllocationResult
{
Devices
:
resourcev1
.
DeviceAllocationResult
{
Results
:
[]
resourcev1
.
DeviceRequestAllocationResult
{
{
Driver
:
nvidiaGPUDRADriver
,
Pool
:
poolName
,
Device
:
"gpu-0"
,
Request
:
"gpu"
},
},
},
},
},
}
slice
:=
&
resourcev1
.
ResourceSlice
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
poolName
+
"-gpu.nvidia.com-xxx"
},
Spec
:
resourcev1
.
ResourceSliceSpec
{
Driver
:
nvidiaGPUDRADriver
,
NodeName
:
&
nodeName
,
Pool
:
resourcev1
.
ResourcePool
{
Name
:
poolName
},
Devices
:
[]
resourcev1
.
Device
{
{
Name
:
"gpu-0"
,
Attributes
:
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
{
resourcev1
.
QualifiedName
(
"uuid"
)
:
{
StringValue
:
&
uuid
},
},
},
},
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
,
claim
,
slice
)
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
defer
cancel
()
got
,
err
:=
DiscoverGPUUUIDs
(
ctx
,
client
,
podName
,
namespace
,
"main"
,
"/proc"
,
123
,
logr
.
Discard
(),
)
if
err
!=
nil
{
t
.
Fatalf
(
"DiscoverGPUUUIDs: %v"
,
err
)
}
if
len
(
got
)
!=
1
||
got
[
0
]
!=
uuid
{
t
.
Fatalf
(
"got %v, want [%s]"
,
got
,
uuid
)
}
}
deploy/snapshot/internal/cuda/dra.go
View file @
2cc6d1e2
...
...
@@ -14,66 +14,93 @@ const (
resourceAttributeUUID
=
"uuid"
)
// GetGPUUUIDsViaDRAAPI resolves GPU UUIDs for a pod by querying the Kubernetes API:
// Pod (resource claim refs) -> ResourceClaim (allocation results) -> ResourceSlice (device attributes).
// Returns nil without error if the pod has no DRA claims or the driver is not gpu.nvidia.com.
func
GetGPUUUIDsViaDRAAPI
(
ctx
context
.
Context
,
clientset
kubernetes
.
Interface
,
podName
,
podNamespace
string
,
log
logr
.
Logger
)
([]
string
,
error
)
{
type
allocatedDRADevice
struct
{
pool
string
device
string
}
func
getAllocatedNVIDIADRADevices
(
ctx
context
.
Context
,
clientset
kubernetes
.
Interface
,
podName
,
podNamespace
string
,
log
logr
.
Logger
)
([]
allocatedDRADevice
,
string
,
bool
,
error
)
{
if
clientset
==
nil
{
return
nil
,
nil
return
nil
,
""
,
false
,
nil
}
if
podName
==
""
||
podNamespace
==
""
{
return
nil
,
nil
return
nil
,
""
,
false
,
nil
}
pod
,
err
:=
clientset
.
CoreV1
()
.
Pods
(
podNamespace
)
.
Get
(
ctx
,
podName
,
metav1
.
GetOptions
{})
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"get pod %s/%s: %w"
,
podNamespace
,
podName
,
err
)
return
nil
,
""
,
false
,
fmt
.
Errorf
(
"get pod %s/%s: %w"
,
podNamespace
,
podName
,
err
)
}
if
len
(
pod
.
Spec
.
ResourceClaims
)
==
0
{
return
nil
,
nil
return
nil
,
pod
.
Spec
.
NodeName
,
false
,
nil
}
nodeName
:=
pod
.
Spec
.
NodeName
if
nodeName
==
""
{
if
pod
.
Spec
.
NodeName
==
""
{
log
.
V
(
1
)
.
Info
(
"pod has no node name, skipping DRA API lookup"
)
return
nil
,
nil
return
nil
,
""
,
false
,
nil
}
var
allocated
[]
struct
{
driver
string
pool
string
device
string
claimNamesByPodRef
:=
make
(
map
[
string
]
string
,
len
(
pod
.
Spec
.
ResourceClaims
))
for
_
,
ref
:=
range
pod
.
Spec
.
ResourceClaims
{
if
ref
.
ResourceClaimName
!=
nil
&&
*
ref
.
ResourceClaimName
!=
""
{
claimNamesByPodRef
[
ref
.
Name
]
=
*
ref
.
ResourceClaimName
}
}
for
_
,
status
:=
range
pod
.
Status
.
ResourceClaimStatuses
{
if
status
.
ResourceClaimName
==
nil
||
*
status
.
ResourceClaimName
==
""
{
continue
}
if
_
,
exists
:=
claimNamesByPodRef
[
status
.
Name
];
!
exists
{
claimNamesByPodRef
[
status
.
Name
]
=
*
status
.
ResourceClaimName
}
}
var
allocated
[]
allocatedDRADevice
hasNVIDIADRAAllocation
:=
false
for
_
,
ref
:=
range
pod
.
Spec
.
ResourceClaims
{
if
ref
.
ResourceClaimName
==
nil
||
*
ref
.
ResourceClaimName
==
""
{
claimName
:=
claimNamesByPodRef
[
ref
.
Name
]
if
claimName
==
""
{
log
.
V
(
1
)
.
Info
(
"pod resource claim has no resolved claim name"
,
"pod_claim"
,
ref
.
Name
)
continue
}
claimName
:=
*
ref
.
ResourceClaimName
claim
,
err
:=
clientset
.
ResourceV1
()
.
ResourceClaims
(
podNamespace
)
.
Get
(
ctx
,
claimName
,
metav1
.
GetOptions
{})
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"get resource claim %s/%s: %w"
,
podNamespace
,
claimName
,
err
)
return
nil
,
pod
.
Spec
.
NodeName
,
hasNVIDIADRAAllocation
,
fmt
.
Errorf
(
"get resource claim %s/%s: %w"
,
podNamespace
,
claimName
,
err
)
}
if
claim
.
Status
.
Allocation
==
nil
||
len
(
claim
.
Status
.
Allocation
.
Devices
.
Results
)
==
0
{
continue
}
for
_
,
r
:=
range
claim
.
Status
.
Allocation
.
Devices
.
Results
{
if
r
.
Driver
==
nvidiaGPUDRADriver
{
allocated
=
append
(
allocated
,
struct
{
driver
string
pool
string
device
string
}{
r
.
Driver
,
r
.
Pool
,
r
.
Device
})
for
_
,
result
:=
range
claim
.
Status
.
Allocation
.
Devices
.
Results
{
if
result
.
Driver
!=
nvidiaGPUDRADriver
{
continue
}
hasNVIDIADRAAllocation
=
true
allocated
=
append
(
allocated
,
allocatedDRADevice
{
pool
:
result
.
Pool
,
device
:
result
.
Device
,
})
}
}
if
len
(
allocated
)
==
0
{
return
nil
,
nil
return
allocated
,
pod
.
Spec
.
NodeName
,
hasNVIDIADRAAllocation
,
nil
}
// GetGPUUUIDsViaDRAAPI resolves GPU UUIDs for a pod by querying the Kubernetes API:
// Pod (resource claim refs) -> ResourceClaim (allocation results) -> ResourceSlice (device attributes).
// It also reports whether the pod is using NVIDIA DRA GPU allocations at all.
func
GetGPUUUIDsViaDRAAPI
(
ctx
context
.
Context
,
clientset
kubernetes
.
Interface
,
podName
,
podNamespace
string
,
log
logr
.
Logger
)
([]
string
,
bool
,
error
)
{
allocated
,
nodeName
,
hasNVIDIADRAAllocation
,
err
:=
getAllocatedNVIDIADRADevices
(
ctx
,
clientset
,
podName
,
podNamespace
,
log
)
if
err
!=
nil
{
return
nil
,
hasNVIDIADRAAllocation
,
err
}
if
!
hasNVIDIADRAAllocation
||
len
(
allocated
)
==
0
{
return
nil
,
hasNVIDIADRAAllocation
,
nil
}
slices
,
err
:=
clientset
.
ResourceV1
()
.
ResourceSlices
()
.
List
(
ctx
,
metav1
.
ListOptions
{
FieldSelector
:
fmt
.
Sprintf
(
"spec.driver=%s,spec.nodeName=%s"
,
nvidiaGPUDRADriver
,
nodeName
),
})
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"list resource slices for node %s: %w"
,
nodeName
,
err
)
return
nil
,
true
,
fmt
.
Errorf
(
"list resource slices for node %s: %w"
,
nodeName
,
err
)
}
poolDeviceToUUID
:=
make
(
map
[
string
]
map
[
string
]
string
)
...
...
@@ -92,15 +119,15 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
}
var
uuids
[]
string
for
_
,
a
:=
range
allocated
{
devMap
:=
poolDeviceToUUID
[
a
.
pool
]
for
_
,
device
:=
range
allocated
{
devMap
:=
poolDeviceToUUID
[
device
.
pool
]
if
devMap
==
nil
{
log
.
V
(
1
)
.
Info
(
"no ResourceSlice found for pool"
,
"pool"
,
a
.
pool
,
"device"
,
a
.
device
)
log
.
V
(
1
)
.
Info
(
"no ResourceSlice found for pool"
,
"pool"
,
device
.
pool
,
"device"
,
device
.
device
)
continue
}
uuid
,
ok
:=
devMap
[
a
.
device
]
uuid
,
ok
:=
devMap
[
device
.
device
]
if
!
ok
||
uuid
==
""
{
log
.
V
(
1
)
.
Info
(
"device has no UUID in ResourceSlice"
,
"pool"
,
a
.
pool
,
"device"
,
a
.
device
)
log
.
V
(
1
)
.
Info
(
"device has no UUID in ResourceSlice"
,
"pool"
,
device
.
pool
,
"device"
,
device
.
device
)
continue
}
uuids
=
append
(
uuids
,
uuid
)
...
...
@@ -108,7 +135,7 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
if
len
(
uuids
)
>
0
{
log
.
Info
(
"resolved GPU UUIDs via DRA API"
,
"uuids"
,
uuids
)
}
return
uuids
,
nil
return
uuids
,
true
,
nil
}
func
deviceUUIDFromAttributes
(
attrs
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
)
string
{
...
...
deploy/snapshot/internal/cuda/dra_test.go
View file @
2cc6d1e2
...
...
@@ -60,10 +60,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
log
:=
logr
.
Discard
()
t
.
Run
(
"nil clientset returns nil without error"
,
func
(
t
*
testing
.
T
)
{
got
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
nil
,
"pod"
,
"ns"
,
log
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
nil
,
"pod"
,
"ns"
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
}
if
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be false"
)
}
if
got
!=
nil
{
t
.
Errorf
(
"got %v, want nil"
,
got
)
}
...
...
@@ -71,10 +74,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
t
.
Run
(
"empty pod name returns nil"
,
func
(
t
*
testing
.
T
)
{
client
:=
fake
.
NewSimpleClientset
()
got
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
""
,
"ns"
,
log
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
""
,
"ns"
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
}
if
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be false"
)
}
if
got
!=
nil
{
t
.
Errorf
(
"got %v, want nil"
,
got
)
}
...
...
@@ -82,7 +88,7 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
t
.
Run
(
"pod not found returns error"
,
func
(
t
*
testing
.
T
)
{
client
:=
fake
.
NewSimpleClientset
()
_
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"missing"
,
"default"
,
log
)
_
,
_
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"missing"
,
"default"
,
log
)
if
err
==
nil
{
t
.
Fatal
(
"expected error when pod not found"
)
}
...
...
@@ -146,10 +152,213 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
}
client
:=
fake
.
NewSimpleClientset
(
pod
,
claim
,
slice
)
got
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
podName
,
namespace
,
log
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
podName
,
namespace
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"GetGPUUUIDsViaDRAAPI: %v"
,
err
)
}
if
!
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be true"
)
}
want
:=
[]
string
{
uuid1
,
uuid2
}
if
len
(
got
)
!=
len
(
want
)
{
t
.
Fatalf
(
"got %v (len %d), want %v (len %d)"
,
got
,
len
(
got
),
want
,
len
(
want
))
}
for
i
:=
range
want
{
if
got
[
i
]
!=
want
[
i
]
{
t
.
Errorf
(
"got[%d] = %q, want %q"
,
i
,
got
[
i
],
want
[
i
])
}
}
})
t
.
Run
(
"pod with template-backed DRA claims resolves UUIDs via pod status"
,
func
(
t
*
testing
.
T
)
{
nodeName
:=
"node-1"
poolName
:=
"pool-node-1"
namespace
:=
"default"
podName
:=
"test-pod"
generatedClaimName
:=
"generated-gpu-claim"
uuid1
:=
"GPU-cccccccc-1111-2222-3333-444444444444"
pod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
podName
,
Namespace
:
namespace
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
nodeName
,
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"gpu"
,
},
},
},
Status
:
corev1
.
PodStatus
{
ResourceClaimStatuses
:
[]
corev1
.
PodResourceClaimStatus
{
{
Name
:
"gpu"
,
ResourceClaimName
:
ptr
(
generatedClaimName
),
},
},
},
}
claim
:=
&
resourcev1
.
ResourceClaim
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
generatedClaimName
,
Namespace
:
namespace
},
Status
:
resourcev1
.
ResourceClaimStatus
{
Allocation
:
&
resourcev1
.
AllocationResult
{
Devices
:
resourcev1
.
DeviceAllocationResult
{
Results
:
[]
resourcev1
.
DeviceRequestAllocationResult
{
{
Driver
:
nvidiaGPUDRADriver
,
Pool
:
poolName
,
Device
:
"gpu-0"
,
Request
:
"gpu"
},
},
},
},
},
}
slice
:=
&
resourcev1
.
ResourceSlice
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
poolName
+
"-gpu.nvidia.com-xxx"
},
Spec
:
resourcev1
.
ResourceSliceSpec
{
Driver
:
nvidiaGPUDRADriver
,
NodeName
:
&
nodeName
,
Pool
:
resourcev1
.
ResourcePool
{
Name
:
poolName
},
Devices
:
[]
resourcev1
.
Device
{
{
Name
:
"gpu-0"
,
Attributes
:
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
{
resourcev1
.
QualifiedName
(
"uuid"
)
:
{
StringValue
:
&
uuid1
},
},
},
},
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
,
claim
,
slice
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
podName
,
namespace
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"GetGPUUUIDsViaDRAAPI: %v"
,
err
)
}
if
!
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be true"
)
}
want
:=
[]
string
{
uuid1
}
if
len
(
got
)
!=
len
(
want
)
{
t
.
Fatalf
(
"got %v (len %d), want %v (len %d)"
,
got
,
len
(
got
),
want
,
len
(
want
))
}
for
i
:=
range
want
{
if
got
[
i
]
!=
want
[
i
]
{
t
.
Errorf
(
"got[%d] = %q, want %q"
,
i
,
got
[
i
],
want
[
i
])
}
}
})
t
.
Run
(
"pod with unresolved resource claim returns nil"
,
func
(
t
*
testing
.
T
)
{
pod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"pod"
,
Namespace
:
"default"
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
"node-1"
,
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"gpu"
,
},
},
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"pod"
,
"default"
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
}
if
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be false"
)
}
if
got
!=
nil
{
t
.
Errorf
(
"got %v, want nil"
,
got
)
}
})
t
.
Run
(
"pod with direct and template-backed claims resolves UUIDs from both"
,
func
(
t
*
testing
.
T
)
{
nodeName
:=
"node-1"
poolName
:=
"pool-node-1"
namespace
:=
"default"
podName
:=
"test-pod"
directClaimName
:=
"direct-gpu-claim"
generatedClaimName
:=
"generated-gpu-claim"
uuid1
:=
"GPU-dddddddd-1111-2222-3333-444444444444"
uuid2
:=
"GPU-eeeeeeee-5555-6666-7777-888888888888"
pod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
podName
,
Namespace
:
namespace
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
nodeName
,
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"gpu-direct"
,
ResourceClaimName
:
ptr
(
directClaimName
),
},
{
Name
:
"gpu-template"
,
},
},
},
Status
:
corev1
.
PodStatus
{
ResourceClaimStatuses
:
[]
corev1
.
PodResourceClaimStatus
{
{
Name
:
"gpu-template"
,
ResourceClaimName
:
ptr
(
generatedClaimName
),
},
},
},
}
directClaim
:=
&
resourcev1
.
ResourceClaim
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
directClaimName
,
Namespace
:
namespace
},
Status
:
resourcev1
.
ResourceClaimStatus
{
Allocation
:
&
resourcev1
.
AllocationResult
{
Devices
:
resourcev1
.
DeviceAllocationResult
{
Results
:
[]
resourcev1
.
DeviceRequestAllocationResult
{
{
Driver
:
nvidiaGPUDRADriver
,
Pool
:
poolName
,
Device
:
"gpu-0"
,
Request
:
"gpu-direct"
},
},
},
},
},
}
generatedClaim
:=
&
resourcev1
.
ResourceClaim
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
generatedClaimName
,
Namespace
:
namespace
},
Status
:
resourcev1
.
ResourceClaimStatus
{
Allocation
:
&
resourcev1
.
AllocationResult
{
Devices
:
resourcev1
.
DeviceAllocationResult
{
Results
:
[]
resourcev1
.
DeviceRequestAllocationResult
{
{
Driver
:
nvidiaGPUDRADriver
,
Pool
:
poolName
,
Device
:
"gpu-1"
,
Request
:
"gpu-template"
},
},
},
},
},
}
slice
:=
&
resourcev1
.
ResourceSlice
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
poolName
+
"-gpu.nvidia.com-xxx"
},
Spec
:
resourcev1
.
ResourceSliceSpec
{
Driver
:
nvidiaGPUDRADriver
,
NodeName
:
&
nodeName
,
Pool
:
resourcev1
.
ResourcePool
{
Name
:
poolName
},
Devices
:
[]
resourcev1
.
Device
{
{
Name
:
"gpu-0"
,
Attributes
:
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
{
resourcev1
.
QualifiedName
(
"uuid"
)
:
{
StringValue
:
&
uuid1
},
},
},
{
Name
:
"gpu-1"
,
Attributes
:
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
{
resourcev1
.
QualifiedName
(
"uuid"
)
:
{
StringValue
:
&
uuid2
},
},
},
},
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
,
directClaim
,
generatedClaim
,
slice
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
podName
,
namespace
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"GetGPUUUIDsViaDRAAPI: %v"
,
err
)
}
if
!
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be true"
)
}
want
:=
[]
string
{
uuid1
,
uuid2
}
if
len
(
got
)
!=
len
(
want
)
{
t
.
Fatalf
(
"got %v (len %d), want %v (len %d)"
,
got
,
len
(
got
),
want
,
len
(
want
))
...
...
@@ -167,10 +376,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
Spec
:
corev1
.
PodSpec
{
NodeName
:
"node-1"
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
)
got
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"pod"
,
"default"
,
log
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"pod"
,
"default"
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
}
if
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be false"
)
}
if
got
!=
nil
{
t
.
Errorf
(
"got %v, want nil"
,
got
)
}
...
...
deploy/snapshot/internal/executor/checkpoint.go
View file @
2cc6d1e2
...
...
@@ -191,18 +191,19 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
}
var
gpuUUIDs
[]
string
if
len
(
cudaHostPIDs
)
>
0
{
gpuUUIDs
,
err
=
cuda
.
GetPodGPUUUIDs
(
ctx
,
req
.
PodName
,
req
.
PodNamespace
,
req
.
ContainerName
)
gpuUUIDs
,
err
=
cuda
.
DiscoverGPUUUIDs
(
ctx
,
req
.
Clientset
,
req
.
PodName
,
req
.
PodNamespace
,
req
.
ContainerName
,
snapshotruntime
.
HostProcPath
,
pid
,
log
,
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to discover source GPU UUIDs: %w"
,
err
)
}
if
len
(
gpuUUIDs
)
==
0
{
log
.
Info
(
"PodResources API returned no GPU UUIDs, falling back to nvidia-smi"
,
"pid"
,
pid
)
gpuUUIDs
,
err
=
cuda
.
GetGPUUUIDsViaNvidiaSmi
(
ctx
,
snapshotruntime
.
HostProcPath
,
pid
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"nvidia-smi GPU UUID fallback failed: %w"
,
err
)
}
log
.
Info
(
"nvidia-smi fallback discovered GPU UUIDs"
,
"uuids"
,
gpuUUIDs
)
}
}
return
&
types
.
CheckpointContainerSnapshot
{
...
...
deploy/snapshot/internal/executor/restore.go
View file @
2cc6d1e2
...
...
@@ -142,18 +142,19 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
if
len
(
m
.
CUDA
.
SourceGPUUUIDs
)
==
0
{
return
nil
,
fmt
.
Errorf
(
"missing source GPU UUIDs in checkpoint manifest"
)
}
targetGPUUUIDs
,
err
:=
cuda
.
GetPodGPUUUIDs
(
ctx
,
req
.
PodName
,
req
.
PodNamespace
,
containerName
)
targetGPUUUIDs
,
err
:=
cuda
.
DiscoverGPUUUIDs
(
ctx
,
req
.
Clientset
,
req
.
PodName
,
req
.
PodNamespace
,
containerName
,
snapshotruntime
.
HostProcPath
,
placeholderPID
,
log
,
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to get target GPU UUIDs: %w"
,
err
)
}
if
len
(
targetGPUUUIDs
)
==
0
{
log
.
Info
(
"PodResources API returned no target GPU UUIDs, falling back to nvidia-smi"
,
"pid"
,
placeholderPID
)
targetGPUUUIDs
,
err
=
cuda
.
GetGPUUUIDsViaNvidiaSmi
(
ctx
,
snapshotruntime
.
HostProcPath
,
placeholderPID
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"nvidia-smi GPU UUID fallback failed for restore target: %w"
,
err
)
}
log
.
Info
(
"nvidia-smi fallback discovered target GPU UUIDs"
,
"uuids"
,
targetGPUUUIDs
)
}
if
len
(
targetGPUUUIDs
)
==
0
{
return
nil
,
fmt
.
Errorf
(
"missing target GPU UUIDs for %s/%s container %s"
,
req
.
PodNamespace
,
req
.
PodName
,
containerName
)
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment