Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2cc6d1e2
Unverified
Commit
2cc6d1e2
authored
Apr 17, 2026
by
Schwinn Saereesitthipitak
Committed by
GitHub
Apr 17, 2026
Browse files
fix(snapshot): resolve DRA GPU UUIDs from claims (#8292)
parent
8428c65f
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
570 additions
and
112 deletions
+570
-112
deploy/snapshot/internal/cuda/cuda.go
deploy/snapshot/internal/cuda/cuda.go
+40
-1
deploy/snapshot/internal/cuda/cuda_test.go
deploy/snapshot/internal/cuda/cuda_test.go
+232
-54
deploy/snapshot/internal/cuda/dra.go
deploy/snapshot/internal/cuda/dra.go
+61
-34
deploy/snapshot/internal/cuda/dra_test.go
deploy/snapshot/internal/cuda/dra_test.go
+217
-5
deploy/snapshot/internal/executor/checkpoint.go
deploy/snapshot/internal/executor/checkpoint.go
+10
-9
deploy/snapshot/internal/executor/restore.go
deploy/snapshot/internal/executor/restore.go
+10
-9
No files found.
deploy/snapshot/internal/cuda/cuda.go
View file @
2cc6d1e2
...
@@ -13,7 +13,7 @@ import (
...
@@ -13,7 +13,7 @@ import (
"github.com/go-logr/logr"
"github.com/go-logr/logr"
"google.golang.org/grpc"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/client-go/kubernetes"
podresourcesv1
"k8s.io/kubelet/pkg/apis/podresources/v1"
podresourcesv1
"k8s.io/kubelet/pkg/apis/podresources/v1"
)
)
...
@@ -104,6 +104,45 @@ func GetGPUUUIDsViaNvidiaSmi(ctx context.Context, hostProcPath string, pid int)
...
@@ -104,6 +104,45 @@ func GetGPUUUIDsViaNvidiaSmi(ctx context.Context, hostProcPath string, pid int)
return
uuids
,
nil
return
uuids
,
nil
}
}
// DiscoverGPUUUIDs resolves GPU UUIDs according to the pod's allocation mode:
// DRA-backed pods use the DRA API, classic nvidia.com/gpu pods use PodResources,
// and nvidia-smi remains the last fallback for either path.
func
DiscoverGPUUUIDs
(
ctx
context
.
Context
,
clientset
kubernetes
.
Interface
,
podName
,
podNamespace
,
containerName
,
hostProcPath
string
,
pid
int
,
log
logr
.
Logger
)
([]
string
,
error
)
{
gpuUUIDs
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
clientset
,
podName
,
podNamespace
,
log
)
fallbackReason
:=
"DRA API returned no GPU UUIDs"
if
err
!=
nil
{
log
.
Error
(
err
,
"DRA API GPU UUID lookup failed, trying other discovery paths"
,
"pod"
,
podNamespace
+
"/"
+
podName
,
"has_nvidia_dra_allocation"
,
hasNVIDIADRAAllocation
,
)
gpuUUIDs
=
nil
fallbackReason
=
"DRA API GPU UUID lookup failed"
}
if
len
(
gpuUUIDs
)
>
0
{
return
gpuUUIDs
,
nil
}
if
!
hasNVIDIADRAAllocation
{
gpuUUIDs
,
err
=
GetPodGPUUUIDs
(
ctx
,
podName
,
podNamespace
,
containerName
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"PodResources GPU UUID lookup failed: %w"
,
err
)
}
if
len
(
gpuUUIDs
)
>
0
{
return
gpuUUIDs
,
nil
}
fallbackReason
=
"PodResources API returned no GPU UUIDs"
}
log
.
Info
(
fallbackReason
+
", falling back to nvidia-smi"
,
"pid"
,
pid
)
gpuUUIDs
,
err
=
GetGPUUUIDsViaNvidiaSmi
(
ctx
,
hostProcPath
,
pid
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"nvidia-smi GPU UUID fallback failed: %w"
,
err
)
}
log
.
Info
(
"nvidia-smi fallback discovered GPU UUIDs"
,
"uuids"
,
gpuUUIDs
)
return
gpuUUIDs
,
nil
}
// FilterProcesses returns the subset of candidate PIDs that hold actual CUDA contexts.
// FilterProcesses returns the subset of candidate PIDs that hold actual CUDA contexts.
// Uses --get-restore-tid (the same technique as the CRIU CUDA plugin) instead of
// Uses --get-restore-tid (the same technique as the CRIU CUDA plugin) instead of
// --get-state, because --get-state incorrectly matches coordinator processes like
// --get-state, because --get-state incorrectly matches coordinator processes like
...
...
deploy/snapshot/internal/cuda/cuda_test.go
View file @
2cc6d1e2
...
@@ -13,7 +13,10 @@ import (
...
@@ -13,7 +13,10 @@ import (
"google.golang.org/grpc"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"google.golang.org/grpc/status"
corev1
"k8s.io/api/core/v1"
resourcev1
"k8s.io/api/resource/v1"
metav1
"k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
podresourcesv1
"k8s.io/kubelet/pkg/apis/podresources/v1"
podresourcesv1
"k8s.io/kubelet/pkg/apis/podresources/v1"
)
)
...
@@ -93,7 +96,7 @@ func (s *testPodResourcesServer) Get(context.Context, *podresourcesv1.GetPodReso
...
@@ -93,7 +96,7 @@ func (s *testPodResourcesServer) Get(context.Context, *podresourcesv1.GetPodReso
return
nil
,
status
.
Error
(
codes
.
Unimplemented
,
"not implemented in test"
)
return
nil
,
status
.
Error
(
codes
.
Unimplemented
,
"not implemented in test"
)
}
}
func
TestGetPodGPUUUIDs
(
t
*
testing
.
T
)
{
func
installTestPodResourcesServer
(
t
*
testing
.
T
,
resp
*
podresourcesv1
.
ListPodResourcesResponse
)
{
socketDir
:=
t
.
TempDir
()
socketDir
:=
t
.
TempDir
()
socketPath
:=
filepath
.
Join
(
socketDir
,
"kubelet.sock"
)
socketPath
:=
filepath
.
Join
(
socketDir
,
"kubelet.sock"
)
...
@@ -101,61 +104,10 @@ func TestGetPodGPUUUIDs(t *testing.T) {
...
@@ -101,61 +104,10 @@ func TestGetPodGPUUUIDs(t *testing.T) {
if
err
!=
nil
{
if
err
!=
nil
{
t
.
Fatalf
(
"listen unix socket: %v"
,
err
)
t
.
Fatalf
(
"listen unix socket: %v"
,
err
)
}
}
defer
listener
.
Close
()
server
:=
grpc
.
NewServer
()
server
:=
grpc
.
NewServer
()
podresourcesv1
.
RegisterPodResourcesListerServer
(
server
,
&
testPodResourcesServer
{
podresourcesv1
.
RegisterPodResourcesListerServer
(
server
,
&
testPodResourcesServer
{
resp
:
&
podresourcesv1
.
ListPodResourcesResponse
{
resp
:
resp
,
PodResources
:
[]
*
podresourcesv1
.
PodResources
{
{
Name
:
"other-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-ignore"
},
},
},
},
},
},
{
Name
:
"test-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"sidecar"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-sidecar"
},
},
},
},
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-a"
,
"GPU-b"
},
},
{
ResourceName
:
"example.com/fpga"
,
DeviceIds
:
[]
string
{
"FPGA-ignore"
},
},
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-c"
},
},
},
},
},
},
},
},
})
})
go
func
()
{
go
func
()
{
...
@@ -167,12 +119,69 @@ func TestGetPodGPUUUIDs(t *testing.T) {
...
@@ -167,12 +119,69 @@ func TestGetPodGPUUUIDs(t *testing.T) {
}
}
}()
}()
t
.
Cleanup
(
server
.
Stop
)
t
.
Cleanup
(
server
.
Stop
)
t
.
Cleanup
(
func
()
{
_
=
listener
.
Close
()
})
previousSocketPath
:=
podResourcesSocketPath
previousSocketPath
:=
podResourcesSocketPath
podResourcesSocketPath
=
socketPath
podResourcesSocketPath
=
socketPath
t
.
Cleanup
(
func
()
{
t
.
Cleanup
(
func
()
{
podResourcesSocketPath
=
previousSocketPath
podResourcesSocketPath
=
previousSocketPath
})
})
}
func
TestGetPodGPUUUIDs
(
t
*
testing
.
T
)
{
installTestPodResourcesServer
(
t
,
&
podresourcesv1
.
ListPodResourcesResponse
{
PodResources
:
[]
*
podresourcesv1
.
PodResources
{
{
Name
:
"other-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-ignore"
},
},
},
},
},
},
{
Name
:
"test-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"sidecar"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-sidecar"
},
},
},
},
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-a"
,
"GPU-b"
},
},
{
ResourceName
:
"example.com/fpga"
,
DeviceIds
:
[]
string
{
"FPGA-ignore"
},
},
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-c"
},
},
},
},
},
},
},
})
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
defer
cancel
()
defer
cancel
()
...
@@ -192,3 +201,172 @@ func TestGetPodGPUUUIDs(t *testing.T) {
...
@@ -192,3 +201,172 @@ func TestGetPodGPUUUIDs(t *testing.T) {
}
}
}
}
}
}
func
TestDiscoverGPUUUIDsUsesPodResourcesForClassicPod
(
t
*
testing
.
T
)
{
installTestPodResourcesServer
(
t
,
&
podresourcesv1
.
ListPodResourcesResponse
{
PodResources
:
[]
*
podresourcesv1
.
PodResources
{
{
Name
:
"test-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-a"
,
"GPU-b"
},
},
},
},
},
},
},
})
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
defer
cancel
()
got
,
err
:=
DiscoverGPUUUIDs
(
ctx
,
nil
,
"test-pod"
,
"default"
,
"main"
,
"/proc"
,
123
,
logr
.
Discard
(),
)
if
err
!=
nil
{
t
.
Fatalf
(
"DiscoverGPUUUIDs: %v"
,
err
)
}
want
:=
[]
string
{
"GPU-a"
,
"GPU-b"
}
if
len
(
got
)
!=
len
(
want
)
{
t
.
Fatalf
(
"got %v, want %v"
,
got
,
want
)
}
for
i
:=
range
want
{
if
got
[
i
]
!=
want
[
i
]
{
t
.
Fatalf
(
"got %v, want %v"
,
got
,
want
)
}
}
}
func
TestDiscoverGPUUUIDsFallsBackToPodResourcesAfterDRAAPILookupError
(
t
*
testing
.
T
)
{
installTestPodResourcesServer
(
t
,
&
podresourcesv1
.
ListPodResourcesResponse
{
PodResources
:
[]
*
podresourcesv1
.
PodResources
{
{
Name
:
"test-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-a"
},
},
},
},
},
},
},
})
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
defer
cancel
()
got
,
err
:=
DiscoverGPUUUIDs
(
ctx
,
fake
.
NewSimpleClientset
(),
"test-pod"
,
"default"
,
"main"
,
"/proc"
,
123
,
logr
.
Discard
(),
)
if
err
!=
nil
{
t
.
Fatalf
(
"DiscoverGPUUUIDs: %v"
,
err
)
}
if
len
(
got
)
!=
1
||
got
[
0
]
!=
"GPU-a"
{
t
.
Fatalf
(
"got %v, want [GPU-a]"
,
got
)
}
}
func
TestDiscoverGPUUUIDsPrefersDRAForDRAPod
(
t
*
testing
.
T
)
{
previousSocketPath
:=
podResourcesSocketPath
podResourcesSocketPath
=
filepath
.
Join
(
t
.
TempDir
(),
"missing-kubelet.sock"
)
t
.
Cleanup
(
func
()
{
podResourcesSocketPath
=
previousSocketPath
})
nodeName
:=
"node-1"
poolName
:=
"pool-node-1"
namespace
:=
"default"
podName
:=
"test-pod"
claimName
:=
"gpu-claim"
uuid
:=
"GPU-ffffffff-1111-2222-3333-444444444444"
pod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
podName
,
Namespace
:
namespace
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
nodeName
,
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"gpu"
,
ResourceClaimName
:
&
claimName
,
},
},
},
}
claim
:=
&
resourcev1
.
ResourceClaim
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
claimName
,
Namespace
:
namespace
},
Status
:
resourcev1
.
ResourceClaimStatus
{
Allocation
:
&
resourcev1
.
AllocationResult
{
Devices
:
resourcev1
.
DeviceAllocationResult
{
Results
:
[]
resourcev1
.
DeviceRequestAllocationResult
{
{
Driver
:
nvidiaGPUDRADriver
,
Pool
:
poolName
,
Device
:
"gpu-0"
,
Request
:
"gpu"
},
},
},
},
},
}
slice
:=
&
resourcev1
.
ResourceSlice
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
poolName
+
"-gpu.nvidia.com-xxx"
},
Spec
:
resourcev1
.
ResourceSliceSpec
{
Driver
:
nvidiaGPUDRADriver
,
NodeName
:
&
nodeName
,
Pool
:
resourcev1
.
ResourcePool
{
Name
:
poolName
},
Devices
:
[]
resourcev1
.
Device
{
{
Name
:
"gpu-0"
,
Attributes
:
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
{
resourcev1
.
QualifiedName
(
"uuid"
)
:
{
StringValue
:
&
uuid
},
},
},
},
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
,
claim
,
slice
)
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
defer
cancel
()
got
,
err
:=
DiscoverGPUUUIDs
(
ctx
,
client
,
podName
,
namespace
,
"main"
,
"/proc"
,
123
,
logr
.
Discard
(),
)
if
err
!=
nil
{
t
.
Fatalf
(
"DiscoverGPUUUIDs: %v"
,
err
)
}
if
len
(
got
)
!=
1
||
got
[
0
]
!=
uuid
{
t
.
Fatalf
(
"got %v, want [%s]"
,
got
,
uuid
)
}
}
deploy/snapshot/internal/cuda/dra.go
View file @
2cc6d1e2
...
@@ -14,66 +14,93 @@ const (
...
@@ -14,66 +14,93 @@ const (
resourceAttributeUUID
=
"uuid"
resourceAttributeUUID
=
"uuid"
)
)
// GetGPUUUIDsViaDRAAPI resolves GPU UUIDs for a pod by querying the Kubernetes API:
type
allocatedDRADevice
struct
{
// Pod (resource claim refs) -> ResourceClaim (allocation results) -> ResourceSlice (device attributes).
pool
string
// Returns nil without error if the pod has no DRA claims or the driver is not gpu.nvidia.com.
device
string
func
GetGPUUUIDsViaDRAAPI
(
ctx
context
.
Context
,
clientset
kubernetes
.
Interface
,
podName
,
podNamespace
string
,
log
logr
.
Logger
)
([]
string
,
error
)
{
}
func
getAllocatedNVIDIADRADevices
(
ctx
context
.
Context
,
clientset
kubernetes
.
Interface
,
podName
,
podNamespace
string
,
log
logr
.
Logger
)
([]
allocatedDRADevice
,
string
,
bool
,
error
)
{
if
clientset
==
nil
{
if
clientset
==
nil
{
return
nil
,
nil
return
nil
,
""
,
false
,
nil
}
}
if
podName
==
""
||
podNamespace
==
""
{
if
podName
==
""
||
podNamespace
==
""
{
return
nil
,
nil
return
nil
,
""
,
false
,
nil
}
}
pod
,
err
:=
clientset
.
CoreV1
()
.
Pods
(
podNamespace
)
.
Get
(
ctx
,
podName
,
metav1
.
GetOptions
{})
pod
,
err
:=
clientset
.
CoreV1
()
.
Pods
(
podNamespace
)
.
Get
(
ctx
,
podName
,
metav1
.
GetOptions
{})
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"get pod %s/%s: %w"
,
podNamespace
,
podName
,
err
)
return
nil
,
""
,
false
,
fmt
.
Errorf
(
"get pod %s/%s: %w"
,
podNamespace
,
podName
,
err
)
}
}
if
len
(
pod
.
Spec
.
ResourceClaims
)
==
0
{
if
len
(
pod
.
Spec
.
ResourceClaims
)
==
0
{
return
nil
,
nil
return
nil
,
pod
.
Spec
.
NodeName
,
false
,
nil
}
}
nodeName
:=
pod
.
Spec
.
NodeName
if
pod
.
Spec
.
NodeName
==
""
{
if
nodeName
==
""
{
log
.
V
(
1
)
.
Info
(
"pod has no node name, skipping DRA API lookup"
)
log
.
V
(
1
)
.
Info
(
"pod has no node name, skipping DRA API lookup"
)
return
nil
,
nil
return
nil
,
""
,
false
,
nil
}
}
var
allocated
[]
struct
{
claimNamesByPodRef
:=
make
(
map
[
string
]
string
,
len
(
pod
.
Spec
.
ResourceClaims
))
driver
string
for
_
,
ref
:=
range
pod
.
Spec
.
ResourceClaims
{
pool
string
if
ref
.
ResourceClaimName
!=
nil
&&
*
ref
.
ResourceClaimName
!=
""
{
device
string
claimNamesByPodRef
[
ref
.
Name
]
=
*
ref
.
ResourceClaimName
}
}
for
_
,
status
:=
range
pod
.
Status
.
ResourceClaimStatuses
{
if
status
.
ResourceClaimName
==
nil
||
*
status
.
ResourceClaimName
==
""
{
continue
}
if
_
,
exists
:=
claimNamesByPodRef
[
status
.
Name
];
!
exists
{
claimNamesByPodRef
[
status
.
Name
]
=
*
status
.
ResourceClaimName
}
}
}
var
allocated
[]
allocatedDRADevice
hasNVIDIADRAAllocation
:=
false
for
_
,
ref
:=
range
pod
.
Spec
.
ResourceClaims
{
for
_
,
ref
:=
range
pod
.
Spec
.
ResourceClaims
{
if
ref
.
ResourceClaimName
==
nil
||
*
ref
.
ResourceClaimName
==
""
{
claimName
:=
claimNamesByPodRef
[
ref
.
Name
]
if
claimName
==
""
{
log
.
V
(
1
)
.
Info
(
"pod resource claim has no resolved claim name"
,
"pod_claim"
,
ref
.
Name
)
continue
continue
}
}
claimName
:=
*
ref
.
ResourceClaimName
claim
,
err
:=
clientset
.
ResourceV1
()
.
ResourceClaims
(
podNamespace
)
.
Get
(
ctx
,
claimName
,
metav1
.
GetOptions
{})
claim
,
err
:=
clientset
.
ResourceV1
()
.
ResourceClaims
(
podNamespace
)
.
Get
(
ctx
,
claimName
,
metav1
.
GetOptions
{})
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"get resource claim %s/%s: %w"
,
podNamespace
,
claimName
,
err
)
return
nil
,
pod
.
Spec
.
NodeName
,
hasNVIDIADRAAllocation
,
fmt
.
Errorf
(
"get resource claim %s/%s: %w"
,
podNamespace
,
claimName
,
err
)
}
}
if
claim
.
Status
.
Allocation
==
nil
||
len
(
claim
.
Status
.
Allocation
.
Devices
.
Results
)
==
0
{
if
claim
.
Status
.
Allocation
==
nil
||
len
(
claim
.
Status
.
Allocation
.
Devices
.
Results
)
==
0
{
continue
continue
}
}
for
_
,
r
:=
range
claim
.
Status
.
Allocation
.
Devices
.
Results
{
for
_
,
result
:=
range
claim
.
Status
.
Allocation
.
Devices
.
Results
{
if
r
.
Driver
==
nvidiaGPUDRADriver
{
if
result
.
Driver
!=
nvidiaGPUDRADriver
{
allocated
=
append
(
allocated
,
struct
{
continue
driver
string
pool
string
device
string
}{
r
.
Driver
,
r
.
Pool
,
r
.
Device
})
}
}
hasNVIDIADRAAllocation
=
true
allocated
=
append
(
allocated
,
allocatedDRADevice
{
pool
:
result
.
Pool
,
device
:
result
.
Device
,
})
}
}
}
}
if
len
(
allocated
)
==
0
{
return
nil
,
nil
return
allocated
,
pod
.
Spec
.
NodeName
,
hasNVIDIADRAAllocation
,
nil
}
// GetGPUUUIDsViaDRAAPI resolves GPU UUIDs for a pod by querying the Kubernetes API:
// Pod (resource claim refs) -> ResourceClaim (allocation results) -> ResourceSlice (device attributes).
// It also reports whether the pod is using NVIDIA DRA GPU allocations at all.
func
GetGPUUUIDsViaDRAAPI
(
ctx
context
.
Context
,
clientset
kubernetes
.
Interface
,
podName
,
podNamespace
string
,
log
logr
.
Logger
)
([]
string
,
bool
,
error
)
{
allocated
,
nodeName
,
hasNVIDIADRAAllocation
,
err
:=
getAllocatedNVIDIADRADevices
(
ctx
,
clientset
,
podName
,
podNamespace
,
log
)
if
err
!=
nil
{
return
nil
,
hasNVIDIADRAAllocation
,
err
}
if
!
hasNVIDIADRAAllocation
||
len
(
allocated
)
==
0
{
return
nil
,
hasNVIDIADRAAllocation
,
nil
}
}
slices
,
err
:=
clientset
.
ResourceV1
()
.
ResourceSlices
()
.
List
(
ctx
,
metav1
.
ListOptions
{
slices
,
err
:=
clientset
.
ResourceV1
()
.
ResourceSlices
()
.
List
(
ctx
,
metav1
.
ListOptions
{
FieldSelector
:
fmt
.
Sprintf
(
"spec.driver=%s,spec.nodeName=%s"
,
nvidiaGPUDRADriver
,
nodeName
),
FieldSelector
:
fmt
.
Sprintf
(
"spec.driver=%s,spec.nodeName=%s"
,
nvidiaGPUDRADriver
,
nodeName
),
})
})
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"list resource slices for node %s: %w"
,
nodeName
,
err
)
return
nil
,
true
,
fmt
.
Errorf
(
"list resource slices for node %s: %w"
,
nodeName
,
err
)
}
}
poolDeviceToUUID
:=
make
(
map
[
string
]
map
[
string
]
string
)
poolDeviceToUUID
:=
make
(
map
[
string
]
map
[
string
]
string
)
...
@@ -92,15 +119,15 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
...
@@ -92,15 +119,15 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
}
}
var
uuids
[]
string
var
uuids
[]
string
for
_
,
a
:=
range
allocated
{
for
_
,
device
:=
range
allocated
{
devMap
:=
poolDeviceToUUID
[
a
.
pool
]
devMap
:=
poolDeviceToUUID
[
device
.
pool
]
if
devMap
==
nil
{
if
devMap
==
nil
{
log
.
V
(
1
)
.
Info
(
"no ResourceSlice found for pool"
,
"pool"
,
a
.
pool
,
"device"
,
a
.
device
)
log
.
V
(
1
)
.
Info
(
"no ResourceSlice found for pool"
,
"pool"
,
device
.
pool
,
"device"
,
device
.
device
)
continue
continue
}
}
uuid
,
ok
:=
devMap
[
a
.
device
]
uuid
,
ok
:=
devMap
[
device
.
device
]
if
!
ok
||
uuid
==
""
{
if
!
ok
||
uuid
==
""
{
log
.
V
(
1
)
.
Info
(
"device has no UUID in ResourceSlice"
,
"pool"
,
a
.
pool
,
"device"
,
a
.
device
)
log
.
V
(
1
)
.
Info
(
"device has no UUID in ResourceSlice"
,
"pool"
,
device
.
pool
,
"device"
,
device
.
device
)
continue
continue
}
}
uuids
=
append
(
uuids
,
uuid
)
uuids
=
append
(
uuids
,
uuid
)
...
@@ -108,7 +135,7 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
...
@@ -108,7 +135,7 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
if
len
(
uuids
)
>
0
{
if
len
(
uuids
)
>
0
{
log
.
Info
(
"resolved GPU UUIDs via DRA API"
,
"uuids"
,
uuids
)
log
.
Info
(
"resolved GPU UUIDs via DRA API"
,
"uuids"
,
uuids
)
}
}
return
uuids
,
nil
return
uuids
,
true
,
nil
}
}
func
deviceUUIDFromAttributes
(
attrs
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
)
string
{
func
deviceUUIDFromAttributes
(
attrs
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
)
string
{
...
...
deploy/snapshot/internal/cuda/dra_test.go
View file @
2cc6d1e2
...
@@ -60,10 +60,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
...
@@ -60,10 +60,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
log
:=
logr
.
Discard
()
log
:=
logr
.
Discard
()
t
.
Run
(
"nil clientset returns nil without error"
,
func
(
t
*
testing
.
T
)
{
t
.
Run
(
"nil clientset returns nil without error"
,
func
(
t
*
testing
.
T
)
{
got
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
nil
,
"pod"
,
"ns"
,
log
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
nil
,
"pod"
,
"ns"
,
log
)
if
err
!=
nil
{
if
err
!=
nil
{
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
}
}
if
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be false"
)
}
if
got
!=
nil
{
if
got
!=
nil
{
t
.
Errorf
(
"got %v, want nil"
,
got
)
t
.
Errorf
(
"got %v, want nil"
,
got
)
}
}
...
@@ -71,10 +74,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
...
@@ -71,10 +74,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
t
.
Run
(
"empty pod name returns nil"
,
func
(
t
*
testing
.
T
)
{
t
.
Run
(
"empty pod name returns nil"
,
func
(
t
*
testing
.
T
)
{
client
:=
fake
.
NewSimpleClientset
()
client
:=
fake
.
NewSimpleClientset
()
got
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
""
,
"ns"
,
log
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
""
,
"ns"
,
log
)
if
err
!=
nil
{
if
err
!=
nil
{
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
}
}
if
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be false"
)
}
if
got
!=
nil
{
if
got
!=
nil
{
t
.
Errorf
(
"got %v, want nil"
,
got
)
t
.
Errorf
(
"got %v, want nil"
,
got
)
}
}
...
@@ -82,7 +88,7 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
...
@@ -82,7 +88,7 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
t
.
Run
(
"pod not found returns error"
,
func
(
t
*
testing
.
T
)
{
t
.
Run
(
"pod not found returns error"
,
func
(
t
*
testing
.
T
)
{
client
:=
fake
.
NewSimpleClientset
()
client
:=
fake
.
NewSimpleClientset
()
_
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"missing"
,
"default"
,
log
)
_
,
_
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"missing"
,
"default"
,
log
)
if
err
==
nil
{
if
err
==
nil
{
t
.
Fatal
(
"expected error when pod not found"
)
t
.
Fatal
(
"expected error when pod not found"
)
}
}
...
@@ -146,10 +152,213 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
...
@@ -146,10 +152,213 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
}
}
client
:=
fake
.
NewSimpleClientset
(
pod
,
claim
,
slice
)
client
:=
fake
.
NewSimpleClientset
(
pod
,
claim
,
slice
)
got
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
podName
,
namespace
,
log
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
podName
,
namespace
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"GetGPUUUIDsViaDRAAPI: %v"
,
err
)
}
if
!
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be true"
)
}
want
:=
[]
string
{
uuid1
,
uuid2
}
if
len
(
got
)
!=
len
(
want
)
{
t
.
Fatalf
(
"got %v (len %d), want %v (len %d)"
,
got
,
len
(
got
),
want
,
len
(
want
))
}
for
i
:=
range
want
{
if
got
[
i
]
!=
want
[
i
]
{
t
.
Errorf
(
"got[%d] = %q, want %q"
,
i
,
got
[
i
],
want
[
i
])
}
}
})
t
.
Run
(
"pod with template-backed DRA claims resolves UUIDs via pod status"
,
func
(
t
*
testing
.
T
)
{
nodeName
:=
"node-1"
poolName
:=
"pool-node-1"
namespace
:=
"default"
podName
:=
"test-pod"
generatedClaimName
:=
"generated-gpu-claim"
uuid1
:=
"GPU-cccccccc-1111-2222-3333-444444444444"
pod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
podName
,
Namespace
:
namespace
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
nodeName
,
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"gpu"
,
},
},
},
Status
:
corev1
.
PodStatus
{
ResourceClaimStatuses
:
[]
corev1
.
PodResourceClaimStatus
{
{
Name
:
"gpu"
,
ResourceClaimName
:
ptr
(
generatedClaimName
),
},
},
},
}
claim
:=
&
resourcev1
.
ResourceClaim
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
generatedClaimName
,
Namespace
:
namespace
},
Status
:
resourcev1
.
ResourceClaimStatus
{
Allocation
:
&
resourcev1
.
AllocationResult
{
Devices
:
resourcev1
.
DeviceAllocationResult
{
Results
:
[]
resourcev1
.
DeviceRequestAllocationResult
{
{
Driver
:
nvidiaGPUDRADriver
,
Pool
:
poolName
,
Device
:
"gpu-0"
,
Request
:
"gpu"
},
},
},
},
},
}
slice
:=
&
resourcev1
.
ResourceSlice
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
poolName
+
"-gpu.nvidia.com-xxx"
},
Spec
:
resourcev1
.
ResourceSliceSpec
{
Driver
:
nvidiaGPUDRADriver
,
NodeName
:
&
nodeName
,
Pool
:
resourcev1
.
ResourcePool
{
Name
:
poolName
},
Devices
:
[]
resourcev1
.
Device
{
{
Name
:
"gpu-0"
,
Attributes
:
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
{
resourcev1
.
QualifiedName
(
"uuid"
)
:
{
StringValue
:
&
uuid1
},
},
},
},
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
,
claim
,
slice
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
podName
,
namespace
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"GetGPUUUIDsViaDRAAPI: %v"
,
err
)
}
if
!
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be true"
)
}
want
:=
[]
string
{
uuid1
}
if
len
(
got
)
!=
len
(
want
)
{
t
.
Fatalf
(
"got %v (len %d), want %v (len %d)"
,
got
,
len
(
got
),
want
,
len
(
want
))
}
for
i
:=
range
want
{
if
got
[
i
]
!=
want
[
i
]
{
t
.
Errorf
(
"got[%d] = %q, want %q"
,
i
,
got
[
i
],
want
[
i
])
}
}
})
t
.
Run
(
"pod with unresolved resource claim returns nil"
,
func
(
t
*
testing
.
T
)
{
pod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"pod"
,
Namespace
:
"default"
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
"node-1"
,
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"gpu"
,
},
},
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"pod"
,
"default"
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
}
if
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be false"
)
}
if
got
!=
nil
{
t
.
Errorf
(
"got %v, want nil"
,
got
)
}
})
t
.
Run
(
"pod with direct and template-backed claims resolves UUIDs from both"
,
func
(
t
*
testing
.
T
)
{
nodeName
:=
"node-1"
poolName
:=
"pool-node-1"
namespace
:=
"default"
podName
:=
"test-pod"
directClaimName
:=
"direct-gpu-claim"
generatedClaimName
:=
"generated-gpu-claim"
uuid1
:=
"GPU-dddddddd-1111-2222-3333-444444444444"
uuid2
:=
"GPU-eeeeeeee-5555-6666-7777-888888888888"
pod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
podName
,
Namespace
:
namespace
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
nodeName
,
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"gpu-direct"
,
ResourceClaimName
:
ptr
(
directClaimName
),
},
{
Name
:
"gpu-template"
,
},
},
},
Status
:
corev1
.
PodStatus
{
ResourceClaimStatuses
:
[]
corev1
.
PodResourceClaimStatus
{
{
Name
:
"gpu-template"
,
ResourceClaimName
:
ptr
(
generatedClaimName
),
},
},
},
}
directClaim
:=
&
resourcev1
.
ResourceClaim
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
directClaimName
,
Namespace
:
namespace
},
Status
:
resourcev1
.
ResourceClaimStatus
{
Allocation
:
&
resourcev1
.
AllocationResult
{
Devices
:
resourcev1
.
DeviceAllocationResult
{
Results
:
[]
resourcev1
.
DeviceRequestAllocationResult
{
{
Driver
:
nvidiaGPUDRADriver
,
Pool
:
poolName
,
Device
:
"gpu-0"
,
Request
:
"gpu-direct"
},
},
},
},
},
}
generatedClaim
:=
&
resourcev1
.
ResourceClaim
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
generatedClaimName
,
Namespace
:
namespace
},
Status
:
resourcev1
.
ResourceClaimStatus
{
Allocation
:
&
resourcev1
.
AllocationResult
{
Devices
:
resourcev1
.
DeviceAllocationResult
{
Results
:
[]
resourcev1
.
DeviceRequestAllocationResult
{
{
Driver
:
nvidiaGPUDRADriver
,
Pool
:
poolName
,
Device
:
"gpu-1"
,
Request
:
"gpu-template"
},
},
},
},
},
}
slice
:=
&
resourcev1
.
ResourceSlice
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
poolName
+
"-gpu.nvidia.com-xxx"
},
Spec
:
resourcev1
.
ResourceSliceSpec
{
Driver
:
nvidiaGPUDRADriver
,
NodeName
:
&
nodeName
,
Pool
:
resourcev1
.
ResourcePool
{
Name
:
poolName
},
Devices
:
[]
resourcev1
.
Device
{
{
Name
:
"gpu-0"
,
Attributes
:
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
{
resourcev1
.
QualifiedName
(
"uuid"
)
:
{
StringValue
:
&
uuid1
},
},
},
{
Name
:
"gpu-1"
,
Attributes
:
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
{
resourcev1
.
QualifiedName
(
"uuid"
)
:
{
StringValue
:
&
uuid2
},
},
},
},
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
,
directClaim
,
generatedClaim
,
slice
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
podName
,
namespace
,
log
)
if
err
!=
nil
{
if
err
!=
nil
{
t
.
Fatalf
(
"GetGPUUUIDsViaDRAAPI: %v"
,
err
)
t
.
Fatalf
(
"GetGPUUUIDsViaDRAAPI: %v"
,
err
)
}
}
if
!
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be true"
)
}
want
:=
[]
string
{
uuid1
,
uuid2
}
want
:=
[]
string
{
uuid1
,
uuid2
}
if
len
(
got
)
!=
len
(
want
)
{
if
len
(
got
)
!=
len
(
want
)
{
t
.
Fatalf
(
"got %v (len %d), want %v (len %d)"
,
got
,
len
(
got
),
want
,
len
(
want
))
t
.
Fatalf
(
"got %v (len %d), want %v (len %d)"
,
got
,
len
(
got
),
want
,
len
(
want
))
...
@@ -167,10 +376,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
...
@@ -167,10 +376,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
Spec
:
corev1
.
PodSpec
{
NodeName
:
"node-1"
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
"node-1"
},
}
}
client
:=
fake
.
NewSimpleClientset
(
pod
)
client
:=
fake
.
NewSimpleClientset
(
pod
)
got
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"pod"
,
"default"
,
log
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"pod"
,
"default"
,
log
)
if
err
!=
nil
{
if
err
!=
nil
{
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
}
}
if
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be false"
)
}
if
got
!=
nil
{
if
got
!=
nil
{
t
.
Errorf
(
"got %v, want nil"
,
got
)
t
.
Errorf
(
"got %v, want nil"
,
got
)
}
}
...
...
deploy/snapshot/internal/executor/checkpoint.go
View file @
2cc6d1e2
...
@@ -191,18 +191,19 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
...
@@ -191,18 +191,19 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
}
}
var
gpuUUIDs
[]
string
var
gpuUUIDs
[]
string
if
len
(
cudaHostPIDs
)
>
0
{
if
len
(
cudaHostPIDs
)
>
0
{
gpuUUIDs
,
err
=
cuda
.
GetPodGPUUUIDs
(
ctx
,
req
.
PodName
,
req
.
PodNamespace
,
req
.
ContainerName
)
gpuUUIDs
,
err
=
cuda
.
DiscoverGPUUUIDs
(
ctx
,
req
.
Clientset
,
req
.
PodName
,
req
.
PodNamespace
,
req
.
ContainerName
,
snapshotruntime
.
HostProcPath
,
pid
,
log
,
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to discover source GPU UUIDs: %w"
,
err
)
return
nil
,
fmt
.
Errorf
(
"failed to discover source GPU UUIDs: %w"
,
err
)
}
}
if
len
(
gpuUUIDs
)
==
0
{
log
.
Info
(
"PodResources API returned no GPU UUIDs, falling back to nvidia-smi"
,
"pid"
,
pid
)
gpuUUIDs
,
err
=
cuda
.
GetGPUUUIDsViaNvidiaSmi
(
ctx
,
snapshotruntime
.
HostProcPath
,
pid
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"nvidia-smi GPU UUID fallback failed: %w"
,
err
)
}
log
.
Info
(
"nvidia-smi fallback discovered GPU UUIDs"
,
"uuids"
,
gpuUUIDs
)
}
}
}
return
&
types
.
CheckpointContainerSnapshot
{
return
&
types
.
CheckpointContainerSnapshot
{
...
...
deploy/snapshot/internal/executor/restore.go
View file @
2cc6d1e2
...
@@ -142,18 +142,19 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
...
@@ -142,18 +142,19 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
if
len
(
m
.
CUDA
.
SourceGPUUUIDs
)
==
0
{
if
len
(
m
.
CUDA
.
SourceGPUUUIDs
)
==
0
{
return
nil
,
fmt
.
Errorf
(
"missing source GPU UUIDs in checkpoint manifest"
)
return
nil
,
fmt
.
Errorf
(
"missing source GPU UUIDs in checkpoint manifest"
)
}
}
targetGPUUUIDs
,
err
:=
cuda
.
GetPodGPUUUIDs
(
ctx
,
req
.
PodName
,
req
.
PodNamespace
,
containerName
)
targetGPUUUIDs
,
err
:=
cuda
.
DiscoverGPUUUIDs
(
ctx
,
req
.
Clientset
,
req
.
PodName
,
req
.
PodNamespace
,
containerName
,
snapshotruntime
.
HostProcPath
,
placeholderPID
,
log
,
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to get target GPU UUIDs: %w"
,
err
)
return
nil
,
fmt
.
Errorf
(
"failed to get target GPU UUIDs: %w"
,
err
)
}
}
if
len
(
targetGPUUUIDs
)
==
0
{
log
.
Info
(
"PodResources API returned no target GPU UUIDs, falling back to nvidia-smi"
,
"pid"
,
placeholderPID
)
targetGPUUUIDs
,
err
=
cuda
.
GetGPUUUIDsViaNvidiaSmi
(
ctx
,
snapshotruntime
.
HostProcPath
,
placeholderPID
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"nvidia-smi GPU UUID fallback failed for restore target: %w"
,
err
)
}
log
.
Info
(
"nvidia-smi fallback discovered target GPU UUIDs"
,
"uuids"
,
targetGPUUUIDs
)
}
if
len
(
targetGPUUUIDs
)
==
0
{
if
len
(
targetGPUUUIDs
)
==
0
{
return
nil
,
fmt
.
Errorf
(
"missing target GPU UUIDs for %s/%s container %s"
,
req
.
PodNamespace
,
req
.
PodName
,
containerName
)
return
nil
,
fmt
.
Errorf
(
"missing target GPU UUIDs for %s/%s container %s"
,
req
.
PodNamespace
,
req
.
PodName
,
containerName
)
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment