Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
2cc6d1e2
"...ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "a207b4be80c285795374f3f47584fdc0dbc70fd2"
Unverified
Commit
2cc6d1e2
authored
Apr 17, 2026
by
Schwinn Saereesitthipitak
Committed by
GitHub
Apr 17, 2026
Browse files
fix(snapshot): resolve DRA GPU UUIDs from claims (#8292)
parent
8428c65f
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
570 additions
and
112 deletions
+570
-112
deploy/snapshot/internal/cuda/cuda.go
deploy/snapshot/internal/cuda/cuda.go
+40
-1
deploy/snapshot/internal/cuda/cuda_test.go
deploy/snapshot/internal/cuda/cuda_test.go
+232
-54
deploy/snapshot/internal/cuda/dra.go
deploy/snapshot/internal/cuda/dra.go
+61
-34
deploy/snapshot/internal/cuda/dra_test.go
deploy/snapshot/internal/cuda/dra_test.go
+217
-5
deploy/snapshot/internal/executor/checkpoint.go
deploy/snapshot/internal/executor/checkpoint.go
+10
-9
deploy/snapshot/internal/executor/restore.go
deploy/snapshot/internal/executor/restore.go
+10
-9
No files found.
deploy/snapshot/internal/cuda/cuda.go
View file @
2cc6d1e2
...
@@ -13,7 +13,7 @@ import (
...
@@ -13,7 +13,7 @@ import (
"github.com/go-logr/logr"
"github.com/go-logr/logr"
"google.golang.org/grpc"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/client-go/kubernetes"
podresourcesv1
"k8s.io/kubelet/pkg/apis/podresources/v1"
podresourcesv1
"k8s.io/kubelet/pkg/apis/podresources/v1"
)
)
...
@@ -104,6 +104,45 @@ func GetGPUUUIDsViaNvidiaSmi(ctx context.Context, hostProcPath string, pid int)
...
@@ -104,6 +104,45 @@ func GetGPUUUIDsViaNvidiaSmi(ctx context.Context, hostProcPath string, pid int)
return
uuids
,
nil
return
uuids
,
nil
}
}
// DiscoverGPUUUIDs resolves GPU UUIDs according to the pod's allocation mode:
// DRA-backed pods use the DRA API, classic nvidia.com/gpu pods use PodResources,
// and nvidia-smi remains the last fallback for either path.
func
DiscoverGPUUUIDs
(
ctx
context
.
Context
,
clientset
kubernetes
.
Interface
,
podName
,
podNamespace
,
containerName
,
hostProcPath
string
,
pid
int
,
log
logr
.
Logger
)
([]
string
,
error
)
{
gpuUUIDs
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
clientset
,
podName
,
podNamespace
,
log
)
fallbackReason
:=
"DRA API returned no GPU UUIDs"
if
err
!=
nil
{
log
.
Error
(
err
,
"DRA API GPU UUID lookup failed, trying other discovery paths"
,
"pod"
,
podNamespace
+
"/"
+
podName
,
"has_nvidia_dra_allocation"
,
hasNVIDIADRAAllocation
,
)
gpuUUIDs
=
nil
fallbackReason
=
"DRA API GPU UUID lookup failed"
}
if
len
(
gpuUUIDs
)
>
0
{
return
gpuUUIDs
,
nil
}
if
!
hasNVIDIADRAAllocation
{
gpuUUIDs
,
err
=
GetPodGPUUUIDs
(
ctx
,
podName
,
podNamespace
,
containerName
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"PodResources GPU UUID lookup failed: %w"
,
err
)
}
if
len
(
gpuUUIDs
)
>
0
{
return
gpuUUIDs
,
nil
}
fallbackReason
=
"PodResources API returned no GPU UUIDs"
}
log
.
Info
(
fallbackReason
+
", falling back to nvidia-smi"
,
"pid"
,
pid
)
gpuUUIDs
,
err
=
GetGPUUUIDsViaNvidiaSmi
(
ctx
,
hostProcPath
,
pid
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"nvidia-smi GPU UUID fallback failed: %w"
,
err
)
}
log
.
Info
(
"nvidia-smi fallback discovered GPU UUIDs"
,
"uuids"
,
gpuUUIDs
)
return
gpuUUIDs
,
nil
}
// FilterProcesses returns the subset of candidate PIDs that hold actual CUDA contexts.
// FilterProcesses returns the subset of candidate PIDs that hold actual CUDA contexts.
// Uses --get-restore-tid (the same technique as the CRIU CUDA plugin) instead of
// Uses --get-restore-tid (the same technique as the CRIU CUDA plugin) instead of
// --get-state, because --get-state incorrectly matches coordinator processes like
// --get-state, because --get-state incorrectly matches coordinator processes like
...
...
deploy/snapshot/internal/cuda/cuda_test.go
View file @
2cc6d1e2
...
@@ -13,7 +13,10 @@ import (
...
@@ -13,7 +13,10 @@ import (
"google.golang.org/grpc"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"google.golang.org/grpc/status"
corev1
"k8s.io/api/core/v1"
resourcev1
"k8s.io/api/resource/v1"
metav1
"k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
podresourcesv1
"k8s.io/kubelet/pkg/apis/podresources/v1"
podresourcesv1
"k8s.io/kubelet/pkg/apis/podresources/v1"
)
)
...
@@ -93,7 +96,7 @@ func (s *testPodResourcesServer) Get(context.Context, *podresourcesv1.GetPodReso
...
@@ -93,7 +96,7 @@ func (s *testPodResourcesServer) Get(context.Context, *podresourcesv1.GetPodReso
return
nil
,
status
.
Error
(
codes
.
Unimplemented
,
"not implemented in test"
)
return
nil
,
status
.
Error
(
codes
.
Unimplemented
,
"not implemented in test"
)
}
}
func
TestGetPodGPUUUIDs
(
t
*
testing
.
T
)
{
func
installTestPodResourcesServer
(
t
*
testing
.
T
,
resp
*
podresourcesv1
.
ListPodResourcesResponse
)
{
socketDir
:=
t
.
TempDir
()
socketDir
:=
t
.
TempDir
()
socketPath
:=
filepath
.
Join
(
socketDir
,
"kubelet.sock"
)
socketPath
:=
filepath
.
Join
(
socketDir
,
"kubelet.sock"
)
...
@@ -101,61 +104,10 @@ func TestGetPodGPUUUIDs(t *testing.T) {
...
@@ -101,61 +104,10 @@ func TestGetPodGPUUUIDs(t *testing.T) {
if
err
!=
nil
{
if
err
!=
nil
{
t
.
Fatalf
(
"listen unix socket: %v"
,
err
)
t
.
Fatalf
(
"listen unix socket: %v"
,
err
)
}
}
defer
listener
.
Close
()
server
:=
grpc
.
NewServer
()
server
:=
grpc
.
NewServer
()
podresourcesv1
.
RegisterPodResourcesListerServer
(
server
,
&
testPodResourcesServer
{
podresourcesv1
.
RegisterPodResourcesListerServer
(
server
,
&
testPodResourcesServer
{
resp
:
&
podresourcesv1
.
ListPodResourcesResponse
{
resp
:
resp
,
PodResources
:
[]
*
podresourcesv1
.
PodResources
{
{
Name
:
"other-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-ignore"
},
},
},
},
},
},
{
Name
:
"test-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"sidecar"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-sidecar"
},
},
},
},
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-a"
,
"GPU-b"
},
},
{
ResourceName
:
"example.com/fpga"
,
DeviceIds
:
[]
string
{
"FPGA-ignore"
},
},
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-c"
},
},
},
},
},
},
},
},
})
})
go
func
()
{
go
func
()
{
...
@@ -167,12 +119,69 @@ func TestGetPodGPUUUIDs(t *testing.T) {
...
@@ -167,12 +119,69 @@ func TestGetPodGPUUUIDs(t *testing.T) {
}
}
}()
}()
t
.
Cleanup
(
server
.
Stop
)
t
.
Cleanup
(
server
.
Stop
)
t
.
Cleanup
(
func
()
{
_
=
listener
.
Close
()
})
previousSocketPath
:=
podResourcesSocketPath
previousSocketPath
:=
podResourcesSocketPath
podResourcesSocketPath
=
socketPath
podResourcesSocketPath
=
socketPath
t
.
Cleanup
(
func
()
{
t
.
Cleanup
(
func
()
{
podResourcesSocketPath
=
previousSocketPath
podResourcesSocketPath
=
previousSocketPath
})
})
}
func
TestGetPodGPUUUIDs
(
t
*
testing
.
T
)
{
installTestPodResourcesServer
(
t
,
&
podresourcesv1
.
ListPodResourcesResponse
{
PodResources
:
[]
*
podresourcesv1
.
PodResources
{
{
Name
:
"other-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-ignore"
},
},
},
},
},
},
{
Name
:
"test-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"sidecar"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-sidecar"
},
},
},
},
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-a"
,
"GPU-b"
},
},
{
ResourceName
:
"example.com/fpga"
,
DeviceIds
:
[]
string
{
"FPGA-ignore"
},
},
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-c"
},
},
},
},
},
},
},
})
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
defer
cancel
()
defer
cancel
()
...
@@ -192,3 +201,172 @@ func TestGetPodGPUUUIDs(t *testing.T) {
...
@@ -192,3 +201,172 @@ func TestGetPodGPUUUIDs(t *testing.T) {
}
}
}
}
}
}
func
TestDiscoverGPUUUIDsUsesPodResourcesForClassicPod
(
t
*
testing
.
T
)
{
installTestPodResourcesServer
(
t
,
&
podresourcesv1
.
ListPodResourcesResponse
{
PodResources
:
[]
*
podresourcesv1
.
PodResources
{
{
Name
:
"test-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-a"
,
"GPU-b"
},
},
},
},
},
},
},
})
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
defer
cancel
()
got
,
err
:=
DiscoverGPUUUIDs
(
ctx
,
nil
,
"test-pod"
,
"default"
,
"main"
,
"/proc"
,
123
,
logr
.
Discard
(),
)
if
err
!=
nil
{
t
.
Fatalf
(
"DiscoverGPUUUIDs: %v"
,
err
)
}
want
:=
[]
string
{
"GPU-a"
,
"GPU-b"
}
if
len
(
got
)
!=
len
(
want
)
{
t
.
Fatalf
(
"got %v, want %v"
,
got
,
want
)
}
for
i
:=
range
want
{
if
got
[
i
]
!=
want
[
i
]
{
t
.
Fatalf
(
"got %v, want %v"
,
got
,
want
)
}
}
}
func
TestDiscoverGPUUUIDsFallsBackToPodResourcesAfterDRAAPILookupError
(
t
*
testing
.
T
)
{
installTestPodResourcesServer
(
t
,
&
podresourcesv1
.
ListPodResourcesResponse
{
PodResources
:
[]
*
podresourcesv1
.
PodResources
{
{
Name
:
"test-pod"
,
Namespace
:
"default"
,
Containers
:
[]
*
podresourcesv1
.
ContainerResources
{
{
Name
:
"main"
,
Devices
:
[]
*
podresourcesv1
.
ContainerDevices
{
{
ResourceName
:
nvidiaGPUResource
,
DeviceIds
:
[]
string
{
"GPU-a"
},
},
},
},
},
},
},
})
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
defer
cancel
()
got
,
err
:=
DiscoverGPUUUIDs
(
ctx
,
fake
.
NewSimpleClientset
(),
"test-pod"
,
"default"
,
"main"
,
"/proc"
,
123
,
logr
.
Discard
(),
)
if
err
!=
nil
{
t
.
Fatalf
(
"DiscoverGPUUUIDs: %v"
,
err
)
}
if
len
(
got
)
!=
1
||
got
[
0
]
!=
"GPU-a"
{
t
.
Fatalf
(
"got %v, want [GPU-a]"
,
got
)
}
}
func
TestDiscoverGPUUUIDsPrefersDRAForDRAPod
(
t
*
testing
.
T
)
{
previousSocketPath
:=
podResourcesSocketPath
podResourcesSocketPath
=
filepath
.
Join
(
t
.
TempDir
(),
"missing-kubelet.sock"
)
t
.
Cleanup
(
func
()
{
podResourcesSocketPath
=
previousSocketPath
})
nodeName
:=
"node-1"
poolName
:=
"pool-node-1"
namespace
:=
"default"
podName
:=
"test-pod"
claimName
:=
"gpu-claim"
uuid
:=
"GPU-ffffffff-1111-2222-3333-444444444444"
pod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
podName
,
Namespace
:
namespace
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
nodeName
,
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"gpu"
,
ResourceClaimName
:
&
claimName
,
},
},
},
}
claim
:=
&
resourcev1
.
ResourceClaim
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
claimName
,
Namespace
:
namespace
},
Status
:
resourcev1
.
ResourceClaimStatus
{
Allocation
:
&
resourcev1
.
AllocationResult
{
Devices
:
resourcev1
.
DeviceAllocationResult
{
Results
:
[]
resourcev1
.
DeviceRequestAllocationResult
{
{
Driver
:
nvidiaGPUDRADriver
,
Pool
:
poolName
,
Device
:
"gpu-0"
,
Request
:
"gpu"
},
},
},
},
},
}
slice
:=
&
resourcev1
.
ResourceSlice
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
poolName
+
"-gpu.nvidia.com-xxx"
},
Spec
:
resourcev1
.
ResourceSliceSpec
{
Driver
:
nvidiaGPUDRADriver
,
NodeName
:
&
nodeName
,
Pool
:
resourcev1
.
ResourcePool
{
Name
:
poolName
},
Devices
:
[]
resourcev1
.
Device
{
{
Name
:
"gpu-0"
,
Attributes
:
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
{
resourcev1
.
QualifiedName
(
"uuid"
)
:
{
StringValue
:
&
uuid
},
},
},
},
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
,
claim
,
slice
)
ctx
,
cancel
:=
context
.
WithTimeout
(
context
.
Background
(),
5
*
time
.
Second
)
defer
cancel
()
got
,
err
:=
DiscoverGPUUUIDs
(
ctx
,
client
,
podName
,
namespace
,
"main"
,
"/proc"
,
123
,
logr
.
Discard
(),
)
if
err
!=
nil
{
t
.
Fatalf
(
"DiscoverGPUUUIDs: %v"
,
err
)
}
if
len
(
got
)
!=
1
||
got
[
0
]
!=
uuid
{
t
.
Fatalf
(
"got %v, want [%s]"
,
got
,
uuid
)
}
}
deploy/snapshot/internal/cuda/dra.go
View file @
2cc6d1e2
...
@@ -14,66 +14,93 @@ const (
...
@@ -14,66 +14,93 @@ const (
resourceAttributeUUID
=
"uuid"
resourceAttributeUUID
=
"uuid"
)
)
// GetGPUUUIDsViaDRAAPI resolves GPU UUIDs for a pod by querying the Kubernetes API:
type
allocatedDRADevice
struct
{
// Pod (resource claim refs) -> ResourceClaim (allocation results) -> ResourceSlice (device attributes).
pool
string
// Returns nil without error if the pod has no DRA claims or the driver is not gpu.nvidia.com.
device
string
func
GetGPUUUIDsViaDRAAPI
(
ctx
context
.
Context
,
clientset
kubernetes
.
Interface
,
podName
,
podNamespace
string
,
log
logr
.
Logger
)
([]
string
,
error
)
{
}
func
getAllocatedNVIDIADRADevices
(
ctx
context
.
Context
,
clientset
kubernetes
.
Interface
,
podName
,
podNamespace
string
,
log
logr
.
Logger
)
([]
allocatedDRADevice
,
string
,
bool
,
error
)
{
if
clientset
==
nil
{
if
clientset
==
nil
{
return
nil
,
nil
return
nil
,
""
,
false
,
nil
}
}
if
podName
==
""
||
podNamespace
==
""
{
if
podName
==
""
||
podNamespace
==
""
{
return
nil
,
nil
return
nil
,
""
,
false
,
nil
}
}
pod
,
err
:=
clientset
.
CoreV1
()
.
Pods
(
podNamespace
)
.
Get
(
ctx
,
podName
,
metav1
.
GetOptions
{})
pod
,
err
:=
clientset
.
CoreV1
()
.
Pods
(
podNamespace
)
.
Get
(
ctx
,
podName
,
metav1
.
GetOptions
{})
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"get pod %s/%s: %w"
,
podNamespace
,
podName
,
err
)
return
nil
,
""
,
false
,
fmt
.
Errorf
(
"get pod %s/%s: %w"
,
podNamespace
,
podName
,
err
)
}
}
if
len
(
pod
.
Spec
.
ResourceClaims
)
==
0
{
if
len
(
pod
.
Spec
.
ResourceClaims
)
==
0
{
return
nil
,
nil
return
nil
,
pod
.
Spec
.
NodeName
,
false
,
nil
}
}
nodeName
:=
pod
.
Spec
.
NodeName
if
pod
.
Spec
.
NodeName
==
""
{
if
nodeName
==
""
{
log
.
V
(
1
)
.
Info
(
"pod has no node name, skipping DRA API lookup"
)
log
.
V
(
1
)
.
Info
(
"pod has no node name, skipping DRA API lookup"
)
return
nil
,
nil
return
nil
,
""
,
false
,
nil
}
}
var
allocated
[]
struct
{
claimNamesByPodRef
:=
make
(
map
[
string
]
string
,
len
(
pod
.
Spec
.
ResourceClaims
))
driver
string
for
_
,
ref
:=
range
pod
.
Spec
.
ResourceClaims
{
pool
string
if
ref
.
ResourceClaimName
!=
nil
&&
*
ref
.
ResourceClaimName
!=
""
{
device
string
claimNamesByPodRef
[
ref
.
Name
]
=
*
ref
.
ResourceClaimName
}
}
for
_
,
status
:=
range
pod
.
Status
.
ResourceClaimStatuses
{
if
status
.
ResourceClaimName
==
nil
||
*
status
.
ResourceClaimName
==
""
{
continue
}
if
_
,
exists
:=
claimNamesByPodRef
[
status
.
Name
];
!
exists
{
claimNamesByPodRef
[
status
.
Name
]
=
*
status
.
ResourceClaimName
}
}
}
var
allocated
[]
allocatedDRADevice
hasNVIDIADRAAllocation
:=
false
for
_
,
ref
:=
range
pod
.
Spec
.
ResourceClaims
{
for
_
,
ref
:=
range
pod
.
Spec
.
ResourceClaims
{
if
ref
.
ResourceClaimName
==
nil
||
*
ref
.
ResourceClaimName
==
""
{
claimName
:=
claimNamesByPodRef
[
ref
.
Name
]
if
claimName
==
""
{
log
.
V
(
1
)
.
Info
(
"pod resource claim has no resolved claim name"
,
"pod_claim"
,
ref
.
Name
)
continue
continue
}
}
claimName
:=
*
ref
.
ResourceClaimName
claim
,
err
:=
clientset
.
ResourceV1
()
.
ResourceClaims
(
podNamespace
)
.
Get
(
ctx
,
claimName
,
metav1
.
GetOptions
{})
claim
,
err
:=
clientset
.
ResourceV1
()
.
ResourceClaims
(
podNamespace
)
.
Get
(
ctx
,
claimName
,
metav1
.
GetOptions
{})
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"get resource claim %s/%s: %w"
,
podNamespace
,
claimName
,
err
)
return
nil
,
pod
.
Spec
.
NodeName
,
hasNVIDIADRAAllocation
,
fmt
.
Errorf
(
"get resource claim %s/%s: %w"
,
podNamespace
,
claimName
,
err
)
}
}
if
claim
.
Status
.
Allocation
==
nil
||
len
(
claim
.
Status
.
Allocation
.
Devices
.
Results
)
==
0
{
if
claim
.
Status
.
Allocation
==
nil
||
len
(
claim
.
Status
.
Allocation
.
Devices
.
Results
)
==
0
{
continue
continue
}
}
for
_
,
r
:=
range
claim
.
Status
.
Allocation
.
Devices
.
Results
{
for
_
,
result
:=
range
claim
.
Status
.
Allocation
.
Devices
.
Results
{
if
r
.
Driver
==
nvidiaGPUDRADriver
{
if
result
.
Driver
!=
nvidiaGPUDRADriver
{
allocated
=
append
(
allocated
,
struct
{
continue
driver
string
pool
string
device
string
}{
r
.
Driver
,
r
.
Pool
,
r
.
Device
})
}
}
hasNVIDIADRAAllocation
=
true
allocated
=
append
(
allocated
,
allocatedDRADevice
{
pool
:
result
.
Pool
,
device
:
result
.
Device
,
})
}
}
}
}
if
len
(
allocated
)
==
0
{
return
nil
,
nil
return
allocated
,
pod
.
Spec
.
NodeName
,
hasNVIDIADRAAllocation
,
nil
}
// GetGPUUUIDsViaDRAAPI resolves GPU UUIDs for a pod by querying the Kubernetes API:
// Pod (resource claim refs) -> ResourceClaim (allocation results) -> ResourceSlice (device attributes).
// It also reports whether the pod is using NVIDIA DRA GPU allocations at all.
func
GetGPUUUIDsViaDRAAPI
(
ctx
context
.
Context
,
clientset
kubernetes
.
Interface
,
podName
,
podNamespace
string
,
log
logr
.
Logger
)
([]
string
,
bool
,
error
)
{
allocated
,
nodeName
,
hasNVIDIADRAAllocation
,
err
:=
getAllocatedNVIDIADRADevices
(
ctx
,
clientset
,
podName
,
podNamespace
,
log
)
if
err
!=
nil
{
return
nil
,
hasNVIDIADRAAllocation
,
err
}
if
!
hasNVIDIADRAAllocation
||
len
(
allocated
)
==
0
{
return
nil
,
hasNVIDIADRAAllocation
,
nil
}
}
slices
,
err
:=
clientset
.
ResourceV1
()
.
ResourceSlices
()
.
List
(
ctx
,
metav1
.
ListOptions
{
slices
,
err
:=
clientset
.
ResourceV1
()
.
ResourceSlices
()
.
List
(
ctx
,
metav1
.
ListOptions
{
FieldSelector
:
fmt
.
Sprintf
(
"spec.driver=%s,spec.nodeName=%s"
,
nvidiaGPUDRADriver
,
nodeName
),
FieldSelector
:
fmt
.
Sprintf
(
"spec.driver=%s,spec.nodeName=%s"
,
nvidiaGPUDRADriver
,
nodeName
),
})
})
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"list resource slices for node %s: %w"
,
nodeName
,
err
)
return
nil
,
true
,
fmt
.
Errorf
(
"list resource slices for node %s: %w"
,
nodeName
,
err
)
}
}
poolDeviceToUUID
:=
make
(
map
[
string
]
map
[
string
]
string
)
poolDeviceToUUID
:=
make
(
map
[
string
]
map
[
string
]
string
)
...
@@ -92,15 +119,15 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
...
@@ -92,15 +119,15 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
}
}
var
uuids
[]
string
var
uuids
[]
string
for
_
,
a
:=
range
allocated
{
for
_
,
device
:=
range
allocated
{
devMap
:=
poolDeviceToUUID
[
a
.
pool
]
devMap
:=
poolDeviceToUUID
[
device
.
pool
]
if
devMap
==
nil
{
if
devMap
==
nil
{
log
.
V
(
1
)
.
Info
(
"no ResourceSlice found for pool"
,
"pool"
,
a
.
pool
,
"device"
,
a
.
device
)
log
.
V
(
1
)
.
Info
(
"no ResourceSlice found for pool"
,
"pool"
,
device
.
pool
,
"device"
,
device
.
device
)
continue
continue
}
}
uuid
,
ok
:=
devMap
[
a
.
device
]
uuid
,
ok
:=
devMap
[
device
.
device
]
if
!
ok
||
uuid
==
""
{
if
!
ok
||
uuid
==
""
{
log
.
V
(
1
)
.
Info
(
"device has no UUID in ResourceSlice"
,
"pool"
,
a
.
pool
,
"device"
,
a
.
device
)
log
.
V
(
1
)
.
Info
(
"device has no UUID in ResourceSlice"
,
"pool"
,
device
.
pool
,
"device"
,
device
.
device
)
continue
continue
}
}
uuids
=
append
(
uuids
,
uuid
)
uuids
=
append
(
uuids
,
uuid
)
...
@@ -108,7 +135,7 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
...
@@ -108,7 +135,7 @@ func GetGPUUUIDsViaDRAAPI(ctx context.Context, clientset kubernetes.Interface, p
if
len
(
uuids
)
>
0
{
if
len
(
uuids
)
>
0
{
log
.
Info
(
"resolved GPU UUIDs via DRA API"
,
"uuids"
,
uuids
)
log
.
Info
(
"resolved GPU UUIDs via DRA API"
,
"uuids"
,
uuids
)
}
}
return
uuids
,
nil
return
uuids
,
true
,
nil
}
}
func
deviceUUIDFromAttributes
(
attrs
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
)
string
{
func
deviceUUIDFromAttributes
(
attrs
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
)
string
{
...
...
deploy/snapshot/internal/cuda/dra_test.go
View file @
2cc6d1e2
...
@@ -60,10 +60,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
...
@@ -60,10 +60,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
log
:=
logr
.
Discard
()
log
:=
logr
.
Discard
()
t
.
Run
(
"nil clientset returns nil without error"
,
func
(
t
*
testing
.
T
)
{
t
.
Run
(
"nil clientset returns nil without error"
,
func
(
t
*
testing
.
T
)
{
got
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
nil
,
"pod"
,
"ns"
,
log
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
nil
,
"pod"
,
"ns"
,
log
)
if
err
!=
nil
{
if
err
!=
nil
{
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
}
}
if
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be false"
)
}
if
got
!=
nil
{
if
got
!=
nil
{
t
.
Errorf
(
"got %v, want nil"
,
got
)
t
.
Errorf
(
"got %v, want nil"
,
got
)
}
}
...
@@ -71,10 +74,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
...
@@ -71,10 +74,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
t
.
Run
(
"empty pod name returns nil"
,
func
(
t
*
testing
.
T
)
{
t
.
Run
(
"empty pod name returns nil"
,
func
(
t
*
testing
.
T
)
{
client
:=
fake
.
NewSimpleClientset
()
client
:=
fake
.
NewSimpleClientset
()
got
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
""
,
"ns"
,
log
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
""
,
"ns"
,
log
)
if
err
!=
nil
{
if
err
!=
nil
{
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
}
}
if
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be false"
)
}
if
got
!=
nil
{
if
got
!=
nil
{
t
.
Errorf
(
"got %v, want nil"
,
got
)
t
.
Errorf
(
"got %v, want nil"
,
got
)
}
}
...
@@ -82,7 +88,7 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
...
@@ -82,7 +88,7 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
t
.
Run
(
"pod not found returns error"
,
func
(
t
*
testing
.
T
)
{
t
.
Run
(
"pod not found returns error"
,
func
(
t
*
testing
.
T
)
{
client
:=
fake
.
NewSimpleClientset
()
client
:=
fake
.
NewSimpleClientset
()
_
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"missing"
,
"default"
,
log
)
_
,
_
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"missing"
,
"default"
,
log
)
if
err
==
nil
{
if
err
==
nil
{
t
.
Fatal
(
"expected error when pod not found"
)
t
.
Fatal
(
"expected error when pod not found"
)
}
}
...
@@ -146,10 +152,213 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
...
@@ -146,10 +152,213 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
}
}
client
:=
fake
.
NewSimpleClientset
(
pod
,
claim
,
slice
)
client
:=
fake
.
NewSimpleClientset
(
pod
,
claim
,
slice
)
got
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
podName
,
namespace
,
log
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
podName
,
namespace
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"GetGPUUUIDsViaDRAAPI: %v"
,
err
)
}
if
!
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be true"
)
}
want
:=
[]
string
{
uuid1
,
uuid2
}
if
len
(
got
)
!=
len
(
want
)
{
t
.
Fatalf
(
"got %v (len %d), want %v (len %d)"
,
got
,
len
(
got
),
want
,
len
(
want
))
}
for
i
:=
range
want
{
if
got
[
i
]
!=
want
[
i
]
{
t
.
Errorf
(
"got[%d] = %q, want %q"
,
i
,
got
[
i
],
want
[
i
])
}
}
})
t
.
Run
(
"pod with template-backed DRA claims resolves UUIDs via pod status"
,
func
(
t
*
testing
.
T
)
{
nodeName
:=
"node-1"
poolName
:=
"pool-node-1"
namespace
:=
"default"
podName
:=
"test-pod"
generatedClaimName
:=
"generated-gpu-claim"
uuid1
:=
"GPU-cccccccc-1111-2222-3333-444444444444"
pod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
podName
,
Namespace
:
namespace
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
nodeName
,
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"gpu"
,
},
},
},
Status
:
corev1
.
PodStatus
{
ResourceClaimStatuses
:
[]
corev1
.
PodResourceClaimStatus
{
{
Name
:
"gpu"
,
ResourceClaimName
:
ptr
(
generatedClaimName
),
},
},
},
}
claim
:=
&
resourcev1
.
ResourceClaim
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
generatedClaimName
,
Namespace
:
namespace
},
Status
:
resourcev1
.
ResourceClaimStatus
{
Allocation
:
&
resourcev1
.
AllocationResult
{
Devices
:
resourcev1
.
DeviceAllocationResult
{
Results
:
[]
resourcev1
.
DeviceRequestAllocationResult
{
{
Driver
:
nvidiaGPUDRADriver
,
Pool
:
poolName
,
Device
:
"gpu-0"
,
Request
:
"gpu"
},
},
},
},
},
}
slice
:=
&
resourcev1
.
ResourceSlice
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
poolName
+
"-gpu.nvidia.com-xxx"
},
Spec
:
resourcev1
.
ResourceSliceSpec
{
Driver
:
nvidiaGPUDRADriver
,
NodeName
:
&
nodeName
,
Pool
:
resourcev1
.
ResourcePool
{
Name
:
poolName
},
Devices
:
[]
resourcev1
.
Device
{
{
Name
:
"gpu-0"
,
Attributes
:
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
{
resourcev1
.
QualifiedName
(
"uuid"
)
:
{
StringValue
:
&
uuid1
},
},
},
},
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
,
claim
,
slice
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
podName
,
namespace
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"GetGPUUUIDsViaDRAAPI: %v"
,
err
)
}
if
!
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be true"
)
}
want
:=
[]
string
{
uuid1
}
if
len
(
got
)
!=
len
(
want
)
{
t
.
Fatalf
(
"got %v (len %d), want %v (len %d)"
,
got
,
len
(
got
),
want
,
len
(
want
))
}
for
i
:=
range
want
{
if
got
[
i
]
!=
want
[
i
]
{
t
.
Errorf
(
"got[%d] = %q, want %q"
,
i
,
got
[
i
],
want
[
i
])
}
}
})
t
.
Run
(
"pod with unresolved resource claim returns nil"
,
func
(
t
*
testing
.
T
)
{
pod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
"pod"
,
Namespace
:
"default"
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
"node-1"
,
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"gpu"
,
},
},
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"pod"
,
"default"
,
log
)
if
err
!=
nil
{
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
}
if
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be false"
)
}
if
got
!=
nil
{
t
.
Errorf
(
"got %v, want nil"
,
got
)
}
})
t
.
Run
(
"pod with direct and template-backed claims resolves UUIDs from both"
,
func
(
t
*
testing
.
T
)
{
nodeName
:=
"node-1"
poolName
:=
"pool-node-1"
namespace
:=
"default"
podName
:=
"test-pod"
directClaimName
:=
"direct-gpu-claim"
generatedClaimName
:=
"generated-gpu-claim"
uuid1
:=
"GPU-dddddddd-1111-2222-3333-444444444444"
uuid2
:=
"GPU-eeeeeeee-5555-6666-7777-888888888888"
pod
:=
&
corev1
.
Pod
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
podName
,
Namespace
:
namespace
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
nodeName
,
ResourceClaims
:
[]
corev1
.
PodResourceClaim
{
{
Name
:
"gpu-direct"
,
ResourceClaimName
:
ptr
(
directClaimName
),
},
{
Name
:
"gpu-template"
,
},
},
},
Status
:
corev1
.
PodStatus
{
ResourceClaimStatuses
:
[]
corev1
.
PodResourceClaimStatus
{
{
Name
:
"gpu-template"
,
ResourceClaimName
:
ptr
(
generatedClaimName
),
},
},
},
}
directClaim
:=
&
resourcev1
.
ResourceClaim
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
directClaimName
,
Namespace
:
namespace
},
Status
:
resourcev1
.
ResourceClaimStatus
{
Allocation
:
&
resourcev1
.
AllocationResult
{
Devices
:
resourcev1
.
DeviceAllocationResult
{
Results
:
[]
resourcev1
.
DeviceRequestAllocationResult
{
{
Driver
:
nvidiaGPUDRADriver
,
Pool
:
poolName
,
Device
:
"gpu-0"
,
Request
:
"gpu-direct"
},
},
},
},
},
}
generatedClaim
:=
&
resourcev1
.
ResourceClaim
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
generatedClaimName
,
Namespace
:
namespace
},
Status
:
resourcev1
.
ResourceClaimStatus
{
Allocation
:
&
resourcev1
.
AllocationResult
{
Devices
:
resourcev1
.
DeviceAllocationResult
{
Results
:
[]
resourcev1
.
DeviceRequestAllocationResult
{
{
Driver
:
nvidiaGPUDRADriver
,
Pool
:
poolName
,
Device
:
"gpu-1"
,
Request
:
"gpu-template"
},
},
},
},
},
}
slice
:=
&
resourcev1
.
ResourceSlice
{
ObjectMeta
:
metav1
.
ObjectMeta
{
Name
:
poolName
+
"-gpu.nvidia.com-xxx"
},
Spec
:
resourcev1
.
ResourceSliceSpec
{
Driver
:
nvidiaGPUDRADriver
,
NodeName
:
&
nodeName
,
Pool
:
resourcev1
.
ResourcePool
{
Name
:
poolName
},
Devices
:
[]
resourcev1
.
Device
{
{
Name
:
"gpu-0"
,
Attributes
:
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
{
resourcev1
.
QualifiedName
(
"uuid"
)
:
{
StringValue
:
&
uuid1
},
},
},
{
Name
:
"gpu-1"
,
Attributes
:
map
[
resourcev1
.
QualifiedName
]
resourcev1
.
DeviceAttribute
{
resourcev1
.
QualifiedName
(
"uuid"
)
:
{
StringValue
:
&
uuid2
},
},
},
},
},
}
client
:=
fake
.
NewSimpleClientset
(
pod
,
directClaim
,
generatedClaim
,
slice
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
podName
,
namespace
,
log
)
if
err
!=
nil
{
if
err
!=
nil
{
t
.
Fatalf
(
"GetGPUUUIDsViaDRAAPI: %v"
,
err
)
t
.
Fatalf
(
"GetGPUUUIDsViaDRAAPI: %v"
,
err
)
}
}
if
!
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be true"
)
}
want
:=
[]
string
{
uuid1
,
uuid2
}
want
:=
[]
string
{
uuid1
,
uuid2
}
if
len
(
got
)
!=
len
(
want
)
{
if
len
(
got
)
!=
len
(
want
)
{
t
.
Fatalf
(
"got %v (len %d), want %v (len %d)"
,
got
,
len
(
got
),
want
,
len
(
want
))
t
.
Fatalf
(
"got %v (len %d), want %v (len %d)"
,
got
,
len
(
got
),
want
,
len
(
want
))
...
@@ -167,10 +376,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
...
@@ -167,10 +376,13 @@ func TestGetGPUUUIDsViaDRAAPI(t *testing.T) {
Spec
:
corev1
.
PodSpec
{
NodeName
:
"node-1"
},
Spec
:
corev1
.
PodSpec
{
NodeName
:
"node-1"
},
}
}
client
:=
fake
.
NewSimpleClientset
(
pod
)
client
:=
fake
.
NewSimpleClientset
(
pod
)
got
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"pod"
,
"default"
,
log
)
got
,
hasNVIDIADRAAllocation
,
err
:=
GetGPUUUIDsViaDRAAPI
(
ctx
,
client
,
"pod"
,
"default"
,
log
)
if
err
!=
nil
{
if
err
!=
nil
{
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
t
.
Fatalf
(
"unexpected error: %v"
,
err
)
}
}
if
hasNVIDIADRAAllocation
{
t
.
Fatal
(
"expected hasNVIDIADRAAllocation to be false"
)
}
if
got
!=
nil
{
if
got
!=
nil
{
t
.
Errorf
(
"got %v, want nil"
,
got
)
t
.
Errorf
(
"got %v, want nil"
,
got
)
}
}
...
...
deploy/snapshot/internal/executor/checkpoint.go
View file @
2cc6d1e2
...
@@ -191,18 +191,19 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
...
@@ -191,18 +191,19 @@ func inspectContainer(ctx context.Context, ctrd *containerd.Client, log logr.Log
}
}
var
gpuUUIDs
[]
string
var
gpuUUIDs
[]
string
if
len
(
cudaHostPIDs
)
>
0
{
if
len
(
cudaHostPIDs
)
>
0
{
gpuUUIDs
,
err
=
cuda
.
GetPodGPUUUIDs
(
ctx
,
req
.
PodName
,
req
.
PodNamespace
,
req
.
ContainerName
)
gpuUUIDs
,
err
=
cuda
.
DiscoverGPUUUIDs
(
ctx
,
req
.
Clientset
,
req
.
PodName
,
req
.
PodNamespace
,
req
.
ContainerName
,
snapshotruntime
.
HostProcPath
,
pid
,
log
,
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to discover source GPU UUIDs: %w"
,
err
)
return
nil
,
fmt
.
Errorf
(
"failed to discover source GPU UUIDs: %w"
,
err
)
}
}
if
len
(
gpuUUIDs
)
==
0
{
log
.
Info
(
"PodResources API returned no GPU UUIDs, falling back to nvidia-smi"
,
"pid"
,
pid
)
gpuUUIDs
,
err
=
cuda
.
GetGPUUUIDsViaNvidiaSmi
(
ctx
,
snapshotruntime
.
HostProcPath
,
pid
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"nvidia-smi GPU UUID fallback failed: %w"
,
err
)
}
log
.
Info
(
"nvidia-smi fallback discovered GPU UUIDs"
,
"uuids"
,
gpuUUIDs
)
}
}
}
return
&
types
.
CheckpointContainerSnapshot
{
return
&
types
.
CheckpointContainerSnapshot
{
...
...
deploy/snapshot/internal/executor/restore.go
View file @
2cc6d1e2
...
@@ -142,18 +142,19 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
...
@@ -142,18 +142,19 @@ func inspectRestore(ctx context.Context, ctrd *containerd.Client, log logr.Logge
if
len
(
m
.
CUDA
.
SourceGPUUUIDs
)
==
0
{
if
len
(
m
.
CUDA
.
SourceGPUUUIDs
)
==
0
{
return
nil
,
fmt
.
Errorf
(
"missing source GPU UUIDs in checkpoint manifest"
)
return
nil
,
fmt
.
Errorf
(
"missing source GPU UUIDs in checkpoint manifest"
)
}
}
targetGPUUUIDs
,
err
:=
cuda
.
GetPodGPUUUIDs
(
ctx
,
req
.
PodName
,
req
.
PodNamespace
,
containerName
)
targetGPUUUIDs
,
err
:=
cuda
.
DiscoverGPUUUIDs
(
ctx
,
req
.
Clientset
,
req
.
PodName
,
req
.
PodNamespace
,
containerName
,
snapshotruntime
.
HostProcPath
,
placeholderPID
,
log
,
)
if
err
!=
nil
{
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"failed to get target GPU UUIDs: %w"
,
err
)
return
nil
,
fmt
.
Errorf
(
"failed to get target GPU UUIDs: %w"
,
err
)
}
}
if
len
(
targetGPUUUIDs
)
==
0
{
log
.
Info
(
"PodResources API returned no target GPU UUIDs, falling back to nvidia-smi"
,
"pid"
,
placeholderPID
)
targetGPUUUIDs
,
err
=
cuda
.
GetGPUUUIDsViaNvidiaSmi
(
ctx
,
snapshotruntime
.
HostProcPath
,
placeholderPID
)
if
err
!=
nil
{
return
nil
,
fmt
.
Errorf
(
"nvidia-smi GPU UUID fallback failed for restore target: %w"
,
err
)
}
log
.
Info
(
"nvidia-smi fallback discovered target GPU UUIDs"
,
"uuids"
,
targetGPUUUIDs
)
}
if
len
(
targetGPUUUIDs
)
==
0
{
if
len
(
targetGPUUUIDs
)
==
0
{
return
nil
,
fmt
.
Errorf
(
"missing target GPU UUIDs for %s/%s container %s"
,
req
.
PodNamespace
,
req
.
PodName
,
containerName
)
return
nil
,
fmt
.
Errorf
(
"missing target GPU UUIDs for %s/%s container %s"
,
req
.
PodNamespace
,
req
.
PodName
,
containerName
)
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment