Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
71be641d
Unverified
Commit
71be641d
authored
Feb 23, 2026
by
Schwinn Saereesitthipitak
Committed by
GitHub
Feb 23, 2026
Browse files
fix(chrek): fix multi-GPU UUID mapping and CUDA PID discovery (#6492)
parent
045eedeb
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
52 additions
and
8 deletions
+52
-8
deploy/chrek/pkg/cuda/cuda.go
deploy/chrek/pkg/cuda/cuda.go
+52
-8
No files found.
deploy/chrek/pkg/cuda/cuda.go
View file @
71be641d
...
...
@@ -21,6 +21,8 @@ const (
)
// GetPodGPUUUIDs resolves GPU UUIDs for a pod/container from the kubelet PodResources API.
// All nvidia.com/gpu device entries are accumulated in case the kubelet splits them
// across multiple entries (observed in some runtimes with multi-GPU pods).
func
GetPodGPUUUIDs
(
ctx
context
.
Context
,
podName
,
podNamespace
,
containerName
string
)
([]
string
,
error
)
{
if
podName
==
""
||
podNamespace
==
""
{
return
nil
,
nil
...
...
@@ -43,6 +45,7 @@ func GetPodGPUUUIDs(ctx context.Context, podName, podNamespace, containerName st
return
nil
,
err
}
var
uuids
[]
string
for
_
,
pod
:=
range
resp
.
GetPodResources
()
{
if
pod
.
GetName
()
!=
podName
||
pod
.
GetNamespace
()
!=
podNamespace
{
continue
...
...
@@ -53,36 +56,46 @@ func GetPodGPUUUIDs(ctx context.Context, podName, podNamespace, containerName st
}
for
_
,
device
:=
range
container
.
GetDevices
()
{
if
device
.
GetResourceName
()
==
nvidiaGPUResource
{
return
device
.
GetDeviceIds
()
,
nil
uuids
=
append
(
uuids
,
device
.
GetDeviceIds
()
...
)
}
}
}
}
return
nil
,
nil
return
uuids
,
nil
}
// FilterProcesses returns the subset of candidate PIDs that report CUDA state.
// FilterProcesses returns the subset of candidate PIDs that hold actual CUDA contexts.
// Uses --get-restore-tid (the same technique as the CRIU CUDA plugin) instead of
// --get-state, because --get-state incorrectly matches coordinator processes like
// cuda-checkpoint --launch-job that share a /proc namespace with CUDA processes but
// don't hold CUDA contexts themselves.
func
FilterProcesses
(
ctx
context
.
Context
,
allPIDs
[]
int
,
log
logr
.
Logger
)
[]
int
{
cudaPIDs
:=
make
([]
int
,
0
,
len
(
allPIDs
))
for
_
,
pid
:=
range
allPIDs
{
if
pid
<=
0
{
continue
}
cmd
:=
exec
.
CommandContext
(
ctx
,
cudaCheckpointBinary
,
"--get-state"
,
"--pid"
,
strconv
.
Itoa
(
pid
))
if
err
:=
cmd
.
Run
();
err
!=
nil
{
cmd
:=
exec
.
CommandContext
(
ctx
,
cudaCheckpointBinary
,
"--get-restore-tid"
,
"--pid"
,
strconv
.
Itoa
(
pid
))
output
,
err
:=
cmd
.
CombinedOutput
()
if
err
!=
nil
{
if
ctx
.
Err
()
!=
nil
{
break
}
log
.
V
(
1
)
.
Info
(
"CUDA
state probe failed"
,
"pid"
,
pid
,
"error"
,
err
)
log
.
V
(
1
)
.
Info
(
"CUDA
restore-tid probe negative"
,
"pid"
,
pid
)
continue
}
tid
:=
strings
.
TrimSpace
(
string
(
output
))
log
.
V
(
1
)
.
Info
(
"CUDA restore-tid probe positive"
,
"pid"
,
pid
,
"tid"
,
tid
)
cudaPIDs
=
append
(
cudaPIDs
,
pid
)
}
return
cudaPIDs
}
// BuildDeviceMap creates a cuda-checkpoint --device-map value from source and target GPU UUID lists.
// When a source UUID exists in the target set, it maps to itself (identity mapping) to avoid
// unnecessary cross-GPU restore on same-node restores where kubelet returns GPUs in different order.
// Remaining unmatched source UUIDs are paired with remaining unmatched target UUIDs positionally.
func
BuildDeviceMap
(
sourceUUIDs
,
targetUUIDs
[]
string
)
(
string
,
error
)
{
if
len
(
sourceUUIDs
)
!=
len
(
targetUUIDs
)
{
return
""
,
fmt
.
Errorf
(
"GPU count mismatch: source has %d, target has %d"
,
len
(
sourceUUIDs
),
len
(
targetUUIDs
))
...
...
@@ -90,9 +103,40 @@ func BuildDeviceMap(sourceUUIDs, targetUUIDs []string) (string, error) {
if
len
(
sourceUUIDs
)
==
0
{
return
""
,
fmt
.
Errorf
(
"GPU UUID list is empty"
)
}
targetSet
:=
make
(
map
[
string
]
bool
,
len
(
targetUUIDs
))
for
_
,
t
:=
range
targetUUIDs
{
targetSet
[
t
]
=
true
}
// First pass: identity-map any source UUID that exists in the target set
mapping
:=
make
(
map
[
string
]
string
,
len
(
sourceUUIDs
))
usedTargets
:=
make
(
map
[
string
]
bool
,
len
(
targetUUIDs
))
for
_
,
src
:=
range
sourceUUIDs
{
if
targetSet
[
src
]
{
mapping
[
src
]
=
src
usedTargets
[
src
]
=
true
}
}
// Second pass: pair remaining source UUIDs with remaining target UUIDs positionally
var
remainingTargets
[]
string
for
_
,
t
:=
range
targetUUIDs
{
if
!
usedTargets
[
t
]
{
remainingTargets
=
append
(
remainingTargets
,
t
)
}
}
idx
:=
0
for
_
,
src
:=
range
sourceUUIDs
{
if
_
,
ok
:=
mapping
[
src
];
!
ok
{
mapping
[
src
]
=
remainingTargets
[
idx
]
idx
++
}
}
pairs
:=
make
([]
string
,
len
(
sourceUUIDs
))
for
i
:=
range
sourceUUIDs
{
pairs
[
i
]
=
s
ourceUUIDs
[
i
]
+
"="
+
targetUUIDs
[
i
]
for
i
,
src
:=
range
sourceUUIDs
{
pairs
[
i
]
=
s
rc
+
"="
+
mapping
[
src
]
}
return
strings
.
Join
(
pairs
,
","
),
nil
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment