Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
c0992522
Unverified
Commit
c0992522
authored
Dec 03, 2025
by
Tzu-Ling Kan
Committed by
GitHub
Dec 03, 2025
Browse files
feat: Add GPU discovery utilities (#4695)
Signed-off-by:
tzulingk@nvidia.com
<
tzulingk@nvidia.com
>
parent
5b24b429
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
298 additions
and
0 deletions
+298
-0
tests/fault_tolerance/hardware/fault_injection_service/helpers/__init__.py
...ance/hardware/fault_injection_service/helpers/__init__.py
+15
-0
tests/fault_tolerance/hardware/fault_injection_service/helpers/gpu_discovery.py
...hardware/fault_injection_service/helpers/gpu_discovery.py
+283
-0
No files found.
tests/fault_tolerance/hardware/fault_injection_service/helpers/__init__.py
View file @
c0992522
...
@@ -9,11 +9,26 @@ This package provides reusable utilities for testing fault tolerance scenarios.
...
@@ -9,11 +9,26 @@ This package provides reusable utilities for testing fault tolerance scenarios.
"""
"""
__all__
=
[
__all__
=
[
# GPU discovery utilities
"get_available_gpu_ids"
,
"get_gpu_id_for_process"
,
"get_gpu_pci_address"
,
"get_gpu_info"
,
"get_processes_on_gpu"
,
# Inference testing utilities
"InferenceLoadTester"
,
"InferenceLoadTester"
,
"get_inference_endpoint"
,
"get_inference_endpoint"
,
# Kubernetes operations utilities
"NodeOperations"
,
"NodeOperations"
,
"PodOperations"
,
"PodOperations"
,
]
]
from
.gpu_discovery
import
(
get_available_gpu_ids
,
get_gpu_id_for_process
,
get_gpu_info
,
get_gpu_pci_address
,
get_processes_on_gpu
,
)
from
.inference_testing
import
InferenceLoadTester
,
get_inference_endpoint
from
.inference_testing
import
InferenceLoadTester
,
get_inference_endpoint
from
.k8s_operations
import
NodeOperations
,
PodOperations
from
.k8s_operations
import
NodeOperations
,
PodOperations
tests/fault_tolerance/hardware/fault_injection_service/helpers/gpu_discovery.py
0 → 100644
View file @
c0992522
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
"""
GPU discovery utilities for fault tolerance testing.
Provides functions to discover GPU information from Kubernetes pods,
including mapping processes to GPUs and handling CUDA_VISIBLE_DEVICES remapping.
"""
import
logging
from
typing
import
List
,
Optional
from
kr8s.objects
import
Pod
logger
=
logging
.
getLogger
(
__name__
)
def
get_available_gpu_ids
(
pod
:
Pod
)
->
List
[
int
]:
"""
Get list of actual GPU IDs available in the pod.
Handles non-sequential GPU IDs correctly (e.g., [0, 1, 3, 7] with gaps).
Args:
pod: Kubernetes pod object (kr8s pod with exec() method)
Returns:
List of GPU IDs (e.g., [0, 1, 2, 3]) or empty list if no GPUs found
Example:
>>> gpu_ids = get_available_gpu_ids(pod)
>>> print(gpu_ids)
[0, 1, 2, 3]
"""
try
:
result
=
pod
.
exec
([
"nvidia-smi"
,
"--query-gpu=index"
,
"--format=csv,noheader"
])
# Parse GPU indices from output
gpu_ids
=
[]
for
line
in
result
.
stdout
.
decode
().
splitlines
():
line
=
line
.
strip
()
if
line
.
isdigit
():
gpu_ids
.
append
(
int
(
line
))
if
not
gpu_ids
:
logger
.
warning
(
f
"No GPUs found in pod
{
pod
.
name
}
"
)
return
[]
logger
.
debug
(
f
"Available GPU IDs in pod
{
pod
.
name
}
:
{
gpu_ids
}
"
)
return
gpu_ids
except
Exception
as
e
:
logger
.
error
(
f
"Failed to get GPU IDs from pod
{
pod
.
name
}
:
{
e
}
"
)
return
[]
def
get_gpu_id_for_process
(
pod
:
Pod
,
process_pid
:
int
)
->
int
:
"""
Find which GPU a process is using.
Queries nvidia-smi to determine the primary GPU for a given process.
This correctly handles:
- Non-sequential GPU IDs
- CUDA_VISIBLE_DEVICES remapping
- Multi-GPU processes (returns primary GPU)
Args:
pod: Kubernetes pod object (kr8s pod with exec() method)
process_pid: Process ID to find GPU for
Returns:
GPU ID (0-N) where the process is running, or 0 if not found
Example:
>>> gpu_id = get_gpu_id_for_process(pod, 603)
>>> print(gpu_id)
1 # Process 603 is running on GPU 1
"""
try
:
# Get actual GPU IDs available in pod (handles non-sequential IDs)
gpu_ids
=
get_available_gpu_ids
(
pod
)
if
not
gpu_ids
:
logger
.
error
(
f
"No GPUs found in pod
{
pod
.
name
}
!"
)
return
0
logger
.
debug
(
f
"Searching for PID
{
process_pid
}
across
{
len
(
gpu_ids
)
}
GPUs:
{
gpu_ids
}
"
)
# Check each GPU for our process
for
gpu_id
in
gpu_ids
:
result
=
pod
.
exec
(
[
"nvidia-smi"
,
"-i"
,
str
(
gpu_id
),
"--query-compute-apps=pid"
,
"--format=csv,noheader"
,
]
)
# Parse PIDs running on this GPU
pids_output
=
result
.
stdout
.
decode
().
strip
()
# Handle both single PID and multiple PIDs
# Output can be:
# "602" (single PID)
# "602\n603\n604" (multiple PIDs)
# " 602 " (with spaces)
pids_on_gpu
=
[
p
.
strip
()
for
p
in
pids_output
.
split
(
"
\n
"
)
if
p
.
strip
()]
# Check if our PID is in the list
if
str
(
process_pid
)
in
pids_on_gpu
:
logger
.
info
(
f
"PID
{
process_pid
}
found on GPU
{
gpu_id
}
in pod
{
pod
.
name
}
"
)
return
gpu_id
# Process not found on any GPU
logger
.
warning
(
f
"PID
{
process_pid
}
not found on any GPU in pod
{
pod
.
name
}
. "
f
"This may happen if the process hasn't initialized CUDA yet or "
f
"if nvidia-smi doesn't track multi-process CUDA apps. "
f
"Defaulting to first GPU:
{
gpu_ids
[
0
]
}
"
)
return
gpu_ids
[
0
]
except
Exception
as
e
:
logger
.
error
(
f
"GPU discovery failed for PID
{
process_pid
}
in pod
{
pod
.
name
}
:
{
e
}
"
)
return
0
def
get_gpu_pci_address
(
pod
:
Pod
,
gpu_id
:
int
)
->
Optional
[
str
]:
"""
Get PCI bus address for a GPU.
The PCI address is used in kernel XID messages and identifies
the physical hardware location of the GPU.
Args:
pod: Kubernetes pod object
gpu_id: GPU index (0-N) as shown by nvidia-smi
Returns:
PCI address (e.g., "00000000:8D:00.0") or None if failed
Example:
>>> pci_addr = get_gpu_pci_address(pod, 1)
>>> print(pci_addr)
00000000:91:00.0
"""
try
:
result
=
pod
.
exec
(
[
"nvidia-smi"
,
"-i"
,
str
(
gpu_id
),
"--query-gpu=pci.bus_id"
,
"--format=csv,noheader"
,
]
)
pci_addr
=
result
.
stdout
.
decode
().
strip
()
if
not
pci_addr
:
logger
.
error
(
f
"Empty PCI address for GPU
{
gpu_id
}
"
)
return
None
logger
.
debug
(
f
"GPU
{
gpu_id
}
in pod
{
pod
.
name
}
has PCI address:
{
pci_addr
}
"
)
return
pci_addr
except
Exception
as
e
:
logger
.
error
(
f
"Failed to get PCI address for GPU
{
gpu_id
}
in pod
{
pod
.
name
}
:
{
e
}
"
)
return
None
def
get_gpu_info
(
pod
:
Pod
,
gpu_id
:
int
)
->
Optional
[
dict
]:
"""
Get comprehensive information about a GPU.
Args:
pod: Kubernetes pod object
gpu_id: GPU index (0-N)
Returns:
Dict with keys: index, name, pci_bus_id, memory_total, driver_version
or None if failed
Example:
>>> info = get_gpu_info(pod, 0)
>>> print(info)
{
'index': 0,
'name': 'NVIDIA H200',
'pci_bus_id': '00000000:8D:00.0',
'memory_total': '143771 MiB',
'driver_version': '550.163.01'
}
"""
try
:
result
=
pod
.
exec
(
[
"nvidia-smi"
,
"-i"
,
str
(
gpu_id
),
"--query-gpu=index,name,pci.bus_id,memory.total,driver_version"
,
"--format=csv,noheader"
,
]
)
output
=
result
.
stdout
.
decode
().
strip
()
parts
=
[
p
.
strip
()
for
p
in
output
.
split
(
","
)]
if
len
(
parts
)
<
5
:
logger
.
error
(
f
"Unexpected nvidia-smi output format:
{
output
}
"
)
return
None
return
{
"index"
:
int
(
parts
[
0
]),
"name"
:
parts
[
1
],
"pci_bus_id"
:
parts
[
2
],
"memory_total"
:
parts
[
3
],
"driver_version"
:
parts
[
4
],
}
except
Exception
as
e
:
logger
.
error
(
f
"Failed to get GPU info for GPU
{
gpu_id
}
:
{
e
}
"
)
return
None
def
get_processes_on_gpu
(
pod
:
Pod
,
gpu_id
:
int
)
->
List
[
int
]:
"""
Get list of process IDs running on a specific GPU.
Args:
pod: Kubernetes pod object
gpu_id: GPU index (0-N)
Returns:
List of PIDs running on this GPU, or empty list if none/error
Example:
>>> pids = get_processes_on_gpu(pod, 1)
>>> print(pids)
[602, 603]
"""
try
:
result
=
pod
.
exec
(
[
"nvidia-smi"
,
"-i"
,
str
(
gpu_id
),
"--query-compute-apps=pid"
,
"--format=csv,noheader"
,
]
)
pids_output
=
result
.
stdout
.
decode
().
strip
()
if
not
pids_output
:
logger
.
debug
(
f
"No processes found on GPU
{
gpu_id
}
in pod
{
pod
.
name
}
"
)
return
[]
# Parse PIDs (handle multiple PIDs on same GPU)
pids
=
[]
for
line
in
pids_output
.
split
(
"
\n
"
):
line
=
line
.
strip
()
if
line
.
isdigit
():
pids
.
append
(
int
(
line
))
logger
.
debug
(
f
"GPU
{
gpu_id
}
in pod
{
pod
.
name
}
has processes:
{
pids
}
"
)
return
pids
except
Exception
as
e
:
logger
.
error
(
f
"Failed to get processes for GPU
{
gpu_id
}
:
{
e
}
"
)
return
[]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment