[Bugfix] Mapping physical device indices for e2e test utils (#8290)

40c39653 · shangmingc · GitHub · 5ec9c0fb · 40c39653
Unverified Commit 40c39653 authored Sep 13, 2024 by shangmingc Committed by GitHub Sep 13, 2024
Show whitespace changes
Inline Side-by-side

Showing with 11 additions and 0 deletions

tests/utils.py tests/utils.py +11 -0

No files found.
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -356,12 +356,23 @@ def error_on_warning():
        yield
+def get_physical_device_indices(devices):
+    visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+    if visible_devices is None:
+        return devices
+    visible_indices = [int(x) for x in visible_devices.split(",")]
+    index_mapping = {i: physical for i, physical in enumerate(visible_indices)}
+    return [index_mapping[i] for i in devices if i in index_mapping]
 @_nvml()
 def wait_for_gpu_memory_to_clear(devices: List[int],
                                 threshold_bytes: int,
                                 timeout_s: float = 120) -> None:
    # Use nvml instead of pytorch to reduce measurement error from torch cuda
    # context.
+    devices = get_physical_device_indices(devices)
    start_time = time.time()
    while True:
        output: Dict[int, str] = {}