[misc][distributed] error on invalid state (#6092)

f6662071 · youkaichao · GitHub · d830656a · f6662071 · f6662071
Unverified Commit f6662071 authored Jul 02, 2024 by youkaichao Committed by GitHub Jul 02, 2024
Showing with 29 additions and 1 deletion

vllm/executor/multiproc_gpu_executor.py vllm/executor/multiproc_gpu_executor.py +3 -0

vllm/executor/ray_gpu_executor.py vllm/executor/ray_gpu_executor.py +4 -1

vllm/utils.py vllm/utils.py +22 -0

No files found.
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -10,6 +10,7 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (cuda_device_count_stateless,
+                        error_on_invalid_device_count_status,
                        get_distributed_init_method, get_open_port,
                        get_vllm_instance_id, make_async,
                        update_environment_variables)
@@ -39,6 +40,8 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
        assert world_size <= cuda_device_count_stateless(), (
            "please set tensor_parallel_size to less than max local gpu count")
+        error_on_invalid_device_count_status()
        # Multiprocessing-based executor does not support multi-node setting.
        # Since it only works for single node, we can use the loopback address
        # 127.0.0.1 for communication.

--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -11,7 +11,8 @@ from vllm.executor.distributed_gpu_executor import (  # yapf: disable
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+from vllm.utils import (error_on_invalid_device_count_status,
+                        get_distributed_init_method, get_ip, get_open_port,
                        get_vllm_instance_id, make_async)
 if ray is not None:
@@ -175,6 +176,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
        distributed_init_method = get_distributed_init_method(
            driver_ip, get_open_port())
+        error_on_invalid_device_count_status()
        # Initialize the actual workers inside worker wrapper.
        init_worker_all_kwargs = [
            self._get_worker_kwargs(

--- a/vllm/utils.py
+++ b/vllm/utils.py
 import argparse
 import asyncio
+import contextlib
 import datetime
 import enum
 import gc
@@ -816,6 +817,27 @@ def cuda_device_count_stateless() -> int:
    return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
+def error_on_invalid_device_count_status():
+    cache_entries = 0
+    with contextlib.suppress(Exception):
+        # future pytorch will fix the issue, device_count will not be cached
+        # at that time, `.cache_info().currsize` will error out
+        cache_entries = torch.cuda.device_count.cache_info().currsize
+    if cache_entries != 0:
+        # the function is already called, and the result is cached
+        remembered = torch.cuda.device_count()
+        current = cuda_device_count_stateless()
+        if remembered > current:
+            raise RuntimeError(
+                "The number of CUDA devices has changed since the first "
+                "call to torch.cuda.device_count(). This is not allowed "
+                "and may result in undefined behavior. Please check out "
+                "https://github.com/vllm-project/vllm/issues/6056 to "
+                "find the first call to torch.cuda.device_count() "
+                "and defer it until the engine is up. Or you can set "
+                "CUDA_VISIBLE_DEVICES to the GPUs you want to use.")
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
 # all the related functions work on real physical device ids.